diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm50.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm50.h
new file mode 100644
index 0000000000000000000000000000000000000000..1701158b0bdd479cb179e4d0162c78ab335aba8a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm50.h
@@ -0,0 +1,432 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, float, LayoutA, float, LayoutB, float, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<float, 1> const &a,
+    Array<float, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, double, LayoutA, double, LayoutB, double, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = double;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<double, 1> &d,
+    Array<double, 1> const &a,
+    Array<double, 1> const &b,
+    Array<double, 1> const &c
+  ) {
+
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, int, LayoutA, int, LayoutB, int, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int, 1> const &a,
+    Array<int, 1> const &b,
+    Array<int, 1> const &c
+  ) {
+
+    d[0] = a[0] * b[0] + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<float>,
+  LayoutA,
+  float,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<float, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  float,
+  LayoutA,
+  complex<float>,
+  LayoutB,
+  complex<float>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<float, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() = a[0].imag() * b[0].real() + c[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() = a[0].real() * b[0].imag() + d[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  complex<double>,
+  LayoutA,
+  double,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<complex<double>, 1> const &a,
+    Array<double, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0].real() * b[0] + c[0].real();
+    d[0].imag() = a[0].imag() * b[0] + c[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<
+  gemm::GemmShape<1, 1, 1>,
+  1,
+  double,
+  LayoutA,
+  complex<double>,
+  LayoutB,
+  complex<double>,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAddComplex;
+  using ElementC = complex<double>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<double>, 1> &d,
+    Array<double, 1> const &a,
+    Array<complex<double>, 1> const &b,
+    Array<complex<double>, 1> const &c
+  ) {
+
+    d[0].real() = a[0] * b[0].real() + c[0].real();
+    d[0].imag() = a[0] * b[0].imag() + d[0].imag();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, half_t, LayoutA, half_t, LayoutB, float, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = float;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<float, 1> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 1> const &b,
+    Array<float, 1> const &c
+  ) {
+    d[0] = float(a[0]) * float(b[0]) + c[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation for Quaternions
+template <
+  /// Layout of A matrix
+  typename LayoutA,
+  /// Layout of B matrix
+  typename LayoutB,
+  /// Layout of C matrix
+  typename LayoutC
+>
+struct Mma<gemm::GemmShape<1, 1, 1>, 1, Quaternion<float>, LayoutA, Quaternion<float>, LayoutB, Quaternion<float>, LayoutC, OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using Element = Quaternion<float>;
+  using ElementC = Element;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<Element, 1> &d,
+    Array<Element, 1> const &a,
+    Array<Element, 1> const &b,
+    Array<Element, 1> const &c
+  ) {
+    multiply_add<Element, Element, Element> op;
+    d[0] = op(a[0], b[0], c[0]);
+  }
+
+};
+
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm60.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm60.h
new file mode 100644
index 0000000000000000000000000000000000000000..31ef2b653076863cfb9387ba078d31ee8b52d607
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm60.h
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<2,1,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 1, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 1> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 B = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[0] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB>
+struct Mma<
+  gemm::GemmShape<1,2,1>,
+  1,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 1> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> &>(D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[0] * b[i] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma <
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 Blo = __low2half2(reinterpret_cast<__half2 const &>(b));
+    __half2 Bhi = __high2half2(reinterpret_cast<__half2 const &>(b));
+
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+
+    __half2 Dlo = __hfma2(A, Blo, C[0]);
+    __half2 Dhi = __hfma2(A, Bhi, C[1]);
+
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+
+    D[0] = reinterpret_cast<Array<half_t, 2> const &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> const &>(Dhi);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < 2; ++j) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < 2; ++i) {
+        d[i + 2 * j] = a[i] * b[j] + c[i + 2 * j];
+      }
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 4> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 4> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 Alo = __low2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 Ahi = __high2half2(reinterpret_cast<__half2 const &>(a));
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+
+    __half2 const *C = reinterpret_cast<__half2 const *>(&c);
+
+    __half2 Dlo = __hfma2(Alo, B, C[0]);
+    __half2 Dhi = __hfma2(Ahi, B, C[1]);
+
+    Array<half_t, 2> * D = reinterpret_cast<Array<half_t, 2> *>(&d);
+
+    D[0] = reinterpret_cast<Array<half_t, 2> &>(Dlo);
+    D[1] = reinterpret_cast<Array<half_t, 2> &>(Dhi);
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < 2; ++j) {
+        d[i * 2 + j] = a[i] * b[j] + c[i * 2 + j];
+      }
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm61.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm61.h
new file mode 100644
index 0000000000000000000000000000000000000000..b780335efadeecee07f7c1c98422f18fec6f7ea3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm61.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1,1,4>,
+  1,
+  int8_t,
+  LayoutA,
+  int8_t,
+  LayoutB,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 4>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int8_t, 4> const &a,
+    Array<int8_t, 4> const &b,
+    Array<int, 1> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+
+    asm volatile("dp4a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+
+#else
+
+    d[0] = c[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 4; ++k) {
+      d[0] += a[k] * b[k];
+    }
+
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <typename LayoutC>
+struct Mma<
+  gemm::GemmShape<1, 1, 2>,
+  1,
+  int16_t,
+  layout::RowMajor,
+  int16_t,
+  layout::ColumnMajor,
+  int,
+  LayoutC,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<1, 1, 2>;
+  using Operator = OpMultiplyAdd;
+  using ElementC = int;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<int, 1> &d,
+    Array<int16_t, 2> const &a,
+    Array<int16_t, 2> const &b,
+    Array<int, 1> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 610))
+
+    unsigned const &A = reinterpret_cast<unsigned const &>(a);
+    unsigned const &B = reinterpret_cast<unsigned const &>(b);
+
+    asm volatile("dp2a.s32.s32 %0, %1, %2, %3;"
+                 : "=r"(d[0])
+                 : "r"(A), "r"(B), "r"(c[0]));
+#else
+    d[0] = c[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < 2; ++k) {
+      d[0] += a[k] * b[k];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..6acdcfac3b9d3d10253d3a343a1d097b617ddb16
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm70.h
@@ -0,0 +1,661 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_SUPPORTED
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700))
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 &&__CUDACC_VER_MINOR__ >= 1))
+#define CUTLASS_ARCH_MMA_SM70_ENABLED
+#endif
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP16 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.col.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.col.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.row.col.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+    unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+    unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+    asm volatile("mma.sync.aligned.m8n8k4.row.row.f16.f16.f16.f16 {%0,%1,%2,%3}, {%4,%5}, {%6,%7}, {%8,%9,%10,%11};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(B[1]), "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])
+    );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix multiply accumulate 884 - FP32 accumulation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.col.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::ColumnMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 4>,
+  8,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::RowMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8, 8, 4>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::RowMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 8>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  /// Multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) {
+
+#if defined(CUTLASS_ARCH_MMA_SM70_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.row.f32.f16.f16.f32 {%0,%1,%2,%3,%4,%5,%6,%7}, {%8,%9}, {%10,%11}, "
+      "{%12,%13,%14,%15,%16,%17,%18,%19};\n"
+      : "=f"(D[0]),
+        "=f"(D[1]),
+        "=f"(D[2]),
+        "=f"(D[3]),
+        "=f"(D[4]),
+        "=f"(D[5]),
+        "=f"(D[6]),
+        "=f"(D[7])
+      : "r"(A[0]),
+        "r"(A[1]),
+        "r"(B[0]),
+        "r"(B[1]),
+        "f"(C[0]),
+        "f"(C[1]),
+        "f"(C[2]),
+        "f"(C[3]),
+        "f"(C[4]),
+        "f"(C[5]),
+        "f"(C[6]),
+        "f"(C[7])
+  );
+
+#else
+    assert(0);
+    #if defined(__CUDA_ARCH__)
+    asm volatile ("brkpt;\n" ::);
+    #endif
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation specialized for the entire warp
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename Operator
+>
+struct Mma<
+  gemm::GemmShape<16, 16, 4>,
+  32,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  Operator
+> : 
+  public Mma<
+    gemm::GemmShape<8, 8, 4>, 
+    8, 
+    half_t, 
+    LayoutA, 
+    half_t, 
+    LayoutB,
+    ElementC, 
+    LayoutC, 
+    Operator> {
+
+  using Shape = gemm::GemmShape<16, 16, 4>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm75.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm75.h
new file mode 100644
index 0000000000000000000000000000000000000000..c71ea076b5c2390cea8b0ba17ae1b642c5d49b48
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm75.h
@@ -0,0 +1,789 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply for SM75
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+// CUDA Toolkit includes for nvcuda::wmma needed for binarized matrix multiply.
+#include <mma.h>
+#include "cutlass/wmma_array.h"
+#endif
+
+// CUTLASS includes
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 10) || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))
+
+#define CUTLASS_ARCH_MMA_SM75_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
+#define CUTLASS_ARCH_MMA_SM75_ENABLED
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP16 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+  
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  unsigned const *C = reinterpret_cast<unsigned const *>(&c);
+  unsigned *D = reinterpret_cast<unsigned *>(&d);
+
+  asm volatile(
+    "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3}, {%4}, {%5,%6};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(B[0]), "r"(C[0]), "r"(C[1]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 4>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const *A = reinterpret_cast<unsigned const *>(&a);
+  unsigned const *B = reinterpret_cast<unsigned const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (8b) with SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.s8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 4>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.s8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 4>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k16.row.col.satfinite.s32.u8.u8.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Integer matrix multiply  (4b) - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+
+  using ElementB = int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.s4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  int4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int4b_t, 8>;
+
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.s4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<8, 8, 32>,
+  32,
+  uint4b_t,
+  layout::RowMajor,
+  uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<8, 8, 32>;
+
+  using ElementA = uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint4b_t, 8>;
+
+  using ElementB = uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint4b_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+  unsigned const & A = reinterpret_cast<unsigned const &>(a);
+  unsigned const & B = reinterpret_cast<unsigned const &>(b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k32.row.col.satfinite.s32.u4.u4.s32 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A), "r"(B), "r"(C[0]), "r"(C[1]));
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// b1 ^ b1 + s32 => s32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,128>,
+  32,
+  uint1b_t,
+  layout::RowMajor,
+  uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+
+  using Shape = gemm::GemmShape<8,8,128>;
+
+  using ElementA = uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint1b_t, 32>;
+
+  using ElementB = uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint1b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 2>;
+
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm75;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM75_ENABLED)
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+  using WmmaFragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::row_major>;
+
+  using WmmaFragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          nvcuda::wmma::experimental::precision::b1,
+          nvcuda::wmma::col_major>;
+
+  using WmmaFragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          int>;
+  
+  WmmaFragmentA const & A = reinterpret_cast<WmmaFragmentA const &>(a);
+  WmmaFragmentB const & B = reinterpret_cast<WmmaFragmentB const &>(b);
+
+  WmmaFragmentC const & C = reinterpret_cast<WmmaFragmentC const &>(c);
+  WmmaFragmentC & D = reinterpret_cast<WmmaFragmentC &>(d);
+
+  nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+
+#else
+
+  CUTLASS_UNUSED(a);
+  CUTLASS_UNUSED(b);
+  CUTLASS_UNUSED(c);
+  CUTLASS_UNUSED(d);
+  CUTLASS_NOT_IMPLEMENTED(); // WMMA must be supported to issue binary matrix multiply-accumulate instructions.
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..22cd87d65b0412e9ac9a4953feee022c5e5feb92
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm80.h
@@ -0,0 +1,1500 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0))
+
+#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_MMA_SM80_ENABLED
+
+#if (__CUDA_ARCH__ <= 900)
+#define CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED
+#endif
+#if (__CUDA_ARCH__ <= 890)
+#define CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED
+#endif
+
+#endif
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float BF16, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 8>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 4>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 "
+      "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1684 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 4>,
+  32,
+  tfloat32_t,
+  layout::RowMajor,
+  tfloat32_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 4>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 2>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 1>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      : 
+        "r"(A[0]), "r"(A[1]), 
+        "r"(B[0]), 
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 1688 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct Mma<gemm::GemmShape<16, 8, 8>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd> {
+  using Shape = gemm::GemmShape<16, 8, 8>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 2>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  bfloat16_t,
+  layout::RowMajor,
+  bfloat16_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 16>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32  {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, "
+        "{%10,%11,%12,%13};\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 884 - F64
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<8,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<8,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 1>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 2>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm80;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  double const & A = reinterpret_cast<double const &>(a);
+  double const & B = reinterpret_cast<double const &>(b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n"
+      : "=d"(D[0]), "=d"(D[1])
+      : "d"(A), "d"(B), "d"(C[0]), "d"(C[1]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+    
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+    
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 8>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 4>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const &B = reinterpret_cast<uint32_t const &>(b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, "
+        "{%6}, {%7,%8,%9,%10};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]),
+          "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,32>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16,8,32>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 8>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+  uint32_t const * A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const * B = reinterpret_cast<uint32_t const *>(&b);
+
+  int const *C = reinterpret_cast<int const *>(&c);
+  int *D = reinterpret_cast<int *>(&d);
+
+  asm volatile(
+      "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, "
+      "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+      : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16, 8, 64>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate> {
+
+  using Shape = gemm::GemmShape<16, 8, 64>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - AND,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int32_t,
+  layout::RowMajor,
+  OpAndPopc> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+
+  using Operator = OpAndPopc;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int32_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int32_t, 4>;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_AND_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = B1 & B1 + S32
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,256>,
+  32,
+  cutlass::uint1b_t,
+  layout::RowMajor,
+  cutlass::uint1b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpXorPopc> {
+
+  using Shape = gemm::GemmShape<16,8,256>;
+
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint1b_t, 128>;
+
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint1b_t, 64>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using Operator = OpXorPopc;
+  using ArchTag = arch::Sm80;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c
+  ) const {
+
+#if defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+    asm volatile(
+        "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, "
+        "{%8,%9}, {%10,%11,%12,%13};\n"
+        : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]));
+
+#else
+    
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+
+#endif // defined(CUTLASS_ARCH_MMA_B1_XOR_SM80_ENABLED)
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm89.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm89.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bcd9bc1de9b6e53629e08f478a50d791d198a1a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm89.h
@@ -0,0 +1,641 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Matrix multiply-accumulate specialzied for SM89
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+#  define CUTLASS_ARCH_MMA_F32_SM89_SUPPORTED
+#endif
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+#  define CUTLASS_ARCH_MMA_F16_SM89_SUPPORTED
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+#  if defined(CUTLASS_ARCH_MMA_F32_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_MMA_F32_SM89_ENABLED
+#  endif
+
+#  if defined(CUTLASS_ARCH_MMA_F16_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_MMA_F16_SM89_ENABLED
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Whether the Mma uses as SM89 staged accumulation policy
+template <class Operator>
+static constexpr bool is_sm89_staged_policy_v =
+  (
+    // ElementA must be FP8
+    platform::is_same<typename Operator::ElementA, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementA, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // ElementB must be FP8
+    platform::is_same<typename Operator::ElementB, cutlass::float_e4m3_t>::value ||
+    platform::is_same<typename Operator::ElementB, cutlass::float_e5m2_t>::value
+  ) &&
+  (
+    // The instruction shape must be 16x8x32
+    Operator::ArchMmaOperator::Shape::kM == 16 &&
+    Operator::ArchMmaOperator::Shape::kN == 8 &&
+    Operator::ArchMmaOperator::Shape::kK == 32
+  ) &&
+  (
+    // The operator must be OpMultiplyAdd (default)
+    platform::is_same<typename Operator::MathOperator, OpMultiplyAdd>::value
+  );
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP32 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e4m3.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F32_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f32.e5m2.e5m2.f32 "
+      "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+      : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Matrix Multiply 16832 - Float {E4M3, E5M2}, FP16 accumulation
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation - F16 = fe4m3 * fe4m3 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e4m3.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F16 = fe4m3 * fe5m2 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e4m3.e5m2.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F16 = fe5m2 * fe4m3 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e4m3.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+/// Matrix multiply-add operation - F16 = fe5m2 * fe5m2 + F16
+template <typename Operator_>
+struct Mma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  cutlass::half_t,
+  layout::RowMajor,
+  Operator_> {
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 8>;
+
+  using ElementC = cutlass::half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<cutlass::half_t, 4>;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_F16_SM89_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+  asm(
+      "mma.sync.aligned.m16n8k32.row.col.f16.e5m2.e5m2.f16 "
+      "{%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n"
+      : "=r"(D[0]), "=r"(D[1])
+      :
+        "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+        "r"(B[0]), "r"(B[1]),
+        "r"(C[0]), "r"(C[1])
+  );
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+
+#endif
+  }
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm90.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm90.h
new file mode 100644
index 0000000000000000000000000000000000000000..b135c8645b48eb40a1cce88c515074e95d4b6a5e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sm90.h
@@ -0,0 +1,241 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/config.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x4 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,4>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,4>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 2>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 1>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k4.row.col.f64.f64.f64.f64.rn {%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[1]),
+        "d"(B[0]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x8 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,8>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,8>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 4>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 2>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k8.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};\n"
+      : "=d"(D[0]), "=d"(d[1]), "=d"(d[2]), "=d"(d[3])
+      : "d"(A[0]), "d"(A[1]), "d"(A[2]), "d"(A[3]),
+        "d"(B[0]), "d"(B[1]),
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+
+    CUTLASS_UNUSED(d);
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Matrix Multiply-Add 16x8x16 fp64
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F64 = F64 * F64 + F64
+template <>
+struct Mma<
+  gemm::GemmShape<16,8,16>,
+  32,
+  double,
+  layout::RowMajor,
+  double,
+  layout::ColumnMajor,
+  double,
+  layout::RowMajor,
+  OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<16,8,16>;
+
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<double, 8>;
+
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<double, 4>;
+
+  using ElementC = double;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<double, 4>;
+
+  using Operator = OpMultiplyAdd;
+
+  using ArchTag = arch::Sm90;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c) const {
+    
+#if defined(CUTLASS_ARCH_MMA_SM90_F64_MMA_ENABLED)
+
+  double const *A = reinterpret_cast<double const *>(&a);
+  double const *B = reinterpret_cast<double const *>(&b);
+
+  double const *C = reinterpret_cast<double const *>(&c);
+  double *D = reinterpret_cast<double *>(&d);
+
+  asm volatile("mma.sync.aligned.m16n8k16.row.col.f64.f64.f64.f64 {%0, %1, %2, %3}, {%4, %5, %6, %7, %8, %9, %10, %11}, {%12, %13, %14, %15}, {%16, %17, %18, %19};\n"
+      : "=d"(D[0]), "=d"(D[1]), "=d"(D[2]), "=d"(D[3])
+      : "d"(A[0]), "d"(A[2]), "d"(A[2]), "d"(A[3]), "d"(A[4]), "d"(A[5]), "d"(A[6]), "d"(A[7]),
+        "d"(B[0]), "d"(B[1]), "d"(B[2]), "d"(B[3]), 
+        "d"(C[0]), "d"(C[1]), "d"(C[2]), "d"(C[3]));
+
+#else
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4ca91a10293334fbd89e21891132442a6216e6a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm80.h
@@ -0,0 +1,1234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM80
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 1))
+
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_SUPPORTED 1
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+#define CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED
+#endif
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F16 = F16 * F16 + F16
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  half_t,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+> {
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<half_t, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  uint32_t const *C = reinterpret_cast<uint32_t const *>(&c);
+  uint32_t *D = reinterpret_cast<uint32_t *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16 {%0,%1}, "
+        "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x1;\n"
+        : "=r"(D[0]), "=r"(D[1])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "r"(C[0]), "r"(C[1]), "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = F16 * F16 + F32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16, 8, 32>,
+  32,
+  half_t,
+  layout::RowMajor,
+  half_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  OpMultiplyAdd,
+  SPFormatType::Thread
+  > {
+
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = half_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<half_t, 8>;
+
+  using ElementB = half_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<half_t, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+  uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+  uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+  float const *C = reinterpret_cast<float const *>(&c);
+  float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+#else
+  if (id2 == 0) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else if (id2 == 1) {
+    asm volatile(
+        "mma.sp.sync.aligned.m16n8k32.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
+        "{%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+        : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+        : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]),
+          "r"(B[2]), "r"(B[3]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]),
+          "r"(E));
+  }
+  else {
+    assert(0);
+  }
+
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16832 - Float BF16, FP32 accumulation 
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 32>, 32, bfloat16_t, layout::RowMajor,
+           bfloat16_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 32>;
+
+  using ElementA = bfloat16_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<bfloat16_t, 8>;
+
+  using ElementB = bfloat16_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<bfloat16_t, 8>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 2;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k32.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16816 - Float TF32
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32
+template <>
+struct SparseMma<gemm::GemmShape<16, 8, 16>, 32, tfloat32_t, layout::RowMajor,
+           tfloat32_t, layout::ColumnMajor, float, layout::RowMajor,
+           OpMultiplyAdd, SPFormatType::Thread> {
+  using Shape = gemm::GemmShape<16, 8, 16>;
+
+  using ElementA = tfloat32_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<tfloat32_t, 4>;
+
+  using ElementB = tfloat32_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<tfloat32_t, 4>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<float, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAdd;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 4;
+
+  static int const kMaxID2 = 2;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b,
+                  FragmentC const &c, uint32_t const &E, int const id2) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else if (id2 == 1) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k16.row.col.f32.tf32.tf32.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x1;\n"
+          : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]), 
+            "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 16864 - S8 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  int8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<int8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * S8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<int8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U8 * U8 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  uint8_t,
+  layout::RowMajor,
+  uint8_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = uint8_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<uint8_t, 16>;
+
+  using ElementB = uint8_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<uint8_t, 16>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k64.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Sparse Matrix Multiply 168128 - S4 input, S32 accumulation - SATURATE
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: S32 = S4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = S4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::int4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::int4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * S4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::int4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::int4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/// Matrix multiply-add operation: S32 = U4 * U4 + S32
+template <>
+struct SparseMma<
+  gemm::GemmShape<16,8,128>,
+  32,
+  cutlass::uint4b_t,
+  layout::RowMajor,
+  cutlass::uint4b_t,
+  layout::ColumnMajor,
+  int,
+  layout::RowMajor,
+  OpMultiplyAddSaturate,
+  SPFormatType::Thread> {
+
+  using Shape = gemm::GemmShape<16,8,128>;
+
+  using ElementA = cutlass::uint4b_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<cutlass::uint4b_t, 32>;
+
+  using ElementB = cutlass::uint4b_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<cutlass::uint4b_t, 32>;
+
+  using ElementC = int;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<int, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = OpMultiplyAddSaturate;
+  using ArchTag = arch::Sm80;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_SM80_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    int const *C = reinterpret_cast<int const *>(&c);
+    int *D = reinterpret_cast<int *>(&d);
+
+#if ((__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 5))
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp::ordered_metadata.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#else
+    if (id2 == 0) {
+      asm volatile(
+          "mma.sp.sync.aligned.m16n8k128.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+          "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+          : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3])
+          : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+            "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]), "r"(E));
+    } else {
+      assert(0);
+    }
+#endif
+
+#else
+
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm89.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm89.h
new file mode 100644
index 0000000000000000000000000000000000000000..6adca25527efdc1c3cb564b4553d96bebe59b3fd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/mma_sparse_sm89.h
@@ -0,0 +1,406 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Sparse matrix multiply accumulate for SM89
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "mma.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)
+#  define CUTLASS_ARCH_SPARSE_MMA_F32_SM89_SUPPORTED
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+#  if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_SUPPORTED)
+#    define CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED
+#  endif
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe4m3 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e4m3_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e4m3_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e4m3.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe4m3 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e4m3_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e4m3_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e4m3.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Matrix multiply-add operation: F32 = fe5m2 * fe5m2 + F32
+template <typename Operator_>
+struct SparseMma<
+  gemm::GemmShape<16,8,64>,
+  32,
+  cutlass::float_e5m2_t,
+  layout::RowMajor,
+  cutlass::float_e5m2_t,
+  layout::ColumnMajor,
+  float,
+  layout::RowMajor,
+  Operator_,
+  SPFormatType::Thread> {
+
+  static_assert(platform::is_same<Operator_, OpMultiplyAdd>::value ||
+                platform::is_same<Operator_, OpMultiplyAddFastAccum>::value,
+                "Invalid operator for SM89 FP8 instruction");
+
+  using Shape = gemm::GemmShape<16,8,64>;
+
+  using ElementA = cutlass::float_e5m2_t;
+  using LayoutA = layout::RowMajor;
+  using FragmentA = Array<ElementA, 16>;
+
+  using ElementB = cutlass::float_e5m2_t;
+  using LayoutB = layout::ColumnMajor;
+  using FragmentB = Array<ElementB, 16>;
+
+  using ElementC = float;
+  using LayoutC = layout::RowMajor;
+  using FragmentC = Array<ElementC, 4>;
+
+  using FragmentE = uint32_t;
+
+  using Operator = Operator_;
+  using ArchTag = arch::Sm89;
+
+  static int const kSparse = 2;
+
+  static int const kMetaSizeInBits = 2;
+
+  static int const kMaxID2 = 1;
+
+  /// Computes multiply-add
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC &d,
+    FragmentA const &a,
+    FragmentB const &b,
+    FragmentC const &c,
+    uint32_t const &E,
+    int const id2
+  ) const {
+
+#if defined(CUTLASS_ARCH_SPARSE_MMA_F32_SM89_ENABLED)
+
+    uint32_t const *A = reinterpret_cast<uint32_t const *>(&a);
+    uint32_t const *B = reinterpret_cast<uint32_t const *>(&b);
+
+    float const *C = reinterpret_cast<float const *>(&c);
+    float *D = reinterpret_cast<float *>(&d);
+
+      if (id2 == 0) {
+        asm volatile(
+            "mma.sp.sync.aligned.m16n8k64.row.col.f32.e5m2.e5m2.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, "
+            "{%8,%9,%10,%11}, {%12,%13,%14,%15}, %16, 0x0;\n"
+            : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
+            : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+              "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]), "r"(E));
+      }
+      else {
+        assert(0);
+      }
+#else
+    CUTLASS_UNUSED(a);
+    CUTLASS_UNUSED(b);
+    CUTLASS_UNUSED(c);
+    CUTLASS_UNUSED(d);
+    assert(0);
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/reg_reconfig.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/reg_reconfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..93dd37d3193867602d69866cb2cfcd2e27e87f62
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/reg_reconfig.h
@@ -0,0 +1,89 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief PTX for CTA Reconfiguration
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cstdint>
+#else
+#include <cstdint>
+#endif
+
+#ifndef CUDA_CTA_RECONFIG_ACTIVATED
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (             \
+         (__CUDA_ARCH__ ==  900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))      \
+      || (__CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))     \
+      || (__CUDA_ARCH__ == 1010 && defined(__CUDA_ARCH_FEAT_SM101_ALL))     \
+      || (__CUDA_ARCH__ == 1030 && defined(__CUDA_ARCH_FEAT_SM103_ALL))     \
+      || (__CUDA_ARCH__ == 1200 && defined(__CUDA_ARCH_FEAT_SM120_ALL))     \
+      || (__CUDA_ARCH__ == 1210 && defined(__CUDA_ARCH_FEAT_SM121_ALL))     \
+    )
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+
+  #if defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 12 && (          \
+         (__CUDA_ARCH__ == 1000 && CUDA_ARCH_FAMILY(1000))  \
+      || (__CUDA_ARCH__ == 1010 && CUDA_ARCH_FAMILY(1010))  \
+      || (__CUDA_ARCH__ == 1030 && CUDA_ARCH_FAMILY(1030))  \
+      || (__CUDA_ARCH__ == 1200 && CUDA_ARCH_FAMILY(1200))  \
+      || (__CUDA_ARCH__ == 1210 && CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))  \
+    )
+    #define CUDA_CTA_RECONFIG_ACTIVATED 1
+  #endif
+
+#endif
+
+namespace cutlass {
+namespace arch {
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_alloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.inc.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+template<uint32_t RegCount>
+CUTLASS_DEVICE
+void warpgroup_reg_dealloc(){
+#if CUDA_CTA_RECONFIG_ACTIVATED
+  asm volatile( "setmaxnreg.dec.sync.aligned.u32 %0;\n" : : "n"(RegCount) );
+#endif
+}
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1dc7dff4d603ecf7e6a190c84bc7634e8c8be62
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators
+*/
+
+#pragma once
+
+#include "cutlass/arch/array.h"
+#include "cutlass/arch/numeric_types.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Element-wise operators
+//
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator*(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i];
+  }
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator+(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] + b[i];
+  }
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> operator-(Array<T, N> const &a, Array<T, N> const &b) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] - b[i];
+  }
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Multiply-accumulate operators
+//
+
+CUTLASS_HOST_DEVICE
+template <typename T, int N>
+Array<T, N> mac(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  Array<T, N> d;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    d[i] = a[i] * b[i] + c[i];
+  }
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Dot product operator
+//
+
+CUTLASS_HOST_DEVICE
+template <typename Element, typename Accumulator, int N>
+Accumulator dot(Array<T, N> const &a, Array<T, N> const &b, Accumulator accum) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; ++i) {
+    accum += a[i] * b[i];
+  }
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "simd_sm60.h"
+#include "simd_sm61.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm60.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm60.h
new file mode 100644
index 0000000000000000000000000000000000000000..59f38d62da91ab9af6a1f73a5990d29056dd259a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm60.h
@@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM60
+*/
+
+#pragma once
+
+#include "simd.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Element-wise operators - specialized for half_t x 2
+//
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator*(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator+(AArray<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> operator-(Array<half_t, 2> const &a, Array<half_t, 2> const &b) {
+  Array<T, N> d;
+
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Multiply-accumulate operators - specialized for half_t x 2
+CUTLASS_HOST_DEVICE
+template <>
+Array<half_t, 2> mac(Array<half_t, 2> const &a, Array<half_t, 2> const &b, Array<half_t, 2> const &c) {
+  Array<half_t, 2> d;
+
+  return d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for half_t <- (half_t * half_t) x 2 + half_t
+CUTLASS_HOST_DEVICE
+template <>
+half_t dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, half_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for float <- (half_t * half_t) x 2 + float
+CUTLASS_HOST_DEVICE
+template <>
+float dot(Array<half_t, 2> const &a, Array<half_t, 2> const &b, float accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm61.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm61.h
new file mode 100644
index 0000000000000000000000000000000000000000..46c22665c2126b5dd2e0fb143be00143b933f3ec
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/simd_sm61.h
@@ -0,0 +1,147 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing SIMD operators for SM61
+*/
+
+#pragma once
+
+#include "simd.h"
+
+namespace cutlass {
+namespace arch {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint8_t * int8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<int8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint8_t * uint8_t) x 4 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint8_t, 4> const &a, Array<uint8_t, 4> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int8_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint8_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<int16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (int16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<int16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/// Dot product operator - specialized for int32_t <- (uint16_t * int16_t) x 2 + int32_t
+CUTLASS_HOST_DEVICE
+template <>
+int32_t dot(Array<uint16_t, 2> const &a, Array<uint16_t, 2> const &b, int32_t accum) {
+
+  return accum;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/synclog.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/synclog.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5567fe561f8ca7a95f7b0958aaced2696109f22a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/synclog.hpp
@@ -0,0 +1,1271 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Synchronization event logging for race condition debugging.
+*/
+
+#pragma once
+
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <mutex>
+#include <vector>
+#endif
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+
+constexpr uint32_t synclog_cap = 1 << 26;
+
+inline std::mutex synclog_mutex;
+inline std::vector<uint32_t*> synclog_buf_list;
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+CUTLASS_DEVICE uint32_t* synclog_buf;
+#endif
+
+CUTLASS_DEVICE
+uint32_t* synclog_alloc(uint32_t n) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t* buf = synclog_buf;
+  if (buf == nullptr) return nullptr;
+  uint32_t last = atomicAdd(&buf[0], n);
+  if (last + n < synclog_cap) return buf + last + 1;
+  if (last >= synclog_cap) atomicAdd(&buf[0], -n);
+  #endif
+  return nullptr;
+}
+
+CUTLASS_DEVICE
+void synclog_emit_prefix(uint32_t* to, uint32_t header, uint32_t line) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint64_t time64;
+  asm volatile (
+    "mov.u64 %0, %%globaltimer;\n"
+    : "=l"(time64) :
+  );
+  to[0] = header;
+  to[1] = line;
+  to[2] = time64;
+  to[3] = time64 >> 32;
+  to[4] = threadIdx.x;
+  to[5] = threadIdx.y;
+  to[6] = threadIdx.z;
+  to[7] = blockIdx.x;
+  to[8] = blockIdx.y;
+  to[9] = blockIdx.z;
+  #endif
+}
+
+constexpr uint32_t synclog_header_none = 0;
+constexpr uint32_t synclog_length_prefix = 1 + 1 + 2 + 3 + 3;
+
+constexpr bool     synclog_enable_syncthreads = true;
+constexpr uint32_t synclog_header_syncthreads = 1;
+constexpr uint32_t synclog_length_syncthreads = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_syncwarp = true;
+constexpr uint32_t synclog_header_syncwarp = 2;
+constexpr uint32_t synclog_length_syncwarp = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_named_barrier_arrive_and_wait = true;
+constexpr uint32_t synclog_header_named_barrier_arrive_and_wait = 3;
+constexpr uint32_t synclog_length_named_barrier_arrive_and_wait = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_named_barrier_arrive = true;
+constexpr uint32_t synclog_header_named_barrier_arrive = 4;
+constexpr uint32_t synclog_length_named_barrier_arrive = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_init = true;
+constexpr uint32_t synclog_header_cluster_barrier_init = 5;
+constexpr uint32_t synclog_length_cluster_barrier_init = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_cluster_barrier_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_wait = 6;
+constexpr uint32_t synclog_length_cluster_barrier_wait = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_test_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_test_wait = 7;
+constexpr uint32_t synclog_length_cluster_barrier_test_wait = synclog_length_prefix + 3;
+constexpr bool     synclog_enable_cluster_barrier_try_wait = true;
+constexpr uint32_t synclog_header_cluster_barrier_try_wait = 8;
+constexpr uint32_t synclog_length_cluster_barrier_try_wait = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_barrier_arrive_cluster = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive_cluster = 9;
+constexpr uint32_t synclog_length_cluster_barrier_arrive_cluster = synclog_length_prefix + 3;
+constexpr bool     synclog_enable_cluster_barrier_arrive = true;
+constexpr uint32_t synclog_header_cluster_barrier_arrive = 10;
+constexpr uint32_t synclog_length_cluster_barrier_arrive = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_cluster_barrier_invalidate = true;
+constexpr uint32_t synclog_header_cluster_barrier_invalidate = 11;
+constexpr uint32_t synclog_length_cluster_barrier_invalidate = synclog_length_prefix + 1;
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx = 12;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster = 13;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_cluster_transaction_barrier_expect_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_expect_transaction = 14;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_expect_transaction = synclog_length_prefix + 2;
+constexpr bool     synclog_enable_cluster_transaction_barrier_complete_transaction = true;
+constexpr uint32_t synclog_header_cluster_transaction_barrier_complete_transaction = 15;
+constexpr uint32_t synclog_length_cluster_transaction_barrier_complete_transaction = synclog_length_prefix + 4;
+constexpr bool     synclog_enable_fence_barrier_init = true;
+constexpr uint32_t synclog_header_fence_barrier_init = 16;
+constexpr uint32_t synclog_length_fence_barrier_init = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_fence_view_async_shared = true;
+constexpr uint32_t synclog_header_fence_view_async_shared = 17;
+constexpr uint32_t synclog_length_fence_view_async_shared = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_wait = true;
+constexpr uint32_t synclog_header_cp_async_wait = 18;
+constexpr uint32_t synclog_length_cp_async_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_cp_async_wait_all = true;
+constexpr uint32_t synclog_header_cp_async_wait_all = 19;
+constexpr uint32_t synclog_length_cp_async_wait_all = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_fence = true;
+constexpr uint32_t synclog_header_cp_async_fence = 20;
+constexpr uint32_t synclog_length_cp_async_fence = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_cp_async_nan = true;
+constexpr uint32_t synclog_header_cp_async_nan = 21;
+constexpr uint32_t synclog_length_cp_async_nan = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cp_async_zfill = true;
+constexpr uint32_t synclog_header_cp_async_zfill = 22;
+constexpr uint32_t synclog_length_cp_async_zfill = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_cp_async = true;
+constexpr uint32_t synclog_header_cp_async = 23;
+constexpr uint32_t synclog_length_cp_async = synclog_length_prefix + 5;
+
+constexpr bool     synclog_enable_tma_load = true;
+constexpr uint32_t synclog_header_tma_load = 24;
+constexpr uint32_t synclog_length_tma_load = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_tma_store = true;
+constexpr uint32_t synclog_header_tma_store = 25;
+constexpr uint32_t synclog_length_tma_store = synclog_length_prefix + 3;
+
+constexpr bool     synclog_enable_tma_store_arrive = true;
+constexpr uint32_t synclog_header_tma_store_arrive = 26;
+constexpr uint32_t synclog_length_tma_store_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_tma_store_wait = true;
+constexpr uint32_t synclog_header_tma_store_wait = 27;
+constexpr uint32_t synclog_length_tma_store_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_arrive = true;
+constexpr uint32_t synclog_header_warpgroup_arrive = 28;
+constexpr uint32_t synclog_length_warpgroup_arrive = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_warpgroup_wait = true;
+constexpr uint32_t synclog_header_warpgroup_wait = 29;
+constexpr uint32_t synclog_length_warpgroup_wait = synclog_length_prefix + 1;
+
+constexpr bool     synclog_enable_warpgroup_commit_batch = true;
+constexpr uint32_t synclog_header_warpgroup_commit_batch = 30;
+constexpr uint32_t synclog_length_warpgroup_commit_batch = synclog_length_prefix + 0;
+
+constexpr bool     synclog_enable_wgmma_reg_smem = true;
+constexpr uint32_t synclog_header_wgmma_reg_smem = 31;
+constexpr uint32_t synclog_length_wgmma_reg_smem = synclog_length_prefix + 2;
+
+constexpr bool     synclog_enable_wgmma_smem_smem = true;
+constexpr uint32_t synclog_header_wgmma_smem_smem = 32;
+constexpr uint32_t synclog_length_wgmma_smem_smem = synclog_length_prefix + 4;
+
+constexpr bool     synclog_enable_cpasync_barrier_arrive = true;
+constexpr uint32_t synclog_header_cpasync_barrier_arrive = 33;
+constexpr uint32_t synclog_length_cpasync_barrier_arrive = synclog_length_prefix + 1;
+CUTLASS_DEVICE
+bool synclog_condition_emit() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x % NumThreadsPerWarp == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return 0;
+  #endif
+}
+
+CUTLASS_DEVICE
+bool synclog_condition_print() {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  return threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+    blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0;
+  #else
+  return false;
+  #endif
+}
+
+CUTLASS_DEVICE
+void synclog_print_prefix(char const* header, uint32_t at) {
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  uint32_t line = synclog_buf[at + 1];
+  uint32_t timeLo = synclog_buf[at + 2];
+  uint32_t timeHi = synclog_buf[at + 3];
+  uint32_t threadIdxX = synclog_buf[at + 4];
+  uint32_t threadIdxY = synclog_buf[at + 5];
+  uint32_t threadIdxZ = synclog_buf[at + 6];
+  uint32_t blockIdxX = synclog_buf[at + 7];
+  uint32_t blockIdxY = synclog_buf[at + 8];
+  uint32_t blockIdxZ = synclog_buf[at + 9];
+  printf(
+    "%s line=%u time=%lu thread=%u,%u,%u block=%u,%u,%u ",
+    header, line,
+    (uint64_t)timeHi << 32 | timeLo,
+    threadIdxX, threadIdxY, threadIdxZ,
+    blockIdxX, blockIdxY, blockIdxZ
+  );
+  #endif
+}
+
+CUTLASS_DEVICE
+void synclog_print_wgmma_desc(char const* str, uint32_t lo, uint32_t hi, char const* sep) {
+  CUTLASS_UNUSED(hi);
+  uint32_t smem_int_ptr = (lo & ((1 << 14) - 1)) << 4;
+  printf("%s_smem_int_ptr=%u%s", str, smem_int_ptr, sep);
+}
+
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline void synclog_setup() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  std::scoped_lock lock(synclog_mutex);
+  auto fail = [] () {
+    fprintf(stderr, "synclog_setup() failed\n");
+    std::terminate();
+  };
+  int orig_device = 0;
+  if (cudaGetDevice(&orig_device) != cudaSuccess) {
+    fail();
+  }
+  int device_count = 0;
+  if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
+    fail();
+  }
+  if (synclog_buf_list.size() == 0) {
+    for (int device = 0; device < device_count; device++) {
+      uint32_t* buf = 0;
+      if (cudaSetDevice(device) != cudaSuccess ||
+        cudaMalloc(&buf, synclog_cap * sizeof(uint32_t)) != cudaSuccess) {
+        fail();
+      }
+      synclog_buf_list.push_back(buf);
+    }
+  }
+  for (int device = 0; device < device_count; device++) {
+    uint32_t* buf = synclog_buf_list.at(device);
+    if (cudaSetDevice(device) != cudaSuccess ||
+      cudaMemset(buf, 0, synclog_cap * sizeof(uint32_t)) != cudaSuccess ||
+      cudaMemcpyToSymbol(synclog_buf, &buf, sizeof(buf)) != cudaSuccess) {
+      fail();
+    }
+  }
+  if (cudaSetDevice(orig_device) != cudaSuccess) {
+    fail();
+  }
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncthreads(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncthreads) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncthreads);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncthreads, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_syncwarp(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_syncwarp) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_syncwarp);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_syncwarp, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive_and_wait(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive_and_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive_and_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive_and_wait, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_named_barrier_arrive(
+  uint32_t line,
+  uint32_t num_threads,
+  uint32_t barrier_id) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_named_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_named_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_named_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = num_threads;
+  to[synclog_length_prefix + 1] = barrier_id;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(num_threads);
+  CUTLASS_UNUSED(barrier_id);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_init(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t arrive_count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_init, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = arrive_count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(arrive_count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_test_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_test_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_test_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_test_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  to[synclog_length_prefix + 2] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_try_wait(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t phase) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_try_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_try_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_try_wait, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = phase;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(phase);  
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = cta_id;
+  to[synclog_length_prefix + 2] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_barrier_invalidate(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_barrier_invalidate) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_barrier_invalidate);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_barrier_invalidate, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_arrive_and_expect_tx_cluster(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes,
+  uint32_t cta_id,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  to[synclog_length_prefix + 2] = cta_id;
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(cta_id);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_expect_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t transaction_bytes) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_expect_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_expect_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_expect_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = transaction_bytes;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(transaction_bytes);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cluster_transaction_barrier_complete_transaction(
+  uint32_t line,
+  uint32_t smem_addr,
+  uint32_t dst_cta_id,
+  uint32_t transaction_bytes,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cluster_transaction_barrier_complete_transaction) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cluster_transaction_barrier_complete_transaction);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cluster_transaction_barrier_complete_transaction, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = dst_cta_id;
+  to[synclog_length_prefix + 2] = transaction_bytes;
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(dst_cta_id);
+  CUTLASS_UNUSED(transaction_bytes);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_barrier_init(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_barrier_init) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_barrier_init);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_barrier_init, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_fence_view_async_shared(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_fence_view_async_shared) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_fence_view_async_shared);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_fence_view_async_shared, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_wait_all(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_wait_all) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_wait_all);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_wait_all, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_fence(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_fence) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_fence);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_fence, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_nan(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_nan) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_nan);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_nan, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async_zfill(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async_zfill) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async_zfill);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async_zfill, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cp_async(
+  uint32_t line,
+  uint32_t smem_addr,
+  const void* gmem_ptr,
+  uint32_t pred,
+  uint32_t size) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cp_async) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cp_async);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cp_async, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_ptr);
+  to[synclog_length_prefix + 2] = (uint32_t)((uint64_t)gmem_ptr >> 32);
+  to[synclog_length_prefix + 3] = pred;
+  to[synclog_length_prefix + 4] = size;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  CUTLASS_UNUSED(gmem_ptr);
+  CUTLASS_UNUSED(pred);
+  CUTLASS_UNUSED(size);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_load(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_mbar,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_load) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_load);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_load, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_mbar;
+  to[synclog_length_prefix + 3] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_mbar);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store(
+  uint32_t line,
+  uint64_t gmem_int_desc,
+  uint32_t smem_int_ptr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store, line);
+  to[synclog_length_prefix + 0] = (uint32_t)((uint64_t)gmem_int_desc);
+  to[synclog_length_prefix + 1] = (uint32_t)((uint64_t)gmem_int_desc >> 32);
+  to[synclog_length_prefix + 2] = smem_int_ptr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(gmem_int_desc);
+  CUTLASS_UNUSED(smem_int_ptr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_arrive(uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_tma_store_wait(
+  uint32_t line,
+  uint32_t count) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_tma_store_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_tma_store_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_tma_store_wait, line);
+  to[synclog_length_prefix + 0] = count;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(count);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_arrive(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_arrive, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_wait(
+  uint32_t line,
+  uint32_t n) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_wait) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_wait);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_wait, line);
+  to[synclog_length_prefix + 0] = n;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(n);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_warpgroup_commit_batch(
+  uint32_t line) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_warpgroup_commit_batch) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_warpgroup_commit_batch);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_warpgroup_commit_batch, line);
+  #else
+  CUTLASS_UNUSED(line);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_reg_smem(
+  uint32_t line,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_reg_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_reg_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_reg_smem, line);
+  to[synclog_length_prefix + 0] = desc_b;
+  to[synclog_length_prefix + 1] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_wgmma_smem_smem(
+  uint32_t line,
+  uint64_t desc_a,
+  uint64_t desc_b) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_wgmma_smem_smem) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_wgmma_smem_smem);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_wgmma_smem_smem, line);
+  to[synclog_length_prefix + 0] = desc_a;
+  to[synclog_length_prefix + 1] = desc_a >> 32;
+  to[synclog_length_prefix + 2] = desc_b;
+  to[synclog_length_prefix + 3] = desc_b >> 32;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(desc_a);
+  CUTLASS_UNUSED(desc_b);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+CUTLASS_DEVICE
+void synclog_emit_cpasync_barrier_arrive(
+  uint32_t line,
+  uint32_t smem_addr) {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  if constexpr (!synclog_enable_cpasync_barrier_arrive) return;
+  if (!synclog_condition_emit()) return;
+  uint32_t* to = synclog_alloc(synclog_length_cpasync_barrier_arrive);
+  if (to == nullptr) return;
+  synclog_emit_prefix(to, synclog_header_cpasync_barrier_arrive, line);
+  to[synclog_length_prefix + 0] = smem_addr;
+  #else
+  CUTLASS_UNUSED(line);
+  CUTLASS_UNUSED(smem_addr);
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+#if !defined(CUTLASS_ENABLE_SYNCLOG)
+CUTLASS_DEVICE
+#elif defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+static __attribute__((__noinline__)) __device__
+#else
+static __attribute__((__noinline__))
+#endif
+void synclog_print() {
+  #if defined(CUTLASS_ENABLE_SYNCLOG)
+  #if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+  if (synclog_buf == nullptr || !synclog_condition_print()) {
+    return;
+  }
+  printf("synclog start\n");
+  for (uint32_t at = 1; at < synclog_cap; ) {
+    uint32_t header = synclog_buf[at];
+    if (header == synclog_header_none) {
+      break;
+    }
+    printf("synclog at %u: ", at);
+    if constexpr (synclog_enable_syncthreads) {
+      if (header == synclog_header_syncthreads) {
+        synclog_print_prefix("syncthreads", at);
+        at += synclog_length_syncthreads;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_syncwarp) {
+      if (header == synclog_header_syncwarp) {
+        synclog_print_prefix("syncwarp", at);
+        at += synclog_length_syncwarp;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive_and_wait) {
+      if (header == synclog_header_named_barrier_arrive_and_wait) {
+        synclog_print_prefix("named_barrier_arrive_and_wait", at);
+        at += synclog_length_named_barrier_arrive_and_wait;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_named_barrier_arrive) {
+      if (header == synclog_header_named_barrier_arrive) {
+        synclog_print_prefix("named_barrier_arrive", at);
+        at += synclog_length_named_barrier_arrive;
+        printf("num_threads=%u barrier_id=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_init) {
+      if (header == synclog_header_cluster_barrier_init) {
+        synclog_print_prefix("cluster_barrier_init", at);
+        at += synclog_length_cluster_barrier_init;
+        printf("smem_addr=%u arrive_count=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_wait) {
+      if (header == synclog_header_cluster_barrier_wait) {
+        synclog_print_prefix("cluster_barrier_wait", at);
+        at += synclog_length_cluster_barrier_wait;
+        printf("smem_addr=%u phase=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_test_wait) {
+      if (header == synclog_header_cluster_barrier_test_wait) {
+        synclog_print_prefix("cluster_barrier_test_wait", at);
+        at += synclog_length_cluster_barrier_test_wait;
+        printf("smem_addr=%u phase=%u pred=%u\n", synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_try_wait) {
+      if (header == synclog_header_cluster_barrier_try_wait) {
+        synclog_print_prefix("cluster_barrier_try_wait", at);
+        at += synclog_length_cluster_barrier_try_wait;
+        printf("smem_addr=%u phase=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive_cluster) {
+      if (header == synclog_header_cluster_barrier_arrive_cluster) {
+        synclog_print_prefix("cluster_barrier_arrive_cluster", at);
+        at += synclog_length_cluster_barrier_arrive_cluster;
+        printf("smem_addr=%u cta_id=%u pred=%u\n", synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_arrive) {
+      if (header == synclog_header_cluster_barrier_arrive) {
+        synclog_print_prefix("cluster_barrier_arrive", at);
+        at += synclog_length_cluster_barrier_arrive;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_barrier_invalidate) {
+      if (header == synclog_header_cluster_barrier_invalidate) {
+        synclog_print_prefix("cluster_barrier_invalidate", at);
+        at += synclog_length_cluster_barrier_invalidate;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx;
+        printf("smem_addr=%u transaction_bytes=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+      if (header == synclog_header_cluster_transaction_barrier_arrive_and_expect_tx_cluster) {
+        synclog_print_prefix("cluster_transaction_barrier_arrive_and_expect_tx_cluster", at);
+        at += synclog_length_cluster_transaction_barrier_arrive_and_expect_tx_cluster;
+        printf("smem_addr=%u transaction_bytes=%u cta_id=%u pred=%u\n", synclog_buf[at-4], synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_expect_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_expect_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_expect_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_expect_transaction;
+        printf("smem_addr=%u transaction_bytes=%u\n", synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cluster_transaction_barrier_complete_transaction) {
+      if (header == synclog_header_cluster_transaction_barrier_complete_transaction) {
+        synclog_print_prefix("cluster_transaction_barrier_complete_transaction", at);
+        at += synclog_length_cluster_transaction_barrier_complete_transaction;
+        printf("smem_addr=%u dst_cta_id=%u transaction_bytes=%u pred=%u\n", synclog_buf[at-4], synclog_buf[at-3], synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_barrier_init) {
+      if (header == synclog_header_fence_barrier_init) {
+        synclog_print_prefix("fence_barrier_init", at);
+        at += synclog_length_fence_barrier_init;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_fence_view_async_shared) {
+      if (header == synclog_header_fence_view_async_shared) {
+        synclog_print_prefix("fence_view_async_shared", at);
+        at += synclog_length_fence_view_async_shared;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait) {
+      if (header == synclog_header_cp_async_wait) {
+        synclog_print_prefix("cp_async_wait", at);
+        at += synclog_length_cp_async_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_wait_all) {
+      if (header == synclog_header_cp_async_wait_all) {
+        synclog_print_prefix("cp_async_wait_all", at);
+        at += synclog_length_cp_async_wait_all;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_fence) {
+      if (header == synclog_header_cp_async_fence) {
+        synclog_print_prefix("cp_async_fence", at);
+        at += synclog_length_cp_async_fence;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_nan) {
+      if (header == synclog_header_cp_async_nan) {
+        synclog_print_prefix("cp_async_nan", at);
+        at += synclog_length_cp_async_nan;
+        uint64_t gmem_addr = synclog_buf[at-3];
+        gmem_addr += (uint64_t)synclog_buf[at-2] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u\n", synclog_buf[at-4], gmem_addr, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async_zfill) {
+      if (header == synclog_header_cp_async_zfill) {
+        synclog_print_prefix("cp_async_zfill", at);
+        at += synclog_length_cp_async_zfill;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cp_async) {
+      if (header == synclog_header_cp_async) {
+        synclog_print_prefix("cp_async", at);
+        at += synclog_length_cp_async;
+        uint64_t gmem_addr = synclog_buf[at-4];
+        gmem_addr += (uint64_t)synclog_buf[at-3] << 32;
+        printf("smem_addr=%u gmem_addr=%llu pred=%u size=%u\n", synclog_buf[at-5], gmem_addr, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_load) {
+      if (header == synclog_header_tma_load) {
+        synclog_print_prefix("tma_load", at);
+        at += synclog_length_tma_load;
+        uint64_t gmem_int_desc = synclog_buf[at-4];
+        gmem_int_desc += (uint64_t)synclog_buf[at-3] << 32;
+        printf("gmem_int_desc=%llu smem_int_mbar=%u smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-2], synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store) {
+      if (header == synclog_header_tma_store) {
+        synclog_print_prefix("tma_store", at);
+        at += synclog_length_tma_store;
+        uint64_t gmem_int_desc = synclog_buf[at-3];
+        gmem_int_desc += (uint64_t)synclog_buf[at-2] << 32;
+        printf("gmem_int_desc=%llu smem_int_ptr=%u\n", gmem_int_desc, synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_arrive) {
+      if (header == synclog_header_tma_store_arrive) {
+        synclog_print_prefix("tma_store_arrive", at);
+        at += synclog_length_tma_store_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_tma_store_wait) {
+      if (header == synclog_header_tma_store_wait) {
+        synclog_print_prefix("tma_store_wait", at);
+        at += synclog_length_tma_store_wait;
+        printf("count=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_arrive) {
+      if (header == synclog_header_warpgroup_arrive) {
+        synclog_print_prefix("warpgroup_arrive", at);
+        at += synclog_length_warpgroup_arrive;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_wait) {
+      if (header == synclog_header_warpgroup_wait) {
+        synclog_print_prefix("warpgroup_wait", at);
+        at += synclog_length_warpgroup_wait;
+        printf("n=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_warpgroup_commit_batch) {
+      if (header == synclog_header_warpgroup_commit_batch) {
+        synclog_print_prefix("warpgroup_commit_batch", at);
+        at += synclog_length_warpgroup_commit_batch;
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_reg_smem) {
+      if (header == synclog_header_wgmma_reg_smem) {
+        synclog_print_prefix("wgmma_reg_smem", at);
+        at += synclog_length_wgmma_reg_smem;
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_wgmma_smem_smem) {
+      if (header == synclog_header_wgmma_smem_smem) {
+        synclog_print_prefix("wgmma_smem_smem", at);
+        at += synclog_length_wgmma_smem_smem;
+        synclog_print_wgmma_desc("desc_a", synclog_buf[at-4], synclog_buf[at-3], " ");
+        synclog_print_wgmma_desc("desc_b", synclog_buf[at-2], synclog_buf[at-1], "");
+        printf("\n");
+        continue;
+      }
+    }
+    if constexpr (synclog_enable_cpasync_barrier_arrive) {
+      if (header == synclog_header_cpasync_barrier_arrive) {
+        synclog_print_prefix("cpasync_barrier_arrive", at);
+        at += synclog_length_cpasync_barrier_arrive;
+        printf("smem_addr=%u\n", synclog_buf[at-1]);
+        continue;
+      }
+    }
+    asm volatile ("brkpt;\n" ::);
+  }
+  if (synclog_buf[0] >= synclog_cap) {
+    printf(
+      "synclog was truncated (exceeded capacity of %lu bytes)\n",
+      (synclog_cap - 1) * sizeof(uint32_t)
+    );
+  }
+  printf("synclog end\n");
+  #endif
+  #endif // defined(CUTLASS_ENABLE_SYNCLOG)
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncthreads
+#define __syncthreads() do {\
+  cutlass::arch::synclog_emit_syncthreads(__LINE__);\
+  __syncthreads();\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+#if defined(CUTLASS_ENABLE_SYNCLOG)
+#undef __syncwarp
+#define __syncwarp(...) do {\
+  cutlass::arch::synclog_emit_syncwarp(__LINE__);\
+  __syncwarp(__VA_ARGS__);\
+} while (0)
+#endif // defined(CUTLASS_ENABLE_SYNCLOG)
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d4861ab682aca73d40ad5d0f298f9a265f7b9f2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma.h
@@ -0,0 +1,218 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp matrix multiply-add (WMMA) operations
+*/
+
+#pragma once
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700))
+#define CUTLASS_ARCH_WMMA_ENABLED
+#define CUTLASS_ARCH_WMMA_SM70_ENABLED
+#endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 720))
+#define CUTLASS_ARCH_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM72_ENABLED
+#endif
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 10)
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 750))
+#define CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED
+#define CUTLASS_ARCH_WMMA_SM75_ENABLED
+#endif
+#endif
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include <mma.h>
+#include "cutlass/arch/mma.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass data types => nvcuda::wmma data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct CutlassToWmmaDataType{
+  using Type = Type_;
+};
+
+/// Statically maps cutlass::half_t => __half
+template<>
+struct CutlassToWmmaDataType<cutlass::half_t> {
+  using Type = __half;
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct CutlassToWmmaDataType<cutlass::bfloat16_t> {
+  using Type = __nv_bfloat16;
+};
+#endif
+
+/// Statically maps int8_t => char
+template<>
+struct CutlassToWmmaDataType<int8_t> {
+  using Type = signed char;
+};
+
+/// Statically maps uint8_t => char
+template<>
+struct CutlassToWmmaDataType<uint8_t> {
+  using Type = unsigned char;
+};
+
+/// Statically maps int32_t => int
+template<>
+struct CutlassToWmmaDataType<int32_t> {
+  using Type = int;
+};
+
+#if defined(CUTLASS_SUBBYTE_INTEGER_MATRIX_MULTIPLY_ENABLED)
+/// Statically maps cutlass::int4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::int4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::s4;
+};
+
+/// Statically maps cutlass::uint4b_t => experimental::precision::s4
+template<>
+struct CutlassToWmmaDataType<cutlass::uint4b_t> {
+  using Type = nvcuda::wmma::experimental::precision::u4;
+};
+
+/// Statically maps cutlass::uint1b_t => experimental::precision::b1
+template<>
+struct CutlassToWmmaDataType<cutlass::uint1b_t> {
+  using Type = nvcuda::wmma::experimental::precision::b1;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout => nvcuda::wmma layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Layout_>
+struct CutlassToWmmaLayout {
+};
+
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::RowMajor> {
+  using Layout = nvcuda::wmma::row_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_row_major;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps cutlass::layout::RowMajor => nvcuda::wmma::row_major layout tags
+////////////////////////////////////////////////////////////////////////////////////////////////
+template <>
+struct CutlassToWmmaLayout<cutlass::layout::ColumnMajor> {
+  using Layout = nvcuda::wmma::col_major;
+  static nvcuda::wmma::layout_t const value = nvcuda::wmma::layout_t::mem_col_major;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+/// Statically maps nvcuda::wmma data types => cutlass data types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Type_>
+struct WmmaToCutlassDataType{
+  using Type = Type_;
+};
+
+/// Statically maps __half => cutlass::half_t
+template<>
+struct WmmaToCutlassDataType<__half> {
+  using Type = cutlass::half_t;
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+template<>
+struct WmmaToCutlassDataType<__nv_bfloat16> {
+  using Type = cutlass::bfloat16_t;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// WMMA template structure defines nvcuda::wmma::fragments and static assertion chaeks
+// for a specific template parameterized data type (Element[A|B|C]), layout (Layout[A|B|C]), 
+// and native wmma size (Shape)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <  
+  typename Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  typename ElementA_,                                ///< Data type of A elements 
+  typename LayoutA_,                                 ///< Layout of A matrix (concept: MatrixLayout)  
+  typename ElementB_,                                ///< Data type of B elements
+  typename LayoutB_,                                 ///< Layout of B matrix (concept: MatrixLayout)  
+  typename ElementC_,                                ///< Element type of C matrix  
+  typename LayoutC_,                                 /// Layout of C matrix (concept: MatrixLayout)
+  typename Operator_ = cutlass::arch::OpMultiplyAdd   ///< Inner product operator (multiply-add, xor.popc)
+>
+struct Wmma;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace arch
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Specializations for each compute capability
+//
+#ifdef CUTLASS_ARCH_WMMA_SM70_ENABLED
+#include "cutlass/arch/wmma_sm70.h"
+#endif
+
+#ifdef CUTLASS_ARCH_WMMA_SM72_ENABLED
+#include "cutlass/arch/wmma_sm72.h"
+#endif
+
+#ifdef CUTLASS_ARCH_WMMA_SM75_ENABLED
+#include "cutlass/arch/wmma_sm75.h"
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif //CUTLASS_ARCH_WMMA_ENABLED
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c540be88577b448a2abc75cf6478736a41eb716
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm70.h
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for half
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename ElementC_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::half_t,                          ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::half_t,                          ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  ElementC_,                                ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+
+#if defined(CUTLASS_ARCH_WMMA_SM70_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::half_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::half_t;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm70;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for f16 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+  // check supported wmma output data type for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::half_t, ElementC>::value || platform::is_same<float, ElementC>::value,
+    "Supported of wmma output data type for f16 multiplicands are: f16 and f32");
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+    
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+#else
+    static_assert(false, "wmma.mma.sync for floating point multiplicands is available only for SM70 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm72.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm72.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eb553e8f311e66e08e47dab15c6b08c29dec81c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm72.h
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for int8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  int8_t,                                   ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  int8_t,                                   ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = int8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = int8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM72 and beyond");
+#endif
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for uint8_t
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  uint8_t,                                  ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  uint8_t,                                  ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM72_ENABLED)
+  using Shape = Shape_;
+  using ElementA = uint8_t;
+  using LayoutA = LayoutA_;
+  using ElementB = uint8_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm72;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<16, 16, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape< 8, 32, 16>, Shape>::value ||
+    platform::is_same<cutlass::gemm::GemmShape<32,  8, 16>, Shape>::value,
+    "Supported list of wmma operator shape for u8 multiplicands are: 16x16x16, 8x32x16, and 32x8x16");
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+      nvcuda::wmma::mma_sync(D, A, B, C);
+  }
+  
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM72 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm75.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm75.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3535ef0748e53b204b7d20cdd4aa82edc8c72a8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/arch/wmma_sm75.h
@@ -0,0 +1,203 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Matrix multiply
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace arch {
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::int4b_t (experimental::s4).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::int4b_t,                         ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::int4b_t,                         ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpMultiplyAdd              ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::int4b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::int4b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+  using ArchTag = arch::Sm75;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 32>, Shape>::value,
+    "Supported list of wmma operator shape for s8 multiplicands is: 8x8x32");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+      nvcuda::wmma::mma_sync(D, A, B, C);
+
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM75 and beyond");
+#endif
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// WMMA template structure defines nvcuda::wmma::fragments and static assert for
+// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1).
+//
+////////////////////////////////////////////////////////////////////////////////
+template <
+typename Shape_, 
+typename LayoutA_, 
+typename LayoutB_,
+typename LayoutC_>
+struct Wmma<
+  Shape_,                                   ///< Size of the matrix product (concept: GemmShape)
+  cutlass::uint1b_t,                        ///< ElementA
+  LayoutA_,                                 ///< LayoutA
+  cutlass::uint1b_t,                        ///< ElementB
+  LayoutB_,                                 ///< LayoutB
+  int32_t,                                  ///< ElementC
+  LayoutC_,                                 ///< LayoutC
+  cutlass::arch::OpXorPopc                  ///< Operator (multiply-add, xor.popc)
+> {
+#if defined(CUTLASS_ARCH_WMMA_SM75_ENABLED)
+  using Shape = Shape_;
+  using ElementA = cutlass::uint1b_t;
+  using LayoutA = LayoutA_;
+  using ElementB = cutlass::uint1b_t;
+  using LayoutB = LayoutB_;
+  using ElementC = int32_t;
+  using LayoutC = LayoutC_;
+  using Operator = cutlass::arch::OpXorPopc;
+  using ArchTag = arch::Sm75;
+
+  // check supported wmma shape for the given multiplicand data types
+  static_assert(
+    platform::is_same<cutlass::gemm::GemmShape<8, 8, 128>, Shape>::value,
+    "Supported list of wmma operator shape for b1 multiplicands is: 8x8x128");
+
+
+  // Wmma Fragment
+  using FragmentA = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_a,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementA>::Type,
+          typename CutlassToWmmaLayout<LayoutA>::Layout>;
+
+  using FragmentB = nvcuda::wmma::fragment<
+          nvcuda::wmma::matrix_b,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementB>::Type,
+          typename CutlassToWmmaLayout<LayoutB>::Layout>;
+
+  using FragmentC = nvcuda::wmma::fragment<
+          nvcuda::wmma::accumulator,
+          Shape::kM,
+          Shape::kN,
+          Shape::kK,
+          typename CutlassToWmmaDataType<ElementC>::Type>;
+  
+  /// Performs a nvcuda::wmma matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+      nvcuda::wmma::bmma_sync(D, A, B, C, nvcuda::wmma::experimental::bmmaBitOpXOR, 
+                                          nvcuda::wmma::experimental::bmmaAccumulateOpPOPC);
+  }
+
+#else
+    static_assert(false, "wmma.mma.sync integer type multiplicands is available only for SM75 and beyond");
+#endif
+
+};
+
+} // namespace arch
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce33110aa4f44e7deba56a5f9fe4db206a6889ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array.h
@@ -0,0 +1,2860 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N,
+  bool RegisterSized = sizeof_bits<T>::value >= 32
+>
+struct Array;
+
+namespace detail {
+
+template<class T>
+struct is_Array : platform::false_type {};
+
+template <
+  typename T,
+  int N,
+  bool RegisterSized
+>
+struct is_Array<Array<T, N, RegisterSized> > : platform::true_type {};
+
+template<typename T>
+constexpr bool is_Array_v = is_Array<T>::value;
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an Array<> in bits
+template <typename T, int N, bool RegisterSized>
+struct sizeof_bits<Array<T, N, RegisterSized> > {
+  static constexpr int value = sizeof(Array<T, N, RegisterSized>) * 8;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns true if the argument is a power of 2
+CUTLASS_HOST_DEVICE
+constexpr bool ispow2(unsigned x) {
+  return x && (!(x & (x - 1)));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the largest power of two not greater than the argument.
+CUTLASS_HOST_DEVICE
+constexpr unsigned floor_pow_2(unsigned x) {
+  return (x == 0 || ispow2(x)) ? x : ((floor_pow_2(x >> 1)) << 1);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, true> {
+
+  /// Storage type
+  using Storage = T;
+
+  /// Element type
+  using Element = T;
+
+  /// Number of storage elements
+  //static std::size_t const kStorageElements = N;
+  static constexpr size_t kStorageElements = N;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  //
+  // C++ standard members
+  //
+
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type &reference;
+  typedef value_type const & const_reference;
+  typedef value_type *pointer;
+  typedef value_type const * const_pointer;
+
+  //
+  // Iterators
+  //
+
+  /// Bidirectional iterator over elements
+  class iterator {
+
+    /// Pointer to object
+    T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator(T *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+
+    /// Pointer to object
+    const T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(T const *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator++() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator &operator--() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator operator++(int) {
+      const_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_iterator operator--(int) {
+      const_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+
+    /// Pointer to object
+    T *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(T *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator++(int) {
+      iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator operator--(int) {
+      iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T &operator*() const {
+      return *(ptr_ - 1);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(reverse_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(reverse_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+
+    /// Pointer to object
+    T const *ptr_;
+
+  public:
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(): ptr_(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(T const *_ptr): ptr_(_ptr) { }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator++() {
+      --ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator &operator--() {
+      ++ptr_;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator++(int) {
+      const_reverse_iterator ret(*this);
+      --ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator operator--(int) {
+      const_reverse_iterator ret(*this);
+      ++ptr_;
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T const &operator*() const {
+      return *(ptr_ - 1);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(const_iterator const &other) const {
+      return ptr_ == other.ptr_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(const_iterator const &other) const {
+      return ptr_ != other.ptr_;
+    }
+  };
+
+  /// Internal storage
+  Storage storage[kElements];
+
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    fill(T(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return reinterpret_cast<reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return reinterpret_cast<const_reference>(storage[pos]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return reinterpret_cast<reference>(storage[0]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return reinterpret_cast<const_reference>(storage[0]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reinterpret_cast<reference>(storage[kStorageElements - 1]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return reinterpret_cast<const_reference>(storage[kStorageElements - 1]);
+  }
+
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  
+  CUTLASS_HOST_DEVICE
+  pointer raw_data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer raw_data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kElements); ++i) {
+      storage[i] = static_cast<Storage>(value);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator begin() const {
+    return cbegin();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator end() const {
+    return cend();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rbegin() const {
+    return crbegin();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage + kStorageElements));
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(reinterpret_cast<pointer>(storage));
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator rend() const {
+    return crend();
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(reinterpret_cast<const_pointer>(storage));
+  }
+
+  //
+  // Comparison operators
+  //
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 1> make_Array(Element x) {
+  return {x};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 2> make_Array(Element x, Element y) {
+  return {x,y};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 3> make_Array(Element x, Element y, Element z) {
+  return {x,y,z};
+}
+
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Array<Element, 4> make_Array(Element x, Element y, Element z, Element w) {
+  return {x,y,z,w};
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int N>
+struct absolute_value_op< Array<T, N> > {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    absolute_value_op<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct plus<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    plus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+template <typename T, int N>
+struct minus<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minus<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct multiplies<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    multiplies<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropogateNaN>
+struct maximum_absolute_value_reduction<Array<T, N>, PropogateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  T operator() (T const& scalar, Array<T, N> const& rhs) const {
+
+    T result = scalar;
+    maximum_absolute_value_reduction<T, PropogateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result = scalar_op(result, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct scale<Array<T, N>> {
+  T const scaling_factor_;
+
+  CUTLASS_HOST_DEVICE
+  scale(T scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const & rhs) const {
+    Array<T, N> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = rhs[i] * scaling_factor_;
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct divides<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()( T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    divides<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct reciprocal_approximate_ftz<Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    reciprocal_approximate_ftz<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct maximum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    maximum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N, bool PropagateNaN>
+struct minimum<Array<T, N>, PropagateNaN> {
+
+  CUTLASS_HOST_DEVICE
+  static T scalar_op(T const &lhs, T const &rhs) {
+    return (rhs < lhs ? rhs : lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &scalar) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i], scalar);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &rhs) const {
+
+    Array<T, N> result;
+    minimum<T, PropagateNaN> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+template <typename T, int N>
+struct minimum_with_nan_propagation<Array<T, N>> : minimum<Array<T, N>, true> 
+{};
+
+template <typename T, int N>
+struct negate<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs) const {
+
+    Array<T, N> result;
+    negate<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(lhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <typename T, int N>
+struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, T const &scalar) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], scalar);
+    }
+
+    return result;
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar_b, T const &scalar_c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar_b, scalar_c);
+    }
+
+    return result;
+  }
+};
+
+/// Fused square-and-plus
+template <typename T, int N>
+struct square_and_plus<Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, Array<T, N> const &rhs) const {
+    multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> ma_op;
+    return ma_op(rhs, rhs, lhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &lhs, T const &rhs) const {
+    plus<Array<T, N>> plus_op;
+    multiplies<T> multiplies_op;
+    return plus_op(multiplies_op(rhs, rhs), lhs);
+  }
+};
+
+/// Inverse-square-root
+template <typename T, int N>
+struct inverse_square_root<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+    Array<T, N> result;
+    inverse_square_root<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+    return result;
+  }
+};
+
+template <int N>
+struct inverse_square_root<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & a) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = h2rsqrt(a_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half d_residual = hrsqrt(a_residual_ptr[N - 1]);
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    inverse_square_root<half_t> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <typename T, int N>
+struct multiply_add_relu0<Array<T, N>, Array<T, N>, Array<T, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(a[i], scalar, c[i]), T(0));
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+    maximum<T> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(scalar_op(scalar, b[i], c[i]), T(0));
+    }
+
+    return result;
+  }
+};
+
+
+template <typename T, int N>
+struct conjugate<Array<T, N> >  {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a) const {
+
+    conjugate<T> conj_op;
+
+    Array<T, N> ca;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      ca[i] = conj_op(a[i]);
+    }
+    return ca;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations targeting SIMD instructions in device code.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int N>
+struct plus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hadd(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs + rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hadd2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hadd(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] + rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct minus<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hsub(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs - rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hsub2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half d_residual = __hsub(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] - rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct multiplies<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+      __half d_residual = __hmul(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hmul(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs * rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hmul2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hmul(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] * rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct divides<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = __hdiv(
+        reinterpret_cast<__half const &>(lhs),
+        b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs / rhs[i];
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __h2div(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = __hdiv(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = lhs[i] / rhs;
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N>
+struct negate<Array<half_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *source_ptr = reinterpret_cast<__half2 const *>(&lhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hneg2(source_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      half_t x = -lhs[N - 1];
+      __half lhs_val = reinterpret_cast<__half const &>(x);
+      result[N - 1] = reinterpret_cast<half_t const &>(lhs_val);
+    }
+
+    #else
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = -lhs[i];
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add-relu0
+template <int N>
+struct multiply_add_relu0<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c[i]), (half_t)0);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    half_t const &a,
+    Array<half_t, N> const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 a_pair = __half2half2(reinterpret_cast<__half const &>(a));
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_pair, b_ptr[i], c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+      __half d_residual = __hfma_relu(
+        reinterpret_cast<__half const &>(a),
+        b_residual_ptr[N - 1],
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a, b[i], c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    Array<half_t, N> const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 const *c_ptr = reinterpret_cast<__half2 const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_pair, c_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *c_residual_ptr = reinterpret_cast<__half const *>(&c);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        c_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b, c[i]), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    Array<half_t, N> const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 const *b_ptr = reinterpret_cast<__half2 const *>(&b);
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2_relu(a_ptr[i], b_ptr[i], c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&b);
+
+      __half d_residual = __hfma_relu(
+        a_residual_ptr[N - 1],
+        b_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+    maximum<half_t> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(op(a[i], b[i], c), half_t(0));
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct minimum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmin2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmin(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i],rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmin2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmin(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t,PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmin2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmin2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmin_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmin(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    minimum<half_t, PropagateNaN> mn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mn(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+template <int N, bool PropagateNaN>
+struct maximum<Array<half_t, N>, PropagateNaN> {
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_ptr[i])
+                                   : __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax(a_residual_ptr[N - 1], b_residual_ptr[N - 1])
+                                       : __hmax_nan(a_residual_ptr[N - 1], b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(half_t const & lhs, Array<half_t, N> const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 lhs_pair = __half2half2(reinterpret_cast<__half const &>(lhs));
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_pair, rhs_ptr[i])
+                                   : __hmax2(lhs_pair, rhs_ptr[i]);
+    }
+
+    if constexpr (N % 2) {
+      __half const *b_residual_ptr = reinterpret_cast<__half const *>(&rhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1])
+                                       : __hmax(reinterpret_cast<__half const &>(lhs), b_residual_ptr[N - 1]);
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs, rhs[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const & lhs, half_t const &rhs) const {
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(&lhs);
+    __half2 rhs_pair = __half2half2(reinterpret_cast<__half const &>(rhs));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = PropagateNaN ? __hmax2_nan(lhs_ptr[i], rhs_pair)
+                                   : __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    if constexpr (N % 2) {
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&lhs);
+
+      __half d_residual = PropagateNaN ? __hmax_nan(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs))
+                                       : __hmax(a_residual_ptr[N - 1], reinterpret_cast<__half const &>(rhs));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    maximum<half_t,PropagateNaN> mx;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = mx(lhs[i], rhs);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+/// Fused multiply-add
+template <int N>
+struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t, N>> {
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    bfloat16_t const &a,
+    Array<bfloat16_t, N> const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned a_packed = static_cast<unsigned>(a.raw());
+    a_packed = (a_packed | (a_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a, b[i], c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    Array<bfloat16_t, N> const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *c_ptr = reinterpret_cast<unsigned const *>(&c);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i])
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c[i]);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    Array<bfloat16_t, N> const &b,
+    bfloat16_t const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+    unsigned const *b_ptr = reinterpret_cast<unsigned const *>(&b);
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed)
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0])
+      );
+    }
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b[i], c);
+    }
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    bfloat16_t const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_packed)
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[0])
+      );
+    }
+
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+
+    return result;
+  }
+};
+
+
+/// bit_and
+template <int N>
+struct bit_and<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] & b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_or
+template <int N>
+struct bit_or<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] | b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// bit_not
+template <int N>
+struct bit_not<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (~a_data[i]);
+    }
+
+    return result;
+  }
+};
+
+/// bit_xor
+template <int N>
+struct bit_xor<Array<uint1b_t, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<uint1b_t, N> operator()(Array<uint1b_t, N> const &a, Array<uint1b_t, N> const &b) const {
+    using ArrayType = Array<uint1b_t, N>;
+    using Storage = typename ArrayType::Storage;
+    ArrayType result;
+
+    Storage *result_data = result.raw_data();
+    Storage const *a_data = a.raw_data();
+    Storage const *b_data = b.raw_data();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ArrayType::kStorageElements; ++i) {
+      result_data[i] = (a_data[i] ^ b_data[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused and-popc-add
+template <typename T, int N>
+struct and_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    and_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/// Fused or-popc-add
+template <typename T, int N>
+struct or_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    or_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+};
+
+/// Fused xor-popc-add
+template <typename T, int N>
+struct xor_popc_add<Array<T, N>, Array<T, N>, Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], b[i], c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar, c[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const &scalar, Array<T, N> const &b, Array<T, N> const &c) const {
+    Array<T, N> result;
+    xor_popc_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(scalar, b[i], c[i]);
+    }
+
+    return result;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Operator overloads
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(T const &lhs, Array<T, N> const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator+(Array<T, N> const &lhs, T const &rhs) {
+  plus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  minus<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator-(Array<T, N> const &lhs) {
+  negate<Array<T, N>> op;
+  return op(lhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(T lhs, Array<T, N> const &rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator*(Array<T, N> const &lhs, T rhs) {
+  multiplies<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> operator/(Array<T, N> const &lhs, Array<T, N> const &rhs) {
+  divides<Array<T, N>> op;
+  return op(lhs, rhs);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(T a, Array<T, N> const &b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, T b, Array<T, N> const &c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+template <typename T, int N>
+CUTLASS_HOST_DEVICE
+Array<T, N> fma(Array<T, N> const &a, Array<T, N> const &b, T c) {
+  multiply_add<Array<T, N>> op;
+  return op(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// AlignedArray
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Aligned array type
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Alignment requirement in bytes
+  int Alignment = ( sizeof_bits<T>::value * N + 7 ) / 8
+>
+class alignas(Alignment) AlignedArray: public Array<T, N> {
+public:
+
+};
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/array_subbyte.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bd9d0d7f7dc709b951c6979a3e26cf05ba9c79d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_planar_complex.h
@@ -0,0 +1,89 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Array holding planar complex elements
+template <typename Element_, int N>
+struct ArrayPlanarComplex {
+
+  /// Underlying real element
+  using Element = Element_;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  /// Underlying Fragment of real-valued elemenets
+  using ArrayReal = cutlass::Array<Element, N>;
+
+public:
+  /// Fragment of real-valued elements representing the real part
+  ArrayReal real;
+
+  /// Fragment of real-valued elements representing the imaginary part
+  ArrayReal imag;
+
+public:
+  /// Sets the array to zero efficiently
+  CUTLASS_HOST_DEVICE
+  void clear() {
+    real.clear();
+    imag.clear();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to deduce template arguments
+template <typename Element, int N>
+CUTLASS_HOST_DEVICE
+ArrayPlanarComplex<Element, N> 
+make_ArrayPlanarComplex(Array<Element, N> const &real, Array<Element, N> const &imag) {
+  return ArrayPlanarComplex<Element, N>{real, imag};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_subbyte.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_subbyte.h
new file mode 100644
index 0000000000000000000000000000000000000000..756890bb61f7ff5f2a9912b00b98a54deae6ee75
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/array_subbyte.h
@@ -0,0 +1,561 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array for any data type
+template <
+  typename T,
+  int N
+>
+struct Array<T, N, false> {
+  static constexpr int kSizeBits = sizeof_bits<T>::value * N;
+
+  /// Storage type
+  using Storage = typename platform::conditional<
+    ((kSizeBits % 32) != 0),
+    typename platform::conditional<
+      ((kSizeBits % 16) != 0),
+      uint8_t,
+      uint16_t
+    >::type,
+    uint32_t
+  >::type;
+
+  /// Element type
+  using Element = T;
+
+  /// Number of logical elements per stored object
+  static constexpr int kElementsPerStoredItem = int(sizeof(Storage) * 8) / sizeof_bits<T>::value;
+
+  /// Number of storage elements
+  static constexpr size_t kStorageElements = (N + kElementsPerStoredItem - 1) / kElementsPerStoredItem;
+
+  /// Number of logical elements
+  static constexpr size_t kElements = N;
+
+  /// Bitmask for covering one item
+  static constexpr Storage kMask = ((Storage(1) << sizeof_bits<T>::value) - 1);
+
+  //
+  // C++ standard members with pointer types removed
+  //
+
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef value_type *pointer;
+  typedef value_type const *const_pointer;
+
+  //
+  // References
+  //
+
+  /// Reference object inserts or extracts sub-byte items
+  class reference {
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    reference() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    reference(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    /// Assignment
+    CUTLASS_HOST_DEVICE
+    reference &operator=(T x) {
+    // `*ptr_ & kUpdateMask` will read ptr_ before write to it
+    // This means code pattern like
+    //
+    // ```cpp
+    // Array<half_t, N> result;
+    // result[0] = xxx;
+    // ```
+    // 
+    // Will leads to compiler warning on use of uninitialized member variable. Although we know
+    //      this read of uninitialized member variable is harmeless.
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wuninitialized"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wuninitialized"
+#  pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+      Storage item = (reinterpret_cast<Storage const &>(x) & kMask);
+
+      Storage kUpdateMask = Storage(~(kMask << (idx_ * sizeof_bits<T>::value)));
+
+      *ptr_ = Storage(((*ptr_ & kUpdateMask) | (item << idx_ * sizeof_bits<T>::value)));
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic pop
+#endif
+
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    T get() const {
+      Storage item = Storage((*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      return get();
+    }
+
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+
+  /// Reference object extracts sub-byte items
+  class const_reference {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_reference() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    const_reference(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    const T get() const {
+      Storage item = (*ptr_ >> (idx_ * sizeof_bits<T>::value)) & kMask;
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Extract
+    CUTLASS_HOST_DEVICE
+    operator T() const {
+      Storage item = Storage(Storage(*ptr_ >> Storage(idx_ * sizeof_bits<T>::value)) & kMask);
+      return reinterpret_cast<T const &>(item);
+    }
+
+    /// Explicit cast to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+      return int(get());
+    }
+
+    /// Explicit cast to float
+    CUTLASS_HOST_DEVICE
+    explicit operator float() const {
+      return float(get());
+    }
+  };
+
+  //
+  // Iterators
+  //
+
+  /// Bidirectional iterator over elements
+  class iterator {
+
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    reference operator*() const {
+      return reference(ptr_, idx_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_iterator {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    const_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator++() {
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator &operator--() {
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator++(int) {
+      iterator ret(*this);
+      ++idx_;
+      if (idx_ == kElementsPerStoredItem) {
+        ++ptr_;
+        idx_ = 0;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    iterator operator--(int) {
+      iterator ret(*this);
+      if (!idx_) {
+        --ptr_;
+        idx_ = kElementsPerStoredItem - 1;
+      }
+      else {
+        --idx_;
+      }
+      return ret;
+    }
+
+    CUTLASS_HOST_DEVICE
+    const_reference operator*() const {
+      return const_reference(ptr_, idx_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator==(iterator const &other) const {
+      return ptr_ == other.ptr_ && idx_ == other.idx_;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool operator!=(iterator const &other) const {
+      return !(*this == other);
+    }
+  };
+
+  /// Bidirectional iterator over elements
+  class reverse_iterator {
+
+    /// Pointer to storage element
+    Storage *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    reverse_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    reverse_iterator(Storage *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+
+  /// Bidirectional constant iterator over elements
+  class const_reverse_iterator {
+
+    /// Pointer to storage element
+    Storage const *ptr_{nullptr};
+
+    /// Index into elements packed into Storage object
+    int idx_{0};
+
+  public:
+
+    const_reverse_iterator() = default;
+
+    CUTLASS_HOST_DEVICE
+    const_reverse_iterator(Storage const *ptr, int idx = 0): ptr_(ptr), idx_(idx) { }
+  };
+
+  /// Efficient clear method
+  CUTLASS_HOST_DEVICE
+  void clear() {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(kStorageElements); ++i) {
+      storage[i] = Storage(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference at(size_type pos) {
+    return reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference at(size_type pos) const {
+    return const_reference(storage + pos / kElementsPerStoredItem, pos % kElementsPerStoredItem);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference operator[](size_type pos) {
+    return at(pos);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference operator[](size_type pos) const {
+    return at(pos);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference front() {
+    return at(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference front() const {
+    return at(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reference back() {
+    return reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reference back() const {
+    return const_reference(storage + kStorageElements - 1, kElementsPerStoredItem - 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  pointer data() {
+    return reinterpret_cast<pointer>(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_pointer data() const {
+    return reinterpret_cast<const_pointer>(storage);
+  }
+  
+  CUTLASS_HOST_DEVICE
+  Storage * raw_data() {
+    return storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Storage const * raw_data() const {
+    return storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr bool empty() const {
+    return !kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_type max_size() const {
+    return kElements;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void fill(T const &value) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerStoredItem; ++i) {
+      reference ref(storage, i);
+      ref = value;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kStorageElements; ++i) {
+      storage[i] = storage[0];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator begin() {
+    return iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cbegin() const {
+    return const_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  iterator end() {
+    return iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_iterator cend() const {
+    return const_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rbegin() {
+    return reverse_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(storage + kStorageElements);
+  }
+
+  CUTLASS_HOST_DEVICE
+  reverse_iterator rend() {
+    return reverse_iterator(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  const_reverse_iterator crend() const {
+    return const_reverse_iterator(storage);
+  }
+
+private:
+  /// Internal storage
+  Storage storage[kStorageElements];
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/barrier.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/barrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..8919e992af20ac2d7f2b5daa8a0cbd7a6f7b79e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/barrier.h
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide barrier for inter-CTA synchronization.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+namespace detail {
+
+//
+// Utilities for abstracting synchronization methods for barriers
+//
+
+struct SyncthreadsSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncthreads();
+  }
+};
+
+struct SyncwarpSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncwarp();
+  }
+};
+
+template <
+  int ThreadCount,
+  int BarrierId
+>
+struct NamedBarrierSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    cutlass::arch::NamedBarrier::sync(ThreadCount, static_cast<arch::ReservedNamedBarriers>(BarrierId));
+  }
+};
+
+} // namepspace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Group or CTA-wide semaphore for inter-CTA synchronization.
+template <class Sync>
+struct GenericBarrier {
+
+public:
+
+  /// Flag type
+  using T = int;
+
+  /// Initial flag value
+  static const T INIT = 0;
+
+
+protected:
+
+  /// Load flag, as a strong acquire operation (int specialization)
+  CUTLASS_DEVICE
+  static int ld_acquire(int *ptr)
+  {
+    int state = 0;
+
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Acquire pattern using acquire modifier
+    asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+
+#else
+    asm volatile ("ld.cg.global.b32 %0, [%1];\n" : "=r"(state) : "l"(ptr));
+#endif // (__CUDA_ARCH__ >= 700)
+
+    return state;
+  }
+
+
+  /// Reduce into flag, with release pattern (int specialization)
+  CUTLASS_DEVICE
+  static void red_release(int *ptr, int val)
+  {
+#if (__CUDA_ARCH__ >= 700)
+    /// SM70 and newer use memory consistency qualifiers
+
+    // Release pattern using acq_rel fence + relaxed modifier.  (The fence also releases data
+    // that was weakly-written by other threads prior to the last syncthreads)
+    asm volatile ("fence.acq_rel.gpu;\n");
+    asm volatile ("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(ptr), "r"(val));
+
+#else
+    __threadfence();
+    atomicAdd(ptr, val);
+#endif // (__CUDA_ARCH__ >= 700)
+  }
+
+
+public:
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) < count) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
+  {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(ld_acquire(flag_ptr) != val) {}
+    }
+    Sync::sync();
+  }
+
+  /// Uses thread[0] to wait for the specified count of signals on the given flag counter
+  CUTLASS_DEVICE
+  static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    if (thread_idx == 0)
+    {
+        // Spin-loop
+        #pragma unroll 1
+        while(atomicCAS(flag_ptr, val, 0) != val) {}
+    }
+
+    Sync::sync();
+  }
+
+  /// Increment the arrival count for a flag
+  CUTLASS_DEVICE
+  static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx, int val = 1)
+  {
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    Sync::sync();
+
+    if (thread_idx == 0)
+    {
+      red_release(flag_ptr, val);
+    }
+  }
+
+
+  /// Increment the arrival counts for a range of flags
+  CUTLASS_DEVICE
+  static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1)
+  {
+    int flag_idx = first_flag_idx + thread_idx;
+    T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
+
+    // Barrier to make sure all other threads in group have written their data
+    Sync::sync();
+
+    // Select threads increment their flags
+    if (thread_idx < count) {
+      red_release(flag_ptr, val);
+    }
+  }
+};
+
+using Barrier = GenericBarrier<detail::SyncthreadsSync>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for managing multiple NamedBarriers to be used by different warp groups, allowing
+ * runtime index values to be used to call into named barriers with compile-time-constant IDs.
+ *
+ * @param ThreadCount_ Number of threads that will wait on a NamedBarrier with a given ID
+ * @param Offset Value added to the ID passed in by the user to determine the NamedBarrier ID to call into
+ * @param MaxNumNamedBarriers The maximum number of unique barrier IDs that will be requested on this type
+**/
+template <
+  uint32_t ThreadCount_,
+  uint32_t Offset = 0,
+  uint32_t MaxNumNamedBarriers = 16
+>
+struct NamedBarrierManager {
+
+  static_assert(MaxNumNamedBarriers <= arch::NamedBarrier::HardwareMaxNumNamedBarriers);
+  static_assert(MaxNumNamedBarriers + Offset <= arch::NamedBarrier::HardwareMaxNumNamedBarriers, "Barrier IDs cannot exceed 15");
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  template <uint32_t BarrierId>
+  using BarrierSync = cutlass::GenericBarrier<cutlass::detail::NamedBarrierSync<ThreadCount, BarrierId>>;
+
+  // Underlying type used by all barriers for synchronization. Does not depend on
+  // template parameter BarrierId, so passing in 0 suffices.
+  using T = typename BarrierSync<0>::T;
+
+  using IntegerSequence = cute::make_integer_sequence<uint32_t, MaxNumNamedBarriers>;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    wait_lt_helper(idx, lock_ptr, thread_idx, flag_idx, count, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<false>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    wait_eq_helper<true>(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    arrive_inc_helper(idx, lock_ptr, thread_idx, flag_idx, val, IntegerSequence{});
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    arrive_range_inc_helper(idx, lock_ptr, thread_idx, first_flag_idx, count, val, IntegerSequence{});
+  }
+
+private:
+  CUTLASS_DEVICE
+  static void
+  check_barrier_in_range([[maybe_unused]] uint32_t idx) {
+    assert((idx < MaxNumNamedBarriers) && "Index exceeds barrier count");
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_lt_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int count, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::wait_lt(lock_ptr, thread_idx, flag_idx, count), true)) || ...);
+  }
+
+  template <bool Reset, uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  wait_eq_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, T val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    if constexpr (Reset) {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+    else {
+      ((Idx == idx && (BarrierSync<Idx + Offset>::wait_eq(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+    }
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int flag_idx, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_inc(lock_ptr, thread_idx, flag_idx, val), true)) || ...);
+  }
+
+  template <uint32_t... Idx>
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc_helper(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count, int val, cute::integer_sequence<uint32_t, Idx...>) {
+    check_barrier_in_range(idx);
+    ((Idx == idx && (BarrierSync<Idx + Offset>::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val), true)) || ...);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
+ *  via an API that mirrors that of NamedBarrierManager
+ *
+ * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
+**/
+template <
+  class Synchronizer,
+  uint32_t ThreadCount_
+>
+struct SyncManager {
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
+
+  // Underlying type used by all barriers for synchronization.
+  using T = typename BarrierSync::T;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    BarrierSync::wait_lt(lock_ptr, thread_idx, flag_idx, count);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/bfloat16.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e2f40b1c85e24eb2bdeedb191529d53539f050c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/bfloat16.h
@@ -0,0 +1,679 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing non-standard 16-bit floating point values with
+          8 bits of exponent and 7 bit of mantissa.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+#include <cuda_bf16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Floating-point type with 8 bits of exponent and 7 bits of mantissa.
+struct alignas(2) bfloat16_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint16_t storage;
+
+  //
+  // Methods
+  //
+
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static bfloat16_t bitcast(uint16_t x) {
+    bfloat16_t h;
+    h.storage = x;
+    return h;
+  }
+
+private:
+  struct from_32_bit_integer_t {};
+  static constexpr from_32_bit_integer_t from_32_bit_integer{};
+
+  template<class T>
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(from_32_bit_integer_t, T x) {
+    static_assert(cutlass::platform::is_integral<T>::value && sizeof(T) == 4, "Requires 32-bit integer");
+
+    float flt = static_cast<float>(x);
+    uint32_t bits;
+
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(flt);
+    #else
+    std::memcpy(&bits, &flt, sizeof(bits));
+    #endif
+
+    storage = uint16_t(bits >> 16);
+  }
+
+public:
+  /// Default constructor
+  bfloat16_t() = default;
+
+  /// Reinterpret cast from CUDA's __nv_bfloat16 type
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(__nv_bfloat16 const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __nv_bfloat16_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(float x) {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+
+    asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x));
+
+    #else
+    uint32_t bits;
+
+    #if defined(__CUDA_ARCH__)
+    bits = reinterpret_cast<uint32_t &>(x);
+    #else
+    std::memcpy(&bits, &x, sizeof(bits));
+    #endif
+
+    if ((bits & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((bits & (1 << 16)) != 0);
+      bool round_bit = ((bits & (1 << 15)) != 0);
+      bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0);
+      
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        bits += uint32_t(1 << 16);
+      }
+    }
+    else if (bits & ~0xff800000) {
+      bits = 0x7fffffff;
+    }
+
+    storage = uint16_t((bits >> 16) & 0xffff);
+    #endif
+  }
+
+  /// Floating-point conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(double x): bfloat16_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round toward nearest
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(int x) : bfloat16_t(from_32_bit_integer, x) {}
+
+  CUTLASS_HOST_DEVICE
+  explicit bfloat16_t(uint32_t x) : bfloat16_t(from_32_bit_integer, x) {}
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    unsigned bits = (unsigned(storage) << 16);
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Bitcasts to CUDA's bf16 type
+  CUTLASS_DEVICE
+  __nv_bfloat16 to_nv_bfloat16() const {
+    return reinterpret_cast<__nv_bfloat16 const &>(storage);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+    /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x8000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 7) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7f);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::bfloat16_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) {
+  return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t nan_bf16(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::bfloat16_t::bitcast(0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::bfloat16_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::bfloat16_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::bfloat16_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::bfloat16_t(sqrtf(float(h)));
+#else
+  return cutlass::bfloat16_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) {
+
+  uint16_t a_bits;
+  uint16_t b_bits;
+
+  #if defined(__CUDA_ARCH__)
+  a_bits = reinterpret_cast<uint16_t const &>(a);
+  b_bits = reinterpret_cast<uint16_t const &>(b);
+  #else
+  std::memcpy(&a_bits, &a, sizeof(a_bits));
+  std::memcpy(&b_bits, &b, sizeof(b_bits));
+  #endif
+
+  uint16_t a_mag = (a_bits & 0x7fff);  
+  uint16_t b_sign = (b_bits & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+
+  return bfloat16_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+
+} // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::bfloat16_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 7;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); }
+};
+
+} // namespace platform
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __heq(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) == float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hne(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) != float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hlt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) < float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hle(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) <= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) > float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hge(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16());
+#else
+  return float(lhs) >= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) + float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hneg(lhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(-float(lhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) - float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) * float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  return bfloat16_t(float(lhs) / float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) + float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) - float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hmul(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) * float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hdiv(lhs.to_nv_bfloat16(), rhs.to_nv_bfloat16()));
+#else
+  lhs = bfloat16_t(float(lhs) / float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator++(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  ++tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t& operator--(bfloat16_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  --tmp;
+  lhs = bfloat16_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator++(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hadd(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp++;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bfloat16_t operator--(bfloat16_t & lhs, int) {
+  bfloat16_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  lhs = bfloat16_t(__hsub(lhs.to_nv_bfloat16(), bfloat16_t(1.0f).to_nv_bfloat16()));
+#else
+  float tmp(lhs);
+  tmp--;
+  lhs = bfloat16_t(tmp);
+#endif
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(long double x) {
+  return cutlass::bfloat16_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) {
+  return cutlass::bfloat16_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3.h
new file mode 100644
index 0000000000000000000000000000000000000000..8788f18b99d5c9d700a0f6f28625097f41862c74
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Basic include for CUTLASS BLAS3/HPC code.
+    
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/blas3_types.h"
+#include "cutlass/coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines FillMode inversions
+template <FillMode kFillMode>
+struct InvertFillMode;
+
+/// Invert FillMode lower to upper
+template <>
+struct InvertFillMode<FillMode::kLower> {
+  static FillMode const mode = FillMode::kUpper;
+};
+
+/// Invert FillMode upper to lower
+template <>
+struct InvertFillMode<FillMode::kUpper> {
+  static FillMode const mode = FillMode::kLower;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines SideMode inversions
+template <SideMode kSideMode>
+struct InvertSideMode;
+
+/// Invert SideMode left to right
+template <>
+struct InvertSideMode<SideMode::kLeft> {
+  static SideMode const mode = SideMode::kRight;
+};
+
+/// Invert SideMode right to left
+template <>
+struct InvertSideMode<SideMode::kRight> {
+  static SideMode const mode = SideMode::kLeft;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines correct compare operation for Triangular matrix boundary
+template <FillMode kFillMode, DiagType kDiagType = DiagType::kNonUnit>
+struct TrMatrixCompareOp {
+  using Index = int32_t;
+  using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kUnit> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater_equal<Index>, 
+                        less_equal<Index>>::type;
+};
+
+template <FillMode kFillMode>
+struct TrMatrixCompareOp <kFillMode, DiagType::kZero> {
+   using Index = int32_t;
+   using Type = typename platform::conditional<
+                        (kFillMode == FillMode::kLower), 
+                        greater<Index>, 
+                        less<Index>>::type;
+};
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns precision in terms of bits (based on datatype) to fill tensors with.
+// Defaults to 5 bits of mantissa for TF32 and FP32 (with implicit round-offs).
+// Also defines acceptable mantissa result variance/error.
+template <typename Element>
+struct MantissaInBits {
+  static int constexpr bits = 5;
+  static double constexpr error = 1.0e-7;
+};
+
+// Full precision is supported for FP64
+template <>
+struct MantissaInBits<double> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-15;
+};
+
+template <>
+struct MantissaInBits<cutlass::complex<double>> {
+  static int constexpr bits = 30;
+  static double constexpr error = 1.0e-14;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3_types.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e47002b1a7255478f3a8d08518a4e081cbfd2422
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/blas3_types.h
@@ -0,0 +1,78 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of kernel (based on input or output matrices).
+enum class BlasMode {
+  kGemm,
+  kSymmetric,
+  kHermitian,
+  kTriangular,
+  kInvalid
+};
+
+/// Enumerated type describing the fill mode for matrices for BLAS functions.
+enum class FillMode {
+  kFull,              /// The entire tensor is covered.
+  kLower,             /// The 'lower' part of a tensor is covered including diagonal
+  kUpper,             /// The 'upper' part of a tensor is covered including diaognal
+  kDiagonal,          /// Only diagonal elements are covered.
+  kNone,              /// No element is covered.
+  kInvalid
+};
+
+/// Enumerated type describing the diagonal property of matrices for BLAS functions.
+enum class DiagType {
+  kNonUnit,
+  kUnit,
+  kZero, // Only used internally for computing SYMM/HEMM
+  kInvalid
+}; 
+
+/// Enumerated type describing the side dense matrix is in matrix equation for BLAS functions.
+enum class SideMode {
+  kLeft,
+  kRight,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/block_striped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/block_striped.h
new file mode 100644
index 0000000000000000000000000000000000000000..93665c64047d847a6fc9de3f5ec691caa8186dbc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/block_striped.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for performing block-striped access (load, store, reduce) of trivially-copyable,
+    statically-sized array types to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/functional.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// AccessWidth
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the maximal power-of-two that evenly divides the size of T, capped at Limit
+template <
+  typename T,
+  int Limit>
+struct AccessWidth
+{
+  // Inductive case
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes,         /// Template induction variable
+      bool IsAligned  =       /// Whether ObjectBytes is an even multiple of AlignBytes
+        ((AlignBytes <= Limit) &&  (ObjectBytes % AlignBytes == 0))>
+  struct Detail
+  {
+      static const int value = Detail<ObjectBytes, AlignBytes * 2>::value;
+  };
+
+  // Base case (ObjectBytes is not an even multiple of AlignBytes)
+  template <
+      int ObjectBytes,        /// Size of T in bytes
+      int AlignBytes>         /// Template induction variable
+  struct Detail<ObjectBytes, AlignBytes, false>
+  {
+      static const int value = AlignBytes / 2;
+  };
+
+  /// The maximal power-of-two that evenly divides the size of T
+  static const int value = Detail<
+    (int) sizeof(T),
+    1>::value;
+};
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// StripedAccessType
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Default specialization.  Striping granularity is type T.)
+template <
+    typename T,           /// Data type
+    int TransferBytes =   /// Data access width (16 byte max for global memory access on current architectures)
+      AccessWidth<T, 16>::value>
+struct alignas(TransferBytes) StripedAccessType : public T
+{};
+
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::Array<T>.  Striping granularity is a multiple of T.)
+template <
+    typename T,           /// Array element type
+    int N,                /// Number of elements in array
+    bool RegisterSized,   /// T is register-sized
+    int TransferBytes>    /// Data access width
+struct StripedAccessType<
+    Array<T, N, RegisterSized>,
+    TransferBytes>
+: public AlignedArray<
+            T,                                                  // Element type of StripedAccessType
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(T)),   // Number of elements T in StripedAccessType
+            TransferBytes>                                      // Alignment of StripedAccessType
+{};
+
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+/// ReinterpretCast type for striping a trivially-copyable type in global memory
+/// (Specialization for cutlass::WmmaFragmentArray<T>.  Striping granularity is a multiple of T.)
+template<
+    typename Use,
+    int m,
+    int n,
+    int k,
+    typename ElementT,
+    typename Layout,
+    int kFragments,
+    int TransferBytes>
+struct StripedAccessType<
+    WmmaFragmentArray<nvcuda::wmma::fragment<Use, m, n, k, ElementT, Layout>, kFragments>,
+    TransferBytes>
+: public AlignedArray<
+            ElementT,
+            __NV_STD_MAX(1, TransferBytes / (int) sizeof(ElementT)),
+            TransferBytes>
+{};
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStriped
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Utility for performing block-striped access (load, store) of trivially-copyable,
+/// statically-sized array types to global memory
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename AccessT = StripedAccessType<ArrayT> >
+struct BlockStriped
+{
+  /// Number of striped accesses
+  static const int kStripes = int(sizeof(ArrayT) / sizeof(AccessT));
+  static_assert(kStripes > 0, "AccessT type must be smaller than or equal to ArrayT type");
+
+  /// Load
+  CUTLASS_DEVICE
+  static void load(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_data[i] = access_input[(BlockThreads * i) + thread_idx];
+    }
+  }
+
+  /// Load & Add
+  CUTLASS_DEVICE
+  static void load_add(ArrayT &data, ArrayT *ptr, int thread_idx)
+  {
+    AccessT *access_input = reinterpret_cast<AccessT*>(ptr);
+    AccessT *access_data = reinterpret_cast<AccessT*>(&data);
+
+    plus<AccessT> add;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i)
+    {
+      access_data[i] = add(access_data[i], access_input[(BlockThreads * i) + thread_idx]);
+    }
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  static void store(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    AccessT *access_output = reinterpret_cast<AccessT*>(ptr);
+    const AccessT *access_data = reinterpret_cast<const AccessT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kStripes; ++i) {
+      access_output[(BlockThreads * i) + thread_idx] = access_data[i];
+    }
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockStripedReduce
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Default specialization)
+template <
+  int BlockThreads,
+  typename ArrayT,
+  typename ElementT = typename StripedAccessType<ArrayT>::Element>
+struct BlockStripedReduce :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    ElementT>
+{
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<ElementT> reduce;
+    ElementT *access_output = reinterpret_cast<ElementT*>(ptr);
+    const ElementT *access_data = reinterpret_cast<const ElementT*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i) {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Specialization for half_t.  Uses half2 vectorized-reduction.)
+template <
+  int BlockThreads,
+  typename ArrayT>
+struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    half2>
+{
+  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of half must be even number in length");
+
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<half2> reduce;
+    half2 *access_output = reinterpret_cast<half2*>(ptr);
+    const half2 *access_data = reinterpret_cast<const half2*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
+    {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cluster_launch.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cluster_launch.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..22c17dba702f62eeab80ab5b3399bda269f4f4d2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cluster_launch.hpp
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUDA interfaces to launch CUTLASS device-level operators (for >= SM90) that use thread-block clusters.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include <cute/arch/cluster_sm100.hpp> 
+#include "cutlass/arch/synclog.hpp"
+
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(type_traits)
+#else
+#include <type_traits>
+#include <cstdio>
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#  define CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED
+#endif
+
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  #  define CUDA_ENABLE_PREFERRED_CLUSTER
+#endif
+namespace cutlass {
+
+#ifndef NDEBUG
+#define Return_Status(cudaError_t_status)            \
+  if (cudaError_t_status != cudaSuccess) {           \
+    fprintf(stderr,                                  \
+            "[ ERROR: CUDA Runtime ] %s:%d: %s\n",   \
+            __FILE__,                                \
+            __LINE__,                                \
+            cudaGetErrorString(cudaError_t_status)); \
+    return Status::kInvalid;                         \
+  } else {                                           \
+    return Status::kSuccess;                         \
+  }
+#else
+#define Return_Status(cudaError_t_status)          \
+  if (cudaError_t_status != cudaSuccess) {         \
+    return Status::kInvalid;                       \
+  } else {                                         \
+    return Status::kSuccess;                       \
+  }
+#endif
+
+struct ClusterLauncher {
+  constexpr static int MaxClusterSize = 32;
+
+  struct LaunchConfig {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    cudaLaunchConfig_t launch_config;
+    
+  #if defined(CUDA_ENABLE_PREFERRED_CLUSTER)
+    constexpr static int numAttrs = 3;
+  #else
+    
+    constexpr static int numAttrs = 2;
+  #endif 
+    cudaLaunchAttribute launch_attribute[numAttrs];
+  // Commonly used utility functions
+  dim3 gridDim()  { return launch_config.gridDim;  }
+  dim3 blockDim() { return launch_config.blockDim; }
+#endif
+  };
+
+  // Check for hardware compatibility
+  static inline CUTLASS_HOST
+  Status check_cluster_dims(dim3 grid, dim3 cluster) {
+    if (((cluster.x * cluster.y * cluster.z) <= MaxClusterSize) &&
+        (grid.x % cluster.x == 0) && (grid.y % cluster.y == 0) && (grid.z % cluster.z == 0)) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("ClusterLauncher: Invalid cluster configuration -- aborting launch.");
+      return Status::kInvalid;
+    }
+  }
+
+  static inline CUTLASS_HOST
+  Status
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+  init(void const* kernel_function)
+#else
+  init(void const* /* kernel_function */)
+#endif
+  {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    if (kernel_function == nullptr) {
+      CUTLASS_TRACE_HOST("kernel_function is null");
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Checking previous error state before calling cudaFuncSetAttribute");
+    cudaError_t prevStatus = cudaGetLastError();
+    if (prevStatus != cudaSuccess) {
+      fprintf(stderr,
+              "[ ERROR: CUDA Runtime ] %s:%d: %s\n",
+              __FILE__,
+              __LINE__,
+              cudaGetErrorString(prevStatus));
+      return Status::kInvalid;
+    }
+    CUTLASS_TRACE_HOST("Calling cudaFuncSetAttribute");
+#endif
+    // This attribute was added in CUDA 11.8.
+    cudaError_t status =
+        cudaFuncSetAttribute(
+          kernel_function, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
+    Return_Status(status);
+#else
+    return Status::kInvalid;
+#endif
+  }
+
+  static inline CUTLASS_HOST
+  LaunchConfig make_cluster_launch_config(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size = 0,
+      cudaStream_t cuda_stream = 0,
+      bool launch_with_pdl = false
+      , dim3 const fallback_cluster_dims = {0, 0, 0} 
+    ) {
+    LaunchConfig cluster_launch_config;
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    auto &launch_config    = cluster_launch_config.launch_config;
+    auto &launch_attribute = cluster_launch_config.launch_attribute;
+    auto numAttrs = cluster_launch_config.numAttrs;
+
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    
+    bool have_fallback = fallback_cluster_dims.x * fallback_cluster_dims.y * fallback_cluster_dims.z > 0;
+
+    if (have_fallback) {
+      launch_attribute[0].val.clusterDim = {fallback_cluster_dims.x, fallback_cluster_dims.y, fallback_cluster_dims.z};
+      CUTLASS_TRACE_HOST("ClusterLauncher: Setting fallback ClusterDims = "
+          "(" << fallback_cluster_dims.x << ", " << fallback_cluster_dims.y << ", " << fallback_cluster_dims.z << ")\n");
+    }
+    else {
+    
+    launch_attribute[0].val.clusterDim = {cluster_dims.x, cluster_dims.y, cluster_dims.z};
+    CUTLASS_TRACE_HOST("ClusterLauncher: Setting ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+    
+    }
+
+#if defined(CUDA_ENABLE_PREFERRED_CLUSTER)
+    if (have_fallback) {
+      if (cute::initialize_preferred_cluster_launch(nullptr, grid_dims, cluster_dims, fallback_cluster_dims)) {
+        launch_attribute[1].id = cudaLaunchAttributePreferredClusterDimension;
+        launch_attribute[1].val.preferredClusterDim = {cluster_dims.x, cluster_dims.y, cluster_dims.z};
+        CUTLASS_TRACE_HOST("ClusterLauncher: Setting preferred ClusterDims = "
+            "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+      }
+    }
+    else {
+      numAttrs--;
+    } 
+#endif
+    
+
+    // PDL attributes
+    launch_attribute[numAttrs - 1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    launch_attribute[numAttrs - 1].val.programmaticStreamSerializationAllowed = 1;
+
+    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
+    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
+    launch_config.dynamicSmemBytes = smem_size;
+    launch_config.stream = cuda_stream;
+    launch_config.numAttrs = launch_with_pdl ? numAttrs : numAttrs - 1;
+    launch_config.attrs = launch_attribute;
+    return cluster_launch_config;
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return cluster_launch_config;
+#endif
+  }
+
+  // This is the method we expect to use going forward
+  static inline CUTLASS_HOST
+  Status launch(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    LaunchConfig cluster_launch_config = make_cluster_launch_config(grid_dims, cluster_dims,
+                                            block_dims, smem_size, cuda_stream, launch_with_pdl);
+
+    auto launch_grid_dims = cluster_launch_config.gridDim();
+    if (check_cluster_dims(launch_grid_dims, cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching GridDims = "
+        "(" << launch_grid_dims.x << ", " << launch_grid_dims.y << ", " << launch_grid_dims.z << "), "
+        "And ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&cluster_launch_config.launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+
+  
+  // This is the method we expect to use going forward
+  // Launch a preferred cluster grid
+  static inline CUTLASS_HOST
+  Status launch_with_fallback_cluster(
+      dim3 const grid_dims,
+      dim3 const preferred_cluster_dims,
+      dim3 const fallback_cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size,
+      cudaStream_t cuda_stream,
+      void const* kernel,
+      void** kernel_params,
+      bool launch_with_pdl = false) {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    LaunchConfig cluster_launch_config = make_cluster_launch_config(grid_dims, preferred_cluster_dims, 
+                                            block_dims, smem_size, cuda_stream, launch_with_pdl, fallback_cluster_dims);
+
+    auto launch_grid_dims = cluster_launch_config.gridDim();
+    if (check_cluster_dims(launch_grid_dims, preferred_cluster_dims) != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
+      return Status::kInvalid;
+    }
+
+    auto init_status = init(kernel);
+    if (init_status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("ClusterLauncher: init(kernel) failed with status " << int(init_status) << ". Aborting.");
+      return Status::kInvalid;
+    }
+
+    CUTLASS_TRACE_HOST("ClusterLauncher: Launching \n\tGridDims = "
+        "(" << launch_grid_dims.x << ", " << launch_grid_dims.y << ", " << launch_grid_dims.z << "), "
+        "\n\tPreferred ClusterDims = "
+        "(" << preferred_cluster_dims.x << ", " << preferred_cluster_dims.y << ", " << preferred_cluster_dims.z << "),"
+        "\n\tFallback  ClusterDims = "
+        "(" << fallback_cluster_dims.x << ", " << fallback_cluster_dims.y << ", " << fallback_cluster_dims.z <<  ")\n");
+
+    cutlass::arch::synclog_setup();
+    cudaError_t status = cudaLaunchKernelExC(&cluster_launch_config.launch_config, kernel, kernel_params);
+    Return_Status(status);
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return Status::kInvalid;
+#endif
+  }
+  
+
+};
+
+namespace detail {
+
+template<class Arg>
+void* checked_addressof(Arg&& arg) {
+  static_assert(! std::is_rvalue_reference_v<Arg> || ! std::is_const_v<Arg>, "You cannot take the address of a const rvalue reference (const T&&).");
+  // We use std::addressof to ensure we get the address,
+  // in case the type has an overloaded operator&.
+  // Note that this precludes `const T&&` references.
+  return const_cast<void*>(reinterpret_cast<void const*>(std::addressof(arg)));
+}
+
+} // namespace detail
+
+//! Parameters for launch_on_cluster (see below).
+struct ClusterLaunchParams {
+  //! Grid dimensions
+  dim3 grid_dims{1, 1, 1};
+
+  //! Block dimensions
+  dim3 block_dims{1, 1, 1};
+
+  //! Cluster dimensions
+  dim3 cluster_dims{1, 1, 1};
+
+  //! Number of bytes required for the kernel's shared memory.
+  int smem_size_in_bytes = 0;
+
+  //! CUDA stream on which to launch the kernel.
+  cudaStream_t cuda_stream = nullptr;
+};
+
+/// @brief Launch the kernel on the stream using cluster launch.
+///
+/// @param params Cluster launch parameters (see above).
+/// @param kernel_ptr Pointer to the kernel function (see example).
+/// @param args Zero or more arguments to pass to the kernel.
+///
+/// @tparam Args Types of the arguments passed to the kernel.
+///   Don't specify this/these template argument(s) explicitly.
+///
+/// @return Status::Success on success, else an error code.
+///
+/// @code
+/// template<class SharedMemoryType, class A, class B, class C>
+/// __global__ void kernel(A a, B b, C c);
+///
+/// X x = get_x();
+/// Y y = get_y();
+/// Z z = get_z();
+///
+/// void const* kernel_ptr =
+///   const_cast<void const*>(reinterpret_cast<void*>(
+///     &kernel<SharedMemory, X, Y, Z>));
+/// auto status = launch_kernel_on_cluster(
+///   {grid_dims, block_dims, cluster_dims, sizeof(SharedMemory)},
+///   kernel_ptr, x, y, z);
+/// @endcode
+template<class ... Args>
+CUTLASS_HOST cutlass::Status
+launch_kernel_on_cluster(const ClusterLaunchParams& params,
+  void const* kernel_ptr,
+  Args&& ... args)
+{
+  // Unfortunately, we find ourselves needing to pass in
+  // the parameters as an array of raw pointers.
+  if constexpr (sizeof...(Args) == 0) {
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr, nullptr);
+  }
+  else {
+    void* kernel_params[sizeof...(Args)] = {
+      detail::checked_addressof(std::forward<Args>(args))...
+    };
+    return cutlass::ClusterLauncher::launch(
+      params.grid_dims,
+      params.cluster_dims,
+      params.block_dims,
+      params.smem_size_in_bytes,
+      params.cuda_stream,
+      kernel_ptr,
+      kernel_params);
+  }
+}
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..0287850bc6febe16a90695c82fabee566cdf9a82
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/complex.h
@@ -0,0 +1,821 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuComplex.h>
+
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+#include "cutlass/functional.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/real.h"
+
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/fast_math.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <iosfwd>
+#endif
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Enumeraed type describing a transformation on a complex value.
+enum class ComplexTransform {
+  kNone,
+  kConjugate
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines ComplexTransform inversions
+template <ComplexTransform kTransform>
+struct InvertComplexTransform;
+
+/// Invert ComplexTransform from kNone to kConjugate
+template <>
+struct InvertComplexTransform<ComplexTransform::kNone> {
+  static ComplexTransform const transform = ComplexTransform::kConjugate;
+};
+
+/// Invert ComplexTransform from kConjugate to kNone
+template <>
+struct InvertComplexTransform<ComplexTransform::kConjugate> {
+  static ComplexTransform const transform = ComplexTransform::kNone;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Accessors for CUDA complex types
+//
+
+#if !defined(__CUDACC_RTC__)
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float const &real(cuFloatComplex const &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+float &real(cuFloatComplex &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double const &real(cuDoubleComplex const &z) { return z.x; }
+
+/// Returns the real part of the complex number
+CUTLASS_HOST_DEVICE
+double &real(cuDoubleComplex &z) { return z.x; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float const &imag(cuFloatComplex const &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+float &imag(cuFloatComplex &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double const &imag(cuDoubleComplex const &z) { return z.y; }
+
+/// Returns the imaginary part of the complex number
+CUTLASS_HOST_DEVICE
+double &imag(cuDoubleComplex &z) { return z.y; }
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuFloatComplex
+conj(cuFloatComplex const& z) {
+  return make_cuFloatComplex(z.x, -z.y);
+}
+
+// Returns the conjugate of the complex number
+CUTLASS_HOST_DEVICE cuDoubleComplex
+conj(cuDoubleComplex const& z) {
+  return make_cuDoubleComplex(z.x, -z.y);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Class for representing and manipulating complex numbers with conversions from built-in CUDA
+/// complex types.
+
+template <typename T>
+class complex
+{
+ public:
+  /// Type alias for scalar type
+  using value_type = T;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Real part
+  T _real;
+
+  /// Imaginary part
+  T _imag;
+
+ public:
+
+//
+// Methods
+//
+
+  /// Default constructor
+  complex() = default;
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r) : _real(r), _imag(T(0)) {}
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  complex(T r, T i) : _real(r), _imag(i) {}
+
+  /// Constructor
+  template<typename A>
+  CUTLASS_HOST_DEVICE
+  complex(complex<A> const &z) : _real(static_cast<T>(z.real())), _imag(static_cast<T>(z.imag())) {}
+
+
+  #if !defined(__CUDACC_RTC__)
+  /// Conversion from cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuFloatComplex const &z) : _real(static_cast<T>(cuCrealf(z))), _imag(static_cast<T>(cuCimagf(z))) {}
+
+  /// Conversion from cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  complex(cuDoubleComplex const &z) : _real(static_cast<T>(cuCreal(z))), _imag(static_cast<T>(cuCimag(z))) {}
+  #endif
+
+  /// Equality operator
+  CUTLASS_HOST_DEVICE bool operator==(complex<T> const &rhs) const {
+    return this->real() == rhs.real() && this->imag() == rhs.imag();
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE bool operator!=(complex<T> const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator+(complex<A> const &rhs) const {
+    return complex<T>(this->real() + rhs.real(), this->imag() + rhs.imag());
+  }
+
+  /// Reduction into memory address.  Components may update out of order.
+  template <typename OtherT>
+  CUTLASS_DEVICE void red(complex<OtherT> *ptr) const {
+    static_assert(platform::is_same<T, OtherT>::value, "Component type must match");
+    cutlass::atomic_add<T> reduce;
+    reduce(&ptr->_real, _real);
+    reduce(&ptr->_imag, _imag);
+  }
+
+  /// Reduction into memory address.  Components may update out of order.  (Half specialization)
+  CUTLASS_DEVICE void red(complex<half_t> *ptr) const {
+    static_assert(platform::is_same<T, half_t>::value, "Component type must match");
+    half2 *h2_ptr = reinterpret_cast<half2*>(ptr);
+    half2 h2_data = reinterpret_cast<half2&>(*this);
+    cutlass::atomic_add<half2> reduce;
+    reduce(h2_ptr, h2_data);
+  }
+
+  /// Subtraction
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator-(complex<A> const &rhs) const {
+    return complex<T>(this->real() - rhs.real(), this->imag() - rhs.imag());
+  }
+
+  /// Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(complex<A> const &rhs) const {
+    return complex<T>(this->real() * rhs.real() - this->imag() * rhs.imag(),
+                      this->real() * rhs.imag() + this->imag() * rhs.real());
+  }
+
+  /// Scalar Multiplication
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator*(A const &s) const {
+    return complex<T>(this->real() * s, this->imag() * s);
+  }
+
+  /// Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(complex<A> const &rhs) const {
+    T d = T(rhs.real() * rhs.real() + rhs.imag() * rhs.imag());
+
+    return complex<T>(
+      (real() * rhs.real() + imag() * rhs.imag()) / d,
+      (imag() * rhs.real() - real() * rhs.imag()) / d
+    );
+  }
+
+  /// Scalar Division
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> operator/(A const &s) const {
+    return complex<T>(this->real() / s, this->imag() / s);
+  }
+
+  /// Addition
+    template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator+=(complex<A> const &rhs) {
+      *this = *this + rhs;
+      return *this;
+  }
+
+  /// Subtraction
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator-=(complex<A> const &rhs) {
+      *this = *this - rhs;
+      return *this;
+  }
+
+  /// Multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(complex<A> const &rhs) {
+      *this = *this * rhs;
+      return *this;
+  }
+
+  /// Scalar multiplication
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator*=(A s) {
+      *this = *this * s;
+      return *this;
+  }
+
+  /// Division
+  template <typename A>
+  CUTLASS_HOST_DEVICE complex<T> &operator/=(complex<A> const &rhs) {
+      *this = *this / rhs;
+      return *this;
+  }
+
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &real() const { return _real; }
+
+  /// Accesses the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &real() { return _real; }
+
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T const &imag() const { return _imag; }
+
+  /// Accesses the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  T &imag() { return _imag; }
+
+  /// Set the real part of the complex number
+  CUTLASS_HOST_DEVICE
+  void real(T real) { _real = real; }
+
+  /// Set the imaginary part of the complex number
+  CUTLASS_HOST_DEVICE
+  void imag(T imag) { _imag = imag; }
+
+  #if !defined(__CUDACC_RTC__)
+  /// Converts to cuFloatComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuFloatComplex() const { return make_cuFloatComplex(float(real()), float(imag())); }
+
+  /// Converts to cuDoubleComplex
+  CUTLASS_HOST_DEVICE
+  explicit operator cuDoubleComplex() const { return make_cuDoubleComplex(real(), imag()); }
+  #endif
+};
+
+// Complex conjugate
+template<class T>
+CUTLASS_HOST_DEVICE complex<T> conj(complex<T> const& z) {
+  return {z.real(), -z.imag()};
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Accessors for complex template
+//
+
+// Nonmember real and imag need to work for non-complex numbers too.
+// That means cutlass::complex, std::complex, cuda::std::complex, and
+// any user-defined complex number type that looks like std::complex.
+// It's reasonable to assume that a "complex number type" has
+// zero-argument real() and imag() member functions returning
+// non-void.  While cuFloatComplex and cuDoubleComplex lack those
+// member functions, one-argument nonmember real and imag overloads
+// for those types are defined above.
+
+namespace detail {
+
+template <typename T, typename Enable = void>
+struct has_zero_argument_real_member_function :
+  cutlass::platform::false_type
+{};
+
+template <typename T>
+struct has_zero_argument_real_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().real())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_zero_argument_real_member_function_v =
+  has_zero_argument_real_member_function<T>::value;
+
+template <typename T, typename Enable = void>
+struct has_zero_argument_imag_member_function :
+  cutlass::platform::false_type
+{};
+
+template <typename T>
+struct has_zero_argument_imag_member_function<T,
+  cutlass::platform::enable_if_t<
+    ! cutlass::platform::is_void_v<
+      decltype(cutlass::platform::declval<T>().imag())
+    >
+  >
+> : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_zero_argument_imag_member_function_v =
+  has_zero_argument_imag_member_function<T>::value;
+
+} // namespace detail
+
+template<typename T>
+CUTLASS_HOST_DEVICE auto real(T z) {
+  if constexpr (detail::has_zero_argument_real_member_function_v<T>) {
+    return z.real();
+  } else {
+    return z;
+  }
+}
+  
+template<typename T>
+CUTLASS_HOST_DEVICE auto imag(T z) {
+  if constexpr (detail::has_zero_argument_imag_member_function_v<T>) {
+    return z.imag();
+  } else {
+    // Imaginary part of a non-complex input has the same type as the
+    // input, and its value is zero.  CUTLASS assumes in this case
+    // that value-initializing T is well-formed and results in zero.
+    return T{};
+  }
+}
+  
+//
+// Output operators
+//
+
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+std::ostream &operator<<(std::ostream &out, complex<T> const &z) {
+  T _r = real(z);
+  T _i = imag(z);
+
+  if (bool(_i)) {
+    return out << _r << "+i" << _i;
+  }
+  return out << _r;
+}
+#endif
+
+//
+// Non-member operators defined for complex types
+//
+
+
+//
+// Non-member functions defined for complex numbers
+//
+
+// abs returns the magnitude of the complex number.
+
+CUTLASS_HOST_DEVICE float abs(complex<float> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+CUTLASS_HOST_DEVICE double abs(complex<double> const &z) {
+  return ::hypot(z.real(), z.imag());
+}
+
+// In theory, it would make sense to add a complex<long double>
+// specialization of abs here, since hypot works for long double too.
+// In practice, long double doesn't have a portable number of bits or
+// behavior, so users who care about higher-precision floating-point
+// computation should probably insist on an actual FP128 type.
+
+template <typename T>
+CUTLASS_HOST_DEVICE T abs(complex<T> const &z) {
+  // cutlass::complex permits all kinds of T, including types that
+  // don't have NaN.  For a generic floating-point type with Inf
+  // and/or NaN, LAPACK's DLAPY2 algorithm would make sense, as it
+  // would handle issues like avoiding unwarranted overflow if
+  // z.real() or z.imag() is slightly bigger than the square root of
+  // the max finite number.  That could be a future improvement; for
+  // now, the code just uses the naive algorithm.
+  //
+  // Use the "swap two-step" idiom so that argument-dependent lookup
+  // can find any CUTLASS-specific overloads.
+  using cutlass::sqrt;
+  return sqrt(z.real() * z.real() + z.imag() * z.imag());
+}
+
+/// Returns the magnitude of the complex number
+template <typename T>
+CUTLASS_HOST_DEVICE T arg(complex<T> const &z) {
+  return atan2(imag(z), real(z));
+}
+
+/// Returns the squared magnitude of a real number
+template <typename T>
+CUTLASS_HOST_DEVICE T norm(T const &z) {
+    return z * z;
+}
+
+/// Returns the squared magnitude of a real number
+template <>
+CUTLASS_HOST_DEVICE int8_t norm(int8_t const &z) {
+    return static_cast<int8_t>(z * z);
+}
+
+/// Returns the squared magnitude of a complex number
+template <typename T>
+CUTLASS_HOST_DEVICE double norm(complex<T> const &z) {
+  return real(z) * real(z) + imag(z) * imag(z);
+}
+
+/// Norm-accumulate calculation
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(T const &x, R const & accumulator) {
+  return accumulator + static_cast<R>(x) * static_cast<R>(x);
+}
+
+/// Norm accumulate specialized for complex types
+template <typename T, typename R>
+CUTLASS_HOST_DEVICE R norm_accumulate(complex<T> const &z, R const &accumulator) {
+  return accumulator + static_cast<R>(real(z)) * static_cast<R>(real(z)) +
+    static_cast<R>(imag(z)) * static_cast<R>(imag(z));
+}
+
+namespace detail {
+  
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::true_type) {
+  return conj(z);
+}
+
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z, cutlass::platform::false_type) {
+  return z;
+}
+
+template<class T>
+CUTLASS_HOST_DEVICE T conj_impl(T const& z) {
+  constexpr bool use_unqualified_conj =
+    ! cutlass::platform::is_arithmetic_v<T> &&
+    ! detail::has_cutlass_conj_v<T> &&
+    detail::has_unqualified_conj_v<T>;
+  return conj_impl(z, cutlass::platform::bool_constant<use_unqualified_conj>{});
+}
+  
+} // namespace detail
+
+// Return the complex conjugate of the input.
+//
+// This MUST be a function and not a function object, because it may
+// be common practice for downstream types to define specifically
+// cutlass::conj overloads, instead of overloads in their namespace.
+//
+// As a result of this being a function and not a function object,
+// CUTLASS code needs to declare "using cutlass::conj;" in scope and
+// then call this function unqualified, just like std::swap.
+//
+// If an overload already exists for cutlass::conj(T), that overload
+// will be called instead of this one.  Otherwise:
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where (namespace-unqualified) conj(z) is well formed
+//    and cutlass::conj(z) is NOT well formed, return conj(z); and,
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup, but does not live in the
+// cutlass namespace).  It excludes cutlass::conj(z) in order to
+// prevent infinite recursion.
+//
+// Case (3) covers non-Standard non-complex number types.
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z) {
+  return detail::conj_impl(z);
+}
+
+/// Projects the complex number z onto the Riemann sphere
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> proj(complex<T> const &z) {
+  T d = real(z) * real(z) + imag(z) * imag(z) + T(1);
+  return complex<T>((T(2) * real(z)) / d, (T(2) * imag(z)) / d);
+}
+
+/// Returns a complex number with magnitude r and phase theta
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> polar(T const &r, T const &theta = T()) {
+  return complex<T>(r * cos(theta), r * sin(theta));
+}
+
+/// Computes the complex exponential of z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> exp(complex<T> const &z) {
+  return complex<T>(fast_exp(real(z)) * fast_cos(imag(z)), fast_exp(real(z)) * fast_sin(imag(z)));
+}
+
+/// Computes the log of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log(complex<T> const &z) {
+  return complex<T>(log(abs(z)), arg(z));
+}
+
+/// Computes the log base 10 of z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> log10(complex<T> const &z) {
+  return log(z) / T(log(T(10)));
+}
+
+/// Computes the square root of complex number z
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sqrt(complex<T> const &z) {
+  return sqrt(T(2)) / T(2) *
+         complex<T>(sqrt(sqrt(norm(z)) + real(z)),
+                    (imag(z) < 0 ? T(-1) : T(1)) * sqrt(sqrt(norm(z)) - real(z)));
+}
+
+/// Computes the cosine of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> cos(complex<T> const &z) {
+  return (exp(z) + exp(-z)) / T(2);
+}
+
+/// Computes the sin of complex z.
+template <typename T>
+CUTLASS_HOST_DEVICE complex<T> sin(complex<T> const &z) {
+  return (exp(-z) - exp(z)) * complex<T>(T(0), T(1) / T(2));
+}
+
+/// Comparison
+template <typename T>
+CUTLASS_HOST_DEVICE bool operator<(complex<T> const &lhs, complex<T> const &rhs) {
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex-valued type.
+template <typename T>
+struct RealType< complex<T> >
+{
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = 2;
+
+  CUTLASS_HOST_DEVICE
+  static complex<T> from_real(double x) {
+    return complex<T>(static_cast<T>(x));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<half_t> from_real<cutlass::complex<half_t> >(double r) {
+  return cutlass::complex<half_t>(half_t(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<float> from_real<cutlass::complex<float> >(double r) {
+  return cutlass::complex<float>(float(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::complex<double> from_real<cutlass::complex<double> >(double r) {
+  return cutlass::complex<double>(r);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct is_complex {
+  static bool const value = false;
+};
+
+template <typename T>
+struct is_complex<complex<T>> {
+  static bool const value = true;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real());
+    Output y_i = Output(lhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b.real();
+    real += -a.imag() * b.imag();
+    imag += a.real() * b.imag();
+    imag += a.imag () * b.real();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<complex<T>, T, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    complex<T> const &a,
+    T const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a.real() * b;
+    imag += a.imag () * b;
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Fused multiply-add
+template <typename T>
+struct multiply_add<T, complex<T>, complex<T>> {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(
+    T const &a,
+    complex<T> const &b,
+    complex<T> const &c) const {
+
+    T real = c.real();
+    T imag = c.imag();
+
+    real += a * b.real();
+    imag += a * b.imag();
+
+    return complex<T>{
+      real,
+      imag
+    };
+  }
+};
+
+/// Conjugate
+template <typename T>
+struct conjugate<complex<T>>  {
+  CUTLASS_HOST_DEVICE
+  complex<T> operator()(complex<T> const &a) const {
+    // Invoke the complex<T> overload specifically, rather than
+    // wasting the compiler's effort on overload resolution.
+    return cutlass::conj(a);
+  }
+};
+
+#if ! defined(__CUDACC_RTC__)
+template <>
+struct conjugate<cuFloatComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuFloatComplex operator()(cuFloatComplex const& z) const {
+    return make_cuFloatComplex(z.x, -z.y);
+  }
+};
+
+template <>
+struct conjugate<cuDoubleComplex>  {
+  CUTLASS_HOST_DEVICE
+  cuDoubleComplex operator()(cuDoubleComplex const& z) const {
+    return make_cuDoubleComplex(z.x, -z.y);
+  }
+};
+#endif
+  
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared_difference<complex<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(complex<T> lhs, complex<T> rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_r = Output(lhs.real()) - Output(rhs.real());
+    Output y_i = Output(lhs.imag()) - Output(rhs.imag());
+
+    return mul_op(y_r, y_r) + mul_op(y_i, y_i);
+  }
+};
+
+/// Reduces value into the data pointed to by ptr (complex<T> specialization)
+template <typename T>
+struct atomic_add<complex<T>> {
+  CUTLASS_DEVICE
+  void operator()(complex<T> *ptr, const complex<T> &data)
+  {
+    data.red(ptr);
+  }
+};
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/constants.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5df01726b3f4dbc88bf2fd6f15092cff2b55fac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/constants.h
@@ -0,0 +1,1239 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *                                                                                                  
+ **************************************************************************************************/
+
+/* \file 
+  \brief Boost-style constant definitions for floating-point types.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/complex.h"
+
+///////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace constants {
+
+///////////////////////////////////////////////////////////////////////////////////
+
+//
+// Primary templates
+//
+
+/// Returns 1, the multiplicative identity element
+template <typename T> CUTLASS_HOST_DEVICE T one();
+
+/// Returns 0, the additive identity element
+template <typename T> CUTLASS_HOST_DEVICE T zero();
+
+/// Returns 2
+template <typename T> CUTLASS_HOST_DEVICE T two();
+
+/// Returns pi, approximately 3.141
+template <typename T> CUTLASS_HOST_DEVICE T pi();
+
+/// Returns 2 * pi
+template <typename T> CUTLASS_HOST_DEVICE T two_pi();
+
+/// Returns pi / 2
+template <typename T> CUTLASS_HOST_DEVICE T half_pi();
+
+/// Returns sqrt(pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_pi();
+
+/// Returns sqrt(pi / 2)
+template <typename T> CUTLASS_HOST_DEVICE T root_half_pi();
+
+/// Returns sqrt(2 * pi)
+template <typename T> CUTLASS_HOST_DEVICE T root_two_pi();
+
+/// Returns sqrt(ln(4))
+template <typename T> CUTLASS_HOST_DEVICE T root_ln_four();
+
+/// Returns e, approximately 2.718...
+template <typename T> CUTLASS_HOST_DEVICE T e();
+
+/// Returns (1/2)
+template <typename T> CUTLASS_HOST_DEVICE T half();
+
+/// Returns sqrt(2), approximately 1.414...
+template <typename T> CUTLASS_HOST_DEVICE T root_two();
+
+/// Returns sqrt(2)/2, approximately 0.707...
+template <typename T> CUTLASS_HOST_DEVICE T half_root_two();
+
+/// Returns ln(2), approximately 0.693...
+template <typename T> CUTLASS_HOST_DEVICE T ln_two();
+
+/// Returns ln(ln(2)), approximately -0.3665...
+template <typename T> CUTLASS_HOST_DEVICE T ln_ln_two();
+
+/// Returns 1/3, approximately 0.333...
+template <typename T> CUTLASS_HOST_DEVICE T third();
+
+/// Returns 2/3, approximately 0.666...
+template <typename T> CUTLASS_HOST_DEVICE T twothirds();
+
+/// Returns pi - 3, approximately 0.1416...
+template <typename T> CUTLASS_HOST_DEVICE T pi_minus_three();
+
+/// Returns 4 - pi, approximately 0.858...
+template <typename T> CUTLASS_HOST_DEVICE T four_minus_pi();
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for double
+
+/// Returns 1, the multiplicative identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double one<double>() {
+  uint64_t bits = 0x3ff0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> one< complex<double> >() {
+  return complex<double>(one<double>(), double());
+}
+
+/// Returns 0, the additive identity element  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double zero<double>() {
+  uint64_t bits = 0x0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> zero< complex<double> >() {
+  return complex<double>(zero<double>(), double());
+}
+
+/// Returns 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two<double>() {
+  uint64_t bits = 0x4000000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two< complex<double> >() {
+  return complex<double>(two<double>(), double());
+}
+
+/// Returns pi, approximately 3.141  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi<double>() {
+  uint64_t bits = 0x400921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi< complex<double> >() {
+  return complex<double>(pi<double>(), double());
+}
+
+/// Returns 2 * pi  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double two_pi<double>() {
+  uint64_t bits = 0x401921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> two_pi< complex<double> >() {
+  return complex<double>(two_pi<double>(), double());
+}
+
+/// Returns pi / 2  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_pi<double>() {
+  uint64_t bits = 0x3ff921fb54442d18ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_pi< complex<double> >() {
+  return complex<double>(half_pi<double>(), double());
+}
+
+/// Returns sqrt(pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_pi<double>() {
+  uint64_t bits = 0x3ffc5bf891b4ef6aull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_pi< complex<double> >() {
+  return complex<double>(root_pi<double>(), double());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_half_pi<double>() {
+  uint64_t bits = 0x3ff40d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_half_pi< complex<double> >() {
+  return complex<double>(root_half_pi<double>(), double());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two_pi<double>() {
+  uint64_t bits = 0x40040d931ff62705ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two_pi< complex<double> >() {
+  return complex<double>(root_two_pi<double>(), double());
+}
+
+/// Returns sqrt(ln(4))  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_ln_four<double>() {
+  uint64_t bits = 0x3ff2d6abe44afc43ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_ln_four< complex<double> >() {
+  return complex<double>(root_ln_four<double>(), double());
+}
+
+/// Returns e, approximately 2.718...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double e<double>() {
+  uint64_t bits = 0x4005bf0a8b145769ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> e< complex<double> >() {
+  return complex<double>(e<double>(), double());
+}
+
+/// Returns (1/2)  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half<double>() {
+  uint64_t bits = 0x3fe0000000000000ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half< complex<double> >() {
+  return complex<double>(half<double>(), double());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double root_two<double>() {
+  uint64_t bits = 0x3ff6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> root_two< complex<double> >() {
+  return complex<double>(root_two<double>(), double());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double half_root_two<double>() {
+  uint64_t bits = 0x3fe6a09e667f3bcdull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> half_root_two< complex<double> >() {
+  return complex<double>(half_root_two<double>(), double());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_two<double>() {
+  uint64_t bits = 0x3fe62e42fefa39efull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_two< complex<double> >() {
+  return complex<double>(ln_two<double>(), double());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double ln_ln_two<double>() {
+  uint64_t bits = 0xbfd774f29bdd6b9full;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> ln_ln_two< complex<double> >() {
+  return complex<double>(ln_ln_two<double>(), double());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double third<double>() {
+  uint64_t bits = 0x3fd5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> third< complex<double> >() {
+  return complex<double>(third<double>(), double());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double twothirds<double>() {
+  uint64_t bits = 0x3fe5555555555555ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> twothirds< complex<double> >() {
+  return complex<double>(twothirds<double>(), double());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double pi_minus_three<double>() {
+  uint64_t bits = 0x3fc21fb54442d180ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> pi_minus_three< complex<double> >() {
+  return complex<double>(pi_minus_three<double>(), double());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for double)
+template <> CUTLASS_HOST_DEVICE double four_minus_pi<double>() {
+  uint64_t bits = 0x3feb7812aeef4ba0ull;
+  return reinterpret_cast<double const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<double>)
+template <> CUTLASS_HOST_DEVICE complex<double> four_minus_pi< complex<double> >() {
+  return complex<double>(four_minus_pi<double>(), double());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for float
+
+/// Returns 1, the multiplicative identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float one<float>() {
+  uint32_t bits = 0x3f800000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> one< complex<float> >() {
+  return complex<float>(one<float>(), float());
+}
+
+/// Returns 0, the additive identity element  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float zero<float>() {
+  uint32_t bits = 0x0u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> zero< complex<float> >() {
+  return complex<float>(zero<float>(), float());
+}
+
+/// Returns 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two<float>() {
+  uint32_t bits = 0x40000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two< complex<float> >() {
+  return complex<float>(two<float>(), float());
+}
+
+/// Returns pi, approximately 3.141  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi<float>() {
+  uint32_t bits = 0x40490fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi< complex<float> >() {
+  return complex<float>(pi<float>(), float());
+}
+
+/// Returns 2 * pi  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float two_pi<float>() {
+  uint32_t bits = 0x40c90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> two_pi< complex<float> >() {
+  return complex<float>(two_pi<float>(), float());
+}
+
+/// Returns pi / 2  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_pi<float>() {
+  uint32_t bits = 0x3fc90fdbu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_pi< complex<float> >() {
+  return complex<float>(half_pi<float>(), float());
+}
+
+/// Returns sqrt(pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_pi<float>() {
+  uint32_t bits = 0x3fe2dfc5u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_pi< complex<float> >() {
+  return complex<float>(root_pi<float>(), float());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_half_pi<float>() {
+  uint32_t bits = 0x3fa06c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_half_pi< complex<float> >() {
+  return complex<float>(root_half_pi<float>(), float());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two_pi<float>() {
+  uint32_t bits = 0x40206c99u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two_pi< complex<float> >() {
+  return complex<float>(root_two_pi<float>(), float());
+}
+
+/// Returns sqrt(ln(4))  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_ln_four<float>() {
+  uint32_t bits = 0x3f96b55fu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_ln_four< complex<float> >() {
+  return complex<float>(root_ln_four<float>(), float());
+}
+
+/// Returns e, approximately 2.718...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float e<float>() {
+  uint32_t bits = 0x402df854u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> e< complex<float> >() {
+  return complex<float>(e<float>(), float());
+}
+
+/// Returns (1/2)  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half<float>() {
+  uint32_t bits = 0x3f000000u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half< complex<float> >() {
+  return complex<float>(half<float>(), float());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float root_two<float>() {
+  uint32_t bits = 0x3fb504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> root_two< complex<float> >() {
+  return complex<float>(root_two<float>(), float());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float half_root_two<float>() {
+  uint32_t bits = 0x3f3504f3u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> half_root_two< complex<float> >() {
+  return complex<float>(half_root_two<float>(), float());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_two<float>() {
+  uint32_t bits = 0x3f317218u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_two< complex<float> >() {
+  return complex<float>(ln_two<float>(), float());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float ln_ln_two<float>() {
+  uint32_t bits = 0xbebba795u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> ln_ln_two< complex<float> >() {
+  return complex<float>(ln_ln_two<float>(), float());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float third<float>() {
+  uint32_t bits = 0x3eaaaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> third< complex<float> >() {
+  return complex<float>(third<float>(), float());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float twothirds<float>() {
+  uint32_t bits = 0x3f2aaaabu;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> twothirds< complex<float> >() {
+  return complex<float>(twothirds<float>(), float());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float pi_minus_three<float>() {
+  uint32_t bits = 0x3e10fdaau;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> pi_minus_three< complex<float> >() {
+  return complex<float>(pi_minus_three<float>(), float());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for float)
+template <> CUTLASS_HOST_DEVICE float four_minus_pi<float>() {
+  uint32_t bits = 0x3f5bc095u;
+  return reinterpret_cast<float const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<float>)
+template <> CUTLASS_HOST_DEVICE complex<float> four_minus_pi< complex<float> >() {
+  return complex<float>(four_minus_pi<float>(), float());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for tfloat32_t
+
+/// Returns 1, the multiplicative identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t one<tfloat32_t>() {
+  uint32_t bits = 0x3f801000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> one< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(one<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t zero<tfloat32_t>() {
+  uint32_t bits = 0x1000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> zero< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(zero<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two<tfloat32_t>() {
+  uint32_t bits = 0x40001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi<tfloat32_t>() {
+  uint32_t bits = 0x40491fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2 * pi  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40c91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(two_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi / 2  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fc91fdbu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fe2efc5u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_half_pi<tfloat32_t>() {
+  uint32_t bits = 0x3fa07c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_half_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_half_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two_pi<tfloat32_t>() {
+  uint32_t bits = 0x40207c99u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_ln_four<tfloat32_t>() {
+  uint32_t bits = 0x3f96c55fu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_ln_four< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_ln_four<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t e<tfloat32_t>() {
+  uint32_t bits = 0x402e0854u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> e< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(e<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns (1/2)  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half<tfloat32_t>() {
+  uint32_t bits = 0x3f001000u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t root_two<tfloat32_t>() {
+  uint32_t bits = 0x3fb514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(root_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t half_root_two<tfloat32_t>() {
+  uint32_t bits = 0x3f3514f3u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> half_root_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(half_root_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_two<tfloat32_t>() {
+  uint32_t bits = 0x3f318218u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t ln_ln_two<tfloat32_t>() {
+  uint32_t bits = 0xbebbb795u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> ln_ln_two< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(ln_ln_two<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t third<tfloat32_t>() {
+  uint32_t bits = 0x3eaabaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> third< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(third<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t twothirds<tfloat32_t>() {
+  uint32_t bits = 0x3f2abaabu;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> twothirds< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(twothirds<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t pi_minus_three<tfloat32_t>() {
+  uint32_t bits = 0x3e110daau;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> pi_minus_three< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(pi_minus_three<tfloat32_t>(), tfloat32_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for tfloat32_t)
+template <> CUTLASS_HOST_DEVICE tfloat32_t four_minus_pi<tfloat32_t>() {
+  uint32_t bits = 0x3f5bd095u;
+  return reinterpret_cast<tfloat32_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<tfloat32_t>)
+template <> CUTLASS_HOST_DEVICE complex<tfloat32_t> four_minus_pi< complex<tfloat32_t> >() {
+  return complex<tfloat32_t>(four_minus_pi<tfloat32_t>(), tfloat32_t());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for half_t
+
+/// Returns 1, the multiplicative identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t one<half_t>() {
+  uint16_t bits = 0x3c00u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> one< complex<half_t> >() {
+  return complex<half_t>(one<half_t>(), half_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t zero<half_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> zero< complex<half_t> >() {
+  return complex<half_t>(zero<half_t>(), half_t());
+}
+
+/// Returns 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two<half_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two< complex<half_t> >() {
+  return complex<half_t>(two<half_t>(), half_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi<half_t>() {
+  uint16_t bits = 0x4248u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi< complex<half_t> >() {
+  return complex<half_t>(pi<half_t>(), half_t());
+}
+
+/// Returns 2 * pi  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t two_pi<half_t>() {
+  uint16_t bits = 0x4648u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> two_pi< complex<half_t> >() {
+  return complex<half_t>(two_pi<half_t>(), half_t());
+}
+
+/// Returns pi / 2  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_pi<half_t>() {
+  uint16_t bits = 0x3e48u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_pi< complex<half_t> >() {
+  return complex<half_t>(half_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_pi<half_t>() {
+  uint16_t bits = 0x3f17u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_pi< complex<half_t> >() {
+  return complex<half_t>(root_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_half_pi<half_t>() {
+  uint16_t bits = 0x3d03u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_half_pi< complex<half_t> >() {
+  return complex<half_t>(root_half_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two_pi<half_t>() {
+  uint16_t bits = 0x4103u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two_pi< complex<half_t> >() {
+  return complex<half_t>(root_two_pi<half_t>(), half_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_ln_four<half_t>() {
+  uint16_t bits = 0x3cb6u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_ln_four< complex<half_t> >() {
+  return complex<half_t>(root_ln_four<half_t>(), half_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t e<half_t>() {
+  uint16_t bits = 0x4170u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> e< complex<half_t> >() {
+  return complex<half_t>(e<half_t>(), half_t());
+}
+
+/// Returns (1/2)  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half<half_t>() {
+  uint16_t bits = 0x3800u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half< complex<half_t> >() {
+  return complex<half_t>(half<half_t>(), half_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t root_two<half_t>() {
+  uint16_t bits = 0x3da8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> root_two< complex<half_t> >() {
+  return complex<half_t>(root_two<half_t>(), half_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t half_root_two<half_t>() {
+  uint16_t bits = 0x39a8u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> half_root_two< complex<half_t> >() {
+  return complex<half_t>(half_root_two<half_t>(), half_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_two<half_t>() {
+  uint16_t bits = 0x398cu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_two<half_t>(), half_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t ln_ln_two<half_t>() {
+  uint16_t bits = 0xb5ddu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> ln_ln_two< complex<half_t> >() {
+  return complex<half_t>(ln_ln_two<half_t>(), half_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t third<half_t>() {
+  uint16_t bits = 0x3555u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> third< complex<half_t> >() {
+  return complex<half_t>(third<half_t>(), half_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t twothirds<half_t>() {
+  uint16_t bits = 0x3955u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> twothirds< complex<half_t> >() {
+  return complex<half_t>(twothirds<half_t>(), half_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t pi_minus_three<half_t>() {
+  uint16_t bits = 0x3088u;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> pi_minus_three< complex<half_t> >() {
+  return complex<half_t>(pi_minus_three<half_t>(), half_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for half_t)
+template <> CUTLASS_HOST_DEVICE half_t four_minus_pi<half_t>() {
+  uint16_t bits = 0x3adeu;
+  return reinterpret_cast<half_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<half_t>)
+template <> CUTLASS_HOST_DEVICE complex<half_t> four_minus_pi< complex<half_t> >() {
+  return complex<half_t>(four_minus_pi<half_t>(), half_t());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for bfloat16_t
+
+/// Returns 1, the multiplicative identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t one<bfloat16_t>() {
+  uint16_t bits = 0x3f80u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 1, the multiplicative identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> one< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(one<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 0, the additive identity element  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t zero<bfloat16_t>() {
+  uint16_t bits = 0x0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 0, the additive identity element  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> zero< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(zero<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two<bfloat16_t>() {
+  uint16_t bits = 0x4000u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi, approximately 3.141  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi<bfloat16_t>() {
+  uint16_t bits = 0x4049u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi, approximately 3.141  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2 * pi  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t two_pi<bfloat16_t>() {
+  uint16_t bits = 0x40c9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2 * pi  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(two_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi / 2  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fc9u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi / 2  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fe3u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(pi / 2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_half_pi<bfloat16_t>() {
+  uint16_t bits = 0x3fa0u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(pi / 2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_half_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_half_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2 * pi)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two_pi<bfloat16_t>() {
+  uint16_t bits = 0x4020u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2 * pi)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two_pi<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(ln(4))  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_ln_four<bfloat16_t>() {
+  uint16_t bits = 0x3f97u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(ln(4))  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_ln_four< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_ln_four<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns e, approximately 2.718...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t e<bfloat16_t>() {
+  uint16_t bits = 0x402eu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns e, approximately 2.718...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> e< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(e<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns (1/2)  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half<bfloat16_t>() {
+  uint16_t bits = 0x3f00u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns (1/2)  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t root_two<bfloat16_t>() {
+  uint16_t bits = 0x3fb5u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2), approximately 1.414...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(root_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t half_root_two<bfloat16_t>() {
+  uint16_t bits = 0x3f35u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns sqrt(2)/2, approximately 0.707...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> half_root_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(half_root_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_two<bfloat16_t>() {
+  uint16_t bits = 0x3f31u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns ln(2), approximately 0.693...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t ln_ln_two<bfloat16_t>() {
+  uint16_t bits = 0xbebcu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns ln(ln(2)), approximately -0.3665...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> ln_ln_two< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(ln_ln_two<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t third<bfloat16_t>() {
+  uint16_t bits = 0x3eabu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 1/3, approximately 0.333...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> third< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(third<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t twothirds<bfloat16_t>() {
+  uint16_t bits = 0x3f2bu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 2/3, approximately 0.666...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> twothirds< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(twothirds<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t pi_minus_three<bfloat16_t>() {
+  uint16_t bits = 0x3e11u;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns pi - 3, approximately 0.1416...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> pi_minus_three< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(pi_minus_three<bfloat16_t>(), bfloat16_t());
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for bfloat16_t)
+template <> CUTLASS_HOST_DEVICE bfloat16_t four_minus_pi<bfloat16_t>() {
+  uint16_t bits = 0x3f5cu;
+  return reinterpret_cast<bfloat16_t const &>(bits);
+}
+
+/// Returns 4 - pi, approximately 0.858...  (specialization for complex<bfloat16_t>)
+template <> CUTLASS_HOST_DEVICE complex<bfloat16_t> four_minus_pi< complex<bfloat16_t> >() {
+  return complex<bfloat16_t>(four_minus_pi<bfloat16_t>(), bfloat16_t());
+}
+///////////////////////////////////////////////////////////////////////////////////
+
+} // namespace constants
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_builder.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_builder.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e032f9599a5e76eb1e8dd6b5279ae9a42ce9c9b4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_builder.hpp
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/collective_conv.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/conv/dispatch_policy.hpp
+struct KernelScheduleAuto {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  conv::Operator,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_gmma_builder.inl"
+#include "builders/sm100_umma_builder.inl" 
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_conv.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0bb596fe02b36d350a1d1065ff5001794eba170
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/collective_conv.hpp
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/conv/collective/detail.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class ElementB,
+  class TiledMma,
+  class TileTraitsA,
+  class TileTraitsB
+>
+struct CollectiveConv {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "sm90_implicit_gemm_gmma_ss_warpspecialized.hpp"
+#include "sm100_implicit_gemm_umma_warpspecialized.hpp" 
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..af541a940f787528d213f068915ce0aa5997a82f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/detail.hpp
@@ -0,0 +1,271 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Construct the stride types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_A() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes ((w,n), C)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((w,h,d,n), C)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (k, nq/npq/nzpq)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1 ||
+                  DispatchPolicy::NumSpatialDimensions == 2 ||
+                  DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, int64_t>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes ((q,n), K)
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Stride<int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // Maps to modes ((q,p,z,n), K)
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>,
+                          cute::Int<1>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+// Construct the stirde types for conv collectives based on the dispatch policy, strides 64b by default
+template <class DispatchPolicy>
+constexpr auto
+sm90_dispatch_policy_to_stride_B() {
+  if constexpr (DispatchPolicy::ConvOp == conv::Operator::kFprop) {
+    // Maps to modes (k, (C,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>>{};
+    }
+    // Maps to modes (k, (C,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kWgrad) {
+    // Maps to modes (C, (w,n))
+    if constexpr (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (w,h,d,n))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>,
+                          cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else if constexpr (DispatchPolicy::ConvOp == conv::Operator::kDgrad) {
+    // Maps to modes (C, (k,s))
+    if constexpr      (DispatchPolicy::NumSpatialDimensions == 1) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 2) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t>>{};
+    }
+    // Maps to modes (C, (k,s,r,t))
+    else if constexpr (DispatchPolicy::NumSpatialDimensions == 3) {
+      return cute::Stride<cute::Int<1>, cute::Stride<int64_t, int64_t, int64_t, int64_t>>{};
+    }
+    // error dims assert
+    else {
+      static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported spatial dim count.");
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Unsupported ConvOp.");
+  }
+}
+
+
+template <class DispatchPolicy>
+constexpr auto
+sm100_dispatch_policy_to_stride_A() {
+  return sm90_dispatch_policy_to_stride_A<DispatchPolicy>();
+}
+
+template <class DispatchPolicy>
+constexpr auto
+sm100_dispatch_policy_to_stride_B() {
+  return sm90_dispatch_policy_to_stride_B<DispatchPolicy>();
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Compute the lower/near corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = -1 * problem_shape.lower_padding[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+// Computes the upper/far corner, returning it as a cute::array in [W,H,D] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_upper_corner_whd(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> upper{};
+  if constexpr (ConvOp == conv::Operator::kFprop) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.upper_padding[i] -
+        (problem_shape.shape_C[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      upper[NumSpatialDimensions-1-i] = problem_shape.lower_padding[i] -
+        (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i] + problem_shape.shape_C[i+1] - problem_shape.shape_A[i+1];
+    });
+  }
+  return upper;
+}
+
+// Compute the lower/near corner of (t,r,s), returning it as a cute::array in [S,R,T] order
+template <conv::Operator ConvOp, int NumSpatialDimensions>
+CUTLASS_HOST_DEVICE
+constexpr auto
+compute_lower_srt(ConvProblemShape<ConvOp, NumSpatialDimensions> const& problem_shape) {
+  using cute::for_each;
+  using cute::make_seq;
+
+  cute::array<int, NumSpatialDimensions> lower{};
+  if constexpr (ConvOp == conv::Operator::kFprop ||
+                ConvOp == conv::Operator::kWgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = 0;
+    });
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    for_each(make_seq<NumSpatialDimensions>{}, [&](auto i) {
+      lower[NumSpatialDimensions-1-i] = (problem_shape.shape_B[i+1] - 1) * problem_shape.dilation[i];
+    });
+  }
+  return lower;
+}
+
+template <class CopyOp> struct is_im2col_load { static constexpr bool value = false; };
+template <> struct is_im2col_load<cute::SM90_TMA_LOAD_IM2COL          > { static constexpr bool value = true; };
+template <> struct is_im2col_load<cute::SM90_TMA_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; };
+template <> struct is_im2col_load<cute::SM100_TMA_2SM_LOAD_IM2COL          > { static constexpr bool value = true; }; 
+template <> struct is_im2col_load<cute::SM100_TMA_2SM_LOAD_IM2COL_MULTICAST> { static constexpr bool value = true; }; 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3c541c325004eb8488ca7353eed9a43fa4ae280
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp
@@ -0,0 +1,917 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/cluster.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/trace.h"
+
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+#  include <sstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,    // Static cluster shape or dynamic (int, int, _1)
+  class TileShapeMNKL_,  // (MmaAtomShapeM, MmaAtomShapeN, TileK, optional: TileL)
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
+      ConvOp,
+      Stages,
+      NumSpatialDims,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShapeMNKL_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedImplicitGemm<
+                           ConvOp,
+                           Stages,
+                           NumSpatialDims,
+                           SchedulerPipelineStageCount,
+                           AccumulatorPipelineStageCount,
+                           ClusterShape>;
+  using TileShape = decltype(cute::take<0,3>(TileShapeMNKL_{})); // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutAtomA = typename TileTraitsA_::SmemLayoutAtom;
+  using SmemLayoutAtomB = typename TileTraitsB_::SmemLayoutAtom;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // deducde the kernel facing stride tuple types based on the dispatch policy (spatial dim, algo, etc.)
+  using StrideA = decltype(detail::sm100_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm100_dispatch_policy_to_stride_B<DispatchPolicy>());
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using TmaInternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>>;
+  using TmaInternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>>;
+
+  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
+  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
+
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(shape<0>(TileShape{}), tile_size<0>(TiledMma{})), "TileShape_M should be evenly divided by TiledMma_M");
+  CUTE_STATIC_ASSERT_V(evenly_divides(shape<1>(TileShape{}), tile_size<1>(TiledMma{})) || (ConvOp == conv::Operator::kWgrad), "TileShape_N should be evenly divided by TiledMma_N");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      Step<_2,_1,_3>{}));
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      Step<_2,_1,_3>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+  static constexpr bool is_strided_dgrad = ConvOp == conv::Operator::kDgrad && not is_im2col_A && not is_im2col_B;
+
+  static constexpr int TileShapeMNKLRank = rank(TileShapeMNKL_{});
+  // If rank > 3, TileL exists and it is GroupsPerTile. The kernel is grouped conv now.
+  static constexpr bool is_grouped_wgrad = ConvOp == conv::Operator::kWgrad && TileShapeMNKLRank > 3;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    size(AtomThrShapeMNK{}) * (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(ElementA))) +
+    size(AtomThrShapeMNK{}) * (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(ElementB)));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+
+private:
+
+  // Note that for fprop and non-strided dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // For strided dgrad A and B are both tma tiled and not im2col
+
+  template <class TensorA, class ClusterShapeVMNK>
+  static constexpr auto
+  get_tma_load_a_instance(
+    TensorA const& tensor_a,
+    ProblemShape const& problem_shape,
+    ClusterShapeVMNK const& cluster_shape_vmnk) {
+
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      // gbasis strides for dgrad kernel need to be negated
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+
+      return make_im2col_tma_atom_A_sm100(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk,
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad and strided dgrad
+    else {
+      return make_tma_atom_A_sm100<TmaInternalElementA>(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk);
+    }
+  }
+
+  template <class TensorB, class ClusterShapeVMNK>
+  static constexpr auto
+  get_tma_load_b_instance(
+    TensorB const& tensor_b,
+    ProblemShape const& problem_shape,
+    ClusterShapeVMNK const& cluster_shape_vmnk) {
+
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      return make_im2col_tma_atom_B_sm100(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk,
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_atom_B_sm100<TmaInternalElementB>(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+          TileShape{},
+          TiledMma{},
+          cluster_shape_vmnk);
+    }
+  }
+
+public:
+
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+
+  // Device-side kernel params
+  //
+  // Arguments has the untransformed problem shape from the user.
+  // Params will have the transformed problem shape.
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int32_t(0)))>;
+
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int32_t(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(recast_ptr<TmaInternalElementA>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{},
+        ClusterLayout_VMNK{}));
+
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(recast_ptr<TmaInternalElementB>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{},
+        ClusterLayout_VMNK{}));
+
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+  };
+
+  //
+  // Constructor
+  //
+  CUTLASS_DEVICE
+  CollectiveConv(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  //
+  // Methods
+  //
+
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+
+    // from the flat problem shape arrays of ConvProblemShape<N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape, cluster_layout_vmnk);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape, cluster_layout_vmnk);
+    auto tma_load_a_fallback = get_tma_load_a_instance(tensor_a, problem_shape, cluster_layout_vmnk_fallback);
+    auto tma_load_b_fallback = get_tma_load_b_instance(tensor_b, problem_shape, cluster_layout_vmnk_fallback);
+
+    static_assert(size(typename decltype(tma_load_a)::ThrID{}) == size(AtomThrShapeMNK{}));
+    static_assert(size(typename decltype(tma_load_b)::ThrID{}) == size(AtomThrShapeMNK{}));
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    {
+      const bool check = problem_shape.stride_A[NumTensorDimensions-1] == 1;
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+      if (not check) {
+        const auto offending_stride =
+          problem_shape.stride_A[NumTensorDimensions-1];
+        std::ostringstream os;
+        os << "CollectiveConv::can_implement: "
+          "problem_shape.stride_A[NumTensorDimensions-1 = "
+          << (NumTensorDimensions-1) << "] = "
+          << offending_stride << " != 1";
+        CUTLASS_TRACE_HOST( os.str() );
+      }
+#endif
+      implementable &= check;
+    }
+
+    {
+      const bool check = problem_shape.stride_B[NumTensorDimensions-1] == 1;
+#if (! defined(__CUDA_ARCH__)) && (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+      if (not check) {
+        const auto offending_stride =
+          problem_shape.stride_B[NumTensorDimensions-1];
+        std::ostringstream os;
+        os << "CollectiveConv::can_implement: "
+          "problem_shape.stride_B[NumTensorDimensions-1 = "
+          << (NumTensorDimensions-1) << "] = "
+          << offending_stride << " != 1\n";
+        CUTLASS_TRACE_HOST( os.str() );
+      }
+#endif
+      implementable &= check;
+    }
+
+    {
+      const auto & traversal_stride  = problem_shape.traversal_stride;
+      for (auto stride: traversal_stride) {
+       implementable &= (stride >= 1 && stride <= 8);
+      }
+    }
+
+    if constexpr (ConvOp == conv::Operator::kDgrad && not is_strided_dgrad) {
+      const auto & traversal_stride  = problem_shape.traversal_stride;
+      for (auto stride: traversal_stride) {
+        implementable &= (stride == 1);
+      }
+    }
+
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    {
+      const bool check = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+      if (not check) {
+        CUTLASS_TRACE_HOST("A shape and/or strides have alignment issue.");
+      }
+      implementable &= check;
+    }
+
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    {
+      const bool check = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+      if (not check) {
+        CUTLASS_TRACE_HOST("B shape and/or strides have alignment issue.");
+      }
+      implementable &= check;
+    }
+
+    if (not implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid corner values for TMA_LOAD_IM2COL, signed int ranging from [-corner_limit, corner_limit - 1]
+      constexpr int32_t corner_limit = 1 << (16 / NumSpatialDimensions - 1);
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && lower_corner_whd[i] >= -corner_limit && lower_corner_whd[i] <= (corner_limit - 1);
+      }
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && upper_corner_whd[i] >= -corner_limit && upper_corner_whd[i] <= (corner_limit - 1);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid filter offsets for TMA_LOAD_IM2COL, unsigned int ranging from [0, offset_limit]
+      constexpr int32_t offset_limit = (1 << (16 / NumSpatialDimensions)) - 1;
+      auto flt_data = (ConvOp == conv::Operator::kWgrad) ? problem_shape.shape_C : problem_shape.shape_B;
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        // flt_data array contains [K, T, R, S, C], so pure filter [T, R, S] starts from the second position in the array
+        implementable = implementable && ((flt_data[i+1] - 1) * problem_shape.dilation[i] >= 0)
+                                      && ((flt_data[i+1] - 1) * problem_shape.dilation[i] <= offset_limit);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: tensor coordinate offset values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
+    // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
+    if constexpr (ConvOp == conv::Operator::kWgrad) {
+
+      const auto & input_shape  = problem_shape.shape_A;
+      const auto & input_stride  = problem_shape.stride_A;
+
+      implementable &= input_stride[ProblemShape::RankT - 1] == 1;
+      int64_t input_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        input_shape_size *= input_shape[i + 1];
+        implementable &= input_stride[i] == input_shape_size;
+      }
+
+      const auto & output_shape  = problem_shape.shape_C;
+      const auto & output_stride  = problem_shape.stride_C;
+
+      implementable &= output_stride[ProblemShape::RankT - 1] == 1;
+      int64_t output_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        output_shape_size *= output_shape[i + 1];
+        implementable &= output_stride[i] == output_shape_size;
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed output strides.\n");
+        return false;
+      }
+    }
+
+    // Conv kernels only support cross correlation mode currently.
+    {
+      implementable &= problem_shape.mode == cutlass::conv::Mode::kCrossCorrelation;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Conv kernels only support cross correlation mode currently.\n");
+        return false;
+      }
+    }
+
+    // When groups > 1, it should be a Grouped Conv.
+    if (problem_shape.groups > 1) {
+      implementable &= TileShapeMNKLRank > 3;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Only Grouped Conv can support groups > 1.\n");
+        return false;
+      }
+    }
+
+    // Only support Grouped Wgrad currently.
+    if constexpr (TileShapeMNKLRank > 3) {
+      implementable &= ConvOp == conv::Operator::kWgrad;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Conv Only support Grouped Wgrad currently.\n");
+        return false;
+      }
+    }
+
+    // Grouped Wgrad channel check.
+    if constexpr (is_grouped_wgrad) {
+
+      int input_K = size<0>(problem_shape.get_shape_A());
+      int input_C = size<0>(problem_shape.get_shape_B());
+
+      implementable &= input_K == input_C;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Conv's input K and input C do not match.\n");
+        return false;
+      }
+
+      int output_K = size<0>(problem_shape.get_shape_C());
+      int output_C = size<1,0>(problem_shape.get_shape_C());
+
+      implementable &= input_K == output_K;
+      implementable &= input_C == output_C * problem_shape.groups;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Wgrad's input and output K,C and groups do not match\n");
+        return false;
+      }
+
+      constexpr int Tile_N = size<1>(TileShape{});
+      constexpr int GroupsPerTile = size<3>(TileShapeMNKL_{});
+
+      implementable &= Tile_N / GroupsPerTile == input_C / problem_shape.groups;
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Grouped Wgrad's Tile_N, GroupsPerTile and input_C, groups do not match.\n");
+        return false;
+      }
+    }
+
+    // The extents of linearized problem shape should be int32_t type(maximum is 2^31-1).
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      MainloopPipelineState mainloop_pipe_producer_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mk, tBgB_nk, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mk(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _);
+    auto tensor_b_coord = get<1>(cta_coord_mnkl);
+    if constexpr (is_grouped_wgrad) {
+      // in grouped wgrad, tensor A = NZPQK, tensor B = NDHWC, tensor C = KTRSc, where C = G*c, c = channel_per_group = 8,16,32.
+      // CTA Tiling follows output tensor KTRSc. So cta_size_m = K/CTA_TILE_M. cta_size_n = T*R*S*ceil(c/CTA_TILE_N) = T*R*S*1 = T*R*S.
+      // tensor_a_coord = K_idx = cta_coord_m.
+      // tensor_b_coord = TRS_idx * C/CTA_TILE_N + C_idx = cta_coord_n * get<1,0>(shape(tBgB_nk) + cta_coord_m,
+      // because K == C and CTA_TILE_M == CTA_TILE_N => C_idx = K_idx = cta_coord_m.
+      tensor_b_coord = get<0>(cta_coord_mnkl) + get<1>(cta_coord_mnkl) * get<1,0>(shape(tBgB_nk));
+    }
+    Tensor tBgB = tBgB_nk(_, tensor_b_coord, _);
+
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if constexpr (is_strided_dgrad) {
+        // construct gemm-k tile coord for gB
+        auto [conv_k, flt_coord, out_coord] = *k_tile_iter;
+        auto gemm_k_tile = prepend(flt_coord, conv_k); // (k,s,r,t)
+
+        // gA doesn't have a gemm-k (k,s,r,t) iterator mode because it's not an im2col tensor
+        auto offset_kqpzn = append(prepend(out_coord, _0{}),_0{}); // (k,q,p,z,n)
+        auto tAgA_offset = make_tensor(tAgA.data() + offset_kqpzn, tAgA.layout()); // (TMA, k)
+
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA_offset(_,conv_k), tAsA(_,write_stage));
+          copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,gemm_k_tile)  , tBsB(_,write_stage));
+        }
+      }
+      else {
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+          copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        }
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+  }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mk - The tiled tma tensor for input A
+  /// gB_nk - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    auto K_A = conditional_return<is_strided_dgrad>(get<0>(K), K);
+    Tensor mA_mk = observed_tma_load_a_->get_tma_tensor(make_shape(M, K_A));
+    Tensor mB_nk = observed_tma_load_b_->get_tma_tensor(make_shape(N, K));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M, BLK_K, m, k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N, BLK_K, n, k)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mk = cta_mma.partition_A(gA_mk);          // (MMA, MMA_M, MMA_K, m, k)
+    Tensor tCgB_nk = cta_mma.partition_B(gB_nk);          // (MMA, MMA_N, MMA_K, n, k)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mk, tAsA] = tma_partition(*observed_tma_load_a_,
+                                    get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                    group_modes<0,3>(sA), group_modes<0,3>(tCgA_mk));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nk, tBsB] = tma_partition(*observed_tma_load_b_,
+                                    get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                    group_modes<0,3>(sB), group_modes<0,3>(tCgB_nk));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return cute::make_tuple(
+        gA_mk, gB_nk,                        // for scheduler
+        tAgA_mk, tBgB_nk, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b);         // multicast masks
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of ctas in Cluster
+      * Waits for all stages to either be released (all
+      * Consumer UNLOCKs), or if the stage was never used
+      * then would just be acquired since the phase was
+      * still inverted from make_producer_start_state
+      */
+    pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB
+  >
+  CUTLASS_DEVICE auto
+  mma(MainloopPipeline pipeline,
+      MainloopPipelineState mainloop_pipe_consumer_state,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      int k_tile_count)
+  {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  mma_init(TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    TiledMma tiled_mma;
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = tiled_mma.make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = tiled_mma.make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+
+private:
+
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..11eefed94182c8d8870a65c9f4d937ede5db5421
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,785 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_im2col.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/util/packed_stride.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  conv::Operator ConvOp,
+  int Stages,
+  int NumSpatialDims,
+  class ClusterShape,
+  class KernelSchedule,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class ElementB_,
+  class TiledMma_,
+  class TileTraitsA_,
+  class TileTraitsB_>
+struct CollectiveConv<
+    MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+        ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    ElementB_,
+    TiledMma_,
+    TileTraitsA_,
+    TileTraitsB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedImplicitGemm<
+      ConvOp, Stages, NumSpatialDims, ClusterShape, KernelSchedule, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = typename TileTraitsA_::GmemTiledCopy;
+  using GmemTiledCopyB = typename TileTraitsB_::GmemTiledCopy;
+  using SmemLayoutA = typename TileTraitsA_::SmemLayout;
+  using SmemLayoutB = typename TileTraitsB_::SmemLayout;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static constexpr int NumSpatialDimensions = DispatchPolicy::NumSpatialDimensions;
+  static constexpr int NumTensorDimensions = NumSpatialDimensions + 2;
+  // Deduce the kernel-facing stride tuple types based on the dispatch policy
+  // (which is a function of the number of spatial dimensions, the algorithm, etc.)
+  using StrideA = decltype(detail::sm90_dispatch_policy_to_stride_A<DispatchPolicy>());
+  using StrideB = decltype(detail::sm90_dispatch_policy_to_stride_B<DispatchPolicy>());
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using ProblemShape = ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  static_assert(rank(SmemLayoutA{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<0>(TileShape{}) == size<0>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutA{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(rank(SmemLayoutB{}) == 3, "SmemLayout must be rank 3 (M/N, K, PIPE)");
+  static_assert((size<1>(TileShape{}) == size<0>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+  static_assert((size<2>(TileShape{}) == size<1>(SmemLayoutB{})), "SmemLayout must be compatible with the tile shape.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  // The tma load mode of wgrad is tiled for tensor A and im2col for tensor B while the tma load mode of fprop and dgrad
+  // kernel is im2col for tensor A and tiled for tensor B.
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST>)),
+      "GmemTiledCopyA - invalid SM90 TMA copy atom specified.");
+  static_assert((ConvOp == conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST>))
+             || (ConvOp != conv::Operator::kWgrad
+             && (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)),
+      "GmemTiledCopyB - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool is_im2col_A = detail::is_im2col_load<GmemTiledCopyA>::value;
+  static constexpr bool is_im2col_B = detail::is_im2col_load<GmemTiledCopyB>::value;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+  static constexpr uint32_t TmaTransactionBytes =
+      (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
+      (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    ElementB const* ptr_B{nullptr};
+  };
+
+private:
+  // Note that for fprop and dgrad kernel, the tma load mode is im2col for tensor A and tiled for
+  // tensor B while for wgrad kernel, the tma load mode is tiled for tensor A and im2col for tensor
+  // B since operand A, B is swapped.
+  // Get tma_load_a instantce.
+  template <class TensorA>
+  static constexpr auto
+  get_tma_load_a_instance(TensorA const& tensor_a, ProblemShape const& problem_shape) {
+    if constexpr (is_im2col_A) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      // The calculation of gbasis strides for dgrad kernel needs perform negate for dilation values.
+      cute::array<int32_t, NumSpatialDimensions> stride_srt{};
+      for (int i = 0; i < NumSpatialDimensions; ++i) {
+        stride_srt[i] = ConvOp == conv::Operator::kDgrad ?
+            -problem_shape.dilation[NumSpatialDimensions-1-i] :
+            problem_shape.dilation[NumSpatialDimensions-1-i];
+      }
+
+      return make_im2col_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          product_each(shape(SmemLayoutA{}(_,_,_0{}))),
+          size<1>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          shape(stride_srt));
+    }
+    // TMA tiled mode for tensor A in wgrad kernel.
+    else {
+      return make_tma_copy(
+          GmemTiledCopyA{},
+          tensor_a,
+          SmemLayoutA{}(_,_,_0{}),
+          make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+          size<1>(ClusterShape{}));
+    }
+  }
+
+  // Get tma_load_b instantce.
+  template <class TensorB>
+  static constexpr auto
+  get_tma_load_b_instance(TensorB const& tensor_b, ProblemShape const& problem_shape) {
+    // TMA im2col mode for tensor B in wgrad kernel.
+    if constexpr (is_im2col_B) {
+      // compute the upper and lower corners based on the conv padding
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      auto lower_srt = detail::compute_lower_srt(problem_shape);
+
+      return make_im2col_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          product_each(shape(SmemLayoutB{}(_,_,_0{}))),
+          size<0>(ClusterShape{}),
+          shape(lower_corner_whd),
+          shape(upper_corner_whd),
+          cute::reverse(shape(problem_shape.lower_padding)),
+          cute::reverse(shape(problem_shape.upper_padding)),
+          cute::reverse(shape(problem_shape.traversal_stride)),
+          shape(lower_srt),
+          cute::reverse(shape(problem_shape.dilation)));
+    }
+    else {
+      return make_tma_copy(
+          GmemTiledCopyB{},
+          tensor_b,
+          SmemLayoutB{}(_,_,_0{}),
+          make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+          size<0>(ClusterShape{}));
+    }
+  }
+
+public:
+
+  // Performs im2col transformations on the input of type ConvProblemShape
+  static constexpr auto
+  get_problem_shape_MNKL(ProblemShape const& problem_shape) {
+
+    if constexpr (is_im2col_A || is_im2col_B) {
+      // transformation + im2col linearization
+      return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+    }
+    else {
+      // transformation
+      return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+    }
+  }
+
+  // Device side kernel params
+  struct Params {
+    using _Submode = decltype(take<0,NumTensorDimensions-1>(typename ProblemShape::TensorExtent{}));
+
+    // Assumption: StrideA is congruent with Problem_MK
+    // Select TMA load type according to convolution operator.
+    using TensorShapeA = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(repeat_like(StrideA{}, int32_t(0))),
+        decltype(make_shape(_Submode{}, int(0)))>;
+
+    using TensorShapeB = cute::conditional_t<ConvOp == conv::Operator::kWgrad,
+        decltype(make_shape(int(0), _Submode{})),
+        decltype(repeat_like(StrideB{}, int32_t(0)))>;
+
+    using TMA_A = decltype(get_tma_load_a_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementA const*>(nullptr)),
+            make_layout(TensorShapeA{}, StrideA{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    using TMA_B = decltype(get_tma_load_b_instance(
+        make_tensor(
+            make_gmem_ptr(static_cast<InternalElementB const*>(nullptr)),
+            make_layout(TensorShapeB{}, StrideB{})),
+        ConvProblemShape<ConvOp, NumSpatialDimensions>{}));
+
+    // Members
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  // Lowers the host side user facing arguments to the kernel facing lauch params
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // from the flat problem shape arrays of ConvProblemShape<ConvOp, N>, create a rank-3 MNK problem shape tuple
+    // tma desc creation depends on the original untransformed domain.
+
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+
+    // Fill inferred cute strides from flat stride arrays
+    auto dA = make_cute_packed_stride(StrideA{}, problem_shape.stride_A, ConvOp);
+    auto dB = make_cute_packed_stride(StrideB{}, problem_shape.stride_B, ConvOp);
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(make_gmem_ptr(ptr_A), make_layout(shape_A_orig, dA));
+    Tensor tensor_b = make_tensor(make_gmem_ptr(ptr_B), make_layout(shape_B_orig, dB));
+
+    auto tma_load_a = get_tma_load_a_instance(tensor_a, problem_shape);
+    auto tma_load_b = get_tma_load_b_instance(tensor_b, problem_shape);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      Arguments const& args) {
+    // Activation and Filter channel mode extents much match
+    bool implementable = true;
+    // channel mode is major
+    implementable &= problem_shape.stride_A[NumTensorDimensions-1] == 1;
+    implementable &= problem_shape.stride_B[NumTensorDimensions-1] == 1;
+
+    constexpr int tma_alignment_bits = 128;
+    // A extents.
+    auto shape_A_orig = problem_shape.get_shape_A();
+    // B extents.
+    auto shape_B_orig = problem_shape.get_shape_B();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(shape_A_orig, StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(shape_B_orig, StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return false;
+    }
+
+    // Check valid padding values for TMA_LOAD_IM2COL
+    constexpr int padding_limit = (ProblemShape::RankS == 1) ? 65536 : (ProblemShape::RankS == 2 ? 256 : 16);
+    for (int i = 0; i < problem_shape.RankS; ++i) {
+      implementable = implementable && problem_shape.lower_padding[i] <= padding_limit && problem_shape.lower_padding[i] >= 0;
+      implementable = implementable && problem_shape.upper_padding[i] <= padding_limit && problem_shape.upper_padding[i] >= 0;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+      return false;
+    }
+
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid corner values for TMA_LOAD_IM2COL, signed int ranging from [-corner_limit, corner_limit - 1]
+      constexpr int32_t corner_limit = 1 << (16 / NumSpatialDimensions - 1);
+      auto lower_corner_whd = detail::compute_lower_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && lower_corner_whd[i] >= -corner_limit && lower_corner_whd[i] <= (corner_limit - 1);
+      }
+      auto upper_corner_whd = detail::compute_upper_corner_whd(problem_shape);
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        implementable = implementable && upper_corner_whd[i] >= -corner_limit && upper_corner_whd[i] <= (corner_limit - 1);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Padding values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid filter offsets for TMA_LOAD_IM2COL, unsigned int ranging from [0, offset_limit - 1]
+      constexpr int32_t offset_limit = (1 << (16 / NumSpatialDimensions)) - 1;
+      auto flt_data = (ConvOp == conv::Operator::kWgrad) ? problem_shape.shape_C : problem_shape.shape_B;
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        // flt_data array contains [K, T, R, S, C], so pure filter [T, R, S] starts from the second position in the array
+        implementable = implementable && ((flt_data[i+1] - 1) * problem_shape.dilation[i] >= 0)
+                                      && ((flt_data[i+1] - 1) * problem_shape.dilation[i] < offset_limit);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: tensor coordinate offset values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
+    // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
+    if constexpr (ConvOp == conv::Operator::kWgrad) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::ostringstream os;
+#endif
+      const auto & input_shape  = problem_shape.shape_A;
+      const auto & input_stride  = problem_shape.stride_A;
+
+      implementable &= input_stride[ProblemShape::RankT - 1] == 1;
+      int64_t input_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        input_shape_size *= input_shape[i + 1];
+        implementable &= input_stride[i] == input_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (input_stride[i] != input_shape_size) {
+          os << "\n    *** input_stride[" << i << "] = " << input_stride[i] << " != input_shape_size = " << input_shape_size << " ***";
+        }
+#endif
+      }
+
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    input_shape_size: " << input_shape_size
+           << "\n    input_shape: " << input_shape
+           << "\n    input_stride: " << input_stride
+           << "\n";
+#endif
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed input strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif
+        return false;
+      }
+
+      const auto & output_shape  = problem_shape.shape_C;
+      const auto & output_stride  = problem_shape.stride_C;
+
+      implementable &= output_stride[ProblemShape::RankT - 1] == 1;
+      int64_t output_shape_size = 1;
+      for (int i = ProblemShape::RankT - 2; i >= 0; --i) {
+        output_shape_size *= output_shape[i + 1];
+        implementable &= output_stride[i] == output_shape_size;
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        if (output_stride[i] != output_shape_size) {
+          os << "\n    *** output_stride[" << i << "] = " << output_stride[i] << " != output_shape_size = " << output_shape_size << " ***";
+        }
+#endif
+      }
+
+      if (!implementable) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        os << "\n    output_shape_size: " << input_shape_size
+           << "\n    output_shape: " << input_shape
+           << "\n    output_stride: " << input_stride
+           << "\n";
+#endif
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Wgrad kernels don't support non-packed output strides.\n");
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST(os.str());
+#endif
+        return false;
+      }
+    }
+
+    // Conv kernels only support cross correlation mode currently.
+    implementable &= problem_shape.mode == cutlass::conv::Mode::kCrossCorrelation;
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Conv kernels only support cross correlation mode currently.\n");
+      return false;
+    }
+
+    if (problem_shape.groups > 1) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: This kernel does not support conv groups > 1.\n");
+      return false;
+    }
+
+    if constexpr (is_im2col_A || is_im2col_B) {
+      auto [M, N, K, L] = cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      auto to_64b = [](auto S) { return transform_leaf(S, [](auto s) { return static_cast<int64_t>(s); }); };
+
+      if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+        implementable &= (cute::product(to_64b(M)) <= cutlass::platform::numeric_limits<int32_t>::max()) &
+                         (cute::product(to_64b(L)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+      else if constexpr (ConvOp == conv::Operator::kWgrad) {
+        implementable &= (cute::product(to_64b(K)) <= cutlass::platform::numeric_limits<int32_t>::max());
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: the extents exceed the maximum number.\n");
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mk - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k)
+  /// gB_nk - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k)
+  /// The rest of the tensors can be specified as needed by this collective.
+  /// The dimensions of gA_mk and gA_nk do not contain L to maintain consistency with
+  /// StrideA and StrideB set up for TMA
+  template <class ProblemShapeMNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params){
+  //load_init(ProblemShapeMNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mk = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K));                            // (m,k)
+    Tensor mB_nk = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K));                            // (n,k)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mk = local_tile(mA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k)
+    Tensor gB_nk = local_tile(mB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k)
+
+    return cute::make_tuple(gA_mk, gB_nk);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_producer_state,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    int lane_predicate = cute::elect_one_sync();
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      auto [gA_mk, gB_nk] = load_inputs;
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA = gA_mk(_,_,m_coord,_);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nk(_,_,n_coord,_);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_IM2COL_MULTICAST> ||
+                    cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_producer_state for _writing_
+        pipeline.producer_acquire(smem_pipe_producer_state);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_producer_state);
+
+        int write_stage = smem_pipe_producer_state.index();
+
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_producer_state
+        ++smem_pipe_producer_state;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_producer_state) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_producer_state);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <class FrgTensorC>
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_consumer_state,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_consumer_state;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_consumer_state;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // WAIT on smem_pipe_consumer_state until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_consumer_state);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_consumer_state.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_producer_state is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_consumer_state and smem_pipe_release
+      ++smem_pipe_consumer_state;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv2d_problem_size.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv2d_problem_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbef858a54eda2ffbfea30e8ff9bd570bcf841f9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv2d_problem_size.h
@@ -0,0 +1,658 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv2dProblem desciption:
+    activation (NHWC), 
+    filter (KRSC), 
+    output (NPQK), 
+    pading (pad_h, pad_w),
+    stride (stride_h, stride_w),
+    dilation (dilation_h, dilation_w).
+    
+  Free functions to map:
+    Map tensor extents (Conv2d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv2d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv2d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv2dProblemSize {
+
+  // Conv2d strictly problem size parameters
+  int N, H, W, C, P, Q, K, R, S;
+  int pad_h, pad_w;
+  int stride_h, stride_w;
+  int dilation_h, dilation_w;
+  Mode mode;
+
+  // Conv2d implementation-related parameters 
+  int split_k_slices;
+  int groups;
+
+  //
+  // Methods
+  //
+
+public:
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize():
+    N(0), H(0), W(0), C(0), P(0), Q(0), K(0), R(0), S(0),
+    pad_h(0), pad_w(0), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(Mode::kConvolution), split_k_slices(1), groups(1) { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int P,
+    int Q,
+    int K,
+    int R,
+    int S,
+    Mode mode
+  ): 
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1), dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(1), groups (1) { }
+  
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    int N,
+    int H,
+    int W,
+    int C,
+    int K,
+    int R,
+    int S,
+    int P,
+    int Q,
+    int pad_h,
+    int pad_w,
+    int stride_h,
+    int stride_w,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(N), H(H), W(W), C(C), P(P), Q(Q), K(K), R(R), S(S),
+    pad_h(pad_h), pad_w(pad_w), stride_h(stride_h), stride_w(stride_w),
+    dilation_h(dilation_h), dilation_w(dilation_w), 
+    mode(mode), split_k_slices(split_k_slices), groups (groups) { }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord padding,       // pad_h, _, pad_w, _
+    cutlass::MatrixCoord stride,          // stride_h, stride_w
+    cutlass::MatrixCoord dilation,        // dilation_h, dilation_w
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // computes output size and sets P and Q (skip output from ctor arguments)
+  CUTLASS_HOST_DEVICE  
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,   // NHWC
+    cutlass::Tensor4DCoord filter_size,  // KRSC
+    cutlass::Tensor4DCoord padding,      // pad_h, upper_pad_h, pad_w, upper_pad_w
+    cutlass::MatrixCoord stride,         // stride_h, stride_w
+    cutlass::MatrixCoord dilation,       // dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(padding[0]), pad_w(padding[2]),
+    stride_h(stride.row()), stride_w(stride.column()),
+    dilation_h(dilation.row()), dilation_w(dilation.column()),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {
+      // set output P and Q
+      P = ((H + pad_h + padding[1] - R * dilation_h) / stride_h) + 1;
+      Q = ((W + pad_w + padding[3] - S * dilation_w) / stride_w) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor4DCoord and MatrixCoord 
+  // set user-defined output size and sets P and Q (skip padding, striding, and dilation)
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize(
+    cutlass::Tensor4DCoord input_size,    // NHWC
+    cutlass::Tensor4DCoord filter_size,   // KRSC
+    cutlass::Tensor4DCoord output_size,   // NPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    N(input_size.n()), H(input_size.h()), W(input_size.w()), C(input_size.c()),
+    P(output_size.h()), Q(output_size.w()),
+    K(filter_size.n()), R(filter_size.h()), S(filter_size.w()),
+    pad_h(R / 2), pad_w(S / 2), stride_h(1), stride_w(1),
+    dilation_h(1), dilation_w(1),
+    mode(mode), split_k_slices(split_k_slices), groups(groups) {}
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv2dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv2dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv2dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (R == conv.R) && (S == conv.S) &&
+      (P == conv.P) && (Q == conv.Q) &&
+      (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv2dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  /// Returns activation extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord activation_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord filter_extent(bool is_deconv = false) const {
+
+    return is_deconv ? cutlass::Tensor4DCoord ({C, R, S, K / groups})
+        : cutlass::Tensor4DCoord ({K, R, S, C / groups});
+  }
+
+  /// Returns output extent as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord output_extent() const {
+
+    return cutlass::Tensor4DCoord ({N, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return static_cast<int64_t>(N) * static_cast<int64_t>(H) *
+           static_cast<int64_t>(W) * static_cast<int64_t>(C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return static_cast<int64_t>(K) * static_cast<int64_t>(R) *
+           static_cast<int64_t>(S) * static_cast<int64_t>(C) /
+           static_cast<int64_t>(groups);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return static_cast<int64_t>(N) * static_cast<int64_t>(P) *
+           static_cast<int64_t>(Q) * static_cast<int64_t>(K);
+  }
+
+  /// Returns padding as Tensor4DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor4DCoord padding() const {
+
+    return cutlass::Tensor4DCoord ({pad_h, pad_h, pad_w, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord stride() const {
+
+    return cutlass::MatrixCoord ({stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::MatrixCoord dilation() const {
+
+    return cutlass::MatrixCoord ({dilation_h, dilation_w});
+  }
+
+  /////////////////////////////////////////////////////////////////
+  //        Methods used for strided dgrad implementation
+  /////////////////////////////////////////////////////////////////
+  /// Number of filter r positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_r(int r) const {
+    return ((R - r + stride_h - 1) / stride_h);
+  }
+
+  /// Number of filter s positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_s(int s) const {
+    return ((S - s + stride_w - 1) / stride_w);
+  }
+
+  /// Number of filter positions to accumulate in gemm-k dim
+  CUTLASS_HOST_DEVICE
+  int num_gemm_k_filter_positions(int r, int s) const {
+    return num_gemm_k_filter_r(r) * num_gemm_k_filter_s(s);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv2dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C / problem_size.groups
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+  int iterations = 0;
+
+  if (group_mode == GroupMode::kNone) {
+
+    if (algorithm == IteratorAlgorithm::kFixedChannels) {
+
+      int positions_per_iteration = threadblock_K / problem_size.C;
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S + positions_per_iteration - 1 ) / positions_per_iteration;
+        break;
+
+      default:
+        break;
+      }
+    }
+    else if (algorithm == IteratorAlgorithm::kFewChannels) {
+
+      switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = (problem_size.R * problem_size.S * problem_size.C + threadblock_K - 1 ) / threadblock_K;
+        break;
+
+      default:
+        break;
+      }
+    }
+    else {
+      int elements_per_split_k_slice = 0;
+
+      switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice = (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+
+      default:
+        break;
+      }
+    }
+
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
+  } else {  // Group conv
+
+    int channels_per_group = problem_size.C / problem_size.groups;
+    int k_per_group = problem_size.K / problem_size.groups;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+          // In group conv, if k_per_group < threadblock_N, one Threadblock will calculate multiple groups
+          if (problem_size.groups != 1) {
+            if (k_per_group < threadblock_N) {
+              iterations *= threadblock_N / k_per_group;
+            }
+          }
+          break;
+
+        default:
+          break;
+      }
+    } else if (algorithm == IteratorAlgorithm::kOptimized) {
+      // Current optimized iterator only support GroupMode::kSingleGroup
+      if (group_mode == GroupMode::kSingleGroup) {
+        switch (conv_operator) {
+          case Operator::kFprop:
+            iterations = problem_size.R * problem_size.S * ((channels_per_group + threadblock_K - 1) / threadblock_K);
+            break;
+
+          default:
+            break;
+        }
+      }
+    }
+
+  }
+
+  return iterations;
+}
+
+
+template <int N = 1, int Output_P = 1, int Output_Q = 1>
+CUTLASS_HOST_DEVICE
+int depthwise_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv2dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+    int n =  problem_size.N;
+    int p = (problem_size.P + Output_P - 1) /  Output_P;
+    int q = (problem_size.Q + Output_Q - 1) /  Output_Q;
+
+    int iterations = (n * p * q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+    return iterations;
+}
+
+
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations_per_channel(
+    Operator conv_operator,
+    Conv2dProblemSize const &problem_size,
+    IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic) {
+
+  int iterations = 0; //0 means not applicable
+  if (algorithm == IteratorAlgorithm::kAnalytic || algorithm == IteratorAlgorithm::kOptimized) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        iterations = problem_size.R * problem_size.S;
+        break;
+
+      default:
+        break;
+    }
+  }
+  return iterations;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor4DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor4DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor4DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv2dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  Strided dgrad helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Returns number of CTAs tile M to cover valid MMAs per starting filter postion
+CUTLASS_HOST_DEVICE
+int strided_dgrad_tile_m_per_filter(
+  Conv2dProblemSize const &problem_size,
+  int tile_size_m) {
+
+  // Compute NHW rows in Dx output that needs MMA per starting filter position
+  int rows_h_per_filter = (problem_size.H + problem_size.stride_h - 1) / problem_size.stride_h;
+  int rows_w_per_filter = (problem_size.W + problem_size.stride_w - 1) / problem_size.stride_w;
+  int rows_nhw_per_filter = problem_size.N * rows_h_per_filter * rows_w_per_filter;
+
+  // Number of CTAs tile M to cover valid MMAs per starting filter postion
+  int tile_m_per_filter = (rows_nhw_per_filter + tile_size_m - 1) / tile_size_m;
+
+  return tile_m_per_filter;
+}
+
+// Computes starting Dx coord (h, w) for given starting filter postion
+CUTLASS_HOST_DEVICE
+void strided_dgrad_starting_coords(
+  Conv2dProblemSize const &problem_size,
+  FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+  int r, int s,
+  int &start_h, int &start_w) {
+
+  // function locals for remainder by fast divmod
+  int pad_h_rem_, pad_w_rem_;
+
+  // start_h  = std::abs(problem_size.stride_h - ((problem_size.pad_h % problem_size.stride_h) - r)) % problem_size.stride_h;
+  stride_h_divmod.divmod(pad_h_rem_, problem_size.pad_h);
+  int r_ = absolute_value(problem_size.stride_h - (pad_h_rem_ - r));
+  stride_h_divmod.divmod(start_h, r_);
+
+  //start_w  = std::abs(problem_size.stride_w - ((problem_size.pad_w % problem_size.stride_w) - s)) % problem_size.stride_w;
+  stride_w_divmod.divmod(pad_w_rem_, problem_size.pad_w);
+  int s_ = absolute_value(problem_size.stride_w - (pad_w_rem_ - s));
+  stride_w_divmod.divmod(start_w, s_);
+}
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv3d_problem_size.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv3d_problem_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..48bf056e17014400a6bc41b87193a05de3cb9c96
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/conv3d_problem_size.h
@@ -0,0 +1,519 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem sizes.
+
+  Conv3dProblem desciption:
+    activation (NDHWC), 
+    filter (KTRSC), 
+    output (NZPQK), 
+    pading (pad_d, pad_h, pad_w), 
+    stride (stride_d, stride_h, stride_w), 
+    dilation (dilation_d, dilation_h, dilation_w).
+  
+  Free functions to map:
+    Map tensor extents (Conv3d -> ImplicitGemm)      : implicit_gemm_tensor_[a|b|c]_extent(ConvolutionOperator)
+    Map tensor sizes (Conv3d -> ImplicitGemm)        : implicit_gemm_tensor_[a|b|c]_size(ConvolutionOperator)
+    Map tensor problem sizes (Conv3d -> ImplicitGemm): implicit_gemm_problem_size(ConvolutionOperator)  
+*/
+
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Problem size structure
+struct Conv3dProblemSize : public Conv2dProblemSize {
+  //
+  // Type definitions
+  //
+
+  // 3D coordinate for padding, stride, and dilation in (d, h, w) dimensions
+  using Coord3D = Coord<3>;
+
+  //
+  // Data members
+  //
+
+  // Conv3d strictly problem size parameters
+  int D, T, Z;    // input depth, filter depth, output depth
+  int pad_d;      // padding in depth dimension
+  int stride_d;   // stride in depth dimension
+  int dilation_d; // dilation in depth dimension
+
+  //
+  // Methods
+  //
+public:
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(): 
+    Conv2dProblemSize(),
+    D(0), T(0), Z(0), 
+    pad_d(0),
+    stride_d(1), 
+    dilation_d(1) { }
+ 
+  /// Constructor for default padding, stride, dilation, and split-K
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int Z,
+    int P,
+    int Q,
+    int K,
+    int T,
+    int R,
+    int S,
+    Mode mode
+  ):
+    Conv2dProblemSize(N, H, W, C, P, Q, K, R, S, mode),
+    D(D), T(T), Z(Z), 
+    pad_d(T / 2), stride_d(1), dilation_d(1) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    int N,
+    int D,
+    int H,
+    int W,
+    int C,
+    int K,
+    int T,
+    int R,
+    int S,
+    int Z,
+    int P,
+    int Q,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    Mode mode,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+    N, H, W, C, K, R, S, P, Q, 
+    pad_h, pad_w, 
+    stride_h, stride_w, 
+    dilation_h, dilation_w,
+    mode, split_k_slices, groups),
+    D(D), T(T), Z(Z), 
+    pad_d(pad_d), stride_d(stride_d), dilation_d(dilation_d) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // set *user-defined* output size and sets Z, P, and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::Tensor5DCoord output_size,   // NZPQK
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      {output_size.n(), output_size.h(), output_size.w(), output_size.c()},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()), Z(output_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0]) { }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord and Coord3D 
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    Coord3D padding,                      // pad_d, pad_h, pad_w
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {padding[1], padding[1], padding[2], padding[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(padding[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d * 2 - T * dilation_d) / stride_d) + 1;
+    }
+
+  /// Constructs convolution problem size from cutlass Tensor5DCoord, Coord3D
+  // *computes* output size and sets Z, P and Q (include all data members in ctor)
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize(
+    cutlass::Tensor5DCoord input_size,    // NDHWC
+    cutlass::Tensor5DCoord filter_size,   // KTRSC
+    CUTLASS_STL_NAMESPACE::tuple<Coord3D, Coord3D> padding, // Coord3D {pad_d, pad_h, pad_w} & Coord3D {far pad_d, pad_h, pad_w} to calculate o/p/q
+    Coord3D stride,                       // stride_d, stride_h, stride_w
+    Coord3D dilation,                     // dilation_d, dilation_h, dilation_w
+    cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation,
+    int split_k_slices = 1,
+    int groups = 1
+  ):
+    Conv2dProblemSize(
+      {input_size.n(), input_size.h(), input_size.w(), input_size.c()},
+      {filter_size.n(), filter_size.h(), filter_size.w(), filter_size.c()},
+      {CUTLASS_STL_NAMESPACE::get<0>(padding)[1], CUTLASS_STL_NAMESPACE::get<1>(padding)[1],
+       CUTLASS_STL_NAMESPACE::get<0>(padding)[2], CUTLASS_STL_NAMESPACE::get<1>(padding)[2]},
+      {stride[1], stride[2]},
+      {dilation[1], dilation[2]},
+      mode, split_k_slices, groups),
+    D(input_size.d()), T(filter_size.d()),
+    pad_d(CUTLASS_STL_NAMESPACE::get<0>(padding)[0]), stride_d(stride[0]), dilation_d(dilation[0])
+    {
+      // set output Z
+      Z = ((D + pad_d + CUTLASS_STL_NAMESPACE::get<1>(padding)[0] - T * dilation_d) / stride_d) + 1;
+    }
+
+  /// Equality operator (ignores mode and split_k_slice)
+  CUTLASS_HOST_DEVICE
+  bool operator==(Conv3dProblemSize const &conv) const {
+    return (
+      (N == conv.N) && (D == conv.D) && (H == conv.H) && (W == conv.W) && (C == conv.C) &&
+      (K == conv.K) && (T == conv.T) && (R == conv.R) && (S == conv.S) &&
+      (Z == conv.Z) &&(P == conv.P) && (Q == conv.Q) &&
+      (pad_d == conv.pad_d) && (pad_h == conv.pad_h) && (pad_w == conv.pad_w) &&
+      (stride_d == conv.stride_d) && (stride_h == conv.stride_h) && (stride_w == conv.stride_w) &&
+      (dilation_d == conv.dilation_d) && (dilation_h == conv.dilation_h) && (dilation_w == conv.dilation_w)
+    );  
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Conv3dProblemSize const &rhs) const {
+    return !(*this == rhs);
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_mode(cutlass::conv::Mode mode_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.mode = mode_; 
+    return tmp; 
+  }
+
+  // Reset covolution mode in the problem
+  CUTLASS_HOST_DEVICE
+  Conv3dProblemSize reset_split_k_slices(int split_k_slices_) {
+    Conv3dProblemSize tmp(*this);
+    tmp.split_k_slices = split_k_slices_; 
+    return tmp; 
+  }
+  
+  /// Returns activation extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord activation_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, D, H, W, C});
+  }
+
+  /// Returns filter extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord filter_extent(bool is_deconv = false) const {
+
+    return is_deconv ? cutlass::Tensor5DCoord ({C, T, R, S, K})
+        : cutlass::Tensor5DCoord ({K, T, R, S, C});
+  }
+
+  /// Returns output extent as Tensor5DCoord
+  CUTLASS_HOST_DEVICE
+  cutlass::Tensor5DCoord output_extent() const {
+
+    return cutlass::Tensor5DCoord ({N, Z, P, Q, K});
+  }
+
+  /// Returns activation size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t activation_size() const {
+
+    return static_cast<int64_t>(N) * static_cast<int64_t>(D) *
+           static_cast<int64_t>(H) * static_cast<int64_t>(W) *
+           static_cast<int64_t>(C);
+  }
+
+  /// Returns filter size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t filter_size() const {
+
+    return static_cast<int64_t>(K) * static_cast<int64_t>(T) *
+           static_cast<int64_t>(R) * static_cast<int64_t>(S) *
+           static_cast<int64_t>(C);
+  }
+
+  /// Returns output size in number of elements
+  CUTLASS_HOST_DEVICE
+  int64_t output_size() const {
+
+    return static_cast<int64_t>(N) * static_cast<int64_t>(Z) *
+           static_cast<int64_t>(P) * static_cast<int64_t>(Q) *
+           static_cast<int64_t>(K);
+  }
+
+  /// Returns padding as Coord3D
+  CUTLASS_HOST_DEVICE
+  Coord3D padding() const {
+
+    return Coord3D ({pad_d, pad_h, pad_w});
+  }
+
+  /// Returns stride as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D stride() const {
+
+    return Coord3D ({stride_d, stride_h, stride_w});
+  }
+
+  /// Returns dilation as MatrixCoord
+  CUTLASS_HOST_DEVICE
+  Coord3D dilation() const {
+
+    return Coord3D ({dilation_d, dilation_h, dilation_w});
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//                                  ImplicitGemm helper functions                                 //
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Determine the problem size of the implicit GEMM operation
+CUTLASS_HOST_DEVICE
+cutlass::gemm::GemmCoord implicit_gemm_problem_size(
+  Operator conv_operator, 
+  Conv3dProblemSize const &problem_size) {
+  // Compute problem size
+  switch (conv_operator) {
+  case Operator::kFprop:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q,
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C
+    );
+  case Operator::kDeconv:
+  case Operator::kDgrad:
+    return gemm::GemmCoord(
+      problem_size.N * problem_size.D * problem_size.H * problem_size.W,
+      problem_size.C,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.K
+    );
+  case Operator::kWgrad:
+    return gemm::GemmCoord(
+      problem_size.K,
+      problem_size.T * problem_size.R * problem_size.S * problem_size.C,
+      problem_size.N * problem_size.Z * problem_size.P * problem_size.Q
+    );
+  default:
+    break;
+  }
+  return gemm::GemmCoord();
+}
+
+// Determine the number of gemm_k iterations for conv2d problem using implicit gemm algorithm
+CUTLASS_HOST_DEVICE
+int implicit_gemm_k_iterations(
+  Operator conv_operator, 
+  int threadblock_K, 
+  Conv3dProblemSize const &problem_size,
+  IteratorAlgorithm algorithm = IteratorAlgorithm::kAnalytic,
+  GroupMode group_mode = GroupMode::kNone,
+  int threadblock_N = 0) {
+
+  int iterations = 0;
+  int elements_per_split_k_slice = 0;
+  if (group_mode == GroupMode::kNone) {
+    switch (conv_operator) {
+      case Operator::kFprop:
+        elements_per_split_k_slice = (problem_size.C + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+
+      case Operator::kDeconv:
+      case Operator::kDgrad:
+        elements_per_split_k_slice =  (problem_size.K + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = problem_size.T * problem_size.R * problem_size.S * ((elements_per_split_k_slice + threadblock_K - 1) / threadblock_K);
+        break;
+    
+      case Operator::kWgrad:
+        elements_per_split_k_slice = (problem_size.N * problem_size.Z * problem_size.P * problem_size.Q + problem_size.split_k_slices - 1) / problem_size.split_k_slices;
+        iterations = (elements_per_split_k_slice + threadblock_K - 1) / threadblock_K;
+        break;
+    
+      default:
+        break;
+    }
+  } else if (group_mode == GroupMode::kDepthwise) {
+    int channels_per_cta = threadblock_N;
+
+    if (algorithm == IteratorAlgorithm::kAnalytic) {
+      switch (conv_operator) {
+        case Operator::kFprop:
+          iterations = problem_size.T * problem_size.R * problem_size.S *
+                       ((channels_per_cta + threadblock_K - 1) / threadblock_K);
+          break;
+
+        default:
+          break;
+      }
+    }
+  }
+
+  return iterations;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//  Mapping function (ImplicitGemm A, B, C -> Conv Activation, Filter, Output)
+////////////////////////////////////////////////////////////////////////////////
+/// Returns ImplicitGemm tensor A extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_a_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor B extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_b_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kDeconv: return problem_size.filter_extent(true);
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor C extent as Tensor5DCoord
+CUTLASS_HOST_DEVICE
+cutlass::Tensor5DCoord implicit_gemm_tensor_c_extent(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_extent();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_extent();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_extent();
+    default : break;
+  }
+  return cutlass::Tensor5DCoord();
+}
+
+/// Returns ImplicitGemm tensor A size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_a_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.activation_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.output_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.output_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor B size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_b_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.filter_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.filter_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.activation_size();
+    default : break;
+  }
+  return 0;
+}
+
+/// Returns ImplicitGemm tensor C size in number of elements
+CUTLASS_HOST_DEVICE
+int64_t implicit_gemm_tensor_c_size(
+  Operator conv_operator,
+  Conv3dProblemSize const &problem_size) {
+  switch (conv_operator) {
+    case cutlass::conv::Operator::kFprop: return problem_size.output_size();
+    case cutlass::conv::Operator::kDeconv:
+    case cutlass::conv::Operator::kDgrad: return problem_size.activation_size();
+    case cutlass::conv::Operator::kWgrad: return problem_size.filter_size();
+    default : break;
+  }
+  return 0;
+}
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convnd_problem_shape.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c31c21b2508914d10d41bb865a6da145bf3c106
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convnd_problem_shape.hpp
@@ -0,0 +1,601 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing convolution problem shapes.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/conv/convolution.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Implements the user facing argument for all CUTLASS 3.x convolutions in a rank agnostic fashion.
+// All tensors are flat and by default treated as layout right (NDHWC, KTRSC, NZPQK)
+// Supports asymmetric padding, traversal strides, dilations, and all conv algorithm types.
+template <
+  conv::Operator ConvOp_,
+  int NumSpatialDimensions_
+>
+struct ConvProblemShape {
+  //
+  // Alias types for members
+  //
+
+  static constexpr int RankS = NumSpatialDimensions_;
+  static constexpr int RankT = NumSpatialDimensions_ + 2;
+  static constexpr conv::Operator ConvOp = ConvOp_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  using SpatialExtent = cute::array<int, RankS>;
+  using TensorExtent  = cute::array<int, RankT>;
+  using TensorStride  = cute::array<int64_t, RankT>;
+  using ShapePadding = SpatialExtent;
+  using TraversalStride = SpatialExtent;
+  using ShapeDilation = SpatialExtent;
+  using Corner = SpatialExtent;
+
+  //
+  // Members
+  //
+  cutlass::conv::Mode mode{};
+  TensorExtent shape_A{};
+  TensorStride stride_A{};
+  TensorExtent shape_B{};
+  TensorStride stride_B{};
+  TensorExtent shape_C{};
+  TensorStride stride_C{};
+
+  // asymmetric padding, both upper and lower padding must be >= 0
+  ShapePadding lower_padding{};
+  ShapePadding upper_padding{};
+  TraversalStride traversal_stride{};
+  ShapeDilation dilation{};
+  int groups = 1;
+
+  //
+  // Methods
+  //
+
+  ConvProblemShape() = default;
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode mode,                                                     // convolution/cross-correlation
+      TensorExtent shape_act,                                              // [n,d,h,w,c]
+      TensorStride stride_act,                                             // [n,d,h,w,c]
+      TensorExtent shape_flt,                                              // [k,t,r,s,c]
+      TensorStride stride_flt,                                             // [k,t,r,s,c]
+      TensorStride stride_xformed_act,                                     // [n,z,p,q,k]
+      ShapePadding lower_padding,                                          // [pad_d, pad_h, pad_w]
+      ShapePadding upper_padding,                                          // [pad_d, pad_h, pad_w]
+      TraversalStride tstride,                                             // [stride_d, stride_h, stride_w]
+      ShapeDilation dilation,                                              // [dilation_d, dilation_h, dilation_w]
+      int groups)
+      : mode(mode)
+      , lower_padding(lower_padding)
+      , upper_padding(upper_padding)
+      , traversal_stride(tstride)
+      , dilation(dilation)
+      , groups(groups) {
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and presume packed tensor strides in canonical (CWHDN) order.
+  ConvProblemShape(
+      conv::Mode mode,
+      TensorExtent shape_act,
+      TensorExtent shape_flt,
+      ShapePadding lower_padding,
+      ShapePadding upper_padding,
+      TraversalStride tstride,
+      ShapeDilation dilation,
+      int groups)
+      : ConvProblemShape(
+        mode,
+        shape_act,
+        packed_stride_right_major(shape_act),
+        shape_flt,
+        packed_stride_right_major(shape_flt),
+        lower_padding,
+        upper_padding,
+        tstride,
+        dilation,
+        groups) {
+    }
+
+#if ! defined(__CUDACC_RTC__)
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(stride_act_.size() == stride_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(stride_flt_.size() == stride_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Allow user input of xformed activation stride to support non-packed strides.
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int64_t> stride_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int64_t> stride_flt_,
+      std::initializer_list<int64_t> stride_xformed_act_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+    TensorStride stride_xformed_act{};
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(stride_act_.begin(), stride_act_.end(), stride_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(stride_flt_.begin(), stride_flt_.end(), stride_flt.begin());
+    std::copy(stride_xformed_act_.begin(), stride_xformed_act_.end(), stride_xformed_act.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+
+    CUTLASS_ASSERT(stride_act[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_flt[RankT - 1] == 1);
+    CUTLASS_ASSERT(stride_xformed_act[RankT - 1] == 1);
+
+    auto stride_act_packed = packed_stride_right_major(shape_act);
+    auto stride_flt_packed = packed_stride_right_major(shape_flt);
+    auto [shape_xformed_act, stride_xformed_act_packed] = calculate_xformed_act(shape_act, shape_flt);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < RankT - 1; ++i) {
+      CUTLASS_ASSERT(stride_act[i] >= stride_act_packed[i]);
+      CUTLASS_ASSERT(stride_flt[i] >= stride_flt_packed[i]);
+      CUTLASS_ASSERT(stride_xformed_act[i] >= stride_xformed_act_packed[i]);
+    }
+
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+
+  // Constructor accepts user facing arguments and computes to stores the corners as its internal state
+  ConvProblemShape(
+      conv::Mode                     mode,
+      std::initializer_list<int>     shape_act_,
+      std::initializer_list<int>     shape_flt_,
+      std::initializer_list<int>     lower_padding_,
+      std::initializer_list<int>     upper_padding_,
+      std::initializer_list<int>     traversal_stride_,
+      std::initializer_list<int>     dilation_,
+      int groups)
+      : mode(mode)
+      , groups(groups) {
+    TensorExtent shape_act{};
+    TensorStride stride_act{};
+    TensorExtent shape_flt{};
+    TensorStride stride_flt{};
+
+    assert(shape_act_.size() == shape_act.size());
+    assert(shape_flt_.size() == shape_flt.size());
+    assert(lower_padding_.size() == lower_padding.size());
+    assert(upper_padding_.size() == upper_padding.size());
+    assert(traversal_stride_.size() == traversal_stride.size());
+    assert(dilation_.size() == dilation.size());
+
+    std::copy(shape_act_.begin(), shape_act_.end(), shape_act.begin());
+    std::copy(shape_flt_.begin(), shape_flt_.end(), shape_flt.begin());
+    std::copy(lower_padding_.begin(), lower_padding_.end(), lower_padding.begin());
+    std::copy(upper_padding_.begin(), upper_padding_.end(), upper_padding.begin());
+    std::copy(traversal_stride_.begin(), traversal_stride_.end(), traversal_stride.begin());
+    std::copy(dilation_.begin(), dilation_.end(), dilation.begin());
+    stride_act = packed_stride_right_major(shape_act);
+    stride_flt = packed_stride_right_major(shape_flt);
+
+    auto [shape_xformed_act, stride_xformed_act] = calculate_xformed_act(shape_act, shape_flt);
+    set_shape_stride_ABC(shape_act, stride_act, shape_flt, stride_flt, shape_xformed_act, stride_xformed_act);
+  }
+#endif // not defined(__CUDACC_RTC__)
+
+  // Set shape and stride of tensor A/B/C according to following table:
+  // |              | Fprop  | Dgrad  | Wgrad |
+  // | ------       | ------ | ------ | ------|
+  // |   ShapeA     | NDHWC  | NZPQK  | NZPQK |
+  // |   ShapeB     | KTRSC  | KTRSC  | NDHWC |
+  // |   ShapeC     | NZPQK  | NDHWC  | KTRSC |
+  //
+  // Input comes from calculate_xformed_act, which does NOT depend on ConvOp.
+  CUTLASS_HOST_DEVICE
+  constexpr void
+  set_shape_stride_ABC(
+    TensorExtent shape_act,
+    TensorStride stride_act,
+    TensorExtent shape_flt,
+    TensorStride stride_flt,
+    TensorExtent shape_xformed_act,
+    TensorStride stride_xformed_act) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("*** set_shape_stride_ABC ***");
+    printf("\n  shape_act: ");
+    print(shape_act);
+    printf("\n  stride_act: ");
+    print(stride_act);
+    printf("\n  shape_flt: ");
+    print(shape_flt);
+    printf("\n  stride_flt: ");
+    print(stride_flt);
+    printf("\n  shape_xformed_act: ");
+    print(shape_xformed_act);
+    printf("\n  stride_xformed_act: ");
+    print(stride_xformed_act);
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      printf("\n  ConvOp: Fprop");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      printf("\n  ConvOp: Dgrad");
+    }
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      printf("\n  ConvOp: Wgrad");
+    }
+    printf("\n");
+#endif
+
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop) {
+      shape_A = shape_act;
+      stride_A = stride_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_xformed_act;
+      stride_C = stride_xformed_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kDgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_flt;
+      stride_B = stride_flt;
+      shape_C = shape_act;
+      stride_C = stride_act;
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      shape_A = shape_xformed_act;
+      stride_A = stride_xformed_act;
+      shape_B = shape_act;
+      stride_B = stride_act;
+      shape_C = shape_flt;
+      stride_C = stride_flt;
+    }
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("\n  shape_A: ");
+    print(shape_A);
+    printf("\n  stride_A: ");
+    print(stride_A);
+    printf("\n  shape_B: ");
+    print(shape_B);
+    printf("\n  stride_B: ");
+    print(stride_B);
+    printf("\n  shape_C: ");
+    print(shape_C);
+    printf("\n  stride_C: ");
+    print(stride_C);
+#endif
+  }
+
+  // Get A extents.
+  // fprop: A extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // dgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // wgrad: A extents array contains [N,Z,P,Q,K]. Turn that into ((K), (Q,P,Z,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_A() const {
+    using cute::make_shape;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        cute::reverse(take<0, RankT - 1>(shape_A)),
+        shape_A[RankT - 1]);
+    }
+    // For wgrad kernel, we need to linearize NZPQ for tensor A
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_A[RankT - 1],
+        cute::product(take<0, RankT - 1>(shape_A)));
+    }
+  }
+
+  // Get B extents.
+  // fprop: B extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  // dgrad: B extents array contains [K,T,R,S,C]. Turn that into ((C), (K,S,R,T))
+  // wgrad: B extents array contains [N,D,H,W,C]. Turn that into ((C), (W,H,D,N))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_B() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop) {
+      return make_shape(
+        shape_B[0],
+        reverse(take<1, RankT>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_B[RankT - 1],
+        reverse(take<0, RankT - 1>(shape_B)));
+    }
+    else if constexpr (ConvOp == conv::Operator::kDgrad) {
+      // shape_B: [K,T,R,S,C], return: [(C),(K,S,R,T)]
+      return make_shape(
+        shape_B[RankT - 1],
+        cute::insert<0>(
+          reverse(take<1, RankT - 1>(shape_B)),
+          shape_B[0]));
+    }
+  }
+
+  // Get C extents.
+  // fprop: C extents array contains [N,Z,P,Q,K]. Turn that into ((Q,P,Z,N), (K))
+  // dgrad: C extents array contains [N,D,H,W,C]. Turn that into ((W,H,D,N), (C))
+  // wgrad: C extents array contains [K,T,R,S,C]. Turn that into ((K), (C,S,R,T))
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  get_shape_C() const {
+    using cute::make_shape;
+    using cute::reverse;
+    using cute::take;
+
+    if constexpr (ConvOp == conv::Operator::kFprop ||
+                  ConvOp == conv::Operator::kDgrad) {
+      return make_shape(
+        reverse(take<0, RankT - 1>(shape_C)),
+        shape_C[RankT - 1]);
+    }
+    else if constexpr (ConvOp == conv::Operator::kWgrad) {
+      return make_shape(
+        shape_C[0],
+        reverse(take<1, RankT>(shape_C)));
+    }
+  }
+
+  // Static method that returns the canonical strides of tensors (layouts are right major and compact)
+  CUTLASS_HOST_DEVICE
+  static constexpr TensorStride
+  packed_stride_right_major(TensorExtent const& extents) {
+    TensorStride strides{};
+    strides[RankT-1] = 1;
+    cute::for_each(cute::make_rseq<RankT-1>{}, [&](auto i) {
+      strides[i] = extents[i+1] * strides[i+1];
+    });
+    return strides;
+  }
+
+  // Static method that returns the packed logical size of any TensorExtent
+  CUTLASS_HOST_DEVICE
+  static constexpr size_t
+  size(TensorExtent const& extents) {
+    size_t size = 1;
+    cute::for_each(cute::make_seq<RankT>{}, [&](auto i) {
+      size *= extents[i];
+    });
+    return size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_A() const {
+    return shape_A[0] * stride_A[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_B() const {
+    return shape_B[0] * stride_B[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  constexpr size_t
+  size_C() const {
+    return shape_C[0] * stride_C[0];
+  }
+
+  // Equality operator
+  CUTLASS_HOST_DEVICE
+  bool operator==(ConvProblemShape<ConvOp, NumSpatialDimensions> const& rhs) const {
+    using cute::for_each;
+    using cute::make_seq;
+
+    bool is_equal = true;
+
+    // Compare all tensor extents
+    for_each(make_seq<RankT>{}, [&](auto i) {
+      is_equal = is_equal
+          && (shape_A[i] == rhs.shape_A[i])
+          && (shape_B[i] == rhs.shape_B[i]);
+    });
+
+    // Compare all spatial extents
+    for_each(make_seq<RankS>{}, [&](auto i) {
+      is_equal = is_equal
+          && (lower_padding[i] == rhs.lower_padding[i])
+          && (upper_padding[i] == rhs.upper_padding[i])
+          && (traversal_stride[i] == rhs.traversal_stride[i])
+          && (dilation[i] == rhs.dilation[i]);
+    });
+
+    return is_equal;
+  }
+
+  /// Inequality operator
+  CUTLASS_HOST_DEVICE
+  bool operator!=(ConvProblemShape<ConvOp, NumSpatialDimensions> const &rhs) const {
+    return !(*this == rhs);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  constexpr auto
+  calculate_xformed_act(TensorExtent shape_act, TensorExtent shape_flt) {
+    TensorExtent shape_xformed_act{};
+    // calculate n,z,p,q,k.
+    // a helper lambda to compute a single spatial extent of the nzpqk tensor
+    auto nzpqk_extent = [](int act_ext, int filter_ext, int pad_total, int dilation, int tstride) {
+      return 1 + (act_ext + pad_total - ((filter_ext -1) * dilation + 1)) / tstride;
+    };
+
+    shape_xformed_act[0] = shape_act[0]; // Activation N extent
+    cute::for_each(cute::make_seq<RankS>{}, [&](auto i) {
+      shape_xformed_act[i+1] = nzpqk_extent(
+          shape_act[i+1], shape_flt[i+1], upper_padding[i] + lower_padding[i], dilation[i], traversal_stride[i]);
+      });
+    shape_xformed_act[RankT-1] = shape_flt[0]; // Filter K extent
+
+    TensorStride stride_xformed_act = packed_stride_right_major(shape_xformed_act);
+
+    return cute::make_tuple(shape_xformed_act, stride_xformed_act);
+  }
+};
+
+template<
+  conv::Operator ConvOp,
+  int SpatialDim
+>
+void print(ConvProblemShape<ConvOp, SpatialDim> const& problem) {
+  printf("ConvProblemShape with %d spatial dimensions implementing cutlass::conv::Operator::%d\n",
+      SpatialDim, int(ConvOp));
+  printf("\tTensorA: ");
+      cute::print(problem.shape_A); printf(":");
+      cute::print(problem.stride_A); printf("\n");
+  printf("\tTensorB: ");
+      cute::print(problem.shape_B); printf(":");
+      cute::print(problem.stride_B); printf("\n");
+  printf("\tTensorC: ");
+      cute::print(problem.shape_C); printf(":");
+      cute::print(problem.stride_C); printf("\n");
+  printf("\tLower padding:     "); print(problem.lower_padding);       printf("\n");
+  printf("\tUpper padding:     "); print(problem.upper_padding);       printf("\n");
+  printf("\tTraversal strides: "); print(problem.traversal_stride);    printf("\n");
+  printf("\tDilation:          "); print(problem.dilation);            printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convolution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3cc98b4740115aefd557468d01ad28fa9a1028a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/convolution.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+
+This file contains definitions and utility functions for describing convolution problem sizes in terms of
+activation (NHWC), filter (KRSC), output (NPQK), padding (pad_h, pad_w), stride (stride_h, stride_w), and
+dilation (dilation_h, dilation_w).  Furthermore, it defines helper functions to map CUTLASS's implicit gemm
+tensor extents, sizes, and data types to that of the convolution's extents, sizes, and data types.
+
+                        * Mapping convolutions to Gemm computation *
+
+Cutlass implements convolutions with the Implicit Gemm algorithm.  This algorithm performs a gemm
+(general matrix-matrix multiply) on the convolution tensors Activation, Filter, and Output.
+The underlying gemm operation follows the standard gemm definition:
+
+                                     C = A * B + C
+
+                               A and B are input matrices
+                            C is source and output matrix
+
+
+For the three convolutional operators (Fprop, Dgrad, Wgrad), ImplicitGemm matrices A, B, and C are mapped
+to convolution tensors Activation, Filter and Output as described in the table below.
+
+        ___________________________________________________________________________
+         ConvolutionalOperator |        A        |      B         |       C
+        ___________________________________________________________________________
+        |                      |                 |                |               |
+        |       Fprop          |    Activation   |    Filter      |     Output    |
+        |       Dgrad          |     Output      |    Filter      |   Activation  |
+        |       Wgrad          |     Output      |  Activation    |     Filter    |
+        ___________________________________________________________________________
+
+In convolution codebase, DO NOT mix using (A, B, C) with (Activation, Filter, Output).
+
+For example, it's confusing and error prone to document a convolution class or function
+as operating on "A, B, Output."  Instead, use the mapping functions below,
+and adhere to using either A, B, C or Activation, Filter, Output.
+
+Map elements' data types (ImplicitGemm -> Conv): GemmToConvElementMap
+Map elements' data types (Conv -> ImplicitGemm): ConvToGemmElementMap
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/matrix_coord.h"
+
+namespace cutlass {
+namespace conv {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Convolutional operator
+enum class Operator {
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kDeconv
+};
+
+/// Distinguishes convolution from cross correlation
+enum class Mode {
+  kCrossCorrelation,
+  kConvolution
+};
+
+/// Selects among several implementation variants trading off performance with simplicity
+enum class IteratorAlgorithm {
+  kAnalytic,      ///< functionally correct in all cases but lower performance
+  kOptimized,     ///< optimized for R <= 32, S <= 32 and unity-stride dgrad
+  kFixedChannels, ///< Analytic algorithm optimized for fixed channel count (C == AccessSize)
+  kFewChannels,   ///< Analytic algorithm optimized for few channels (C divisible by AccessSize)
+  kFixedStrideDilation ///< Optimized for fixed stride and dilation
+};
+
+/// Distinguishes among partial specializations that accelerate certain problems where convolution
+/// stride is unit.
+enum class StrideSupport {
+  kStrided,       ///< arbitrary convolution stride
+  kUnity,         ///< unit convolution stride
+  kFixed          ///< fixed convolution stride
+};
+
+/// Identifies split-K mode
+enum class SplitKMode {
+  kNone,
+  kSerial,
+  kParallel
+};
+
+/// Identifies group mode
+enum class GroupMode {
+  kNone,
+  kSingleGroup,   ///< One CTA calculates one group or less
+  kMultipleGroup, ///< One CTA calculates multiple groups
+  kDepthwise      ///< One CTA calculates cta_n groups (problem_size.C == problem_size.K == problem_size.groups)
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a tensor
+template <
+  int N = 1,
+  int H = 1,
+  int W = 1,
+  int C = 1
+>
+struct TensorNHWCShape {
+  static int const kN = N;
+  static int const kH = H;
+  static int const kW = W;
+  static int const kC = C;
+
+  static int const kHW = H * W;
+  static int const kNHW = N * kHW;
+  static int const kNHWC = N * H * W * C;
+
+  static int const kCount = kNHWC;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<4> toCoord() {
+    return make_Coord(kN, kH, kW, kC);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a conv2d stride, which controls how the filter convolves around the input volume
+template <
+  /// Stride in horizontal direction
+  int u = 1,
+  /// Stride in vertical direction
+  int v = 1
+>
+struct Stride2D {
+  static int const kU = u;
+  static int const kV = v;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kU, kV);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0802921d60ce1809a7da67805de0f045c3511b19
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/detail.hpp
@@ -0,0 +1,137 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Helper function to get the problem shape
+template <typename T, class ProblemShape>
+auto get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::true_type) {
+  return T::get_problem_shape_MNKL(problem_shape);
+}
+
+template <typename T, class ProblemShape>
+ProblemShape get_problem_shape_MNKL_helper(ProblemShape const& problem_shape, cute::false_type) {
+  return problem_shape;
+}
+
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad         |   Wgrad   |
+// |   ----        | --------- | --------        | --------  |
+// |   Shape_M     | (Q,P,Z,N) | (W/V,H/U,D/O,N) | (K)       |
+// |   Shape_N     | (K)       | (C)             | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)       | (Q,P,Z,N) |
+// |   Shape_L     | _1        | (V,U,O)         | _1        |
+
+template <class ProblemShape>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ProblemShape const& problem_shape) {
+  return problem_shape;
+}
+
+
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_transformed_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+  using cute::insert;
+  using cute::make_shape;
+  using cute::reverse;
+  using cute::take;
+
+  constexpr int RankT = SpatialDim + 2;
+
+  if constexpr (ConvOp == conv::Operator::kWgrad) {
+    auto M_xformed = problem_shape.shape_C[0];
+    auto N_xformed = reverse(take<1, RankT>(problem_shape.shape_C));
+    auto K_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_A));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kFprop){
+    auto M_xformed = reverse(take<0, RankT - 1>(problem_shape.shape_C));
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    auto K_xformed = reverse(take<1, RankT>(problem_shape.shape_B));
+    auto L_xformed = cute::Int<1>{};
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+  else if constexpr (ConvOp == conv::Operator::kDgrad) {
+    auto L_xformed = reverse(problem_shape.traversal_stride); // (V,U,O)
+    auto M_xformed = ceil_div(reverse(take<0,RankT - 1>(problem_shape.shape_C)), L_xformed);
+    auto N_xformed = problem_shape.shape_C[RankT - 1];
+    // shape_B: [K,T,R,S,C], K_xformed: [K,S,R,T]
+    auto K_xformed = insert<0>(
+                (reverse(take<1,RankT - 1>(problem_shape.shape_B))),
+                problem_shape.shape_B[0]);
+
+    return make_shape(M_xformed, N_xformed, K_xformed, L_xformed);
+  }
+}
+
+// Assuming im2col linearization
+// Get problem shape MNKL according to following table:
+// |               |   Fprop   |   Dgrad               |   Wgrad   |
+// |   ----        | --------- | --------              | --------  |
+// |   Shape_M     | (Q*P*Z*N) | ([W/V]*[H/U]*[D/O]*N) | (K)       |
+// |   Shape_N     | (K)       | (C)                   | (C,S,R,T) |
+// |   Shape_K     | (C,S,R,T) | (K,S,R,T)             | (Q*P*Z*N) |
+// |   Shape_L     | _1        | (V*U*O)               | _1        |
+template <conv::Operator ConvOp, int SpatialDim>
+CUTLASS_HOST_DEVICE
+constexpr auto
+get_linearized_problem_shape_MNKL(ConvProblemShape<ConvOp, SpatialDim> const& problem_shape) {
+
+  auto [M, N, K, L] = get_transformed_problem_shape_MNKL(problem_shape);
+
+  if constexpr (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad) {
+    return cute::make_shape(cute::product(M), N, K, cute::product(L));
+  }
+  else if constexpr (ConvOp == conv::Operator::kWgrad) {
+    return cute::make_shape(M, N, cute::product(K), L);
+  }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d60469f429f94f4b8152a02d9db232eea5698e56
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/conv_universal_adapter.hpp
@@ -0,0 +1,448 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// common
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/trace.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/conv/kernel/conv_universal.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  ConvUniversalAdapter is a stateful, reusable handle built around a kernel
+  of type cutlass::conv::kernel::ConvUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, static methods
+  are exposed that bypass the stateful methods or args->params lowering.
+*/
+template <class ConvKernel_>
+class ConvUniversalAdapter
+{
+public:
+  using ConvKernel = GetUnderlyingKernel_t<ConvKernel_>;
+  using TileShape = typename ConvKernel::TileShape;
+  using ElementA = typename ConvKernel::ElementA;
+  using ElementB = typename ConvKernel::ElementB;
+  using ElementC = typename ConvKernel::ElementC;
+  using ElementD = typename ConvKernel::ElementD;
+  using ElementAccumulator = typename ConvKernel::TiledMma::ValTypeC;
+  using DispatchPolicy = typename ConvKernel::DispatchPolicy;
+  using CollectiveMainloop = typename ConvKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename ConvKernel::CollectiveEpilogue;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  // Tease out meta-information about the conv algorithm
+  static constexpr conv::Operator kConvolutionalOperator = DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
+
+  // If our TiledMMA's instruction thread layout size is larger than 1, we know its a tensorop!
+  using OperatorClass = cute::conditional_t<
+      (cute::size(typename ConvKernel::TiledMma::AtomThrID{}) > 1),
+      cutlass::arch::OpClassTensorOp, cutlass::arch::OpClassSimt>;
+
+  using ArchTag = typename ConvKernel::ArchTag;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = ConvKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename ConvKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = CollectiveMainloop::DispatchPolicy::Stages;
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  /// Argument structure: User API
+  using Arguments = typename ConvKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename ConvKernel::Params;
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the conv can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (ConvKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    workspace_bytes += ConvKernel::get_workspace_size(args);
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = ConvKernel::to_underlying_arguments(args, workspace);
+    return ConvKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return ConvKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("ConvUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<ConvKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<ConvKernel>,
+        ConvKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes conv state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("ConvUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = ConvKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Initialize the Params structure
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      // account for dynamic smem capacity if needed
+      int smem_size = ConvKernel::SharedStorageSize;
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<ConvKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("ConvUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = ConvKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling ConvKernel::to_underling_arguments()
+  static Status
+  run(Params& params, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    CUTLASS_TRACE_HOST("ConvUniversal::run()");
+    dim3 const block = ConvKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = ConvKernel::SharedStorageSize;
+
+    Status launch_result;
+    // Use extended launch API only for mainloops that use it
+    if constexpr (ConvKernel::ArchTag::kMinComputeCapability >= 90) {
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename ConvKernel::DispatchPolicy::ClusterShape{}) == 1;
+      dim3 cluster(cute::size<0>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<1>(typename ConvKernel::DispatchPolicy::ClusterShape{}),
+                   cute::size<2>(typename ConvKernel::DispatchPolicy::ClusterShape{}));
+      // Dynamic cluster support
+      [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0};
+      if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 100 ||
+                    ConvKernel::ArchTag::kMinComputeCapability == 101) {
+        if constexpr (!cute::is_static_v<typename ConvKernel::DispatchPolicy::ClusterShape>) {
+          fallback_cluster = params.hw_info.cluster_shape_fallback;
+          cluster = params.hw_info.cluster_shape;
+        }
+      }
+
+      void* kernel_params[] = {&params};
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster, 
+                                               fallback_cluster,
+                                               block, 
+                                               smem_size, 
+                                               stream, 
+                                               kernel_params,
+                                               kernel_index);
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<ConvKernel>;
+        if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 90
+                        || ConvKernel::ArchTag::kMinComputeCapability == 100 
+                     ) {
+          if constexpr (is_static_1x1x1) {
+            device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+            launch_result = Status::kSuccess;
+          }
+          else {
+            launch_result = ClusterLauncher::launch(
+                grid, cluster, block, smem_size, stream, kernel, kernel_params);
+          }
+        }
+        else {
+          if constexpr (ConvKernel::ArchTag::kMinComputeCapability == 100 ||
+                        ConvKernel::ArchTag::kMinComputeCapability == 101) {
+            launch_result = ClusterLauncher::launch_with_fallback_cluster(
+              grid,
+              cluster,
+              fallback_cluster,
+              block,
+              smem_size,
+              stream,
+              kernel,
+              kernel_params);
+          }
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+              grid, block, smem_size, stream, kernel_params, 0
+              );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        device_kernel<ConvKernel><<<grid, block, smem_size, stream>>>(params);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return run(args, workspace, stream, cuda_adapter);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(params_, stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/direct_convolution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/direct_convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..387574b989681ba6f9e5e6fa333dda109b7f7aa6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/direct_convolution.h
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Depthwise Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename DirectConvolutionKernel_>
+class DirectConvolution {
+public:
+
+  using UnderlyingKernel = DirectConvolutionKernel_;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+
+  using ReorderKernel = typename UnderlyingKernel::ReorderKernel;
+
+ private:
+
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  DirectConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    if (kGroupMode != conv::GroupMode::kDepthwise) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // C and K should be multiple of groups
+    if (args.problem_size.K != args.problem_size.groups &&
+      args.problem_size.C != args.problem_size.groups) {
+      return Status::kErrorInvalidProblem;
+    }
+    
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {  
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.ptr_reordered_B = args.ref_reordered_B.data();
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    // Launch reorder kernel
+    if (params_.ptr_reordered_B != nullptr) {
+      dim3 grid = ReorderKernel::get_grid_shape(params_);
+      dim3 block = ReorderKernel::get_block_shape();
+
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<ReorderKernel><<<grid, block, 0, stream>>>(params_);
+    }
+
+    // Launch main kernel
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    // Dynamic SMEM size based on input params.
+    int smem_size = int(params_.get_smem_size());
+
+    // Make sure we can use that much shared memory.
+    cudaError_t status = 
+        cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (status != cudaSuccess)
+      return Status::kErrorInternal;
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+  int get_smem_size() { return int(params_.get_smem_size()); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9aae87bc1c57a20e27298b4f227726dd199a769
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level Implicit GEMM Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename ImplicitGemmKernel_>
+class ImplicitGemmConvolution {
+public:
+
+  using UnderlyingKernel = GetUnderlyingKernel_t<ImplicitGemmKernel_>;
+
+  using ElementA = typename UnderlyingKernel::ElementA;
+  using LayoutA = typename UnderlyingKernel::LayoutA;
+  using ElementB = typename UnderlyingKernel::ElementB;
+  using LayoutB = typename UnderlyingKernel::LayoutB;
+  using ElementC = typename UnderlyingKernel::ElementC;
+  using LayoutC = typename UnderlyingKernel::LayoutC;
+  using ElementAccumulator = typename UnderlyingKernel::ElementAccumulator;
+  using ElementCompute = typename UnderlyingKernel::ElementCompute;
+  using OperatorClass = typename UnderlyingKernel::OperatorClass;
+  using ArchTag = typename UnderlyingKernel::ArchTag;
+  using ThreadblockShape = typename UnderlyingKernel::ThreadblockShape;
+  using WarpShape = typename UnderlyingKernel::WarpShape;
+  using InstructionShape = typename UnderlyingKernel::InstructionShape;
+  using ThreadblockSwizzle = typename UnderlyingKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename UnderlyingKernel::EpilogueOutputOp;
+  static int const kStages = UnderlyingKernel::kStages;
+  static int const kConvDim = UnderlyingKernel::kConvDim;
+  using WarpMmaOperator = typename UnderlyingKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename UnderlyingKernel::ArchMmaOperator;
+  using MathOperator = typename UnderlyingKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = UnderlyingKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = UnderlyingKernel::kIteratorAlgorithm;
+  static cutlass::conv::StrideSupport const kStrideSupport = UnderlyingKernel::kStrideSupport;
+  static cutlass::conv::GroupMode const kGroupMode = UnderlyingKernel::kGroupMode;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename UnderlyingKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename UnderlyingKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolution() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    // dispatch to iterators
+    Status status = UnderlyingKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = UnderlyingKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Check that tensor sizes don't exceed maximum supported size
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.activation_size() * sizeof(ElementA) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementC) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kDgrad ||
+               kConvolutionalOperator == conv::Operator::kDeconv) {
+      if (args.problem_size.activation_size() * sizeof(ElementC) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementB) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+      if (args.problem_size.activation_size() * sizeof(ElementB) >=
+              (1ull << 31) ||
+          args.problem_size.filter_size() * sizeof(ElementC) >= (1ull << 31) ||
+          args.problem_size.output_size() * sizeof(ElementA) >= (1ull << 31)) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // check group conv constraint
+    if (args.problem_size.groups != 1) {
+      if (kGroupMode == conv::GroupMode::kNone) {
+        return Status::kErrorInvalidProblem;
+      } 
+
+      // C and K should be multiple of groups
+      if (args.problem_size.K % args.problem_size.groups ||
+        args.problem_size.C % args.problem_size.groups) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // split-k is not supported
+      if (args.problem_size.split_k_slices != 1) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      int k_per_group = args.problem_size.K / args.problem_size.groups;
+      // k_per_group should be multiple of ThreadblockShape N, one CTA calculate one group
+      if (kGroupMode == conv::GroupMode::kSingleGroup && k_per_group % ThreadblockShape::kN) {
+        return Status::kErrorInvalidProblem;
+      }
+      // ThreadblockShape::kN should be divisible by k_per_group, one CTA calculate multiple groups
+      if (kGroupMode == conv::GroupMode::kMultipleGroup && ThreadblockShape::kN % k_per_group) {
+        return Status::kErrorInvalidProblem;
+      }
+
+      // current optimized iterator algo only supports SingleGroup mode
+      if (kIteratorAlgorithm == IteratorAlgorithm::kOptimized &&
+        kGroupMode != conv::GroupMode::kSingleGroup) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    static int const kAlignmentC = UnderlyingKernel::Epilogue::OutputTileIterator::kElementsPerAccess;
+    if (kConvolutionalOperator == conv::Operator::kFprop) {
+      if (args.problem_size.K % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    } else if (kConvolutionalOperator == conv::Operator::kWgrad) {
+       if (args.problem_size.C % kAlignmentC)
+        return Status::kErrorMisalignedOperand;
+    }
+
+    // check for unsupported problem sizes for strided dgrad / deconv implementation
+    if ((kConvolutionalOperator == conv::Operator::kDgrad || kConvolutionalOperator == conv::Operator::kDeconv) &&
+      kStrideSupport == conv::StrideSupport::kStrided) {
+      // split-k (serial or parallel) is not supported for strided dgrad / deconv
+      if(args.problem_size.split_k_slices > 1 && (args.problem_size.stride().at(args.problem_size.stride().max_dim_index()) > 1)) {
+        return Status::kErrorNotSupported;
+      }
+
+      // dilation > {1x1} is not supported for strided dgrad / deconv
+      if(args.problem_size.dilation_h > 1 || args.problem_size.dilation_w > 1) {
+        return Status::kErrorNotSupported;
+      }
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+   
+    if (args.problem_size.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename UnderlyingKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+  
+      if (smem_size >= (48 << 10)) {
+        cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<UnderlyingKernel>,
+                                      cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                      smem_size);
+  
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename UnderlyingKernel::SharedStorage));
+    cutlass::Status launch_result = cutlass::Status::kSuccess ;
+
+    if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          void* kernel_params[] = {&params_};
+          launch_result = cuda_adapter->launch(
+              grid, dim3(1,1,1), block, smem_size, stream, kernel_params, kernel_index
+              );
+        }
+        else {
+          launch_result = Status::kErrorInternal;
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      cutlass::Kernel<UnderlyingKernel><<<grid, block, smem_size, stream>>>(params_);      
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter, kernel_index);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..efd3dcbad093cf8d11036a63a9b6638d1801aeee
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Template for device-level fused activation's scale+bias+relu and Implicit GEMM Convolution
+*/
+
+#pragma once
+
+#include <limits>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/conv/convolution.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename ImplicitGemmFusionKernel_>
+class ImplicitGemmConvolutionFusion {
+public:
+
+  using ImplicitGemmFusionKernel = ImplicitGemmFusionKernel_;
+
+  using ElementA = typename ImplicitGemmFusionKernel::ElementA;
+  using LayoutA = typename ImplicitGemmFusionKernel::LayoutA;
+  using ElementB = typename ImplicitGemmFusionKernel::ElementB;
+  using LayoutB = typename ImplicitGemmFusionKernel::LayoutB;
+
+//  using ElementScaleBias = typename ImplicitGemmFusionKernel::ElementScaleBias;
+//  using LayoutScaleBias = typename ImplicitGemmFusionKernel::LayoutScaleBias;
+
+  using ElementC = typename ImplicitGemmFusionKernel::ElementC;
+  using LayoutC = typename ImplicitGemmFusionKernel::LayoutC;
+  using ElementAccumulator = typename ImplicitGemmFusionKernel::ElementAccumulator;
+  using ElementCompute = typename ImplicitGemmFusionKernel::ElementCompute;
+  using OperatorClass = typename ImplicitGemmFusionKernel::OperatorClass;
+  using ArchTag = typename ImplicitGemmFusionKernel::ArchTag;
+  using ThreadblockShape = typename ImplicitGemmFusionKernel::ThreadblockShape;
+  using WarpShape = typename ImplicitGemmFusionKernel::WarpShape;
+  using InstructionShape = typename ImplicitGemmFusionKernel::InstructionShape;
+  using ThreadblockSwizzle = typename ImplicitGemmFusionKernel::ThreadblockSwizzle;
+  using EpilogueOutputOp = typename ImplicitGemmFusionKernel::EpilogueOutputOp;
+  static int const kStages = ImplicitGemmFusionKernel::kStages;
+  static int const kConvDim = ImplicitGemmFusionKernel::kConvDim;
+  using WarpMmaOperator = typename ImplicitGemmFusionKernel::WarpMmaOperator;
+  using ArchMmaOperator = typename ImplicitGemmFusionKernel::ArchMmaOperator;
+  using MathOperator = typename ImplicitGemmFusionKernel::MathOperator; 
+
+  static cutlass::conv::Operator const kConvolutionalOperator = ImplicitGemmFusionKernel::kConvolutionalOperator;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = ImplicitGemmFusionKernel::kIteratorAlgorithm;
+
+  static int const kWarpCount = 
+    (ThreadblockShape::kM / WarpShape::kM) * 
+    (ThreadblockShape::kN / WarpShape::kN) *
+    (ThreadblockShape::kK / WarpShape::kK);
+
+  /// Argument structure
+  using Arguments = typename ImplicitGemmFusionKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename ImplicitGemmFusionKernel::Params params_;
+
+public:
+
+  /// Constructs Implicit GEMM
+  ImplicitGemmConvolutionFusion() { }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    // dispatch to iterators
+    Status status = ImplicitGemmFusionKernel::Mma::IteratorA::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    status = ImplicitGemmFusionKernel::Mma::IteratorB::can_implement(args.problem_size);
+    if (Status::kSuccess != status) {
+      return status;
+    }
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(
+      threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices));
+
+    if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+          grid.z <= std::numeric_limits<uint16_t>::max())) {
+
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+  
+    size_t workspace_bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size),
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    if(args.split_k_mode == SplitKMode::kParallel) {
+
+      // Split-K parallel: CTAs in k-dimension write the partial results in a temporary workspace.
+      // The user needs to call a reduction operator to optain the final output tensor
+      workspace_bytes = 
+        sizeof(ElementAccumulator) *
+        size_t(cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, args.problem_size)) *
+        size_t(grid_tiled_shape.k());
+    }
+
+    else if(args.split_k_mode == SplitKMode::kSerial && args.problem_size.split_k_slices > 1) {
+
+      // Split-K serial: The user workspace is used to store semaphore and serialize writing the 
+      // final reduced output to user's output tensor
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    if (args.problem_size.split_k_slices > 1) {
+
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+
+      cudaError_t status = cudaMemsetAsync(workspace, 0, get_workspace_size(args), stream);
+
+      if (status != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    // initialize the params structure from the arguments
+    params_ = typename ImplicitGemmFusionKernel::Params(
+    	args,
+    	static_cast<int *>(workspace)
+    );
+    
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(cutlass::Kernel<ImplicitGemmFusionKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+    
+    return Status::kSuccess;
+  }
+
+  /// Initializes Impicit GEMM state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.ptr_A = args.ref_A.data();
+    params_.ptr_B = args.ref_B.data();
+    params_.ptr_scale = args.ref_A_scale.data();
+    params_.ptr_bias = args.ref_A_bias.data();
+    params_.ptr_C = args.ref_C.data();
+    params_.ptr_D = args.ref_D.data();
+    params_.output_op = args.output_op;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(32 * kWarpCount, 1, 1);
+
+    int smem_size = int(sizeof(typename ImplicitGemmFusionKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<ImplicitGemmFusionKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/dispatch_policy.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/dispatch_policy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d569cb1c3e6d6c7da188691a94384d43259d2be0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/dispatch_policy.hpp
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/arch/arch.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Policies for categorical dispatch of mainloop against kernel grid schedules
+//
+struct KernelImplicitTmaWarpSpecializedSm90 : cutlass::gemm::KernelTmaWarpSpecialized { };
+struct KernelImplicitTmaWarpSpecializedSm90Cooperative { };
+struct KernelImplicitTmaWarpSpecializedSm90Pingpong { };
+
+//
+// Collective Mainloop Policies
+//
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+// for fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>,
+  class KernelSchedule = KernelImplicitTmaWarpSpecializedSm90,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  static constexpr int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+
+  static_assert(NumSpatialDimensions >= 1);
+  static_assert(! (cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Cooperative> ||
+                   cute::is_same_v<KernelSchedule,KernelImplicitTmaWarpSpecializedSm90Pingpong>),
+    "Persistent schedules not support for conv yet.");
+};
+
+
+
+// SM100 tensor op kernel schedule
+struct KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = 0;
+  static constexpr int AccumulatorPipelineStageCount = 0;
+};
+
+// Pseudo-policies for builder auto override that dispatches to the KernelImplicitTmaWarpSpecializedSm100
+// but for opting into 1 or 2 SM atoms
+struct KernelImplicitTmaWarpSpecialized1SmSm100 : KernelImplicitTmaWarpSpecializedSm100 { };
+struct KernelImplicitTmaWarpSpecialized2SmSm100 : KernelImplicitTmaWarpSpecializedSm100 { };
+
+struct KernelStridedDgradTmaWs1SmSm100 { };
+struct KernelStridedDgradTmaWs2SmSm100 { };
+
+// Policy for implicit gemm kernel
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelScheduleImplicitTmaWarpSpecializedSm100 : KernelImplicitTmaWarpSpecializedSm100 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// n-buffer in smem (Blackwell TMA), pipelined with Blackwell UMMA and TMA, fprop
+template<
+  conv::Operator ConvOp_,
+  int Stages_,
+  int NumSpatialDimensions_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = cute::Shape<cute::C<1>,cute::C<1>,cute::C<1>>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedImplicitGemm {
+  static constexpr int Stages = Stages_;
+  static constexpr int NumSpatialDimensions = NumSpatialDimensions_;
+  static constexpr Operator ConvOp = ConvOp_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelScheduleImplicitTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+
+  static_assert(NumSpatialDimensions >= 1);
+}; 
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv 
+
+//////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/conv_universal.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..af804df30e76a156af33f7095da64614370e466c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/conv_universal.hpp
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*
+ * Stateless universal device CONV kernel type that treats CONV as
+ * a composition of a collective mainloop and a collective epilogue.
+**/
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_ = void,
+  class Enable = void
+>
+class ConvUniversal {
+  static_assert(cutlass::detail::dependent_false<Enable>,
+      "Could not find a valid specialization at the kernel layer to dispatch against.");
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
+#include "cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp" 
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9647a598799cf233962457f8d2cad7e59e46cf5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d.h
@@ -0,0 +1,322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level implicit GEMM convolution definitions for threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/implicit_gemm_pipelined.h"
+#include "cutlass/conv/threadblock/implicit_gemm_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h"
+#include "cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_fusion.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogue<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultConvEpilogueWithBroadcastSimt {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimt<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess,
+    false,
+    PermuteDLayout,
+    StrideSupport,
+    Rank
+  >::Epilogue;
+};
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastSimtStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithBroadcastTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  ElementTensor,
+  ElementVector,
+  OutputOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    ElementTensor,
+    ElementVector,
+    OutputOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess
+>
+struct DefaultConvEpilogueWithReductionTensorOp<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  ElementOutput,
+  OutputOp,
+  ReductionOp,
+  ElementsPerAccess
+  > {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    ElementOutput,
+    OutputOp,
+    ReductionOp,
+    ElementsPerAccess
+  >::Epilogue;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Defaults for strided Dgrad
+template <
+  typename ArchTag,
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad {
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename OutputOp
+>
+struct DefaultConvEpilogueStridedDgrad<
+  arch::Sm70,
+  Shape,
+  WarpMmaTensorOp,
+  PartitionsK,
+  OutputOp
+> {
+
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueVoltaTensorOpStridedDgrad<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    OutputOp::kCount
+  >::Epilogue;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..27a96a5602494e2abe3980b3d07d54c49dcb9932
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -0,0 +1,1927 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dDgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                               OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA 
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB 
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for optimized IteratorAlgorithm Dgrad Unity Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided and
+// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kStrided,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOpStridedDgrad<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
+// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kStrided,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kStrided,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogueStridedDgrad<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm Dgrad Unity
+// 2 stage pipeline
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      conv::StrideSupport::kStrided
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kUnity
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dDgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        StrideSupport::kUnity
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        conv::StrideSupport::kStrided
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad
+  >;
+
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..932d1abdc6e2c80a4a1e8d1eb805cbedbe5ac78a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -0,0 +1,2007 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport, 
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and two stage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFixedChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kFewChannels,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorFewChannels<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  // Note GEMM shared memory threadmap is used here because conv global memory
+  // layout needs to be mapped to fprop which is similar to the crosswise
+  // layout which is used by the interleaved GEMM shared memory threadmap.
+  // The Interleaved GEMM global memory layout is similar to the congruous
+  // layout.
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 
+// multistage pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+    Stages, MathOperatorTag, true
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      layout::TensorNCxHWx<InterleavedK>,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      layout::TensorCxRSKx<InterleavedK>,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA 
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 2 stage 
+/// pipeline with interleaved layout.
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB,
+  int InterleavedK
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  layout::TensorNCxHWx<InterleavedK>,
+  ElementB,
+  layout::TensorCxRSKx<InterleavedK>,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+      ElementB, layout::RowMajorInterleaved<InterleavedK>, 
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      2, MathOperatorTag, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::SmemThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, layout::TensorNCxHWx<InterleavedK>,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::SmemThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, layout::TensorCxRSKx<InterleavedK>,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultInterleavedConvEpilogue<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    InterleavedK
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..85b142a0e27d3c39d2d742c1709582ea3156b801
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
@@ -0,0 +1,357 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv2dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv2dFpropFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dFprop specialization for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc751535c7a8c2c2f49b8d34f9d0e9a8edbd90e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Defines a default configuration for convolution with absolute maximum calculation.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithAbsMax {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithAbsMax<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7fca981b0e0b44dca2b9add89808ac2b036d021
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
@@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c2c7ffc700b089e449d4f18008c26cdb8d6c81a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename EpilogueReductionOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv2dFpropWithReduction {
+
+  using ImplicitGemmBase = typename DefaultConv2dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithReductionTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..99e353d80a0b3b37818371737c8189eee6b5ed38
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
@@ -0,0 +1,622 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dGroupFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dGroupFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and multistage 
+/// pipeline that supports all GroupMode.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport, 
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA,
+      GroupMode
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB,
+      GroupMode
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Analytic IteratorAlgorithm and
+/// 2 stage pipeline that supports all GroupMode.
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::GroupMode GroupMode,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA,
+        AccessTypeA,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        GroupMode
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and multistage
+/// pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA, LayoutA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dGroupFprop specialization for Optimized IteratorAlgorithm and
+/// 2 stage pipeline that supports GroupMode::kSingleGroup.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dGroupFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  GroupMode::kSingleGroup,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  static_assert(platform::is_same<LayoutA, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutB, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+  static_assert(platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value,
+    "Current group conv only support NHWC layout");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    GroupMode::kSingleGroup
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..d55d453eb02675d0b626865b6625dc4bf2b12e92
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -0,0 +1,1011 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultConv2dWgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      AccessTypeA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      AccessTypeB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::AlignedArray<ElementA, AlignmentA>;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        AccessTypeA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        AccessTypeB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AccessTypeA,
+  int AccessTypeB
+>
+struct DefaultConv2dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  AccessTypeA,
+  AccessTypeB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..83b680ec3591de39470013d71b808f356306b2f0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv2dWgradFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          OpClassTensorOp convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgradFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    IteratorScaleBias,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv2dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv2dWgradFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmWgradFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    IteratorScaleBias,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..309924cebafe82df1651b0fb5542eb14dc6c5388
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -0,0 +1,736 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dDgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dDgrad;
+
+/// Defines a kernel for Conv3dDgrad specialization for Analytic IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm Dgrad Strided
+// and multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dDgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dDgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b6709f08a4b2e93a0e3b93e1a343896368451c2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -0,0 +1,981 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.    
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv3dFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic Iterator Algorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized Iterator Algorithm
+/// and 2 stage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm and multistage
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB; 
+
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport
+>
+struct DefaultConv3dFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..513de059c6591a47fbf2c75f81d1400c96fe9d48
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+   Default kernel-level fused activation's scale+bias+relu and implicit GEMM convolution
+   definitions that combine threadblock-scoped matrix multiply-add with the
+   appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for fused batch norm and Conv3dFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity
+> struct DefaultConv3dFpropFusion;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassTensorOp convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Analytic IteratorAlgorithm and multistage 
+/// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dFprop specialzation for Optimized IteratorAlgorithm and 
+/// multistage pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementScaleBias,
+  typename LayoutScaleBias,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dFpropFusion <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementScaleBias,
+  LayoutScaleBias,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+    ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+    Stages, MathOperatorTag
+  >;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dFpropActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand 
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorScaleBias =
+      cutlass::conv::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorScaleBias =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorScaleBias = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename WarpMmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmFpropFusionMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Global,
+    IteratorScaleBias,
+    SmemIteratorScaleBias,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    WarpIteratorScaleBias,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb12c2a502f9af2aa5383288e6695a108abdf60
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultConv3dFpropWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dFprop specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultConv3dFpropWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultConv3dFprop<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b50d2087e20889a934eaf34c7f120badff8a435
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -0,0 +1,936 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultConv3dWgrad;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and multistage 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm and two 
+// pipeline.
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+>  {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, OperatorClass,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaTensorOp = typename MmaCore::MmaTensorOp;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename detail::DefaultConvEpilogue<
+    ArchTag,
+    ThreadblockShape,
+    WarpMmaTensorOp,
+    1,
+    EpilogueOutputOp
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                         OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Conv3dWgrad specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultConv3dWgrad <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::ColumnMajor,
+      ElementB, layout::RowMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dWgradActivationTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kWgrad,
+    Conv3dProblemSize
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..a58046ffa414e6556d14b20c5402fb5d82cfbf64
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d.h
@@ -0,0 +1,999 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h" 
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDeconv2d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      conv::GroupMode::kNone,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB, LayoutB,
+      ThreadMapB,
+      cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Analytic IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        conv::GroupMode::kNone,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv2d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    4
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        cutlass::AlignedArray<ElementB, ThreadMapB::kElementsPerAccess>,
+        true /*IsDeconv*/
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimtStridedDgrad<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionStridedDgrad<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e62187e3680e55a71d77bf4fee19276357753f98
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv2d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv2dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv2d specialization,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv2dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv2d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided,
+    AlignmentA,
+    AlignmentB
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimtStridedDgrad<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb7ca07e6eb9b18f3006d51e742772f755852e23
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d.h
@@ -0,0 +1,541 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h"
+
+#include "cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided
+> struct DefaultDeconv3d;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      conv::StrideSupport::kStrided
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      ThreadMapB,
+      true /*IsDeconv*/
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+      ElementA,
+      ThreadMapA,
+      StrideSupport::kUnity
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+      ElementB,
+      LayoutB,
+      ThreadMapB,
+      true /*IsDeconv*/
+      // ThreadMapB,
+      // StrideSupport::kUnity
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmMultistage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    arch::CacheOperation::Always,
+    IteratorB,
+    SmemIteratorB,
+    arch::CacheOperation::Always,
+    MmaPolicy,
+    Stages 
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport::kStrided
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        conv::StrideSupport::kStrided
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIteratorStridedDgrad<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        ThreadMapB,
+        true /*IsDeconv*/
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a kernel for Deconv3d specialization for Optimized IteratorAlgorithm, 
+/// 2 stage pipeline, and FFMA-based mainloop for SM50
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag
+>
+struct DefaultDeconv3d <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport::kUnity
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, layout::RowMajor,
+      ElementB, layout::ColumnMajor, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      2, MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dDgradOutputGradientTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA,
+        ThreadMapA,
+        StrideSupport::kUnity
+      // >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using IteratorB =
+    // cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv3dFpropFilterTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB,
+        LayoutB,
+        ThreadMapB,
+        true /*IsDeconv*/
+        // ThreadMapB,
+        // StrideSupport::kUnity
+      // >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::ImplicitGemmPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    false,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e25c8b2eee551252b902e0c0845416b753194df1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
@@ -0,0 +1,309 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Broadcast based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/kernel/default_deconv3d.h"
+#include "cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kOptimized,
+  conv::StrideSupport StrideSupport = StrideSupport::kStrided,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+>
+struct DefaultDeconv3dWithBroadcast {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    StrideSupport
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastTensorOp<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ImplicitGemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Deconv3d specialization for Analytic IteratorAlgorithm,
+/// multi-stage pipeline, and FFMA-based mainloop for SM80
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kUnity,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kUnity
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDeconv3dWithBroadcast <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm,
+  conv::StrideSupport::kStrided,
+  AlignmentA,
+  AlignmentB
+> {
+
+  using ImplicitGemmBase = typename DefaultDeconv3d<
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MathOperatorTag,
+    IteratorAlgorithm,
+    conv::StrideSupport::kStrided
+  >::Kernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::conv::kernel::detail::DefaultConvEpilogueWithBroadcastSimt<
+    ArchTag,
+    typename ImplicitGemmBase::Epilogue::Shape,
+    typename ImplicitGemmBase::Epilogue::WarpMmaOperator,
+    ElementC,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    ImplicitGemmBase::Epilogue::kElementsPerAccess,
+    layout::NoPermute,
+    StrideSupport::kStrided,
+    5
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolutionWithFusedEpilogue<
+    typename ImplicitGemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kDeconv,
+    Conv3dProblemSize
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba70813e4c94104522a05897a60811d26ae3c6a4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/default_depthwise_fprop.h
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+    Default kernel-level Depthwise implicit GEMM convolution definitions combine threadblock-scoped 
+      matrix multiply-add with the appropriate threadblock-scoped epilogue.  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d.h"
+#include "cutlass/conv/kernel/direct_convolution.h"
+
+#include "cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_pipelined.h"
+
+// Direct Conv Related Header files
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h"
+
+#include "cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h"
+#include "cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = cutlass::sizeof_bits<ElementB>::value / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for DepthwiseFprop with direct convolution algorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename OperatorClass,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+  conv::StrideSupport StrideSupport = StrideSupport::kUnity,
+  // MatrixShape<Height, Width>
+  typename StrideShape = cutlass::MatrixShape<-1, -1>,
+  // MatrixShape< Height, Width> 
+  typename DilationShape =  cutlass::MatrixShape<-1, -1>, 
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value
+> struct DefaultDepthwiseDirect2dConvFprop;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                            OpClassSimt convolutions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for Analytic IteratorAlgorithm
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  MathOperatorTag, //   cutlass::arch::OpMultiplyAdd
+  IteratorAlgorithm::kAnalytic,
+  StrideSupport,
+  AlignmentA,
+  AlignmentB
+> {
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      sizeof_bits<ElementB>::value,
+      2,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropActivationTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+        ElementA, LayoutA,
+        ThreadMapA
+      >
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+    cutlass::conv::threadblock::TileIterator<
+      cutlass::conv::threadblock::Conv2dFpropFilterTileAccessIteratorAnalytic<
+        cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+        ElementB, LayoutB,
+        ThreadMapB,
+        AccessTypeB,
+        cutlass::conv::GroupMode::kDepthwise
+      >
+    >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropPipelined<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    IteratorB,
+    SmemIteratorB,
+    ElementC,
+    LayoutC,
+    MmaPolicy
+  >;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpilogueSimt<
+    ThreadblockShape,
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::ImplicitGemmConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kOptimized,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Defines a kernel for Depthwise specialization for direct 2d conv implementation, 
+/// multiple stage pipeline, and SIMT-based mainloop
+template <
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementAccumulator,
+  typename ArchTag,
+  typename ThreadblockShape,
+  typename ThreadBlockOutputShape,
+  typename FilterShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename EpilogueOutputOp,
+  typename ThreadblockSwizzle,
+  int Stages,
+  typename MathOperatorTag,
+  conv::StrideSupport StrideSupport,
+  typename StrideShape,
+  typename DilationShape,
+  int AlignmentA,
+  int AlignmentB
+>
+struct DefaultDepthwiseDirect2dConvFprop <
+  ElementA,
+  LayoutA,
+  ElementB,
+  LayoutB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  arch::OpClassSimt,
+  ArchTag,
+  ThreadblockShape,
+  ThreadBlockOutputShape,
+  FilterShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  MathOperatorTag,
+  IteratorAlgorithm::kFixedStrideDilation,
+  StrideSupport,
+  StrideShape,
+  DilationShape,
+  AlignmentA,
+  AlignmentB
+> {
+
+
+
+  // One warp handles the entrie groups per cta.
+  static_assert(ThreadblockShape::kN == WarpShape::kN,
+                "ThreadblockShape::kN should be same as WarpShape::kN ");
+  static_assert(ThreadblockShape::kK == FilterShape::kCount && WarpShape::kK == FilterShape::kCount,
+                "ThreadblockShape::kK and WarpShape::kK should be same as filter size");
+  static_assert(ThreadblockShape::kM % WarpShape::kM == 0,
+                "ThreadblockShape::kM must be divisible by WarpShape shape::kM");
+  static_assert(ThreadBlockOutputShape::kN, "ThreadBlockOutputShape::kN should be 1");
+
+  static_assert(StrideShape::kRow >= 0 && StrideShape::kColumn >= 0, "Stride should be fixed");
+  static_assert(DilationShape::kRow >= 0 && DilationShape::kColumn >= 0, "Stride should be fixed");
+
+  // Activations loaded by threadblock
+  static int const ActivationShapeH = (ThreadBlockOutputShape::kH - 1) * StrideShape::kRow +
+                             (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ActivationShapeW = (ThreadBlockOutputShape::kW - 1) * StrideShape::kColumn +
+                             (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ActivationShape =
+      cutlass::conv::TensorNHWCShape<1, ActivationShapeH, ActivationShapeW, ThreadblockShape::kN >;
+
+  // Define the core components from GEMM
+  using MmaCore = typename cutlass::conv::threadblock::DepthwiseDirectConvMmaCoreWithLaneAccessSize<
+      ThreadblockShape,
+      ThreadBlockOutputShape,
+      FilterShape,
+      WarpShape,
+      InstructionShape,
+      ElementA,
+      layout::RowMajor,
+      ElementB,
+      layout::ColumnMajor,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      128,
+      128,
+      Stages,
+      MathOperatorTag,
+      IteratorAlgorithm::kFixedStrideDilation,
+      StrideShape,
+      DilationShape,
+      ActivationShape>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using IteratorA =
+    cutlass::conv::threadblock::DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation<
+      cutlass::MatrixShape<ThreadblockShape::kM,ThreadblockShape::kN>, // < outputShape:KMNK, groups per cta>
+      ThreadBlockOutputShape,
+      StrideShape,
+      DilationShape,
+      ActivationShape,
+      ElementA, LayoutA,
+      ThreadMapA
+    >;
+
+  using SmemIteratorA = typename MmaCore::SmemIteratorA;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::AlignedArray<ElementB, AlignmentB>;
+  using IteratorB =
+      cutlass::conv::threadblock::DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized<
+        cutlass::MatrixShape<ThreadblockShape::kN, FilterShape::kCount>,
+        ElementB, LayoutB,
+        ThreadMapB
+      >;
+  
+  using SmemIteratorB = typename MmaCore::SmemIteratorB;
+
+  // Warp-level GEMM components
+  using WarpMmaSimtOp = typename MmaCore::MmaWarpSimt;
+  using MmaPolicy = typename MmaCore::MmaPolicy;
+  using ThreadOutputShape = typename MmaCore::ThreadOutputShape;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * AlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * AlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultDirectConvEpilogueSimt<
+    ThreadblockShape, // < outputShape:KMNK, groups per cta>
+    WarpMmaSimtOp,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ThreadOutputShape,
+    ThreadBlockOutputShape
+  >::Epilogue;
+
+  // Define the Mma
+  using Mma = threadblock::DepthwiseFpropDirectConvMultipleStage<
+    ThreadblockShape,
+    IteratorA,
+    SmemIteratorA,
+    CacheOpA,
+    IteratorB,
+    SmemIteratorB,
+    CacheOpB,
+    MmaPolicy,
+    Stages,
+    Epilogue,
+    IteratorAlgorithm::kFixedStrideDilation
+  >;
+
+  // Define the kernel
+  using Kernel = cutlass::conv::kernel::DirectConvolution<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    conv::Operator::kFprop,
+    Conv2dProblemSize,
+    cutlass::conv::GroupMode::kDepthwise,
+    ThreadBlockOutputShape
+  >;
+};
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/direct_convolution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/direct_convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c04988790b9b03e41e9c2245dbdf2e5e8af493b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/direct_convolution.h
@@ -0,0 +1,506 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multi-staged Depthwise Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <typename Mma_,                 ///! Threadblock-scoped matrix multiply-accumulate
+          typename Epilogue_,            ///! Epilogue
+          typename ThreadblockSwizzle_,  ///! Threadblock swizzling function
+          conv::Operator ConvOperator,   ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+          typename Arguments_,           ///! Kernel Arguments
+          typename ConvOutputIteratorParameter_, ///! Output Iterator Params
+          typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+          conv::GroupMode GroupMode_ = conv::GroupMode::kNone,  ///! Group mode
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >  ///! OutputShape per ThreadBlock
+struct DirectConvolutionParams {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+  using Arguments = Arguments_;
+  using ConvOutputIteratorParameter = ConvOutputIteratorParameter_;
+
+  using ThreadblockShape = typename Mma::Shape;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+  static int const kStages = Mma::kStages;
+
+  ConvProblemSize problem_size;
+  cutlass::gemm::GemmCoord grid_tiled_shape;
+  gemm::GemmCoord implicit_gemm_problem_size;
+  int swizzle_log_tile;
+  int smem_size_;
+
+  int gemm_k_iterations;
+  int gemm_k_iterations_per_channel;
+  typename Mma::IteratorA::Params iterator_A;
+  typename Mma::IteratorA::Element const *ptr_A;
+  typename Mma::IteratorB::Params iterator_B;
+  typename Mma::IteratorB::Element const *ptr_B;
+  typename Mma::IteratorB::Element *ptr_reordered_B;
+  typename Epilogue::OutputTileIterator::Params iterator_C;
+  typename Epilogue::OutputTileIterator::Element *ptr_C;
+  typename Epilogue::OutputTileIterator::Params iterator_D;
+  typename Epilogue::OutputTileIterator::Element *ptr_D;
+  typename EpilogueOutputOp::Params output_op;
+  int *semaphore;
+  SplitKMode split_k_mode;
+  int split_k_slices;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams() : swizzle_log_tile(0), gemm_k_iterations(0) {}
+
+  ///
+  CUTLASS_HOST_DEVICE
+  DirectConvolutionParams(Arguments const &args, int *semaphore = nullptr)
+      : problem_size(args.problem_size),
+        implicit_gemm_problem_size(
+            cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+        iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+        ptr_A(args.ref_A.data()),
+        iterator_B(Mma::IteratorB::getParams(args.problem_size, args.ref_B.layout())),
+        ptr_B(args.ref_B.data()),
+        ptr_reordered_B(args.ref_reordered_B.data()),
+        iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size),
+        ptr_C(args.ref_C.data()),
+        iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size),
+        ptr_D(args.ref_D.data()),
+        output_op(args.output_op),
+        semaphore(semaphore),
+        split_k_mode(args.split_k_mode),
+        split_k_slices(args.problem_size.split_k_slices) {
+    gemm_k_iterations =
+        depthwise_gemm_k_iterations<ThreadBlockOutputShape::kN,
+                                    ThreadBlockOutputShape::kH,
+                                    ThreadBlockOutputShape::kW>(kConvolutionalOperator,
+                                                                ThreadblockShape::kK,
+                                                                args.problem_size,
+                                                                kIteratorAlgorithm,
+                                                                kGroupMode,
+                                                                ThreadblockShape::kN);
+
+    gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+        kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+    swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+
+    // Dynamic SMEM usage because stride and dilation are runtime params.
+    smem_size_ = (cutlass::platform::max(iterator_A.activation_size, int(sizeof(typename Epilogue::SharedStorage))) * kStages + iterator_B.filter_size);
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_smem_size() {
+    // Dynamic Smem Size
+    return smem_size_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Params_, typename ElementB_>
+struct ReorderKernel {
+  using Params = Params_;
+  using ElementB = ElementB_;
+
+  union SharedStorage {};
+
+  static unsigned int const kReorderKernelThreadPerCTA = 128;
+
+  CUTLASS_HOST_DEVICE
+  ReorderKernel() {}
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(Params const &params) {
+    return dim3{static_cast<unsigned int>(
+                    (params.problem_size.filter_size() + kReorderKernelThreadPerCTA - 1) /
+                    kReorderKernelThreadPerCTA),
+                1,
+                1};
+  }
+
+  CUTLASS_HOST_DEVICE
+  static dim3 get_block_shape() { return dim3{kReorderKernelThreadPerCTA, 1, 1}; }
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    int64_t m = static_cast<int64_t>(params.problem_size.groups);
+    int64_t n = static_cast<int64_t>(params.problem_size.filter_size() / params.problem_size.K);
+    const ElementB *src_with_type = static_cast<const ElementB *>(params.ptr_B);
+    ElementB *dst_with_type = static_cast<ElementB *>(params.ptr_reordered_B);
+
+    int64_t linear_index = blockIdx.x * kReorderKernelThreadPerCTA + threadIdx.x;
+    int64_t index_m = linear_index / n;
+    int64_t index_n = linear_index % n;
+    int64_t new_linear_index = index_m + index_n * m;
+
+    if (linear_index < m * n) {
+      dst_with_type[new_linear_index] = src_with_type[linear_index];
+    }
+    return;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,    ///! Group mode
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+struct DirectConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename cutlass::gemm::GemmShape<1, 1, 1>;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefB ref_reordered_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      TensorRefB const & ref_reordered_B = nullptr,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_reordered_B(ref_reordered_B),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  using Params =
+      typename cutlass::conv::kernel::DirectConvolutionParams<Mma,
+                                                              Epilogue,
+                                                              ThreadblockSwizzle,
+                                                              kConvolutionalOperator,
+                                                              Arguments,
+                                                              ConvOutputIteratorParameter,
+                                                              ConvProblemSize,
+                                                              kGroupMode,
+                                                              ThreadBlockOutputShape>;
+
+  using ReorderKernel = typename cutlass::conv::kernel::ReorderKernel<Params, ElementB>;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  DirectConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if threadblock is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+    int iterator_column_offset = 0;
+    int filter_row_offset = 0;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode == GroupMode::kDepthwise) {
+        iterator_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+        iterator_column_offset
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_reordered_B,
+      thread_idx,
+      MatrixCoord(
+        filter_row_offset,
+        iterator_column_offset
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() + threadblock_tile_idx.k(),
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+
+    // Compute threadblock-scoped matrix multiply-add
+    // Epilogue is fused in the mainloop
+    mma(params.gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        params.iterator_A,
+        iterator_B,
+        params.iterator_B,
+        accumulators,
+        epilogue,
+        output_op,
+        iterator_D,
+        iterator_C,
+        params.split_k_slices);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3fa0e907bb94c2716861395324b2da0346cebde
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -0,0 +1,455 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
+  typename ConvProblemSize_ = Conv2dProblemSize,  ///! Convolutional operator on 2D or 3D problem
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone    ///! Group mode
+>
+struct ImplicitGemmConvolution {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = GroupMode_;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    int gemm_k_iterations_per_channel;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), gemm_k_iterations(0) { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(
+        kConvolutionalOperator,
+        ThreadblockShape::kK,
+        args.problem_size,
+        kIteratorAlgorithm,
+        kGroupMode,
+        ThreadblockShape::kN);
+
+      gemm_k_iterations_per_channel = implicit_gemm_k_iterations_per_channel(
+          kConvolutionalOperator, args.problem_size, kIteratorAlgorithm);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolution() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+    int iterator_A_column_offset = threadblock_tile_idx.k() * Mma::Shape::kK;
+    if (kGroupMode != GroupMode::kNone) {
+      if (kGroupMode != GroupMode::kDepthwise) {
+        int k_per_group = params.problem_size.K / params.problem_size.groups;
+        int group_idx = threadblock_tile_idx.n() * Mma::Shape::kN / k_per_group;
+        int channels_per_group = params.problem_size.C / params.problem_size.groups;
+        iterator_A_column_offset += group_idx * channels_per_group;
+      } else {
+        iterator_A_column_offset += threadblock_tile_idx.n() * Mma::Shape::kN;
+      }
+    } 
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        iterator_A_column_offset
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, params.gemm_k_iterations_per_channel);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..5451c176f4027bc40a3ec3466efe69dea18f5342
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
@@ -0,0 +1,461 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined fused activation's scale+bias+relu and Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionFusion {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+
+  using ElementScaleBias = typename Mma::IteratorScaleBias::Element;
+  using LayoutScaleBias = typename Mma::IteratorScaleBias::Layout;
+
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+ 
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefScaleBias = typename Mma::IteratorScaleBias::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefScaleBias ref_scale;
+    TensorRefScaleBias ref_bias;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefScaleBias const & ref_scale,
+      TensorRefScaleBias const & ref_bias,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_scale(ref_scale),
+      ref_bias(ref_bias),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    gemm::GemmCoord implicit_gemm_problem_size{};
+    int swizzle_log_tile{0};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Mma::IteratorScaleBias::Params iterator_scale_bias{};
+    typename Mma::IteratorScaleBias::Element const *ptr_scale = nullptr;
+    typename Mma::IteratorScaleBias::Element const *ptr_bias = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C {};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D {};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_scale_bias(args.problem_size, args.ref_scale.layout()),
+      ptr_scale(args.ref_scale.data()),
+      ptr_bias(args.ref_bias.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionFusion() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A operand
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    // Construct iterators to B operand
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+ 
+    // Construct iterators to A scale/bias vector
+    typename Mma::IteratorScaleBias iterator_scale_bias(
+      params.iterator_scale_bias,
+      params.problem_size,
+      params.ptr_scale,
+      params.ptr_bias,
+      thread_idx,
+      MatrixCoord(
+        0, (kConvolutionalOperator == conv::Operator::kFprop) ?
+                  (threadblock_tile_idx.k() * Mma::Shape::kK) :
+                  // Wgrad
+                  (threadblock_tile_idx.n() * Mma::Shape::kN)
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A,
+        iterator_B, iterator_scale_bias, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..071854cd629e26417ca987bc24681665c8d30702
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
@@ -0,0 +1,492 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionStridedDgrad {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+  
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  // Strided dgrad uses a specialized threadblock swizzle for functionality and performance
+  static_assert((platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradHorizontalThreadblockSwizzle>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<1>>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<4>>::value) ||
+                (platform::is_same<ThreadblockSwizzle,
+                      threadblock::StridedDgradIdentityThreadblockSwizzle<8>>::value),
+    "Needs ThreadblockSwizzle type specialized for strided dgrad");
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size{};
+    TensorRefA ref_A{};
+    TensorRefB ref_B{};
+    TensorRefC ref_C{};
+    TensorRefC ref_D{};
+    typename EpilogueOutputOp::Params output_op{};
+    SplitKMode split_k_mode{};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    FastDivmod stride_h_divmod{};
+    FastDivmod stride_w_divmod{};
+    int gemm_k_iterations{0};
+    typename Mma::IteratorA::Params iterator_A{};
+    typename Mma::IteratorA::Element const *ptr_A = nullptr;
+    typename Mma::IteratorB::Params iterator_B{};
+    typename Mma::IteratorB::Element const *ptr_B = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_C{};
+    typename Epilogue::OutputTileIterator::Element *ptr_C = nullptr;
+    typename Epilogue::OutputTileIterator::Params iterator_D{};
+    typename Epilogue::OutputTileIterator::Element *ptr_D = nullptr;
+    typename EpilogueOutputOp::Params output_op {};
+    int *semaphore = nullptr;
+    SplitKMode split_k_mode {};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      stride_h_divmod(args.problem_size.stride_h),
+      stride_w_divmod(args.problem_size.stride_w),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), args.problem_size, ThreadblockShape::kM),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), args.problem_size, ThreadblockShape::kM),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode)
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        kConvolutionalOperator,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+      
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+  
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionStridedDgrad() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Compute starting filter position for strided dgrad
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(params.problem_size, 
+                                                            ThreadblockShape::kM);
+    int filter_tile_m = (threadblock_tile_idx.m() / tile_m_per_filter);
+    
+
+    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
+    //
+    // int start_r = filter_tile_m / (params.problem_size.stride_w);
+    // int start_s = filter_tile_m % (params.problem_size.stride_w);
+
+    int start_r, start_s;
+    params.stride_w_divmod(start_r, start_s, filter_tile_m);
+
+    int filter_r = start_r;
+    int filter_s = start_s;
+
+    if (params.problem_size.mode == Mode::kConvolution) {
+      filter_r = (params.problem_size.R - 1 - filter_r);
+      filter_s = (params.problem_size.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      params.problem_size,
+      params.stride_h_divmod, params.stride_w_divmod,
+      filter_r, filter_s,
+      start_h, start_w);
+
+    if (start_h >= params.problem_size.H || start_w >= params.problem_size.W) {
+      return;
+    }
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    // Check if CTA contributes valid MMA (Dy * w) and accumulator will be non-zero after MMA
+    if (start_r < params.problem_size.R && start_s < params.problem_size.S) {
+      // Scale gemm_k_iterations for strided dgrad
+      int gemm_k_iterations = (params.gemm_k_iterations / (params.problem_size.R * params.problem_size.S)
+                              ) * params.problem_size.num_gemm_k_filter_positions(start_r, start_s);
+      
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.iterator_A,
+        params.problem_size,
+        params.ptr_A,
+        thread_idx,
+        params.stride_h_divmod, params.stride_w_divmod,
+        start_r, start_s,
+        MatrixCoord(
+          threadblock_tile_idx.m() * Mma::Shape::kM,
+          threadblock_tile_idx.k() * Mma::Shape::kK
+        ) 
+      );
+      
+      typename Mma::IteratorB iterator_B(
+        params.iterator_B,
+        params.problem_size,
+        params.ptr_B,
+        thread_idx,
+        start_r, start_s,
+        MatrixCoord(
+          threadblock_tile_idx.k() * Mma::Shape::kK,
+          threadblock_tile_idx.n() * Mma::Shape::kN
+        )
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      params.stride_h_divmod, params.stride_w_divmod,
+      start_r, start_s,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    if (output_op.is_source_needed())
+    {
+      // Tile iterator reading from source accumulator tensor
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.iterator_C,
+        params.ptr_C,
+        ConvOutputIteratorParameter::extent(params.problem_size),
+        thread_idx,
+        params.stride_h_divmod, params.stride_w_divmod,
+        start_r, start_s,
+        threadblock_offset);
+
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_idx.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_idx.k());
+      }
+
+      // Run epilogue with addend source iterator
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+    else
+    {
+      // Run epilogue without addend source iterator
+      epilogue(output_op, iterator_D, accumulators);
+    }
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..0113473f9b28d7c657c07ff8f85e34fc66ea1ed1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
@@ -0,0 +1,494 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Convolution kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionWithAbsMax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+  using TensorRefAux = cutlass::TensorRef<typename EpilogueOutputOp::ElementAuxOutput, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+    TensorRefC ref_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      TensorRefAux const & ref_Aux,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial,
+      void * ptr_Vector = nullptr,
+      typename LayoutC::Stride::Index ldr = 0
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      ref_Aux(ref_Aux),
+      output_op(output_op),
+      split_k_mode(split_k_mode),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename Epilogue::AuxOutputTileIterator::Params iterator_Aux;
+    typename Epilogue::AuxOutputTileIterator::Element *ptr_Aux;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0), 
+      gemm_k_iterations(0),
+      ptr_Vector(nullptr),
+      ldr(0)
+    { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D)),
+      ptr_D(args.ref_D.data()),
+      iterator_Aux(ConvOutputIteratorParameter::layout(args.ref_Aux)),
+      ptr_Aux(args.ref_Aux.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode),
+      ptr_Vector(args.ptr_Vector), 
+      ldr(args.ldr)
+
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionWithAbsMax() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.iterator_Aux,
+      params.ptr_Aux,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector = 
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
+    }
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             ConvOutputIteratorParameter::extent(params.problem_size),
+             threadblock_offset);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e810e3d13c8b8eed4894ac9670f4a586dcaef8d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
@@ -0,0 +1,499 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Implicit GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/epilogue/threadblock/output_iterator_parameter.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,                             ///! Epilogue
+  typename ThreadblockSwizzle_,                   ///! Threadblock swizzling function
+  conv::Operator ConvOperator,                    ///! Convolutional operator (Fprop, Dgrad, Wgrad, Deconv)
+  typename ConvProblemSize_ = Conv2dProblemSize   ///! Convolutional operator on 2D or 3D problem
+>
+struct ImplicitGemmConvolutionWithFusedEpilogue {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static Operator const kConvolutionalOperator = ConvOperator;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename EpilogueOutputOp::ElementOutput;
+
+  /// Set output tensor C layout
+  using LayoutC = LayoutA;
+
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using WarpMmaOperator = typename Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  static int const kStages = Mma::kStages;
+  static IteratorAlgorithm const kIteratorAlgorithm = Mma::IteratorA::kIteratorAlgorithm; 
+  static StrideSupport const kStrideSupport = Mma::IteratorA::kStrideSupport;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using TensorRefC = cutlass::TensorRef<ElementC, LayoutC>;
+
+  /// Check iterator A and B convolution dimension are the same and 
+  // set device::ImplicitGemmConvolution::kConvDim
+  static_assert(Mma::IteratorA::kConvDim == Mma::IteratorB::kConvDim, 
+    "Convolution on different different dimensions is not supported");
+  static int const kConvDim = Mma::IteratorA::kConvDim;
+
+  /// Conv dimension and problem size structure (Conv2d or Conv3d)
+  using ConvProblemSize = ConvProblemSize_;
+
+  static conv::GroupMode const kGroupMode = conv::GroupMode::kNone;
+
+  /// Wgrad C stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix C (KxRSC) 
+  // Conv3d row-major matrix C (KxTRSC)
+  static int const kWgradCStrideIdx = 
+    platform::is_same<LayoutC, cutlass::layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorCStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradCStrideIdx : 0);
+
+  //
+  //
+  //
+  using ConvOutputIteratorParameter = epilogue::threadblock::ConvOutputIteratorParameter<
+    LayoutC,
+    typename Epilogue::OutputTileIterator::Layout, 
+    TensorRefC,
+    ConvOperator,
+    ConvProblemSize
+    >;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    ConvProblemSize problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefC ref_D;
+
+    typename EpilogueOutputOp::Params output_op;
+    SplitKMode split_k_mode;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      ConvProblemSize const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      ConvProblemSize const & problem_size,
+      TensorRefA const & ref_A,
+      TensorRefB const & ref_B,
+      TensorRefC const & ref_C,
+      TensorRefC const & ref_D,
+      typename EpilogueOutputOp::Params const & output_op,
+      SplitKMode const & split_k_mode = SplitKMode::kSerial,
+      void * ptr_Vector = nullptr,
+      void * ptr_Tensor = nullptr,
+      typename LayoutC::Stride::Index ldr = 0,
+      typename LayoutC::Stride::Index ldt = 0
+    ):
+      problem_size(problem_size),
+      ref_A(ref_A),
+      ref_B(ref_B),
+      ref_C(ref_C),
+      ref_D(ref_D),
+      output_op(output_op),
+      split_k_mode(split_k_mode),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      ldr(ldr),
+      ldt(ldt)
+    {
+
+    }
+
+  };
+
+  /// Parameters structure
+  struct Params {
+    ConvProblemSize problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    gemm::GemmCoord implicit_gemm_problem_size;
+    int swizzle_log_tile;
+
+    int gemm_k_iterations;
+    typename Mma::IteratorA::Params iterator_A;
+    typename Mma::IteratorA::Element const *ptr_A;
+    typename Mma::IteratorB::Params iterator_B;
+    typename Mma::IteratorB::Element const *ptr_B;
+    typename Epilogue::OutputTileIterator::Params iterator_C;
+    typename Epilogue::OutputTileIterator::Element *ptr_C;
+    typename Epilogue::OutputTileIterator::Params iterator_D;
+    typename Epilogue::OutputTileIterator::Element *ptr_D;
+    typename EpilogueOutputOp::Params output_op;
+    int *semaphore;
+    SplitKMode split_k_mode;
+
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+    void * ptr_Tensor;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      swizzle_log_tile(0), 
+      gemm_k_iterations(0),
+      ptr_Vector(nullptr),
+      ldr(0),
+      ptr_Tensor(nullptr)
+    { }
+
+    /// 
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      int *semaphore = nullptr
+    ):
+      problem_size(args.problem_size),
+      implicit_gemm_problem_size(cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, args.problem_size)),
+      iterator_A(Mma::IteratorA::getParams(args.problem_size, args.ref_A.layout())),
+      ptr_A(args.ref_A.data()),
+      iterator_B(args.problem_size, args.ref_B.layout()),
+      ptr_B(args.ref_B.data()),
+      iterator_C(ConvOutputIteratorParameter::layout(args.ref_C), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_C(args.ref_C.data()),
+      iterator_D(ConvOutputIteratorParameter::layout(args.ref_D), implicit_gemm_tensor_c_extent(kConvolutionalOperator, args.problem_size)),
+      ptr_D(args.ref_D.data()),
+      output_op(args.output_op),
+      semaphore(semaphore),
+      split_k_mode(args.split_k_mode),
+      params_Tensor(args.ldt),
+      ptr_Vector(args.ptr_Vector), 
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor)
+
+    {
+      gemm_k_iterations = implicit_gemm_k_iterations(kConvolutionalOperator, ThreadblockShape::kK, args.problem_size);
+
+      ThreadblockSwizzle threadblock_swizzle;
+
+      grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+        implicit_gemm_problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.problem_size.split_k_slices);
+
+      swizzle_log_tile = threadblock_swizzle.get_log_tile(grid_tiled_shape);
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ImplicitGemmConvolutionWithFusedEpilogue() { } 
+
+  /// Executes one ImplicitGEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_idx.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_idx.n()) {
+
+      return;
+    }
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.iterator_A,
+      params.problem_size,
+      params.ptr_A,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.m() * Mma::Shape::kM,
+        threadblock_tile_idx.k() * Mma::Shape::kK
+      )
+    );
+    
+    typename Mma::IteratorB iterator_B(
+      params.iterator_B,
+      params.problem_size,
+      params.ptr_B,
+      thread_idx,
+      MatrixCoord(
+        threadblock_tile_idx.k() * Mma::Shape::kK,
+        threadblock_tile_idx.n() * Mma::Shape::kN
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    // Construct the semaphore.
+    int block_idx = threadblock_tile_idx.m() + threadblock_tile_idx.n() * params.grid_tiled_shape.m();
+
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+    
+    // Compute logical position within grid
+    threadblock_tile_idx =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_idx.k(), params.grid_tiled_shape.k());
+    }
+
+    MatrixCoord threadblock_offset(
+      threadblock_tile_idx.m() * Mma::Shape::kM,
+      threadblock_tile_idx.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to destination tensor
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.iterator_D,
+      params.ptr_D,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+    
+    // Tile iterator reading from source accumulator tensor
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.iterator_C,
+      params.ptr_C,
+      ConvOutputIteratorParameter::extent(params.problem_size),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::ElementTensor *ptr_Tensor = 
+      static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector = 
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        ConvOutputIteratorParameter::extent(params.problem_size),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_idx.m() * params.ldr;
+    }
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_idx.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_idx.k());
+
+    }
+    // Each split-k-slice writes to a unique tensor location
+    else if (params.split_k_mode == SplitKMode::kParallel) {
+      iterator_D.add_pointer_offset(threadblock_tile_idx.k() * 
+        cutlass::conv::implicit_gemm_tensor_c_size(ConvOperator, params.problem_size));
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_idx.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             tensor_iterator,
+            ConvOutputIteratorParameter::extent(params.problem_size),
+             threadblock_offset);
+  
+    //
+    // Release the semaphore
+    //
+
+    if (params.split_k_mode == SplitKMode::kSerial && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_idx.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_idx.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..327fc27db4eba8093ce58845e465071da724c2e8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm100_implicit_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,874 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/sm100_pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class ConvUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<cute::is_base_of_v<KernelImplicitTmaWarpSpecializedSm100,
+                                       typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+
+  // Mainloop derived types
+  using ProblemShape = ProblemShape_;
+  using CollectiveMainloop = CollectiveMainloop_;
+
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  static constexpr int NumSpatialDimensions = CollectiveMainloop::NumSpatialDimensions;
+  static constexpr bool is_grouped_wgrad = CollectiveMainloop::is_grouped_wgrad;
+  static constexpr bool IsComplex = false;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+  // TileID scheduler
+  // CLC pipeline depth determines how many waves (stages-1) the scheduler can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename cutlass::gemm::kernel::detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+
+  // Pipelines and pipeline states
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = cutlass::PipelineDetail::PipelineCLCFetchAsyncPipelineState<SchedulerPipelineStageCount>;
+  using CLCPipelineSharedStorage = cutlass::PipelineDetail::PipelineCLCFetchAsyncSharedStorage<SchedulerPipelineStageCount>;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = CLCPipelineSharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    using ProblemShapeMNKL = decltype(CollectiveMainloop::get_problem_shape_MNKL(ProblemShape{}));
+    ProblemShapeMNKL problem_shape;
+    MainloopParams mainloop;
+    EpilogueParams epilogue;
+    TileSchedulerParams scheduler;
+    KernelHardwareInfo hw_info{}; 
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    MainloopLoad = 2,
+    EpilogueLoad = 3,
+    Epilogue     = 4
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t main_load = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+  };
+
+  //
+  // Methods
+  //
+  // Map user facing arguments to device facing params
+  CUTLASS_HOST
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    auto problem_shape_mnkl = CollectiveMainloop::get_problem_shape_MNKL(args.problem_shape);
+
+    auto mainloop_params = CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace, args.hw_info);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<decltype(problem_shape_mnkl), ElementAccumulator>(
+      args.scheduler, problem_shape_mnkl, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      problem_shape_mnkl,
+      mainloop_params,
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(
+          args.problem_shape, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+          args.hw_info, args.scheduler, scheduler_workspace),
+      args.hw_info 
+    };
+  }
+
+  CUTLASS_HOST
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, args.hw_info.cluster_shape);
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, args.hw_info.cluster_shape_fallback);
+
+    // implicit gemm B tile can be small for conv, ensure multicast smem offsets are 128B aligned
+    int multicast_b_bits = (size<1>(TileShape{}) * size<2>(TileShape{}) / size<0>(cluster_shape)) * sizeof_bits_v<ElementB>;
+    int multicast_b_fallback_bits = (size<1>(TileShape{}) * size<2>(TileShape{}) / size<0>(cluster_shape_fallback)) * sizeof_bits_v<ElementB>;
+    implementable &= multicast_b_bits % (128*8) == 0 && multicast_b_fallback_bits % (128*8) == 0;
+    if (not implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: multicast size too large for B tile\n");
+      return false;
+    }
+
+    if constexpr (is_grouped_wgrad) {
+      implementable &= size<0>(cluster_shape) == 1 && size<0>(cluster_shape_fallback) == 1;
+
+      if (!implementable) {
+        return false;
+      }
+    }
+
+    return implementable;
+  }
+
+  CUTLASS_HOST
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+    auto linear_problem_shape_MNKL = cutlass::conv::detail::get_linearized_problem_shape_MNKL(args.problem_shape);
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<decltype(linear_problem_shape_MNKL), ElementAccumulator>(
+      args.scheduler, linear_problem_shape_MNKL, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  CUTLASS_HOST
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    auto linear_problem_shape_MNKL = cutlass::conv::detail::get_linearized_problem_shape_MNKL(args.problem_shape);
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(
+      args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace
+      <decltype(linear_problem_shape_MNKL), ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, linear_problem_shape_MNKL,
+      args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+
+    workspace_offset += TileScheduler::template get_workspace_size
+      <decltype(linear_problem_shape_MNKL), ElementAccumulator>(
+      args.scheduler, linear_problem_shape_MNKL, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles,
+      CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  CUTLASS_HOST
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape
+        ,params.hw_info 
+       );
+  }
+
+  CUTLASS_HOST
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto problem_shape_MNKL = append<4>(params.problem_shape, _1{});
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue) ? WarpCategory(warp_idx)
+                                                                                     : WarpCategory::Epilogue;
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoad),                        // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue)                             // epilogue
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load;
+    mainloop_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop,
+                                       mainloop_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 1;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopLoadThreads;
+    load_order_barrier_params.initializing_warp = 3;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 4;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 5;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_init(
+      problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+    uint32_t tmem_stage_ptrs[AccumulatorPipelineStageCount];
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, problem_shape_MNKL, TileShape{}, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    auto accumulators = TiledMma::make_fragment_C(acc_shape);
+
+    int TmemColumnsPerAccumulatorTile = cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      Tensor gA_mk = get<0>(load_inputs);
+
+      do {
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, TileShape{}, shape<3>(gA_mk));
+        auto k_tile_count = scheduler.get_work_k_tile_count(work_tile_info, problem_shape_MNKL, TileShape{});
+        auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
+
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_prologue
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter_next, k_tile_count - k_tile_prologue
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      // Whether a new CLC query must be performed.
+      // See comment below where this variable is updated for a description of
+      // why this variable is needed.
+      bool requires_clc_query = true;
+
+      do {
+        if (requires_clc_query) {
+          // Query next clcID and update producer state
+          clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+        }
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        // Only perform a new CLC query if we consumed a new CLC query result in
+        // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+        // not consume a new CLC query response is when processing stream-K units.
+        // The current stream-K scheduler uses single WorkTileInfo to track multiple
+        // (potentially-partial) tiles to be computed via stream-K. In this case,
+        // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+        // rather than consuming a CLC query response.
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        work_tile_info = next_work_tile_info;
+      } while (work_tile_info.is_valid());
+      clc_pipeline.producer_tail(clc_pipe_producer_state);
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int acc_stage = 0; acc_stage < AccumulatorPipelineStageCount; acc_stage++) {
+        tmem_stage_ptrs[acc_stage] = tmem_base_ptr + (TmemColumnsPerAccumulatorTile * acc_stage) & cutlass::detail::TmemColMask;
+      }
+      auto mma_inputs = collective_mainloop.mma_init(shared_storage.tensors.mainloop);
+
+      do {
+        auto k_tile_count = scheduler.get_work_k_tile_count(work_tile_info, problem_shape_MNKL, TileShape{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        }
+
+        // Accumulator stage slice
+        int acc_stage = accumulator_pipe_producer_state.index();
+        accumulators.data() = tmem_stage_ptrs[acc_stage];
+
+        if (is_mma_leader_cta) {
+          mainloop_pipe_consumer_state = collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            mma_inputs,
+            k_tile_count
+          );
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      // Leader MMA waits for leader + peer epilogues to release accumulator stage
+      if (is_mma_leader_cta) {
+        accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+      }
+      // Signal to peer MMA that entire tmem allocation can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      CUTLASS_PRAGMA_UNROLL
+      for (int acc_stage = 0; acc_stage < AccumulatorPipelineStageCount; acc_stage++) {
+        tmem_stage_ptrs[acc_stage] = tmem_base_ptr + (TmemColumnsPerAccumulatorTile * acc_stage) & cutlass::detail::TmemColMask;
+      }
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice after making sure allocation has been performed
+        int acc_stage = accumulator_pipe_consumer_state.index();
+        accumulators.data() = tmem_stage_ptrs[acc_stage];
+
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulators,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulators,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c02a4531edd4078da6c92205f36b62b237c20bc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/conv/detail.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/dispatch_policy.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::conv::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class ConvUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelImplicitTmaWarpSpecializedSm90, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+> : public cutlass::gemm::kernel::GemmUniversal< 
+  ProblemShape_, 
+  CollectiveMainloop_, 
+  CollectiveEpilogue_, 
+  TileScheduler_
+>
+{};
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::conv::kernel
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/thread/depthwise_mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/thread/depthwise_mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..41eaba2f64b1c14fd85de632b1bfe8c9a3efbc1e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/thread/depthwise_mma.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for depthwise convolution
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// MMA operation
+template <
+  /// Size of the matrix product (concept: GemmShape)
+  typename Shape_,
+  /// Number of threads participating
+  int kThreads_,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Inner product operator
+  typename Operator
+>
+struct ElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// General implementation
+template <
+    /// Size of the matrix product (concept: GemmShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Element type of C matrix
+    typename ElementC_>
+struct ElementwiseInnerProduct<Shape_, 1, ElementA_, ElementB_, ElementC_, arch::OpMultiplyAdd> {
+  using Shape = Shape_;
+  using Operator = arch::OpMultiplyAdd;
+  using ElementC = ElementC_;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(Array<ElementC_, Shape::kN> &d,
+                  Array<ElementA_, Shape::kN> const &a,
+                  Array<ElementB_, Shape::kN> const &b,
+                  Array<ElementC_, Shape::kN> const &c) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Shape::kN; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization of half_t
+template <>
+struct ElementwiseInnerProduct<
+  gemm::GemmShape<2, 2, 1>,
+  1,
+  half_t,
+  half_t,
+  half_t,
+  arch::OpMultiplyAdd> {
+
+  using Shape = gemm::GemmShape<2, 2, 1>;
+  using Operator =  arch::OpMultiplyAdd;
+  using ElementC = half_t;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<half_t, 2> &d,
+    Array<half_t, 2> const &a,
+    Array<half_t, 2> const &b,
+    Array<half_t, 2> const &c
+  ) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600))
+
+    __half2 const & A = reinterpret_cast<__half2 const &>(a);
+    __half2 const & B = reinterpret_cast<__half2 const &>(b);
+    __half2 const & C = reinterpret_cast<__half2 const &>(c);
+
+    __half2 tmp_D = __hfma2(A, B, C);
+
+    d = reinterpret_cast<Array<half_t, 2> const &>(tmp_D);
+
+#else
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      d[i] = a[i] * b[i] + c[i];
+    }
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
+  typename Operator = arch::OpMultiplyAdd,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+struct DepthwiseDirectConvElementwiseInnerProduct;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Operator used to compute GEMM
+  typename Operator_
+>
+struct DepthwiseDirectConvElementwiseInnerProductGeneric {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = Operator_;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMN>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = cutlass::conv::thread::ElementwiseInnerProduct<
+    gemm::GemmShape<Shape::kN, Shape::kN, 1>,
+    1,
+    ElementA,
+    ElementB,
+    ElementC,
+    Operator>;
+
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+    Array<ElementC, Shape::kN> *ptr_D = reinterpret_cast<Array<ElementC, Shape::kN> *>(&D);
+    Array<ElementA, Shape::kN> const *ptr_A =
+        reinterpret_cast<Array<ElementA, Shape::kN> const *>(&A);
+    Array<ElementB, Shape::kN> const *ptr_B =
+        reinterpret_cast<Array<ElementB, Shape::kN> const *>(&B);
+
+    MmaOp mma_op;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN / MmaOp::Shape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+
+          Array<ElementC, MmaOp::Shape::kN> tmpD = ptr_D[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementA, MmaOp::Shape::kN> tmpA = ptr_A[m * Shape::kN / MmaOp::Shape::kN + n];
+          Array<ElementB, MmaOp::Shape::kN> tmpB = ptr_B[n];
+
+          mma_op(tmpD, tmpA, tmpB, tmpD);
+
+          ptr_D[m * Shape::kN / MmaOp::Shape::kN + n] = tmpD;
+
+        }
+      }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+    /// Data type of A elements
+  typename ElementA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Element type of C matrix
+  typename ElementC_
+>
+struct DepthwiseDirectConvElementwiseInnerProduct<
+  Shape_,
+  ElementA_,
+  ElementB_,
+  ElementC_,
+  arch::OpMultiplyAdd
+  > {
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA =
+      Array<ElementA, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kN>;  // 1 * groups_per_thread
+
+  /// C operand storage
+  using FragmentC =
+      Array<ElementC, Shape::kMN>;  // output_tile_size per thread * groups_per_thread
+
+  static bool const use_optimized = 0;
+
+  using ArchMmaOperator =  DepthwiseDirectConvElementwiseInnerProductGeneric<Shape,
+                                                        ElementA,
+                                                        ElementB,
+                                                        ElementC,
+                                                        Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    ArchMmaOperator mma;
+
+    mma(D, A, B, C);
+
+  }
+};
+
+} // namespace thread
+} // namespace conv
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..2da2b73b3afe3d5f5800c84d2edb2b220003ba83
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,485 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorAnalytic strided dgrad needs special handling to skip MMAs
+// on non-contributing w positions
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+  
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Moves filter_s
+    filter_s_ += problem_size_.stride_w;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    // Restore filter_s
+    filter_s_ = start_s_;
+
+    // Move filter_r 
+    filter_r_ += problem_size_.stride_h;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    // Restore filter_r
+    filter_r_ = start_r_;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+    
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorAnalytic unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+>{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+  
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // For a fixed filter position (r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = offset_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a5e60b9d134d8ec5d28da7e486bc5c7f6629a39
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,619 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+  > {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dStridedDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dStridedDgradFilterIteratorOptimizedParams const &base): 
+      Conv2dStridedDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dStridedDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv2dStridedDgradFilterIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int start_r_;
+  int start_s_;
+
+  int64_t reset_bytes_s_;
+  int64_t reset_bytes_r_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized(
+    Conv2dStridedDgradFilterIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
+    reset_bytes_r_ = reset_bytes_s_ +
+                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+          
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    TensorCoord coord{filter_k_, filter_r_, filter_s_, column};
+
+    pointer_ += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+
+    int next_idx = 0;
+    LongIndex reset_bytes = params_.reset_bytes;
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ >= problem_size_.S) {
+      
+      // Restore filter_s
+      filter_s_ = start_s_;
+
+      // Move filter_r by stride_h
+      filter_r_ += problem_size_.stride_h;
+#if 0
+      bool check = (filter_r_ < problem_size_.R);
+
+      filter_r_ = check ? filter_r_ : start_r_;
+      next_idx = check ? 1 : 2;
+      reset_bytes += (check ? reset_bytes_s_ : reset_bytes_r_);
+#else
+    asm volatile(
+        "{\n\t"
+        " .reg .pred %%p;\n\t"
+        " .reg .s64 t1;\n\t"
+        " setp.lt.s32 %%p, %3, %4;\n\t"
+        " selp.s32 %0, %3, %5, %%p;\n\t"
+        " selp.s32 %1, 1, 2, %%p;\n\t"
+        " selp.s64 t1, %6, %7, %%p;\n\t"
+        " add.s64 %2, %8, t1;\n\t"
+        "}\n"
+        : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+        : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+          "l"(reset_bytes_s_), "l"(reset_bytes_r_), "l"(reset_bytes));
+#endif
+    }
+
+    // offset pointers by offset_bytes
+    pointer_ += (params_.inc_next[next_idx] - reset_bytes);
+
+    if (next_idx == 2) {
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradFilterTileAccessIteratorOptimized unity strided dgrad is more performant for  dgrad
+// on problem sizes with stride = {1x1}
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradFilterTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+  > {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dDgradFilterIteratorOptimizedParams const &base): 
+      Conv2dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv2dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv2dDgradFilterIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_rs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized(
+    Conv2dDgradFilterIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_rs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          uint32_t pred = ((filter_k < problem_size_.K && (filter_c + v * AccessType::kElements) < problem_size_.C) ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+          
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[2] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..b33645c1783c8b12cc9d8d6e1d93dbffb3f47f1c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/functional.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0),
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int filter_r = filter_r_;
+    int filter_s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      filter_r = (problem_size_.R - 1 - filter_r);
+      filter_s = (problem_size_.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      problem_size_, 
+      stride_h_divmod, stride_w_divmod, 
+      filter_r, filter_s, 
+      start_h, start_w);
+
+    // Effective P and Q for filter position required for remapping NHW rows
+    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
+    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
+
+      // (STEP 1) [reorder NHW rows to start with same filter positions]
+      offset_n_[s] = offset_npq / (P * Q);
+      int residual = offset_npq % (P * Q);
+
+      int p = (residual / Q);
+      int q = (residual % Q);
+
+      int mapped_h = (start_h + p * problem_size_.stride_h);
+      int mapped_w = (start_w + q * problem_size_.stride_w);
+      
+      // Access (p, q) coordinates for Dy tensor and a filter position in gemm_k=0
+      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are divisible 
+      // by stride_h and stride_w
+      offset_p_[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
+      offset_q_[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, 
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+
+    // Restore filter_s 
+    filter_s_ = start_s_;
+
+    // Move filter_r by stride_h
+    filter_r_ +=  problem_size_.stride_h;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+
+    // Restore filter_r 
+    filter_r_ = start_r_;
+
+    // Move filter_k
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_]; 
+    int q = offset_q_[iteration_strided_];
+    
+    int conv_sign = (problem_size_.mode == Mode::kConvolution ? 1 : -1);
+
+    p += (conv_sign * (filter_r_ / problem_size_.stride_h));
+    q += (conv_sign * (filter_s_ / problem_size_.stride_w));
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements; 
+
+    return TensorCoord(
+      n, 
+      p, 
+      q, 
+      k);
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return 
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv2dDgradOutputGradientTileAccessIteratorAnalytic for unity strides can be optimized by 
+// eliminating modulo arithmetic to compute unscaled coordinates 
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorAnalytic < 
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int n = offset_n_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h) / problem_size_.stride_h;
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w) / problem_size_.stride_w;
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(n, p, q, k);
+  }
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // Conv2dDgradFilterTileAccessIteratorAnalytic unity stride specialization 
+    // only supports (stride_h, stride_w) = (1, 1)
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..638c6607095ce85f7c1b135296d974bdf295621a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,821 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2dDgradOutputGradientTileAccessIteratorOptimized strided dgrad needs special handling 
+// to skip MMAs (Dx = Dy * w) on invalid filter positions
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided,
+  AccessType_
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  using Mask = uint64_t;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dStridedDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+  
+  int filter_k_;
+  int filter_r_;
+  int filter_s_;
+  int start_r_;
+  int start_s_;
+  int64_t reset_bytes_s_;
+  int64_t reset_bytes_r_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    filter_k_(0),
+    filter_r_(start_r),
+    filter_s_(start_s),
+    start_r_(start_r),
+    start_s_(start_s) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    reset_bytes_s_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0];
+
+    reset_bytes_r_ = (problem_size_.num_gemm_k_filter_s(start_s_) - 1) * params_.inc_next[0] +
+                      (problem_size_.num_gemm_k_filter_r(start_r_) - 1) * params_.inc_next[1];
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    int filter_r = filter_r_;
+    int filter_s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      filter_r = (problem_size_.R - 1 - filter_r);
+      filter_s = (problem_size_.S - 1 - filter_s);
+    }
+
+    // Starting h, w positions for filter position in gemm_k=0
+    int start_h, start_w;
+    strided_dgrad_starting_coords(
+      problem_size_, 
+      stride_h_divmod, stride_w_divmod, 
+      filter_r, filter_s, 
+      start_h, start_w);
+
+
+    // Effective starting P and Q for filter position required for remapping NHW rows
+    int P = (problem_size_.H - start_h + problem_size_.stride_h - 1) / problem_size_.stride_h;
+    int Q = (problem_size_.W - start_w + problem_size_.stride_w - 1) / problem_size_.stride_w;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);      
+
+      int offset_npq = (threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided) % params_.tiled_rows_per_filter;
+
+      // (STEP 1) [reorder NHW rows to start with same filter positions]
+      offset_n[s] = offset_npq / (P * Q);
+      int residual = offset_npq % (P * Q);
+
+      int p = (residual / Q);
+      int q = (residual % Q);
+
+      int mapped_h = (start_h + p * problem_size_.stride_h);
+      int mapped_w = (start_w + q * problem_size_.stride_w);
+      
+      // Access (p, q) coordinates for Dy tensor for filter position in gemm_k=0
+      // note that (h + pad_h - filter_r) and (w + pad_w - filter_s) are ensured to be 
+      // divisible by stride_h and stride_w
+      offset_p[s] = (mapped_h + problem_size_.pad_h - filter_r) / problem_size_.stride_h;
+      offset_q[s] = (mapped_w + problem_size_.pad_w - filter_s) / problem_size_.stride_w;
+
+      // Initialize pointers for gemm_k=0
+      TensorCoord coord{offset_n[s], offset_p[s], offset_q[s], filter_k_};
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    //
+    // Precompute mask predicates
+    //
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = start_r; r < problem_size_.R; r += problem_size_.stride_h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int p = offset_p[s_idx] ;
+
+        p += (params_.conv_sign * (r / problem_size_.stride_h));
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for(int s = start_s; s < problem_size_.S; s += problem_size_.stride_w) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int q = offset_q[s_idx];
+        q += (params_.conv_sign * (s / problem_size_.stride_w));
+
+        bool pred = (q >=0 && q < problem_size_.Q);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size.K);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, 
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn});
+  }
+
+private:
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset, LongIndex byte_reset = 0) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset - byte_reset;
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+
+    int next_idx = 0;
+    int64_t reset_bytes = 0;
+
+    // Move filter_s by stride_w
+    filter_s_ +=  problem_size_.stride_w;
+    if (filter_s_ >= problem_size_.S) {
+      
+      // Restore filter_s
+      filter_s_ = start_s_;
+
+      // Move filter_r by stride_h
+      filter_r_ += problem_size_.stride_h;
+#if 0
+      if (filter_r_ < problem_size_.R) {
+
+        next_idx = 1;
+
+        // Restore bytes in q coordinate (Mma in filter s dimension)
+        reset_bytes = reset_bytes_s_;
+
+      } else {
+
+        // Restore filter_r
+        filter_r_ = start_r_;
+
+        next_idx = 2;
+
+        // Restore bytes in p and q coordinate (Mma in filter s and r dimension)
+        reset_bytes = reset_bytes_r_;
+      }
+#else
+      asm volatile(
+          "{\n\t"
+          " .reg .pred %%p;\n\t"
+          " setp.lt.s32 %%p, %3, %4;\n\t"
+          " selp.s32 %0, %3, %5, %%p;\n\t"
+          " selp.s32 %1, 1, 2, %%p;\n\t"
+          " selp.s64 %2, %6, %7, %%p;\n\t"
+          "}\n"
+          : "=r"(filter_r_), "=r"(next_idx), "=l"(reset_bytes)
+          : "r"(filter_r_), "r"(problem_size_.R), "r"(start_r_),
+            "l"(reset_bytes_s_), "l"(reset_bytes_r_));
+#endif
+    }
+
+    // offset pointers by offset_bytes
+    add_byte_offset_(params_.inc_next[next_idx] - reset_bytes);
+
+    if (next_idx == 2) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+      }
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+    }
+  }
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // Limit on filter size
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Conv2dDgradOutputGradientTileAccessIteratorOptimized unity stride dgrad is optimized for dgrad
+// with problem stride = {1x1}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_
+>
+class Conv2dDgradOutputGradientTileAccessIteratorOptimized <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kUnity,
+  AccessType_
+> {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Conv2dDgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dDgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nhw / (problem_size_.H * problem_size_.W);
+      //  int residual = offset_nhw % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      params_.hw_divmod(offset_n[s], residual, offset_nhw);
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_h[s], offset_w[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && p >= 0 && p < problem_size_.P);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_k_ + v_idx * AccessType::kElements >= problem_size.K);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // activation nhw and filter position k, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int h, int w, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, p, q, filter_k_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+  
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, (filter_k_ + v_idx * AccessType::kElements) >= problem_size_.K);
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+        masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+      }
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? Mask(0) : masks_[s][v][0];
+      masks_[s][v][1] = clear ? Mask(0) : masks_[s][v][1];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != MatrixCoord({1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4eb011e1c675757b9f1fa3111c2de0db658cad5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,332 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone
+>
+class Conv2dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_c_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_init_;
+  int group_idx_offset_;
+  int channels_per_group_;
+  int crs_cnt_;
+  int crs_per_group_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_(0),
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      channels_per_group_ = problem_size_.C / problem_size_.groups;
+      crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kColumn - 1) / Shape::kColumn);
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        // moves to next group
+        crs_cnt_ = 0;
+        ++group_idx_offset_;
+        filter_c_ = group_idx_offset_ * channels_per_group_ + filter_c_init_;
+      } else {
+        filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+      }
+    }
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    int c = filter_c_ + iteration_vector_ * AccessType::kElements; 
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
new file mode 100644
index 0000000000000000000000000000000000000000..c608ce5305039ce42bd017fd74f14658a6c593da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kColumn;
+
+  static int const kAccessesPerVector = kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.column() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int r = 0;
+    int s = 0;
+    int c = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+
+      int rs_index = (rsc_index / problem_size_.C);
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e38c285c78ba570506074f40f6bc5cff45a76
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,353 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kColumn / AccessType::kElements;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static int const kStrideH = 0;
+  static int const kStrideW = 0;
+  static int const kDilationH = 0;
+  static int const kDilationW = 0;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    //
+    // This requires problem_size.C == AccessType::kElements
+    //
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.column() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      if (kUseFastDivmodPrologue) {
+        int residual = params_.divmod_Q.divmod(offset_q_[s], offset_npq);
+        offset_n_[s] = params_.divmod_P.divmod(offset_p_[s], residual);
+      }
+      else {
+        offset_n_[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+        int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+
+        offset_p_[s] = residual / problem_size_.Q;
+        offset_q_[s] = residual % problem_size_.Q;
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int stride_h = kStrideH;
+    if (!kStrideH) {
+      stride_h = problem_size_.stride_h;
+    }
+
+    int stride_w = kStrideW;
+    if (!kStrideW) {
+      stride_w = problem_size_.stride_w;
+    }
+
+    int dilation_h = kDilationH;
+    if (!kDilationH) {
+      dilation_h = problem_size_.dilation_h;
+    }
+
+    int dilation_w = kDilationW;
+    if (!kDilationW) {
+      dilation_w = problem_size_.dilation_w;
+    }
+
+    int h = p * stride_h - problem_size_.pad_h + r * dilation_h;
+    int w = q * stride_w - problem_size_.pad_w + s * dilation_w;
+
+    return TensorCoord(n, h, w, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationH && problem_size.dilation_h != kDilationH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kDilationW && problem_size.dilation_w != kDilationW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideH && problem_size.stride_h != kStrideH) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (kStrideW && problem_size.stride_w != kStrideW) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a5c33e885be7521981e2d4bc5fc35f3b1412ebe
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,422 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorNCxHWx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropActivationTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  
+  using Mask = uint64_t;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFpropActivationIteratorOptimizedParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (r, s)
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][kAccessesPerVector][2];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_c_(0), 
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_npq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_npq / (problem_size_.P * problem_size_.Q);
+      //  int residual = offset_npq % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      params.pq_divmod(offset_n[s], residual, offset_npq);
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_p[s], offset_q[s], 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && h >= 0 && h < problem_size_.H);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][0] |= (pred << r);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+          masks_[s_idx][v_idx][1] |= (pred << s);
+        }
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output npq and filter position r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int p, int q, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, h, w, filter_c_);
+  }
+  
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      filter_s_ = 0;
+      ++filter_r_;
+ 
+      if (filter_r_ < problem_size_.R) {
+        next_idx = 1;
+      }
+      else {
+        filter_r_ = 0;
+        next_idx = 2;
+      }
+    }
+    
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 2) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= problem_size_.C);
+    }
+  }
+   
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < kAccessesPerVector; ++v) {
+        masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
+        masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
+      }
+    }
+  } 
+   
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][v][0] = clear ? 0 : masks_[s][v][0];
+      masks_[s][v][1] = clear ? 0 : masks_[s][v][1];
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][iteration_vector_][0] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][iteration_vector_][1] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((problem_size.C / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<32>>::value) {
+      if (problem_size.C % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorNCxHWx<64>>::value) {
+      if (problem_size.C % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Conv2dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed200ed3cf030055b3f7ba470748c91c3751fbfe
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,330 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  conv::GroupMode GroupMode_ = conv::GroupMode::kNone,
+  bool IsDeconv_ = false
+>
+class Conv2dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+  static conv::GroupMode const kGroupMode = GroupMode_;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+  int filter_c_init_;
+  int crs_cnt_;
+  int crs_per_group_;  
+  int group_idx_offset_c_;
+  int channels_per_group_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+  int group_idx_offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    crs_cnt_(0),
+    group_idx_offset_c_(0),
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode != conv::GroupMode::kNone) {
+      filter_c_init_ = filter_c_;
+      if (kGroupMode == conv::GroupMode::kDepthwise){
+        channels_per_group_ = 1;
+        crs_per_group_ = problem_size_.S * problem_size_.R;
+      } else {
+        channels_per_group_ = input_channels / problem_size_.groups;
+        crs_per_group_ = problem_size_.S * problem_size_.R * ((channels_per_group_ + Shape::kRow - 1) / Shape::kRow);
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+      if (kGroupMode != conv::GroupMode::kNone && kGroupMode != conv::GroupMode::kDepthwise) {
+        group_idx_offset_k_[s] = (thread_coord.strided() + s * ThreadMap::Delta::kStrided) / (output_channels / problem_size_.groups);
+      }
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    if (kGroupMode != conv::GroupMode::kNone) {
+      ++crs_cnt_;
+    }
+
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+    } else {
+      if (crs_cnt_ == crs_per_group_) {
+        crs_cnt_ = 0;
+        filter_c_ = filter_c_init_;
+        if (kGroupMode != conv::GroupMode::kDepthwise) {
+          // moves to next group
+          ++group_idx_offset_c_;
+        }
+      } else {
+        filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+      }
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+    int c = filter_c_ + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(k, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    if (kGroupMode == conv::GroupMode::kNone) {
+      return coord.n() < output_channels && coord.c() < input_channels;
+    } else if (kGroupMode == conv::GroupMode::kDepthwise) {
+      return coord.n() < output_channels && coord.c() < 1; // channels_per_group_ is always equal to ONE.
+    } else {
+      return coord.n() < output_channels && coord.c() < channels_per_group_ &&
+             group_idx_offset_c_ == group_idx_offset_k_[iteration_strided_];
+    }
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (output_channels % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (output_channels % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
new file mode 100644
index 0000000000000000000000000000000000000000..f208c9a5bb2ee697626a8caebc5715073ecdc7eb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFewChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFewChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kPositionsPerTile = Shape::kRow;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rsc_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rsc_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rsc_index_ = (threadblock_offset.row() + thread_coord.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rsc_index_ += kPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rsc_index = rsc_index_ + iteration_vector_ * AccessType::kElements;
+
+    int c = 0;
+    int s = 0;
+    int r = 0;
+
+    if (kUseFastDivmodMainloop) {
+      int rs_index = params_.divmod_C.divmod(c, rsc_index);
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      c = (rsc_index % problem_size_.C);
+      int rs_index = (rsc_index / problem_size_.C);
+
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    bool in_bounds =
+      coord.n() < problem_size_.K &&
+      coord.h() >= 0 &&
+      coord.h() < problem_size_.R &&
+      coord.c() < problem_size_.C;
+
+    return in_bounds;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w +
+      coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFewChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc2151d8ba2759d55f6602024a5072b31789cf6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
@@ -0,0 +1,275 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dFpropFilterTileAccessIteratorFixedChannels {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kFixedChannels;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kFilterPositionsPerTile = Shape::kRow / AccessType::kElements;
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static bool const kUseFastDivmodPrologue = true;
+  static bool const kUseFastDivmodMainloop = true;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dFewChannelsParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int rs_index_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels(
+    Params const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params),
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    rs_index_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    rs_index_ = (threadblock_offset.row() + thread_coord.contiguous()) / AccessType::kElements;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    rs_index_ += kFilterPositionsPerTile * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int rs_index = rs_index_ + iteration_vector_;
+
+    int r = 0;
+    int s = 0;
+
+    if (kUseFastDivmodMainloop) {
+      r = params_.divmod_S.divmod(s, rs_index);
+    }
+    else {
+      s = (rs_index % problem_size_.S);
+      r = (rs_index / problem_size_.S);
+    }
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, r, s, 0);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.h() >= 0 && coord.h() < problem_size_.R;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+
+    int32_t offset =
+      coord.n() * params_.stride_n +
+      coord.h() * params_.stride_h +
+      coord.w() * params_.stride_w + coord.c();
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorFixedChannels &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C != AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (problem_size.K % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (problem_size.K % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b12fbe3390c61f9f39ed54ad27cf78e65d80dff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>,
+  bool IsDeconv_ = false
+>
+class Conv2dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv2dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv2dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_rs_;
+  int filter_c_;
+  int channels_per_group_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized(
+    Conv2dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_rs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+    channels_per_group_ = (IsDeconv ? problem_size_.K : problem_size_.C) / problem_size_.groups;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+        predicates_[v_idx] |= (pred << s);
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_rs;
+
+    // moves to the next tile
+    ++filter_rs_;
+    if (filter_rs_ == params_.RS) {
+
+      filter_rs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+ 
+    CUTLASS_PRAGMA_UNROLL
+    for (int v_idx = 0; v_idx < kAccessesPerVector; ++v_idx) {
+      clear_mask(v_idx, filter_c_ + v_idx * AccessType::kElements >= channels_per_group_);
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask(int v, bool clear = true) {
+    predicates_[v] = clear ? 0u : predicates_[v];
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_[iteration_vector_] & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if ((input_channels / problem_size.groups) % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<32>>::value) {
+      if (output_channels % 32) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    if (platform::is_same<Layout, layout::TensorCxRSKx<64>>::value) {
+      if (output_channels % 64) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a3828fccb00b32d70e215785be3da1d317ed38a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_params.h
@@ -0,0 +1,893 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dAnalyticParams(
+    Conv2dProblemSize const &,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv2d analytic tile iterators
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFewChannelsParams {
+
+  using Layout = Layout_;
+
+
+  int32_t stride_w;
+  int32_t stride_h;
+  int32_t stride_n;
+
+  FastDivmod divmod_P;
+  FastDivmod divmod_Q;
+  FastDivmod divmod_S;
+  FastDivmod divmod_C;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFewChannelsParams(
+    Conv2dProblemSize const &problem_size,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ):
+    stride_w(int32_t(layout.stride()[0])),
+    stride_h(int32_t(layout.stride()[1])),
+    stride_n(int32_t(layout.stride()[2])),
+    divmod_P(problem_size.P),
+    divmod_Q(problem_size.Q),
+    divmod_S(problem_size.S),
+    divmod_C(problem_size.C)
+  {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams
+struct Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int tiled_rows_per_filter;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientTileAccessIteratorAnalyticParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                            ///< layout object
+    int element_size_bits,                           ///< size of each element in bits
+    MatrixCoord threadblock_shape
+  ): layout(layout) {
+    
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
+  
+    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
+    
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+
+CUTLASS_HOST_DEVICE
+void TraceIteratorParams(
+  char const *conv_operator, 
+  char const *operand,
+  int element_size_bits,
+  MatrixCoord threadblock_shape,
+  int thread_count,
+  int access_size,
+  layout::PitchLinearCoord threadmap_iterations,
+  layout::PitchLinearCoord threadmap_delta
+) {
+ 
+#if !defined(__CUDA_ARCH__)
+
+  char const *fname = "conv_iterator_params.csv";
+
+  std::ifstream test(fname);
+  bool file_exists = test.is_open();
+
+  if (file_exists) {
+    test.close();
+  }
+ 
+  std::ofstream trace("conv_iterator_params.csv", std::ofstream::app);
+
+  if (!file_exists) {
+    trace 
+      << "Operator,Operand,ElementSize,CtaRows,CtaColumns,ThreadCount,AccessSize,"
+      << "IterationsContiguous,IterationsStrided,DeltaContiguous,DeltaStrided\n";
+  }
+
+  trace << conv_operator << "," << operand << "," << element_size_bits << "," 
+    << threadblock_shape.row() << "," << threadblock_shape.column()
+    << "," << thread_count << "," << access_size 
+    << "," << threadmap_iterations.contiguous() << "," << threadmap_iterations.strided()
+    << "," << threadmap_delta.contiguous() << "," << threadmap_delta.strided() << "\n";
+#endif
+}
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) \
+  TraceIteratorParams(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta);
+
+#else
+
+#define TRACE_CONV_INITIALIZERS(conv_op, operand, element_size, cta_shape, thread_count, access_size, iterations, delta) {}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template<>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q), 
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+
+#if ENABLE_CONV2D_PARAMS_PRINT
+  /// Prints internal state.
+  CUTLASS_HOST_DEVICE
+  void print() {
+    auto stride = layout.stride();
+    printf(
+      "Conv2dFpropActivationIteratorOptimizedParams:\n"
+      "  layout(w: %d, h: %d, n: %d)\n"
+      "  inc_next[%ld, %ld, %ld]\n"
+      "  filter_c_delta(%d) - PQ(%d)\n"
+      "  pq_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n"
+      "  q_divmod(divisor: %d, multiplier: %u, shift_right: %u)\n",
+      stride[0], stride[1], stride[2],
+      inc_next[0], inc_next[1], inc_next[2],
+      filter_c_delta,
+      PQ,
+      pq_divmod.divisor,
+      pq_divmod.multiplier,
+      pq_divmod.shift_right,
+      q_divmod.divisor,
+      q_divmod.multiplier,
+      q_divmod.shift_right
+    );
+  }
+#endif  
+};
+
+/// Parameters structure used for Conv2dFpropActivationTileIteratorOptimized
+template <int Interleaved_>
+struct Conv2dFpropActivationIteratorOptimizedParams<layout::TensorNCxHWx<Interleaved_>> {
+  static int const kInterleaved = Interleaved_;
+ 
+  using Layout = layout::TensorNCxHWx<kInterleaved>;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int PQ;                 // product of P*Q
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), PQ(problem_size.P * problem_size.Q), pq_divmod(PQ), q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (kInterleaved * problem_size.dilation_w) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[0]) * problem_size.dilation_h
+        - (problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[1])
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[0] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * kInterleaved * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template< typename Layout_ = layout::TensorNHWC >
+struct Conv2dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorNHWC>
+{
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[2]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[2]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+
+#if ENABLE_CONV2D_PARAMS_PRINT
+  /// Prints internal state.
+  CUTLASS_HOST_DEVICE
+  void print() {
+    auto stride = layout.stride();
+    printf(
+      "Conv2dFpropFilterIteratorOptimizedParams:\n"
+      "  layout[%d, %d, %d]\n"
+      "  RS(%d), filter_c_delta(%d), inc_next(k: %ld, rs: %ld, c: %ld)\n",
+      stride[0], stride[1], stride[2],
+      RS,
+      filter_c_delta,
+      inc_next_k, inc_next_rs, inc_next_c
+    );
+  }
+#endif
+};
+
+template<int Interleaved_>
+struct Conv2dFpropFilterIteratorOptimizedParams<layout::TensorCxRSKx<Interleaved_>>
+{
+  static int const kInterleaved = Interleaved_;
+  using Layout = layout::TensorCxRSKx<kInterleaved>;
+
+  Layout layout;
+  int RS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dFpropFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    RS = problem_size.R * problem_size.S;
+
+    inc_next_k = (kInterleaved * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      (  int64_t(layout.stride()[0])
+        - kInterleaved * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices / kInterleaved * int64_t(layout.stride()[2])
+        - int64_t(RS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * kInterleaved 
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Dgrad Optimized Dy params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters object for Conv2d DGRAD OutputGradient (dy) iterator
+struct Conv2dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int64_t inc_next[3];    // {next S, next R, next K}
+
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  int HW;                  // product of H*W
+
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    HW(problem_size.H *problem_size.W), 
+    hw_divmod(HW), 
+    w_divmod(problem_size.W) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * (problem_size.R - 1) * (int64_t)layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * (problem_size.S - 1) * (int64_t)layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad Optimized Dy params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dStridedDgradOutputGradientIteratorOptimizedParams {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  
+  int64_t inc_next[3];    // {next S, next R, next K}
+
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  int tiled_rows_per_filter;
+
+  int conv_sign;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,                            ///< layout object
+    int element_size_bits,                           ///< size of each element in bits
+    MatrixCoord threadblock_shape
+  ): layout(layout) {
+    
+    int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_shape.row());
+  
+    tiled_rows_per_filter = tile_m_per_filter * threadblock_shape.row();
+
+    conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      (int64_t)layout.stride()[0] * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        (int64_t)layout.stride()[1] * problem_size.dilation_h
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// Dgrad Optimized w params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_rs;        // offset in units of bytes to next RS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dDgradFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), RS(problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = ((int64_t)layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_rs =
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
+        - (problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+// StridedDgrad Optimized w params (layout::TensorNHWC)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct Conv2dStridedDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+  int RS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next[3];        // {next S, next R, next K}
+  int64_t reset_bytes;        // offset in units of bytes to move back the pointer 
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dStridedDgradFilterIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), RS(problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv2d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = (layout.stride()[2] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    // next S
+    inc_next[0] =
+      ( (int64_t)layout.stride()[0] * problem_size.stride_w
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] =
+      ( (int64_t)layout.stride()[1] * problem_size.stride_h
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[2] =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[2]
+        //- (problem_size.R * problem_size.S - 1) * layout.stride()[0]
+        //- (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[2]
+      ) * element_size_bits / 8;
+
+    // offset in units of bytes to move the pointer in backward direction
+    reset_bytes = (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[2]
+            * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d WGRAD Output Gradient (dy) iterator
+struct Conv2dWgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int NPQ;                      // precomputd product of N*P*Q for clearing predicates
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int64_t offset_next_strided;    // offset in units of bytes to next npq coordinate within tile
+  int64_t offset_next_contiguous; // offset in units of bytes to next k coordinate within tile
+  int64_t inc_next_npq;           // offset in units of bytes to next npq position in subsequent tile
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    layout(layout),
+    NPQ(problem_size.N * problem_size.P * problem_size.Q),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q) {
+    
+    TRACE_CONV_INITIALIZERS("conv2d_wgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    // Incremental offsets in unites of bytes (number of elements) * sizeof_bits<Element>::value / 8
+    offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
+                        * element_size_bits / 8;
+
+    offset_next_contiguous = (threadmap_delta.contiguous())
+                            * element_size_bits / 8;
+
+    inc_next_npq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
+                      * element_size_bits / 8;
+  }
+};
+
+struct Conv2dWgradActivationIteratorOptimizedParams {
+
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod sc_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+  FastDivmod c_divmod;
+  FastDivmod s_divmod;
+  int small_channel_conv_s_offset;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout
+  ):
+    layout(layout),
+    sc_divmod(problem_size.S * problem_size.C),
+    pq_divmod(problem_size.P * problem_size.Q),
+    q_divmod(problem_size.Q),
+    c_divmod(problem_size.C),
+    s_divmod(problem_size.S * problem_size.dilation_w),
+    small_channel_conv_s_offset((problem_size.S - 1) * problem_size.dilation_w - problem_size.pad_w) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationIteratorOptimizedParams(
+    Conv2dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ):
+    Conv2dWgradActivationIteratorOptimizedParams(
+      problem_size,
+      layout
+    ) { 
+    
+      TRACE_CONV_INITIALIZERS("conv2d_wgrad", "activation", 
+        element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+    }
+};
+
+struct PredicatedScaleBiasVectorAccessIteratorParams {
+  public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams() { }
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      layout::PitchLinear const &layout) {}
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    PredicatedScaleBiasVectorAccessIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      layout::RowMajor const &layout) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..13bd29b7a0547eee204d642a3cd67a24709f89ea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template wraps the tile access iterator concept to load whole tiles from tensors in
+      memory used for implicit GEMM convolution.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename TileAccessIterator_>
+class TileIterator {
+public:
+  using TileAccessIterator = TileAccessIterator_;
+
+  using Shape = typename TileAccessIterator::Shape;
+  using Element = typename TileAccessIterator::Element;
+  using Layout = typename TileAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = typename TileAccessIterator::ThreadMap;
+  using AccessType = typename TileAccessIterator::AccessType;
+  using TensorRef = typename TileAccessIterator::TensorRef;
+  using Index = typename TileAccessIterator::Index;
+  using LongIndex = typename TileAccessIterator::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
+  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
+  using Params = typename TileAccessIterator::Params;
+  static int const kConvDim = TileAccessIterator::kConvDim;
+  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, 
+    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+private:
+
+  /// Internal state
+  TileAccessIterator tile_access_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TileIterator(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(params, problem_size, ptr, thread_idx, threadblock_offset) { }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
+    return TileAccessIterator::getParams(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    tile_access_iterator_.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    tile_access_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator &operator++() {
+    tile_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIterator operator++(int) {
+    TileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          cutlass::arch::global_load<
+            AccessType,
+            sizeof(AccessType)
+          >(
+            frag_ptr[idx],
+            tile_access_iterator_.get() + pointer_offset,
+            tile_access_iterator_.valid()
+          );
+  
+          ++tile_access_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    tile_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    tile_access_iterator_.advance();
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // dispatch to iterator implementation
+    return TileAccessIterator::can_implement(problem_size);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad Tile Iterator
+template <typename TileAccessIterator_>
+class TileIteratorStridedDgrad {
+public:
+  using TileAccessIterator = TileAccessIterator_;
+
+  using Shape = typename TileAccessIterator::Shape;
+  using Element = typename TileAccessIterator::Element;
+  using Layout = typename TileAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = typename TileAccessIterator::ThreadMap;
+  using AccessType = typename TileAccessIterator::AccessType;
+  using TensorRef = typename TileAccessIterator::TensorRef;
+  using Index = typename TileAccessIterator::Index;
+  using LongIndex = typename TileAccessIterator::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = TileAccessIterator::kIteratorAlgorithm;
+  static StrideSupport const kStrideSupport = TileAccessIterator::kStrideSupport;
+  using Params = typename TileAccessIterator::Params;
+  static int const kConvDim = TileAccessIterator::kConvDim;
+  using ConvProblemSize = typename TileAccessIterator::ConvProblemSize;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, 
+    ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+private:
+
+  /// Internal state
+  TileAccessIterator tile_access_iterator_;
+
+public:
+
+  /// Constructor (output gradient (Dy) OperandA ctor)
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(
+      params, 
+      problem_size, 
+      ptr, 
+      thread_idx, 
+      stride_h_divmod, stride_w_divmod, 
+      start_r, start_s, 
+      threadblock_offset) { }
+
+  /// Constructor (filter (w) OperandB ctor)
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    int start_r, int start_s,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    tile_access_iterator_(params, 
+      problem_size, 
+      ptr, 
+      thread_idx, 
+      start_r, start_s, 
+      threadblock_offset) { }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(ConvProblemSize const &problem_size, Layout const &layout) {
+    return TileAccessIterator::getParams(problem_size, layout);
+  }
+
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    tile_access_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad &operator++() {
+    tile_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  TileIteratorStridedDgrad operator++(int) {
+    TileIteratorStridedDgrad self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        cutlass::arch::global_load<
+          AccessType,
+          sizeof(AccessType)
+        >(
+          frag_ptr[c + s * ThreadMap::Iterations::kContiguous],
+          tile_access_iterator_.get() + pointer_offset,
+          tile_access_iterator_.valid()
+        );
+
+        ++tile_access_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    tile_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    tile_access_iterator_.advance();
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // dispatch to iterator implementation
+    return TileAccessIterator::can_implement(problem_size);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5a240773b5912c9ace50916f55e8a1054092845
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // Filter postion (r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int r, s, c;
+
+    if (kAccessesPerVector == 1) {
+      /// One 128b aligned access fetching more than one element
+      c = filter_c_[iteration_contiguous_];
+      r = filter_r_[iteration_contiguous_];
+      s = filter_s_[iteration_contiguous_];
+    }  
+    else {
+      /// Multiple access to support non-128b alignment in contiguous dimension
+      c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) % problem_size_.C;
+      int wrap_c = (filter_c_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements) / problem_size_.C;
+      s = (filter_s_[iteration_contiguous_] + wrap_c) % problem_size_.S;
+      int wrap_s = (filter_s_[iteration_contiguous_] + wrap_c) / problem_size_.S;
+      r = filter_r_[iteration_contiguous_] + wrap_s;
+    } 
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+    
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+   
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+ 
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..56197279a5a45be6ac992ac16a86011cd9843646
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dWgradActivationIteratorOptimizedParams;
+
+private:
+
+  Conv2dWgradActivationIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for npq -> nhw translation
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized(
+    Conv2dWgradActivationIteratorOptimizedParams const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr))
+  {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int rsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      // filter_r_[c] = rsc_offset / (problem_size_.S * problem_size_.C);
+      // int residual = rsc_offset % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      params_.sc_divmod(precomputed_filter_r_[c], residual, rsc_offset);
+      params_.c_divmod(precomputed_filter_s_[c], filter_c_[c], residual);
+
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+
+      precomputed_filter_r_[c] =  -problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] =  -problem_size_.pad_w + s * problem_size_.dilation_w;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_npq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int r = precomputed_filter_r_[iteration_contiguous_];
+    int s = precomputed_filter_s_[iteration_contiguous_];
+    int c = filter_c_[iteration_contiguous_];
+
+    if (kAccessesPerVector > 1) {
+      // This code section is only to support non-128b alignment
+      // Multiple access to support non-128b alignment in contiguous dimension
+      int wrap_c;
+      params_.c_divmod(wrap_c, c, c + iteration_vector_ * AccessType::kElements);
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        s -= (problem_size_.dilation_w * wrap_c);
+        
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, params_.small_channel_conv_s_offset - s);
+        s = params_.small_channel_conv_s_offset - s;
+
+        r -= (problem_size_.dilation_h * wrap_s);
+
+      } else {
+        s += (problem_size_.dilation_w * wrap_c);
+
+        int wrap_s;
+        params_.s_divmod(wrap_s, s, s + problem_size_.pad_w);
+        s -= problem_size_.pad_w;
+
+        r += (problem_size_.dilation_h * wrap_s);
+      }
+    }
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_npq_[iteration_strided_] / (problem_size_.P * problem_size_.Q);
+    // int residual = offset_npq_[iteration_strided_] % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq_[iteration_strided_]);
+    params_.q_divmod(p, q, residual);
+
+    int h = p * problem_size_.stride_h + r;
+    int w = q * problem_size_.stride_w + s;
+
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea48bc6de0f94015b24632b6609ee8c81dac93cb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,260 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_npq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_npq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int npq = offset_npq_[iteration_strided_];
+
+    int n = npq / (problem_size_.P * problem_size_.Q);
+    int residual = npq % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    int k = filter_k_[iteration_contiguous_] + iteration_vector_ * AccessType::kElements;
+
+    return TensorCoord(n, p, q, k);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e5048fd304f8edd96cca25bc8735725d6f2e843
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess>
+>
+class Conv2dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv2dWgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Conv2dWgradOutputGradientIteratorOptimizedParams const &params_;
+  Conv2dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  uint32_t predicates_[kAccessesPerVector];
+  int filter_k_;
+  int offset_npq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized(
+    Conv2dWgradOutputGradientIteratorOptimizedParams const &params,
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_k_(0),
+    offset_npq_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_npq_ = threadblock_offset.column() + thread_coord.strided();
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_npq = offset_npq_ + s * ThreadMap::Delta::kStrided;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          bool predicate = valid_(at_(offset_npq, filter_k + v * AccessType::kElements));
+  
+          uint32_t pred = (predicate ? 1u : 0);
+  
+          int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+ 
+          predicates_[v] |= (pred << pred_idx);
+        }
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_npq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_npq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_npq_ + s * ThreadMap::Delta::kStrided >= params_.NPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          predicates_[v] = (predicates_[v] & (~kClearMask));
+        }
+      }
+    }
+
+    pointer_ += params_.inc_next_npq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is pointed to
+  /// by offset_npq and k.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_npq, int k) const {
+
+    // The subsequent fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int npq = offset_npq;
+    // int n = npq / (problem_size_.P * problem_size_.Q);
+    // int residual = npq % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+    
+    int residual, n, p, q;
+    
+    params_.pq_divmod(n, residual, offset_npq);
+    params_.q_divmod(p, q, residual);
+
+    return TensorCoord(n, p, q, k);
+  }
+  
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_[iteration_vector_] & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    ) + iteration_vector_;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv2dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..d996003f42587ef6de2af268e1903808991b34d9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,268 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  static_assert(sizeof_bits<Element>::value >= 8, 
+    "DGRAD requires elements of size 8b or larger.");
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // For a fixed filter position (t,r,s) find and fill offset_k_, offset_c_ in strided and contiguous dimension 
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int offset_k_[ThreadMap::Iterations::kStrided]; 
+  int offset_c_[ThreadMap::Iterations::kContiguous];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0), 
+    filter_r_(0),
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      offset_c_[c] = threadblock_offset.column() + thread_coord.contiguous() 
+        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = 
+        threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+     ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the filter tensor w that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int c = offset_c_[iteration_contiguous_];
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, c);
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor w
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..a269b18b0010329dedd31c4689bff8db4fb46d2a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradFilterTileAccessIteratorOptimized {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = StrideSupport_;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dDgradFilterIteratorOptimizedParams {
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dDgradFilterIteratorOptimizedParams const &base): 
+      Conv3dDgradFilterIteratorOptimizedParams(base) { }
+      
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ):
+      Conv3dDgradFilterIteratorOptimizedParams(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) { }
+
+  };
+
+private:
+
+  Conv3dDgradFilterIteratorOptimizedParams const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_k_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided *
+    ThreadMap::Iterations::kContiguous < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized(
+    Conv3dDgradFilterIteratorOptimizedParams const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_trs_(0),
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.strided();
+    Index column = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + s * ThreadMap::Delta::kStrided;
+        int filter_c = column + c * ThreadMap::Delta::kContiguous;
+
+        uint32_t pred = ((filter_k < problem_size_.K && filter_c < problem_size_.C) ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    pointer_ += (
+      filter_k_ * params.layout.stride()[3] + column
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_k;
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (filter_k_ + s * ThreadMap::Delta::kStrided >= problem_size_.K) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous);
+
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_ + 
+      iteration_contiguous_ * ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_strided;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..700c3d12ddfd53b0acef8b6c11188499ca021f76
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,343 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kStrided
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conv3dDgradOutputGradientTileAccessIteratorAnalytic strided dgrad needs special handling using
+// unscaled coordinations
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dDgradOutputGradientTileAccessIteratorAnalytic <
+  Shape_,
+  Element_,
+  ThreadMap_,
+  conv::StrideSupport::kStrided
+> {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "DGRAD requires elements of size 8b or greater.");
+ 
+  //
+  // Simpligying assertions
+  //
+
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ConvProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_k_;
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_d_[ThreadMap::Iterations::kStrided];
+  int offset_w_[ThreadMap::Iterations::kStrided];
+  int offset_h_[ThreadMap::Iterations::kStrided];
+  
+private:
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator but DOES NOT scale by the convolution stride. This is needed
+  /// to compute predicates in the valid() method. The return value of the public at()
+  /// method is correctly scaled.
+  CUTLASS_HOST_DEVICE
+  TensorCoord unscaled_at_() const {
+    int n = offset_n_[iteration_strided_];
+    int d = offset_d_[iteration_strided_];
+    int h = offset_h_[iteration_strided_];
+    int w = offset_w_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int z = (d + problem_size_.pad_d - t * problem_size_.dilation_d);
+    int p = (h + problem_size_.pad_h - r * problem_size_.dilation_h);
+    int q = (w + problem_size_.pad_w - s * problem_size_.dilation_w);
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()     // threadblock offset - units are whole CTA tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      offset_n_[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+
+      offset_d_[s] = residual / (problem_size_.H * problem_size_.W);
+      residual     = residual % (problem_size_.H * problem_size_.W);
+
+      offset_h_[s] = residual / problem_size_.W;
+      offset_w_[s] = residual % problem_size_.W;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // move to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_  = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_k_ += Shape_::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the output tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    TensorCoord coord = unscaled_at_();
+
+    return TensorCoord(
+      coord.n(), 
+      coord.d() / problem_size_.stride_d, 
+      coord.h() / problem_size_.stride_h, 
+      coord.w() / problem_size_.stride_w, 
+      coord.c());
+  }
+
+
+  /// Returns true if the current coordinate is within the output tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord unscaled_coord = unscaled_at_();
+    TensorCoord coord = at();
+
+    return 
+      !(unscaled_coord.d() % problem_size_.stride_d) &&
+      !(unscaled_coord.h() % problem_size_.stride_h) && 
+      !(unscaled_coord.w() % problem_size_.stride_w) &&
+      coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.Z &&
+      coord.h() >= 0 && coord.h() < problem_size_.P &&
+      coord.w() >= 0 && coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..69915babcbfcacc1a1830a4f9d70885aca5d40c8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,489 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  conv::StrideSupport StrideSupport_ = conv::StrideSupport::kUnity
+>
+class Conv3dDgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  static_assert(StrideSupport_ == conv::StrideSupport::kUnity,
+    "Only unit-stride dgrad is supported at this time.");
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kUnity;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  using Coord3D = Coord<3>;
+  static int const kAccessesPerVector = 1;
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dDgradOutputGradientIteratorOptimizedParams;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_k_;
+
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params,
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    filter_k_(0), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_d[ThreadMap::Iterations::kStrided];
+    int offset_h[ThreadMap::Iterations::kStrided];
+    int offset_w[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_ndhw = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_ndhw / (problem_size_.D * problem_size_.H * problem_size_.W);
+      //  int residual = offset_ndhw % (problem_size_.D * problem_size_.H * problem_size_.W);
+      //
+      //
+      //  offset_d[s] = residual / (problem_size_.H * problem_size_.W);
+      //  residual    = residual % (problem_size_.H * problem_size_.W);
+      //
+      //  offset_h[s] = residual / problem_size_.W;
+      //  offset_w[s] = residual % problem_size_.W;
+      //
+
+      int residual;
+
+      // input: (ndhw offset) output: (n offset and resudial (dhw offset))
+      params_.dhw_divmod(offset_n[s], residual, offset_ndhw);
+      // input: (dhw offset) output: (d offset and resudial (hw))
+      params_.hw_divmod(offset_d[s], residual, residual);
+      // input: (hw offset) output: (h offset and resudial (w offset))
+      params_.w_divmod(offset_h[s], offset_w[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_d[s], offset_h[s], offset_w[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int z = offset_d[s_idx] + problem_size_.pad_d - t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && z >= 0 && z < problem_size_.Z);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int p = offset_h[s_idx] + problem_size_.pad_h - r_ * problem_size_.dilation_h;
+
+        bool pred = (p >= 0 && p < problem_size_.P);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int q = offset_w[s_idx] + problem_size_.pad_w - s_ * problem_size_.dilation_w;
+
+        bool pred = (q >= 0 && q < problem_size_.Q);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_k_ >= problem_size.K) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+
+  /// Returns the coordinate in the output gradient tensor dy that is correspoinding to 
+  // activation ndhw and filter position k, t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int d, int h, int w, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int z = d + problem_size_.pad_d - t * problem_size_.dilation_d;
+    int p = h + problem_size_.pad_h - r * problem_size_.dilation_h;
+    int q = w + problem_size_.pad_w - s * problem_size_.dilation_w;
+
+    return TensorCoord(n, z, p, q, filter_k_);
+  }
+
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_k_ += params_.filter_k_delta;
+    }
+
+    clear_mask_(filter_k_ >= problem_size_.K);
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // This is specialized for unit stride
+    if (problem_size.stride() != Coord3D({1, 1, 1})) {
+      return Status::kErrorNotSupported;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorNotSupported;
+    }
+
+    // Limit on filter size
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a888e0fe4e63a255cd8bdb6b27de831691f71c8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,291 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_n_[ThreadMap::Iterations::kStrided];
+  int offset_z_[ThreadMap::Iterations::kStrided];
+  int offset_p_[ThreadMap::Iterations::kStrided];
+  int offset_q_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_t_(0),
+    filter_r_(0), 
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    
+      offset_n_[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+      offset_z_[s] = residual / (problem_size_.P * problem_size_.Q);
+      residual     = residual % (problem_size_.P * problem_size_.Q);
+
+      offset_p_[s] = residual / problem_size_.Q;
+      offset_q_[s] = residual % problem_size_.Q;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kColumn * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int n = offset_n_[iteration_strided_];
+    int z = offset_z_[iteration_strided_];
+    int p = offset_p_[iteration_strided_];
+    int q = offset_q_[iteration_strided_];
+
+    int t = filter_t_;
+    int r = filter_r_;
+    int s = filter_s_;
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - filter_t_);
+      r = (problem_size_.R - 1 - filter_r_);
+      s = (problem_size_.S - 1 - filter_s_);
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    AccessType const *ptr = reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..057023c09cb73199bda94f62c3c879269d7b5189
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_
+>
+class Conv3dFpropActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;  
+  using Mask = uint64_t;
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dFpropActivationIteratorOptimizedParams<Layout>;
+
+private:
+
+  Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+
+  // One pointer per access
+  char const *pointer_[ThreadMap::Iterations::kStrided];
+
+  // current filter position (t, r, s)
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  // mask for t, r, and s
+  Index masks_[ThreadMap::Iterations::kStrided][3];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized(
+    Conv3dFpropActivationIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()       // tile index - units are threadblock-scoped tiles
+  ) :    
+  params_(params), 
+  problem_size_(problem_size),
+  filter_t_(0), 
+  filter_r_(0), 
+  filter_s_(0),
+  filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    int offset_n[ThreadMap::Iterations::kStrided];
+    int offset_z[ThreadMap::Iterations::kStrided];
+    int offset_p[ThreadMap::Iterations::kStrided];
+    int offset_q[ThreadMap::Iterations::kStrided];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      pointer_[s] = reinterpret_cast<char const *>(ptr);
+ 
+      int offset_nzpq = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      //
+      //  offset_n[s] = offset_nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //  int residual = offset_nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+      //
+      //  offset_z[s] = residual / (problem_size_.P * problem_size_.Q);
+      //  residual = residual % (problem_size_.P * problem_size_.Q);
+      //
+      //  offset_p[s] = residual / problem_size_.Q;
+      //  offset_q[s] = residual % problem_size_.Q;
+      //
+
+      int residual;
+
+      // input: (nzpq offset) output: (n offset and resudial (zpq offset))
+      params.zpq_divmod(offset_n[s], residual, offset_nzpq);
+      // input: (zpq offset) output: (z offset and resudial (pq))
+      params.pq_divmod(offset_z[s], residual, residual);
+      // input: (pq offset) output: (p offset and resudial (q offset))
+      params.q_divmod(offset_p[s], offset_q[s], residual);
+
+      TensorCoord coord = at_(offset_n[s], offset_z[s], offset_p[s], offset_q[s], 0, 0, 0);
+
+      pointer_[s] += params_.layout(coord) * sizeof_bits<Element>::value / 8;
+    }
+
+    clear_mask();
+
+    // mask predicates for filter position T
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int t = 0; t < problem_size_.T; ++t) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int t_ = t;
+        if (problem_size_.mode == Mode::kConvolution) {
+          t_ = problem_size_.T - 1 - t;
+        }
+
+        int d = offset_z[s_idx] * problem_size_.stride_d - problem_size_.pad_d + t_ * problem_size_.dilation_d;
+
+        bool pred = (offset_n[s_idx] < problem_size_.N && d >= 0 && d < problem_size_.D);
+        masks_[s_idx][0] |= (pred << t);
+      }
+    }   
+
+    // mask predicates for filter position R
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int r = 0; r < problem_size_.R; ++r) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int r_ = r;
+        if (problem_size_.mode == Mode::kConvolution) {
+          r_ = problem_size_.R - 1 - r;
+        }
+
+        int h = offset_p[s_idx] * problem_size_.stride_h - problem_size_.pad_h + r_ * problem_size_.dilation_h;
+
+        bool pred = (h >= 0 && h < problem_size_.H);
+        masks_[s_idx][1] |= (pred << r);
+      }
+    }  
+
+    // mask predicates for filter position S
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int s = 0; s < problem_size_.S; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s_idx = 0; s_idx < ThreadMap::Iterations::kStrided; ++s_idx) {
+
+        int s_ = s;
+        if (problem_size_.mode == Mode::kConvolution) {
+          s_ = problem_size_.S - 1 - s;
+        }
+
+        int w = offset_q[s_idx] * problem_size_.stride_w - problem_size_.pad_w + s_ * problem_size_.dilation_w;
+
+        bool pred = (w >= 0 && w < problem_size_.W);
+        masks_[s_idx][2] |= (pred << s);
+      }
+    }
+
+    if (filter_c_ >= problem_size.C) {
+      clear_mask();
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size,
+                  layout,
+                  sizeof_bits<Element>::value,
+                  {Shape::kRow, Shape::kColumn},
+                  ThreadMap::kThreads,
+                  ThreadMap::kElementsPerAccess,
+                  {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+                  {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided});
+  }
+
+private:
+
+  /// Returns the coordinate in the activations tensor X that is correspoinding to 
+  // output nzpq and filter position t, r, s
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int n, int z, int p, int q, int t, int r, int s) const {
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = problem_size_.T - 1 - t;
+      r = problem_size_.R - 1 - r;
+      s = problem_size_.S - 1 - s;
+    }
+
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_);
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_byte_offset_(LongIndex byte_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      pointer_[s] += byte_offset;
+    }
+  }
+
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask_(bool clear) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      // We are using inline PTX assembly here to avoid an CUDA C++ compilation
+      // artifact in which control flow instructions are generated. Instead, our
+      // intent is to predicate the mov instructions.
+      #if defined(__CUDA_ARCH__)
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][0])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][0])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][1])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][1])
+      );
+      asm volatile(
+          "{\n"
+          "  .reg .pred p;\n"
+          "  .reg .u32  m;"
+          "  mov.u32 m, %2;"
+          "  setp.ne.b32 p, %1, 0;\n"
+          "  @p mov.u32 m, 0;\n"
+          "  mov.u32 %0, m;\n"
+          "}\n" 
+        :
+          "=r"(masks_[s][2])
+       : 
+          "r"((int)clear),
+          "r"(masks_[s][2])
+      );
+      #else
+        if (clear) {
+          masks_[s][0] = 0;
+          masks_[s][1] = 0;
+          masks_[s][2] = 0;
+        }
+      #endif
+    }
+  }
+
+public:
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    add_byte_offset_(pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() { 
+
+    int next_idx = 0;
+ 
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ == problem_size_.S) {
+      
+      filter_s_ = 0;
+      ++filter_r_;
+      next_idx = 1;
+
+      if (filter_r_ == problem_size_.R) {
+        filter_r_ = 0;
+        ++filter_t_;
+
+        if (filter_t_ < problem_size_.T) {
+          next_idx = 2;
+        } 
+        else {
+          filter_t_ = 0;
+          next_idx = 3;
+        } 
+      }
+    }
+
+    add_byte_offset_(params_.inc_next[next_idx]);
+      
+    if (next_idx == 3) {  
+      filter_c_ += params_.filter_c_delta;
+    }
+
+    clear_mask_(filter_c_ >= problem_size_.C);
+  }
+
+  /// Clears the predicates
+  CUTLASS_HOST_DEVICE
+  void clear_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      masks_[s][0] = Mask(0);
+      masks_[s][1] = Mask(0);
+      masks_[s][2] = Mask(0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    return 
+      (masks_[iteration_strided_][0] & (Index(1) << filter_t_)) &&
+      (masks_[iteration_strided_][1] & (Index(1) << filter_r_)) &&
+      (masks_[iteration_strided_][2] & (Index(1) << filter_s_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    return reinterpret_cast<AccessType const *>(pointer_[iteration_strided_]);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationTileAccessIteratorOptimized &operator++() {
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // Conv3dFpropActivationTileAccessIteratorOptimized has constraint on filter positions 
+    // due to the number of mask bits.
+    if (problem_size.T > 32 || problem_size.R > 32 || problem_size.S > 32) {
+      return Status::kErrorNotSupported;
+    }
+    return Status::kSuccess;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a40d37e56bfc73744f80dc0cf84e30918b86a1b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -0,0 +1,259 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_,
+  bool IsDeconv_ = false
+>
+class Conv3dFpropFilterTileAccessIteratorAnalytic {
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Conv3dAnalyticParams<Layout>;
+
+private:
+
+  Params const &params_;
+  ConvProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  int filter_t_;
+  int filter_r_;
+  int filter_s_;
+  int filter_c_;
+
+  int offset_k_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic(
+    Params const &params, 
+    ConvProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    filter_t_(0),
+    filter_r_(0),
+    filter_s_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_k_[s] = threadblock_offset.column() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_s_;
+    if (filter_s_ < problem_size_.S) {
+      return;
+    }
+    filter_s_ = 0;
+    
+    ++filter_r_;
+    if (filter_r_ < problem_size_.R) {
+      return;
+    }
+    filter_r_ = 0;
+
+    ++filter_t_;
+    if (filter_t_ < problem_size_.T) {
+      return;
+    }
+    filter_t_ = 0;
+
+    filter_c_ += Shape::kRow * problem_size_.split_k_slices;
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = offset_k_[iteration_strided_];
+
+    return TensorCoord(k, filter_t_, filter_r_, filter_s_, filter_c_);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    auto input_channels = (IsDeconv ? problem_size_.K : problem_size_.C);
+    auto output_channels = (IsDeconv ? problem_size_.C : problem_size_.K);
+
+    return coord.n() < output_channels &&
+      coord.c() < input_channels;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+    
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(ConvProblemSize const &problem_size) {
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+    auto output_channels = (IsDeconv ? problem_size.C : problem_size.K);
+    // check alignment constraint on iterator's contiguous dimension
+    if (input_channels % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4e7db3a4398b67a2a2cf185cf9e689a22d3d0b8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -0,0 +1,279 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC or TensorCxRSKx<Interleave> layout of tensors in Global Memory.
+    
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad). 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename Layout_,
+  typename ThreadMap_,
+  bool IsDeconv_ = false
+>
+class Conv3dFpropFilterTileAccessIteratorOptimized{
+public:
+  
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static bool const IsDeconv = IsDeconv_;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dFpropFilterIteratorOptimizedParams<Layout> {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+    
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dFpropFilterIteratorOptimizedParams<Layout> const &base): 
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(base) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size,
+      Layout const &layout
+    ):
+      Conv3dFpropFilterIteratorOptimizedParams<Layout>(
+        problem_size,
+        layout,
+        sizeof_bits<Element>::value,
+        {Shape::kRow, Shape::kColumn},
+        ThreadMap::kThreads,
+        ThreadMap::kElementsPerAccess,
+        {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+        {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+      ) {
+
+    }
+  };
+
+private:
+
+  Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  uint32_t predicates_;
+  int filter_trs_;
+  int filter_c_;
+
+  //
+  // Assertions
+  //
+
+  // We map predicates into bits packed in this uint32_t container
+  static_assert(ThreadMap::Iterations::kStrided < sizeof(predicates_) * 8,
+    "Currently, the number of loads per iteration is limited by the size of the predicates container.");
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized(
+    Conv3dFpropFilterIteratorOptimizedParams<Layout> const &params,
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_{0},
+    filter_trs_(0),
+    filter_c_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_c_ = threadblock_offset.row() + thread_coord.contiguous();
+    Index column = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      uint32_t pred = ((column + s * ThreadMap::Delta::kStrided < (IsDeconv ? problem_size_.C : problem_size_.K)) ? 1u : 0);
+      predicates_ |= (pred << s);
+    }
+
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
+      predicates_ = 0u;
+    }
+
+    pointer_ += (
+      params_.layout({filter_c_, column}) 
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+
+    LongIndex next = params_.inc_next_trs;
+
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == params_.TRS) {
+
+      filter_trs_ = 0;
+      next = params_.inc_next_c;
+      filter_c_ += params_.filter_c_delta;
+    }
+      
+    if (filter_c_ >= (IsDeconv ? problem_size_.K : problem_size_.C)) {
+      predicates_ = 0;
+    }
+      
+    pointer_ += next;
+  }
+
+  /// Returns true if the current coordinate is within the filter tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return (predicates_ & (1u << iteration_strided_));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    return reinterpret_cast<AccessType const *>(pointer_);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+
+      // Move to the next K coordinate within the tile
+      pointer_ += params_.inc_next_k;
+
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+    auto input_channels = (IsDeconv ? problem_size.K : problem_size.C);
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (input_channels % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..941f4e1dff7ebfffd6830ad76876087b98a0c8b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_params.h
@@ -0,0 +1,508 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Params structure used for all Conv3d analytic tile iterators
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dAnalyticParams {
+
+  using Layout = Layout_;
+
+  Layout layout;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dAnalyticParams(
+    Conv3dProblemSize const &,  // unused; placeholder to match other Params interfaces.
+    Layout const &layout
+  ): layout(layout) {
+
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropActivationIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for Conv3dFpropActivationTileIteratorOptimized
+template<>
+struct Conv3dFpropActivationIteratorOptimizedParams<layout::TensorNDHWC> {
+  
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next C}
+  int filter_c_delta;     // number of logical elements to add to filter_c_
+  int ZPQ;                // product of Z*P*Q
+  int PQ;                 // product of P*Q
+
+  FastDivmod zpq_divmod;
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropActivationIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    PQ(problem_size.P * problem_size.Q),
+    ZPQ(problem_size.Z * problem_size.P * problem_size.Q),  
+    zpq_divmod(ZPQ),
+    pq_divmod(PQ), 
+    q_divmod(problem_size.Q) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "activation", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+  
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? -1 : 1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next C
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_c_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template< typename Layout_ = layout::TensorNDHWC >
+struct Conv3dFpropFilterIteratorOptimizedParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Conv3dFpropFilterIteratorOptimizedParams<layout::TensorNDHWC>
+{
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_c_delta;
+
+  int64_t inc_next_k;         // offset in units of bytes to next K position
+  int64_t inc_next_trs;        // offset in units of bytes to next TRS position
+  int64_t inc_next_c;         // offset in units of bytes to next C position
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dFpropFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout) {
+    
+    TRACE_CONV_INITIALIZERS("conv3d_fprop", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    TRS = problem_size.T * problem_size.R * problem_size.S;
+
+    inc_next_k = (int64_t(layout.stride()[3]) * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( int64_t(layout.stride()[0])
+        - int64_t(layout.stride()[3]) * (threadmap_iterations.strided() - 1) * threadmap_delta.strided()
+      ) * element_size_bits / 8;
+
+    inc_next_c =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices
+        - int64_t(TRS - 1) * layout.stride()[0]
+        - int64_t(threadmap_iterations.strided() - 1) * threadmap_delta.strided() * layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_c_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv3d DGRAD OutputGradient (dy) iterator
+struct Conv3dDgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int64_t inc_next[4];    // {next S, next R, next T, next K}
+  int filter_k_delta;     // number of logical elements to add to filter_k_
+
+  FastDivmod dhw_divmod;
+  FastDivmod hw_divmod;
+  FastDivmod w_divmod;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradOutputGradientIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,                             ///< layout object
+    int element_size_bits,                            ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size,
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), 
+    dhw_divmod(problem_size.D * problem_size.H * problem_size.W),
+    hw_divmod(problem_size.H * problem_size.W), 
+    w_divmod(problem_size.W) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "output_gradient", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    int conv_sign = (problem_size.mode == Mode::kConvolution ? 1 : -1);
+
+    // next S
+    inc_next[0] = conv_sign * (
+      int64_t(layout.stride()[0]) * problem_size.dilation_w
+    ) * element_size_bits / 8;
+
+    // next R
+    inc_next[1] = conv_sign * (
+        int64_t(layout.stride()[1]) * problem_size.dilation_h
+        - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next T
+    inc_next[2] = conv_sign * (
+      int64_t(layout.stride()[2]) * problem_size.dilation_d
+      - (problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+      - (problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // next K
+    inc_next[3] = (
+        threadblock_shape.column() * problem_size.split_k_slices
+        - conv_sign * int64_t(problem_size.T - 1) * layout.stride()[2] * problem_size.dilation_d
+        - conv_sign * int64_t(problem_size.R - 1) * layout.stride()[1] * problem_size.dilation_h
+        - conv_sign * int64_t(problem_size.S - 1) * layout.stride()[0] * problem_size.dilation_w
+      ) * element_size_bits / 8;
+
+    // logical offset added to internal channel counter - units are elements, not bytes
+    filter_k_delta = threadblock_shape.column() * problem_size.split_k_slices;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters object for Conv2d DGRAD Filter (w) iterator
+struct Conv3dDgradFilterIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+  int TRS;
+  int filter_k_delta;
+
+  int64_t inc_next_strided;   // offset in units of bytes to next K coordinate within tile
+  int64_t inc_next_trs;       // offset in units of bytes to next TRS position
+  int64_t inc_next_k;         // offset in units of bytes to next K position in subsequent tile
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dDgradFilterIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,                        ///< size of each element in bits
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): 
+    layout(layout), TRS(problem_size.T * problem_size.R * problem_size.S) {
+
+    TRACE_CONV_INITIALIZERS("conv3d_dgrad", "filter", 
+      element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+    inc_next_strided = ((int64_t)layout.stride()[3] * threadmap_delta.strided() * element_size_bits) / 8;
+
+    inc_next_trs =
+      ( (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    inc_next_k =
+      (
+        threadblock_shape.row() * problem_size.split_k_slices * (int64_t)layout.stride()[3]
+        - (problem_size.T * problem_size.R * problem_size.S - 1) * (int64_t)layout.stride()[0]
+        - (threadmap_iterations.strided() - 1) * threadmap_delta.strided() * (int64_t)layout.stride()[3]
+      ) * element_size_bits / 8;
+
+    filter_k_delta = threadblock_shape.row() * problem_size.split_k_slices;
+  }
+};
+
+/// Parameters object for Conv3d WGRAD OutputGradient iterator
+struct Conv3dWgradOutputGradientIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+  using LongIndex = typename Layout::LongIndex;
+
+  Layout layout;
+
+  int NZPQ;                // precomputd product of N*Z*P*Q for clearing predicates
+  int ZPQ;                 // product of Z*P*Q
+  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+  unsigned zpq_shr;        //    in device code.
+
+  int PQ;                  // product of P*Q
+  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+  unsigned pq_shr;         //    in device code.
+
+  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+  unsigned q_shr;          //    in device code.
+
+  LongIndex offset_next_strided;     // offset in units of bytes to next nzpq coordinate within tile
+  LongIndex offset_next_contiguous;  // offset in units of bytes to next k coordinate within tile
+  LongIndex inc_next_nzpq;           // offset in units of bytes to next nzpq position in subsequent tile
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): layout(layout) {
+
+  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "output_gradient", 
+    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+  // Incremental offsets in unites of bytes (number of elements) * element_size_bits / 8
+  offset_next_strided = (threadmap_delta.strided() * (int64_t)layout.stride()[0])
+                      * element_size_bits / 8;
+
+  offset_next_contiguous = (threadmap_delta.contiguous()) 
+                          * element_size_bits / 8;
+
+  inc_next_nzpq = (threadblock_shape.column() * problem_size.split_k_slices * (int64_t)layout.stride()[0])
+                    * element_size_bits / 8;
+
+  // Precompute several quantities for fast modulo arithmetic.
+  NZPQ = problem_size.N * problem_size.Z * problem_size.P * problem_size.Q;
+  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+  find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+  PQ = problem_size.P * problem_size.Q;
+  find_divisor(pq_mul, pq_shr, PQ);
+
+  find_divisor(q_mul, q_shr, problem_size.Q);
+
+  }
+};
+
+/// Parameters object for Conv3d WGRAD Activation Tile Access Iterator
+struct Conv3dWgradActivationIteratorOptimizedParams {
+
+  using Layout = layout::TensorNDHWC;
+
+  Layout layout;
+
+  int RSC;                 // product of R*S*C
+  unsigned rsc_mul;        // precomputed quantities for fast computation of div/% by RSC
+  unsigned rsc_shr;        //    in device code.
+
+  int SC;                  // product of S*C
+  unsigned sc_mul;         // precomputed quantities for fast computation of div/% by SC
+  unsigned sc_shr;         //    in device code.
+
+  unsigned c_mul;          // precomputed quantities for fast computation of div/% by C
+  unsigned c_shr;          //    in device code.
+
+  int ZPQ;                 // product of Z*P*Q
+  unsigned zpq_mul;        // precomputed quantities for fast computation of div/% by ZPQ
+  unsigned zpq_shr;        //    in device code.
+
+  int PQ;                  // product of P*Q
+  unsigned pq_mul;         // precomputed quantities for fast computation of div/% by PQ
+  unsigned pq_shr;         //    in device code.
+
+  unsigned q_mul;          // precomputed quantities for fast computation of div/% by Q
+  unsigned q_shr;          //    in device code.
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationIteratorOptimizedParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationIteratorOptimizedParams(
+    Conv3dProblemSize const &problem_size,
+    Layout const &layout,    
+    int element_size_bits,
+    MatrixCoord threadblock_shape,
+    int thread_count,
+    int access_size, 
+    layout::PitchLinearCoord threadmap_iterations,
+    layout::PitchLinearCoord threadmap_delta
+  ): layout(layout) {
+
+  TRACE_CONV_INITIALIZERS("conv3d_wgrad", "activation", 
+    element_size_bits, threadblock_shape, thread_count, access_size, threadmap_iterations, threadmap_delta);
+
+  // Precompute several quantities for fast modulo arithmetic.
+  RSC = problem_size.R * problem_size.S * problem_size.C;
+  find_divisor(rsc_mul, rsc_shr, RSC);
+
+  SC = problem_size.S * problem_size.C;
+  find_divisor(sc_mul, sc_shr, SC);
+      
+  find_divisor(c_mul, c_shr, problem_size.C);
+
+  ZPQ = problem_size.Z * problem_size.P * problem_size.Q;
+  find_divisor(zpq_mul, zpq_shr, ZPQ);
+
+  PQ = problem_size.P * problem_size.Q;
+  find_divisor(pq_mul, pq_shr, PQ);
+
+  find_divisor(q_mul, q_shr, problem_size.Q);
+
+  }
+};
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..97cad0a131667235fbab4c7dd092c1571ae3ee6c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -0,0 +1,289 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  
+  static int const kAccessesPerVector = 1;
+ 
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Filter postion (t,r,s,c) in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_t_[ThreadMap::Iterations::kContiguous];
+  int filter_r_[ThreadMap::Iterations::kContiguous];
+  int filter_s_[ThreadMap::Iterations::kContiguous];
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+
+      filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      residual = residual % (problem_size_.S * problem_size_.C);
+
+      filter_s_[c] = residual / problem_size_.C;
+      filter_c_[c] = residual % problem_size_.C;
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int t = filter_t_[iteration_contiguous_];
+    int r = filter_r_[iteration_contiguous_];
+    int s = filter_s_[iteration_contiguous_];
+
+    if (problem_size_.mode == Mode::kConvolution) {
+      t = (problem_size_.T - 1 - t);
+      r = (problem_size_.R - 1 - r);
+      s = (problem_size_.S - 1 - s);
+    }
+
+    int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+ 
+    int d = z * problem_size_.stride_d - problem_size_.pad_d + t * problem_size_.dilation_d;
+    int h = p * problem_size_.stride_h - problem_size_.pad_h + r * problem_size_.dilation_h;
+    int w = q * problem_size_.stride_w - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e5475f8f738e4f434f72e1f50a2c5762904cc42
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -0,0 +1,319 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (activation tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradActivationTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dWgradActivationIteratorOptimizedParams {
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dWgradActivationIteratorOptimizedParams const &base)
+          : Conv3dWgradActivationIteratorOptimizedParams(base) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
+          : Conv3dWgradActivationIteratorOptimizedParams(
+          problem_size,
+          layout,
+          sizeof_bits<Element>::value,
+          {Shape::kRow, Shape::kColumn},
+          ThreadMap::kThreads,
+          ThreadMap::kElementsPerAccess,
+          {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+          {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+
+  // Precomputed effective filter postion (t,r,s) in contiguous dimension stays constant for each gemm_iteration_k
+  // required for nzpq -> ndhw translation
+  int precomputed_filter_t_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_r_[ThreadMap::Iterations::kContiguous];
+  int precomputed_filter_s_[ThreadMap::Iterations::kContiguous];
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int filter_c_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize t,r,s,c filter position for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+      int trsc_offset = threadblock_offset.column() + thread_coord.contiguous()
+                        + c * ThreadMap::Delta::kContiguous;
+
+      // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+      //
+      // 
+      // filter_t_[c] = trsc_offset / (problem_size_.R * problem_size_.S * problem_size_.C);
+      // int residual = trsc_offset % (problem_size_.R * problem_size_.S * problem_size_.C);
+      //
+      // filter_r_[c] = residual / (problem_size_.S * problem_size_.C);
+      // residual = residual % (problem_size_.S * problem_size_.C);
+      //
+      // filter_s_[c] = residual / problem_size_.C;
+      // filter_c_[c] = residual % problem_size_.C;
+
+      int residual;
+      fast_divmod(precomputed_filter_t_[c], residual, trsc_offset, params_.RSC, params_.rsc_mul, params_.rsc_shr);
+      fast_divmod(precomputed_filter_r_[c], residual, residual, params_.SC, params_.sc_mul, params_.sc_shr);
+      fast_divmod(precomputed_filter_s_[c], filter_c_[c], residual, problem_size_.C, params_.c_mul, params_.c_shr);
+
+      int t = precomputed_filter_t_[c];
+      int r = precomputed_filter_r_[c];
+      int s = precomputed_filter_s_[c];
+
+      if (problem_size_.mode == Mode::kConvolution) {
+        t = (problem_size_.T - 1 - t);
+        r = (problem_size_.R - 1 - r);
+        s = (problem_size_.S - 1 - s);
+      }
+      
+      // efective t,r,s for every contiguous dimension
+      precomputed_filter_t_[c] = - problem_size_.pad_d + t * problem_size_.dilation_d;
+      precomputed_filter_r_[c] = - problem_size_.pad_h + r * problem_size_.dilation_h;
+      precomputed_filter_s_[c] = - problem_size_.pad_w + s * problem_size_.dilation_w;
+
+
+    }
+
+    // initialize n, z, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+    
+      offset_nzpq_[s] = threadblock_offset.row() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;   
+    }
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-B by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kRow * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the activation tensor x that is currently pointed to
+  /// by the iterator.
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int n = offset_nzpq_[iteration_strided_] / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = offset_nzpq_[iteration_strided_] % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // 
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    // 
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq_[iteration_strided_], params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+ 
+    int d = z * problem_size_.stride_d + precomputed_filter_t_[iteration_contiguous_];
+    int h = p * problem_size_.stride_h + precomputed_filter_r_[iteration_contiguous_];
+    int w = q * problem_size_.stride_w + precomputed_filter_s_[iteration_contiguous_];
+
+    return TensorCoord(n, d, h, w, filter_c_[iteration_contiguous_]);
+  }
+
+  /// Returns true if the current coordinate is within the activation tensor x
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() >= 0 && coord.d() < problem_size_.D &&
+      coord.h() >= 0 && coord.h() < problem_size_.H &&
+      coord.w() >= 0 && coord.w() < problem_size_.W &&
+      coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradActivationTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+  
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe49985f5df8b76bfd1e57552e47577c379f229
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorAnalytic {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kAnalytic;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params {
+
+    Layout layout;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Conv3dProblemSize const &problem_size, 
+      Layout const &layout
+    ): layout(layout) {
+
+    }
+  };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  int filter_k_[ThreadMap::Iterations::kContiguous];
+
+  int offset_nzpq_[ThreadMap::Iterations::kStrided];
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+    
+    // initialize filter_k for every contiguous iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      filter_k_[c] = threadblock_offset.row() + thread_coord.contiguous() 
+                        + c * ThreadMap::Delta::kContiguous;
+    }
+
+    // initialize n, p, q offset for every strided iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] = threadblock_offset.column() + thread_coord.strided() 
+                      + s * ThreadMap::Delta::kStrided;  
+      
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_nzpq_) in GEMM-A by a CTA-K tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_nzpq_[s] += Shape::kColumn * problem_size_.split_k_slices;
+    }
+  }
+
+  /// Returns the coordinate in the output gradient tensor Dy that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int nzpq = offset_nzpq_[iteration_strided_];
+
+    int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+
+    int z = residual / (problem_size_.P * problem_size_.Q);
+    residual = residual % (problem_size_.P * problem_size_.Q);
+
+    int p = residual / problem_size_.Q;
+    int q = residual % problem_size_.Q;
+
+    return TensorCoord(n, z, p, q, filter_k_[iteration_contiguous_]);
+  }
+
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N &&
+      coord.d() < problem_size_.Z &&
+      coord.h() < problem_size_.P &&
+      coord.w() < problem_size_.Q &&
+      coord.c() < problem_size_.K;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    return reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorAnalytic &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c2f2e51e5e69f28d552839f906237b35d4879db
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -0,0 +1,310 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (output gradient tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNDHWC layout of tensors in Global Memory.
+
+    The iterator is specialized for each of the three convolution operators: forward propagation (Fprop),
+    backward data gradient (Dgrad), and backward weight gradient (Wgrad).
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/conv/threadblock/conv3d_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,
+  typename Element_,
+  typename ThreadMap_
+>
+class Conv3dWgradOutputGradientTileAccessIteratorOptimized {
+public:
+
+  //
+  // Types
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorNDHWC;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 3;
+  using ConvProblemSize = typename conv::Conv3dProblemSize;
+  static int const kAccessesPerVector = 1;  
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "WGRAD requires elements of size 8b or greater.");
+
+  //
+  // Parameters structure
+  //
+
+  struct Params : Conv3dWgradOutputGradientIteratorOptimizedParams {
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dWgradOutputGradientIteratorOptimizedParams const &base)
+          : Conv3dWgradOutputGradientIteratorOptimizedParams(base) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Conv3dProblemSize const &problem_size, Layout const &layout)
+          : Conv3dWgradOutputGradientIteratorOptimizedParams(
+            problem_size,
+            layout,
+            sizeof_bits<Element>::value,
+            {Shape::kRow, Shape::kColumn},
+            ThreadMap::kThreads,
+            ThreadMap::kElementsPerAccess,
+            {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+            {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}) {}
+    };
+
+private:
+
+  Params const &params_;
+  Conv3dProblemSize const &problem_size_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  char const *pointer_;
+    
+  uint32_t predicates_;
+  int filter_k_;
+  int offset_nzpq_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv3dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size),
+    pointer_(reinterpret_cast<char const *>(ptr)),
+    predicates_(0),
+    filter_k_(0),
+    offset_nzpq_(0) {
+
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.row() + thread_coord.contiguous();
+    offset_nzpq_ = threadblock_offset.column() + thread_coord.strided();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int filter_k = filter_k_ + c * ThreadMap::Delta::kContiguous;
+        int offset_nzpq = offset_nzpq_ + s * ThreadMap::Delta::kStrided;
+
+        bool predicate = valid_(at_(offset_nzpq, filter_k));
+
+        uint32_t pred = (predicate ? 1u : 0);
+
+        int pred_idx = c + s * ThreadMap::Iterations::kContiguous;
+        
+        predicates_ |= (pred << pred_idx);
+      }
+    }
+
+    // Offset pointer to (iteration_strided_, iteration_contiguous_) = (0, 0) 
+    pointer_ += (
+      offset_nzpq_ * params.layout.stride()[0] + filter_k_
+    ) * sizeof_bits<Element>::value / 8;
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv3dProblemSize const &problem_size, Layout const &layout) {
+    return Params(problem_size, layout);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next GEMM-K offset (offset_npq_) in GEMM-A by a CTA-K tile
+    offset_nzpq_ += Shape::kColumn * problem_size_.split_k_slices;
+
+    // Clear predicates if needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      if (offset_nzpq_ + s * ThreadMap::Delta::kStrided >= params_.NZPQ) {
+        uint32_t kClearMask = ((1u << ThreadMap::Iterations::kContiguous) - 1) << (s * ThreadMap::Iterations::kContiguous); 
+        predicates_ = (predicates_ & (~kClearMask));
+      }
+    }
+    pointer_ += params_.inc_next_nzpq; 
+  }
+
+private:
+  /// Returns the coordinate in the output gradient tensor Dy that is (offset_nzpq, k) pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at_(int offset_nzpq, int k) const {
+
+    // The subseqnet fast_divmod() operations are equivalent to the following logical computation:
+    //
+    //
+    // int nzpq = offset_nzpq_;
+    // int n = nzpq / (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    // int residual = nzpq % (problem_size_.Z * problem_size_.P * problem_size_.Q);
+    //
+    // int z = residual / (problem_size_.P * problem_size_.Q);
+    // residual = residual % (problem_size_.P * problem_size_.Q);
+    //
+    // int p = residual / problem_size_.Q;
+    // int q = residual % problem_size_.Q;
+
+    int residual, n, z, p, q;
+    fast_divmod(n, residual, offset_nzpq, params_.ZPQ, params_.zpq_mul, params_.zpq_shr);
+    fast_divmod(z, residual, residual, params_.PQ, params_.pq_mul, params_.pq_shr);
+    fast_divmod(p, q, residual, problem_size_.Q, params_.q_mul, params_.q_shr);
+
+    return TensorCoord(n, z, p, q, k);
+  }
+
+  /// Returns true if the coord is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid_(TensorCoord coord) const {
+
+    return coord.n() < problem_size_.N &&
+      coord.c() < problem_size_.K;
+  }
+
+public:
+
+  /// Returns true if the current coordinate is within the output gradient tensor Dy
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    LongIndex pred_idx = iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous;
+    return (predicates_ & (1u << pred_idx));
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    
+    return reinterpret_cast<AccessType const *>(
+      pointer_ +
+      iteration_strided_ * params_.offset_next_strided + 
+      iteration_contiguous_ * params_.offset_next_contiguous
+    );
+
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  Conv3dWgradOutputGradientTileAccessIteratorOptimized &operator++() {
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv3dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5cd2a740232c257f8e3b25c37408973f536722b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file 
+  \brief Extracts the host-params objects into non-template code.
+*/
+
+#pragma once
+
+#define TRACE_CONV_PARAMS_INITIALIZERS_ENABLED 0
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#if TRACE_CONV_PARAMS_INITIALIZERS_ENABLED
+#include <fstream>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvParams;
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams;
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template<typename Layout_ = layout::TensorNHWC >
+struct Depthwise2dFpropDirectConvFilterIteratorParams;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized
+template<>
+struct Depthwise2dFpropDirectConvParams<layout::TensorNHWC> {
+  
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int32_t activation_tile_h;
+  int32_t activation_tile_w;
+  int32_t activation_tile_hw;
+  FastDivmod activation_tile_w_divmod;
+  
+  int filter[2];
+  int stride[2];
+  int dilation[2];
+  int inc_next[2];
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_load_count;
+  int activation_storage_elements;
+  int activation_size;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams() { }
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,             ///< layout object
+      MatrixCoord threadblock_shape,    ///< CTA threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int element_size_bits,      ///< bits of activation element
+      const int thread_count,           ///< threads per threadblock
+      const int thread_count_contiguous, ///< number of threads for continuous dimension
+      const int element_per_load)       ///< element per each load
+      : layout(layout) {
+          
+    filter[0] = problem_size.S;
+    filter[1] = problem_size.R;
+    
+    stride[0] =  problem_size.stride_w;
+    stride[1] =  problem_size.stride_h;
+
+    dilation[0] = problem_size.dilation_w;
+    dilation[1] = problem_size.dilation_h;
+
+    // Compute activation_tile size per threadblock because stride and dilation are runtime params.
+    activation_tile_h = (threadblock_output_shape.h() - 1) * problem_size.stride_h +
+                        (problem_size.R - 1) * problem_size.dilation_h + 1;
+    activation_tile_w = (threadblock_output_shape.w() - 1) * problem_size.stride_w +
+                        (problem_size.S - 1) * problem_size.dilation_w + 1;
+    activation_tile_hw = activation_tile_h * activation_tile_w;
+
+    activation_tile_w_divmod = FastDivmod(activation_tile_w);
+
+    /// Below two values could not be templatized because the stride and dilation are runtime params
+    activation_load_count = (thread_count_contiguous * activation_tile_hw + (thread_count - 1)) / thread_count;
+    activation_storage_elements = activation_load_count * element_per_load * thread_count;
+    activation_size =  activation_storage_elements * element_size_bits / 8;
+
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q = (problem_size.Q + (threadblock_output_shape.w() - 1)) /
+                  (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+
+    // next S
+    inc_next[0] = problem_size.dilation_w;
+    // next R
+    inc_next[1] = (activation_tile_w * problem_size.dilation_h - (problem_size.S - 1) * problem_size.dilation_w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parameters structure used for DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation
+template <>
+struct Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  int activation_size;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,                          ///< Layout object
+      MatrixCoord threadblock_shape,                 ///< Threadblock Shape
+      Layout::TensorCoord threadblock_output_shape,  ///< Output tile Shape per threadblock
+      const int activation_size_                     ///< Activation size loaded by iterator
+      )
+      : layout(layout),
+        activation_size(activation_size_) {
+    // Fastdivmod for output P, Q
+    int tiles_p =
+        (problem_size.P + (threadblock_output_shape.h() - 1)) / (threadblock_output_shape.h());
+    int tiles_q =
+        (problem_size.Q + (threadblock_output_shape.w() - 1)) / (threadblock_output_shape.w());
+
+    pq_divmod = FastDivmod(tiles_p * tiles_q);
+    q_divmod = FastDivmod(tiles_q);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure used for DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized
+template <>
+struct Depthwise2dFpropDirectConvFilterIteratorParams<layout::TensorNHWC> {
+  using Layout = layout::TensorNHWC;
+
+  Layout layout;
+
+  int filter_size;
+
+  bool is_convolution;
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams() {}
+
+  CUTLASS_HOST_DEVICE
+  Depthwise2dFpropDirectConvFilterIteratorParams(
+      Conv2dProblemSize const &problem_size,
+      Layout const &layout,           ///< Layout object
+      MatrixCoord threadblock_shape,  ///< Threadblock Shape
+      const int filter_size_)         ///< Filter size loaded by iterator
+      : layout(layout),
+        filter_size(filter_size_),
+        is_convolution(problem_size.mode == Mode::kConvolution){}
+};
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
new file mode 100644
index 0000000000000000000000000000000000000000..012e306d800c3bcd62c1322a217d643d2ae38fd5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
@@ -0,0 +1,314 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename StrideShape_,
+          typename DilationShape_,
+          typename ActivationShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  // Compilation value of stride , dialtion and activation shape
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_;
+  using ActivationShape = ActivationShape_;
+
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  static int const kActivationSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvActivationIteratorFixedStrideDilationParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // activation N is inside the Tensor or not
+  bool valid_n_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * StrideShape::kRow - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * StrideShape::kColumn - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+
+    valid_n_ = activatioin_base_.n() < problem_size_.N;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        kActivationSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_iteration_index(0);
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h = next / ActivationShape::kW;
+    int w = next % ActivationShape::kW;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+    bool valid_c = coord.c() < problem_size_.C;
+    bool valid_h = coord.h() >= 0 && coord.h() < problem_size_.H;
+    bool valid_w = coord.w() >= 0 && coord.w() < problem_size_.W;
+    return valid_n_ ? valid_c & valid_h & valid_w : 0;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorFixedStrideDilation &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < ThreadMap::Iterations::kCount) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kActivationSize;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return ThreadMap::Iterations::kCount;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check stride and dilation constraint
+    if (problem_size.stride_h != StrideShape::kRow || problem_size.stride_w != StrideShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (problem_size.dilation_h != DilationShape::kRow || problem_size.dilation_w != DilationShape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8ae9b9312c79f88715fd8cd1efebb2dad8a76f1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,291 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM A (activation tile)
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/threadblock/depthwise_direct_conv_params.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape_,
+          typename OutputTileShape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized {
+ public:
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using OutputTileShape = OutputTileShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using TensorCoord = typename Layout::TensorCoord;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+                "Vectors implied by the thread map must be divisible by the access type.");
+
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1, "Require Iterations::kContiguous == 1");
+  
+  static_assert(OutputTileShape::kN == 1, "Require OutputTileShape::kN == 1");
+  static_assert(OutputTileShape::kC == Shape::kColumn, "Require OutputTile shape == channels per threadblock");
+
+  //
+  // Parameters structure
+  //
+
+  using Params = Depthwise2dFpropDirectConvParams<Layout>;
+
+ private:
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  char const *pointer_;
+
+  // Base channels for current threadblock
+  int base_c_;
+  // Base activation index for current threadblock
+  int offset_intial_npq_;
+  // Base activation coord for current threadblock
+  TensorCoord activatioin_base_;
+  // Intial thread positioin
+  int offset_initial_hwc_;
+  // Overall load instruction per thread.
+  int iterator_load_;
+  // thread loading position.
+  int iterator_hwc_;
+  // Number of loads for activations tensor X.
+  const int number_of_loads_;
+
+ public:
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized(
+      Params const &params,
+      Conv2dProblemSize const &problem_size,
+      Element const *ptr,
+      int thread_idx,
+      MatrixCoord const &threadblock_offset =
+          MatrixCoord()
+      )
+      : params_(params),
+        problem_size_(problem_size),
+        pointer_(reinterpret_cast<char const *>(ptr)),
+        offset_intial_npq_(threadblock_offset.row()),
+        offset_initial_hwc_(thread_idx),
+        iterator_load_(0),
+        number_of_loads_(params.activation_load_count) {
+    
+    base_c_ = threadblock_offset.column();
+
+    set_activation_coord(offset_intial_npq_);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_activation_coord(int offset_npq) {
+    int offset_inital_n, offset_inital_p, offset_inital_q;
+    int residual;
+
+    params_.pq_divmod(offset_inital_n, residual, offset_npq);
+    params_.q_divmod(offset_inital_p, offset_inital_q, residual);
+
+    int base_n = offset_inital_n;
+
+    int base_h =
+        offset_inital_p * OutputTileShape::kH * problem_size_.stride_h - problem_size_.pad_h;
+
+    int base_w =
+        offset_inital_q * OutputTileShape::kW * problem_size_.stride_w - problem_size_.pad_w;
+
+    activatioin_base_ = TensorCoord(base_n, base_h, base_w, base_c_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+    return Params(
+        problem_size,
+        layout,
+        {Shape::kRow, Shape::kColumn},
+        {OutputTileShape::kN, OutputTileShape::kH, OutputTileShape::kW, OutputTileShape::kC},
+        sizeof_bits<Element>::value,
+        ThreadMap::kThreads,
+        ThreadMap::Detail::ShapeVec::kContiguous,
+        ThreadMap::kElementsPerAccess);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iterator_hwc_ = offset_initial_hwc_ + index * ThreadMap::kThreads;
+    iterator_load_ = index;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Go to next threadblock
+    offset_intial_npq_ += problem_size_.split_k_slices;
+
+    set_activation_coord(offset_intial_npq_);
+  }
+
+  /// Returns the coordinate in the activations tensor X that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+    
+    int c = iterator_hwc_ %  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int next = iterator_hwc_ /  ThreadMap::Detail::ShapeVec::kContiguous ;
+    int h, w;
+    params_.activation_tile_w_divmod(h, w, next) ;
+
+    c = c * AccessType::kElements;
+
+    return activatioin_base_ + TensorCoord(0, h, w, c);
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor X
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.N && coord.h() >= 0 && coord.h() < problem_size_.H &&
+           coord.w() >= 0 && coord.w() < problem_size_.W && coord.c() < problem_size_.C;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    LongIndex offset = params_.layout(coord);
+
+    AccessType const *ptr =
+        reinterpret_cast<AccessType const *>(pointer_ + offset * sizeof_bits<Element>::value / 8);
+
+    return ptr;
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropActivationDirect2dConvTileAccessIteratorOptimized &operator++() {
+
+    ++iterator_load_;
+    iterator_hwc_ += ThreadMap::kThreads;
+
+    if (iterator_load_ < number_of_loads_) {
+       return *this;
+    }
+    
+    iterator_load_ = 0;
+    iterator_hwc_ = offset_initial_hwc_;
+
+    return *this;
+  }
+
+  /// Determines the activation size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return params_.activation_size;
+  }
+
+  /// Determines the iterations needed
+  CUTLASS_HOST_DEVICE
+  int get_iteration_num() {
+    return number_of_loads_;
+  }
+
+  /// Determines whether the Depthwise fprop can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.C % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..846f1f3aeb269edc67b5e2c02db3f05993172025
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
@@ -0,0 +1,551 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Epilogue stores the data into global memory
+    typename Epilogue_,
+    /// iterator implementation variants
+    conv::IteratorAlgorithm IteratorAlgorithm_ = conv::IteratorAlgorithm::kOptimized,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseFpropDirectConvMultipleStage :
+   public DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = DepthwiseDirectConvMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using Epilogue = Epilogue_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static conv::IteratorAlgorithm const kItertorAlgorithm = IteratorAlgorithm_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB = 
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropDirectConvMultipleStage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0,
+                              int group_start_B = 0) {
+    if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+      // Number of iterators is a static value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    } else {
+      // Number of iterators is a runtime value.
+      iterator_A.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+      this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+        ++this->smem_iterator_A_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA &iterator_A,
+      ///< Params of global memory iterator
+      typename IteratorA::Params const &iterator_a_params,
+      ///< iterator over B operand in global memory
+      IteratorB &iterator_B,
+      ///< Params of global memory iterator
+      typename IteratorB::Params const &iterator_b_params,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      /// Epilogue
+      Epilogue &epilogue, 
+      ///< Output operator
+      typename Epilogue::OutputOp const &output_op, 
+      ///< Tile iterator for destination 
+      typename Epilogue::OutputTileIterator &destination_iterator,
+      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      typename Epilogue::OutputTileIterator &source_iterator,
+
+      int split_k_slices = 1
+      ) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      if (stage == 0) {
+        iterator_B.set_iteration_index(0);
+        this->smem_iterator_B_.set_iteration_index(0);
+
+        // Async Copy for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+          typename IteratorB::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                                  IteratorB::ThreadMap::kElementsPerAccess /
+                                  IteratorB::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+            
+            ++iterator_B;
+          }
+
+          ++this->smem_iterator_B_;
+        }
+      }
+
+      if(kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation){
+        // Number of iterators is compilation static.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_index(0);
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+
+      } else {
+        // Number of iterators is a runtime value.
+        iterator_A.set_iteration_index(0);
+        this->smem_iterator_A_.set_iteration_num(iterator_A.get_iteration_num());
+        this->smem_iterator_A_.set_iteration_index(0);
+
+
+        // Async Copy for operand A
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < iterator_A.get_iteration_num(); ++j) {
+          typename IteratorA::AccessType *dst_ptr =
+              reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+            int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                                  IteratorA::ThreadMap::kElementsPerAccess /
+                                  IteratorA::kAccessesPerVector / 8;
+
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+            ++iterator_A;
+          }
+
+          ++this->smem_iterator_A_;
+        }
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+
+      this->smem_iterator_A_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    unsigned int iterations = 0;
+    constexpr int inner_loop_iterations = round_up(Base::kWarpGemmIterations, 2);
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {   // Each iteration is a cta tile.
+
+      accum.clear();
+    
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < inner_loop_iterations; ++warp_mma_k) {
+        if (Base::kWarpGemmIterations % 2 == 0 || warp_mma_k + 1 != Base::kWarpGemmIterations) {
+          // Load warp-level tiles from shared memory, wrapping to k offset if
+          // this is the last group as the case may be.
+
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Shape::kK);
+
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                              warp_transformed_frag_B[warp_mma_k % 2],
+                              warp_loaded_frag_A[warp_mma_k % 2],
+                              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k == 0) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, group_start_iteration_A, group_start_iteration_B);
+        }
+
+        if (warp_mma_k < Base::kWarpGemmIterations) {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        if (warp_mma_k + 1 == inner_loop_iterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == inner_loop_iterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next cta
+          iterator_A.advance();
+
+          this->smem_iterator_A_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({-Base::kStages, 0});
+   
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.advance(- (Base::kStages-1) * iterator_A.get_load_size());
+            smem_read_stage_idx = 0;
+          } else {
+            this->warp_tile_iterator_A_.advance(iterator_A.get_load_size());
+            ++smem_read_stage_idx;
+          }
+
+          if (kItertorAlgorithm == conv::IteratorAlgorithm::kFixedStrideDilation) {
+            this->warp_tile_iterator_A_.setup_initial_status(iterator_a_params);
+          }
+
+          // goback to start position. B has no multiple stage
+          this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Shape::kK, 0});
+
+          --gemm_k_iterations;
+        }
+      }
+
+      //
+      // Epilogue
+      //
+      int32_t smem_base_offset = iterator_B.get_load_size() + (iterations % Base::kStages) * iterator_A.get_load_size();
+
+      destination_iterator.set_tile_index(iterations * split_k_slices);
+      
+      source_iterator.set_tile_index(iterations * split_k_slices);
+    
+      epilogue(output_op, destination_iterator, accum, source_iterator, smem_base_offset);
+
+      ++iterations;
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
new file mode 100644
index 0000000000000000000000000000000000000000..1035fda375787cc211929854b662d1ccb7a809ae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of convolution tiles mapped to GEMM B (filter tile) 
+    matrix from memory.
+
+    This iterator assumes TensorNHWC layout of tensors in Global Memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+#include "cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+template <typename Shape_,
+          typename Element_,
+          typename Layout_,
+          typename ThreadMap_,
+          typename AccessType_ = cutlass::AlignedArray<Element_, ThreadMap_::kElementsPerAccess> >
+class DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized {
+public:   
+  //
+  // Types
+  //
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+  using TensorRef = cutlass::TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  static IteratorAlgorithm const kIteratorAlgorithm = conv::IteratorAlgorithm::kOptimized;
+  static StrideSupport const kStrideSupport = conv::StrideSupport::kStrided;
+  static int const kConvDim = 2;
+  using ConvProblemSize = typename conv::Conv2dProblemSize;
+ 
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static int const kFilterSize = ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess * ThreadMap::kThreads *
+           sizeof_bits<Element>::value / 8;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+ 
+  //
+  // Simplifying assertions
+  //
+  static_assert(ThreadMap::Iterations::kContiguous == 1,
+    "Require Iterations::kContiguous == 1");
+
+  //
+  // Parameters structure
+  //
+  using Params = Depthwise2dFpropDirectConvFilterIteratorParams<Layout>;
+
+ protected:
+
+  Conv2dProblemSize const &problem_size_;
+  Params const &params_;
+  LongIndex iteration_contiguous_;
+  LongIndex iteration_strided_;
+  LongIndex iteration_vector_;
+  char const *pointer_;
+
+  int filter_k_;
+  int offset_trs_[ThreadMap::Iterations::kStrided];
+
+public:
+
+
+
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized(
+    Params const &params, 
+    Conv2dProblemSize const &problem_size,
+    Element const *ptr,
+    int thread_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    params_(params), 
+    problem_size_(problem_size), 
+    pointer_(reinterpret_cast<char const *>(ptr)), 
+    filter_k_(0) {
+
+    layout::PitchLinearCoord thread_coord = ThreadMap::initial_offset(thread_idx);
+
+    filter_k_ = threadblock_offset.column() + thread_coord.contiguous();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      offset_trs_[s] = threadblock_offset.row() + thread_coord.strided() + s * ThreadMap::Delta::kStrided;
+    }
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Params getParams(Conv2dProblemSize const &problem_size, Layout const &layout) {
+      return Params(problem_size, layout, {Shape::kRow, Shape::kColumn}, kFilterSize);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(Index index) {
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset * 8 / sizeof_bits<Element>::value;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // Do nothing because the filter is persistent in the SMEM
+  }
+
+  /// Returns the coordinate in the filter tensor W that is currently pointed to
+  /// by the iterator.
+  CUTLASS_HOST_DEVICE
+  TensorCoord at() const {
+
+    int k = filter_k_ + iteration_vector_ * AccessType::kElements;
+    int trs =  offset_trs_[iteration_strided_];
+
+    return TensorCoord(k, trs, 0 , 0);  // As a 2D-matrix
+  }
+
+  /// Returns true if the current coordinate is within the activations tensor W
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    TensorCoord coord = at();
+
+    return coord.n() < problem_size_.K &&
+            coord.h() < Shape::kColumn;
+  }
+
+  /// Returns a pointer to the vector starting at the current coordinate
+  CUTLASS_HOST_DEVICE
+  AccessType const *get() const {
+    TensorCoord coord = at();
+    int64_t offset = coord.n();
+    if (params_.is_convolution) {
+      offset += (Shape::kColumn - coord.h() - 1)* problem_size_.K;
+    } else {
+      offset += coord.h() * problem_size_.K;
+    }
+
+    return reinterpret_cast<AccessType const *>(pointer_ +
+                                                offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Increments to the next memory access
+  CUTLASS_HOST_DEVICE
+  DepthwiseFpropFilterDirectConvTileAccessIteratorOptimized &operator++() {
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+    iteration_vector_ = 0;
+
+    ++iteration_contiguous_;
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+    iteration_contiguous_ = 0;
+    
+    ++iteration_strided_;
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+    iteration_strided_ = 0;
+ 
+    return *this;
+  }
+
+  /// Determines the filter size loaded by iterator
+  CUTLASS_HOST_DEVICE
+  int get_load_size() {
+    return kFilterSize;
+  }
+
+  /// Determines whether the Implicit GEMM can execute the given problem.
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(Conv2dProblemSize const &problem_size) {
+
+    // check alignment constraint on iterator's contiguous dimension
+    if (problem_size.K % AccessType::kElements) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // check whether runtime filter size is same as templated filter size.
+    if ((problem_size.R * problem_size.S) != Shape::kColumn) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..30d13e9087e4b47384a43b9381036f668d581808
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
@@ -0,0 +1,336 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class DepthwiseFpropPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseFpropPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,            ///< number of iterations per channel
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+    // Depthwise specific
+    int channel_start_index = 0;
+    int rs_plane_idx = 0;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+        // Reset interation index.
+        iterator_B.set_iteration_index(0);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          if(rs_plane_idx == gemm_k_iterations_per_channel - 1){
+            // Move to next set of filter groups.
+            channel_start_index += Base::kWarpGemmIterations;
+          }
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(channel_start_index + (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+
+      rs_plane_idx = (rs_plane_idx == gemm_k_iterations_per_channel - 1) ? 0: (rs_plane_idx + 1);
+
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..44dafcb5fa4f099e8070a9c8d271c4048128ceac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_base.h
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a directconv threadblock-scoped Depthwise kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    ///
+    typename ThreadMapA_,
+    ///
+    typename ThreadMapB_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct DepthwiseDirectConvMmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  using ThreadMapA = ThreadMapA_;
+  using ThreadMapB = ThreadMapB_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class DepthwiseDirectConvMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::
+      GemmShape<Shape::kM / WarpGemm::kM, Shape::kN / WarpGemm::kN, Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  /// kWarpGemmIterations could be even and odd. 
+  static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<1,  // Not determined at compile-time :(
+                               Shape::kN + Policy::SmemPaddingA::kRow>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<Policy::ThreadMapB::StorageShape::kStrided +
+                                   Policy::SmemPaddingB::kRow,  // filter_rs_size
+                               Policy::ThreadMapB::StorageShape::kContiguous +
+                                   Policy::SmemPaddingB::kColumn>;  // Tile N = 64?
+
+   public:
+    //
+    // Data members
+    //
+
+    // Let persistent B matrix in front of dynamic matrix A
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand
+    /// Not be determined at compile-time -- Just to get a Smem start address.
+    AlignedBuffer<typename Operator::ElementA, 1> operand_A;  
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() { return TensorRefA{operand_A.data(), LayoutA()}; }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() { return TensorRefB{operand_B.data(), LayoutB()}; }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  DepthwiseDirectConvMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e3cc417d4cc3169724f9e5db9e82fa093121fae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
@@ -0,0 +1,952 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting depthwise related simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/warp/mma_depthwise_simt.h"
+
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/conv/threadblock/depthwise_mma_base.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h"
+
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+namespace detail {
+//
+// Convert a WarpShapeM which is the whole tile of elements into the number of elements (2D) held by
+// each partitions within warp. 
+// The goal is for each thread's tile of elements to be as square as
+// possible for performance (4x4 will be faster than 2x8).
+template<int WarpShapeM,  // The number of elements (1D) contained in the entire warp
+         int WarpNumThreadsM> // The number of partitions within the warp
+struct SimtWarpShape {
+  // kP * kQ * WarpNumThreadsM = WarpShapeM
+  // If needed, enable more specializations.
+};
+template <>
+struct SimtWarpShape<4, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 1;
+};
+
+template <>
+struct SimtWarpShape<4, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<8, 1> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<8, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+template <>
+struct SimtWarpShape<8, 4> {
+  static constexpr int kP = 1;
+  static constexpr int kQ = 2;
+};
+
+template <>
+struct SimtWarpShape<16, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 2> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+template <>
+struct SimtWarpShape<16, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 2;
+};
+
+template <int WarpNumThreadsM>
+struct SimtWarpShape<25, WarpNumThreadsM> {
+  static_assert(WarpNumThreadsM == 1, "WarpShapeM could not be evenly splited by threads");
+  static constexpr int kP = 5;
+  static constexpr int kQ = 5;
+};
+
+template <>
+struct SimtWarpShape<32, 1> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 8;
+};
+
+template <>
+struct SimtWarpShape<32, 2> {
+  static constexpr int kP = 4;
+  static constexpr int kQ = 4;
+};
+
+template <>
+struct SimtWarpShape<32, 4> {
+  static constexpr int kP = 2;
+  static constexpr int kQ = 4;
+};
+
+}  // namespace detail
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of threadblock-scoped output tile 
+    typename ThreadBlockOutputShape,
+    /// Shape of filter shape per threadblock
+    typename FilterShape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_ = 0,
+    /// Size of a warp-scoped per thread access 
+    int kLaneAccessSizeB_ = 0,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DepthwiseMmaCoreWithLaneAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : cutlass::gemm::threadblock::DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeA_,
+    /// Size of a warp-scoped per thread access (a value of -1 indicates the default)
+    int kLaneAccessSizeB_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseMmaCoreWithLaneAccessSize<Shape_,
+                                        WarpShape_,
+                                        cutlass::gemm::GemmShape<1, 1, 1>,
+                                        ElementA_,
+                                        layout::RowMajor,
+                                        ElementB_,
+                                        layout::ColumnMajor,
+                                        ElementC_,
+                                        LayoutC_,
+                                        arch::OpClassSimt,
+                                        kLaneAccessSizeA_,
+                                        kLaneAccessSizeB_,
+                                        2,
+                                        Operator_> : public cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                                                                           WarpShape_,
+                                                                           cutlass::gemm::GemmShape<1, 1, 1>,
+                                                                           ElementA_,
+                                                                           layout::RowMajor,
+                                                                           ElementB_,
+                                                                           layout::ColumnMajor,
+                                                                           ElementC_,
+                                                                           LayoutC_,
+                                                                           arch::OpClassSimt,
+                                                                           2,
+                                                                           Operator_> {
+  using Base = cutlass::gemm::threadblock::DefaultMmaCore<Shape_,
+                              WarpShape_,
+                              cutlass::gemm::GemmShape<1, 1, 1>,
+                              ElementA_,
+                              layout::RowMajor,
+                              ElementB_,
+                              layout::ColumnMajor,
+                              ElementC_,
+                              LayoutC_,
+                              arch::OpClassSimt,
+                              2,
+                              Operator_>;
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeA = kLaneAccessSizeA_;
+  static int const kLaneAccessSizeB = kLaneAccessSizeB_;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeA > 0 && kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = typename Base::WarpCount;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory are same as base class
+  //
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = cutlass::gemm::threadblock::detail::simt_get_warp_threads_m<WarpShape>(); 
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = kLaneAccessSizeA / sizeof_bits<ElementA>::value;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = cutlass::gemm::threadblock::detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::gemm::threadblock::MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, 1>, // Set kStrided = 1 because activation shape is runtime value.
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<1, Shape::kN>, // set kRow is 1 because it is a runtime value
+    ElementA, 
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    true  // Dynamic iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  // Get output P, Q per thread
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape_, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of threadblock-scoped output tile (concept: TensorNHWCShape)
+    typename ThreadBlockOutputShape_,
+    /// Shape of filter shape per threadblock
+    typename FilterShape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a warp-scoped per thread access
+    int kLaneAccessSizeA_,
+    /// Number of stages
+    int Stages_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_>
+struct DepthwiseDirectConvMmaCoreWithLaneAccessSize<Shape_,
+                                                    ThreadBlockOutputShape_,
+                                                    FilterShape_,
+                                                    WarpShape_,
+                                                    cutlass::gemm::GemmShape<1, 1, 1>,
+                                                    ElementA_,
+                                                    layout::RowMajor,
+                                                    ElementB_,
+                                                    layout::ColumnMajor,
+                                                    ElementC_,
+                                                    LayoutC_,
+                                                    arch::OpClassSimt,
+                                                    kLaneAccessSizeA_,
+                                                    128,
+                                                    Stages_,
+                                                    Operator_,
+                                                    IteratorAlgorithm::kFixedStrideDilation,
+                                                    StrideShape_,
+                                                    DilationShape_,
+                                                    ActivationShape_> {
+  using Shape = Shape_;
+  using FilterShape = FilterShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  using StrideShape = StrideShape_;
+  using DilationShape = DilationShape_; 
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using ActivationShape = ActivationShape_;
+
+  static int const kLaneAccessSizeB = 128;
+
+  // Divisility requirements
+  static_assert( kLaneAccessSizeB > 0,
+    "Size of a warp-scoped per thread access should be larger then ZERO" );
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = cutlass::gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    1
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  // For Gmem load
+  static int const kElementsPerAccessA = 128 / sizeof_bits<ElementA>::value;
+  static int const kElementsPerAccessB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ActivationShape::kC, ActivationShape::kNHW>,
+    kThreads,
+    kElementsPerAccessA
+  >;
+
+  /// ThreadMap of iterator A
+  using SmemThreadMapA = IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<ActivationShape::kNHW, ActivationShape::kC>,
+    ElementA,
+    SmemLayoutA,
+    0,
+    SmemThreadMapA, // was IteratorThreadMapA
+    false  // static iterations.
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, FilterShape::kCount>,
+    kThreads,
+    kElementsPerAccessB
+  >;
+
+  /// Transpose the ThreadMap of iterator B
+  using SmemThreadMapB = IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand                                                  
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIteratorDirectConv<
+    MatrixShape<FilterShape::kCount, Shape::kN>,
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB, // was IteratorThreadMapB
+    false   // static iterations.
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+  // Groups per threads
+  // Fp32: 2 groups
+  // Fp16: 2 groups
+  static const int GroupsPerThread = sizeof(ElementB) > 1 ? 2 : 4;
+  // Define the warp-level op  
+  static const int WarpNumThreadsN = cutlass::const_min(WarpShape::kN / GroupsPerThread, kWarpSize);
+  static const int WarpNumThreadsM = kWarpSize / WarpNumThreadsN; 
+
+  static const int TileP = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kP;
+  static const int TileQ = cutlass::conv::threadblock::detail::SimtWarpShape<WarpShape::kM, WarpNumThreadsM>::kQ;
+
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+
+  static const int LaneLayout = 1;
+  static const int numElementsB = kLaneAccessSizeB / sizeof_bits<ElementB>::value;
+  static const int LaneN = cutlass::const_min(numElementsB, WarpShape::kN / WarpNumThreadsN);
+  
+  // Define the output tile computed by each thread
+  using ThreadOutputShape = cutlass::conv::TensorNHWCShape<1, TileP, TileQ, LaneN>;
+
+  // Fetch the channel with same access size
+  static const int LaneM = LaneN;
+
+  // No paddings
+  static int const kPaddingM = 0;
+  static int const kPaddingN = 0;
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::conv::warp::MmaDepthwiseDirectConvSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<>
+      FilterShape,    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+      ThreadOutputShape, /// Size of the output tile computed by thread - concept: conv::TensorNHWCShape<>
+      ThreadBlockOutputShape, /// Size of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy,          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+      IteratorAlgorithm::kFixedStrideDilation, /// Iterator algo type
+      StrideShape,   /// Stride ( MatrixShape<Height, Width> )
+      DilationShape,  /// Dilation ( MatrixShape<Height, Width> )
+      ActivationShape /// Activation Shape loaded by threadblock
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = cutlass::conv::threadblock::DepthwiseDirectConvMmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    IteratorThreadMapA,
+    IteratorThreadMapB,
+    WarpCount::kK
+  >;
+};
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..482a52fe63209650546811aa24cafcc7419e7479
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
@@ -0,0 +1,802 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped fused activation's 
+   scale+bias+relu and Implicit GEMM Convolution kernel.
+
+   The original implicit gemm will store out-of-bound data as zeroes in the
+   shared memory because zeros into the tensor core, zeroes out of the tensor
+   cores.  The result is remained the same.   When fusing scale+bias+relu
+   into the mainloop, it is no longer true because
+
+     0 x scale + bias = bias
+
+   which is no longer always 0.  So, instead of storing zeroes, this fused
+   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
+   scale+bias+relu, the code is like
+
+     if (data == 0x7eff)
+       data = 0;
+     else
+       data = scale+bias+relu(data, scale, bias);
+
+  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
+  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/conv/warp/scale_bias_relu_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorScaleBias_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaFpropFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the scale and bias vectors
+  using TensorRefScaleBias = TensorRef<ElementScaleBias, LayoutScaleBias>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the A scale and bias vectors in shared memory
+    using ShapeScaleBias =
+        MatrixShape<1 + Policy::SmemPaddingA::kRow,
+                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand Scale and Bias
+    AlignedBuffer<ElementScaleBias, ShapeScaleBias::kCount> operand_A_scale_bias;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the A scale and bias vectors
+    CUTLASS_DEVICE
+    static LayoutScaleBias LayoutScaleBias() {
+      return LayoutScaleBias::packed(
+          {ShapeScaleBias::kRow, ShapeScaleBias::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the A operand Scale vector
+    CUTLASS_HOST_DEVICE
+    TensorRefScaleBias operand_A_scale_bias_ref() {
+      return TensorRefScaleBias{operand_A_scale_bias.data(), LayoutScaleBias()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
+  /// from shared memory
+  WarpIteratorScaleBias warp_tile_iterator_A_scale_bias_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaFpropFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_A_scale_bias_(
+            shared_storage.operand_A_scale_bias_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorScaleBias_,
+    /// Iterates over vectors of scale and bias vector in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorScaleBias_,
+    /// Cache operation for scale/bias operand 
+    cutlass::arch::CacheOperation::Kind CacheOpScaleBias,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorScaleBias_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmFpropFusionMultistage
+    : public MmaFpropFusionBase<Shape_, typename IteratorScaleBias_::Element,
+                       typename IteratorScaleBias_::Layout, Policy_,
+                       WarpIteratorScaleBias_, Stages> {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorScaleBias = IteratorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorScaleBias = WarpIteratorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Base class
+  using Base = MmaFpropFusionBase<Shape_, typename IteratorScaleBias::Element,
+                         typename IteratorScaleBias::Layout, Policy,
+                         WarpIteratorScaleBias, Stages>;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorScaleBias = SmemIteratorScaleBias_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpScaleBias =
+      CacheOpScaleBias;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpLoadedFragmentScaleBias =
+      typename WarpIteratorScaleBias::Fragment;
+
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
+  SmemIteratorScaleBias smem_iterator_A_scale_bias_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+  
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmFpropFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_A_scale_bias_(shared_storage.operand_A_scale_bias_ref(),
+                                    thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorScaleBias &iterator_A_scale_bias,
+                              IteratorB &iterator_B, int group_start_A = 0,
+                              int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
+    // are small.  One iteration is enough.
+    if (group_start_A == 0) {
+      typename IteratorScaleBias::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorScaleBias::AccessType *>(
+              this->smem_iterator_A_scale_bias_.get());
+
+      int const kSrcBytes =
+          sizeof_bits<typename IteratorScaleBias::Element>::value *
+          IteratorScaleBias::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
+          dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
+    }
+
+    iterator_B.set_iteration_index(group_start_B);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorScaleBias iterator_A_scale_bias,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,  
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess / 8;
+        
+        // Uses Nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+        ++this->smem_iterator_A_;
+      }
+
+      // Async Copy for operand A scale and bias vectors.  Scale and bias
+      // vectors are small.  One iteration is enough.
+      {
+        typename IteratorScaleBias::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorScaleBias::AccessType *>(
+                this->smem_iterator_A_scale_bias_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorScaleBias::Element>::value *
+            IteratorScaleBias::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpScaleBias>(
+            dst_ptr, iterator_A_scale_bias.get(), iterator_A_scale_bias.valid());
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_A_scale_bias.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpLoadedFragmentScaleBias warp_loaded_frag_A_scale_bias[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::conv::warp::FpropScaleBiasReluTransform<WarpTransformedFragmentA,
+                                            WarpLoadedFragmentScaleBias>
+        elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_A_scale_bias_.load(
+        warp_loaded_frag_A_scale_bias[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_scale_bias_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_A_scale_bias[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_scale_bias_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_scale_bias_.load(
+            warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_scale_bias_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                               warp_loaded_frag_A_scale_bias[warp_mma_k % 2]);
+        }
+
+        warp_mma(
+                 accum, 
+                 warp_transformed_frag_A[warp_mma_k % 2],
+                 warp_transformed_frag_B[warp_mma_k % 2],
+                 accum
+                );
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_A_scale_bias, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A_scale_bias[(warp_mma_k + 1) % 2]);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_A_scale_bias.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_A_scale_bias_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_A_scale_bias_.add_tile_offset(
+                {0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_A_scale_bias_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c9c4792e289824afd1a761f5b7b4cc5972f167a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -0,0 +1,539 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Implicit GEMM Convolution kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmMultistage : 
+  public gemm::threadblock::MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A, IteratorB &iterator_B,
+    int group_start_A = 0, int group_start_B = 0) {
+
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                  dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                  dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0,
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+  
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (Detail::kStagedAccumulation) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A,
+                               group_start_iteration_B);
+
+        if (Detail::kStagedAccumulation) {
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    if (Detail::kStagedAccumulation) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+  
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..45e27949665f797ba28afcd5f1cf98007c56eac9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -0,0 +1,320 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to A operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class ImplicitGemmPipelined : public gemm::threadblock::MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = gemm::threadblock::MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    int gemm_k_iterations_per_channel = 0,             ///< number of iterations per channel
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+    
+          ++iterator_A;
+          ++iterator_B;
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..3be08c1ad90cf896b0b2191aa0c0a4a5a8c5b033
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
@@ -0,0 +1,729 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped fused activation's scale+bias+relu and
+   Implicit GEMM Convolution kernel.
+
+   The original implicit gemm will store out-of-bound data as zeroes in the
+   shared memory because zeros into the tensor core, zeroes out of the tensor
+   cores.  The result is remained the same.   When fusing scale+bias+relu
+   into the mainloop, it is no longer true because
+
+     0 x scale + bias = bias
+
+   which is no longer always 0.  So, instead of storing zeroes, this fused
+   kernel stores the out-of-bound data as a special NaN (0x7eff), when applying
+   scale+bias+relu, the code is like
+
+     if (data == 0x7eff)
+       data = 0;
+     else
+       data = scale+bias+relu(data, scale, bias);
+
+  The biggest difference compared with the fused Fprop and scale+bias+relu is
+  that scale and bias are loop invariant in Wgrad so that they only needs to 
+  be loaded once before the mainloop.
+
+  See include/cutlass/conv/warp/scale_bias_relu_transformation.h for the 
+  elementwise computation.  See include/cutlass/arch/memory_sm80.h for nan fill.
+
+
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/conv/warp/scale_bias_relu_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Element type of scale and bias vectors 
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaWgradFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaWgradFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorScaleBias_,
+    /// Iterates over vectors of scale and bias vector i
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class ImplicitGemmWgradFusionMultistage
+    : public MmaWgradFusionBase<Shape_, typename IteratorScaleBias_::Element,
+                       typename IteratorScaleBias_::Layout, Policy_, Stages> {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorScaleBias = IteratorScaleBias_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Base class
+  using Base = MmaWgradFusionBase<Shape_, typename IteratorScaleBias::Element,
+                         typename IteratorScaleBias::Layout, Policy_, Stages>;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+
+  using ElementC = typename Policy::Operator::ElementC;
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+  
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const kBBufferSize =
+        ((sizeof(typename Operator::ElementC) == 4) &&
+         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
+                             typename Operator::ElementA>::value &&
+           platform::is_same<typename Operator::Policy::Operator::ElementB,
+                             typename Operator::ElementB>::value)) &&
+         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
+            ? 1
+            : 2;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpLoadedFragmentScaleBias = typename IteratorScaleBias::Fragment;
+
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+  
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  ImplicitGemmWgradFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+
+    iterator_A.set_iteration_index(group_start_A);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+      
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B);
+
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+    
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+        
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
+                dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorScaleBias iterator_B_scale_bias,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      ///< number of iterations per channel
+      int gemm_k_iterations_per_channel = 0, 
+      ///< Imaginary strides used for planar-complex only - ignored here
+      int64_t imag_stride_A = 0,
+      int64_t imag_stride_B = 0) {
+
+    //
+    // Prologue
+    //
+
+    WarpLoadedFragmentScaleBias warp_loaded_frag_B_scale_bias;
+    iterator_B_scale_bias.add_tile_offset({0, warp_idx_n_});
+    iterator_B_scale_bias.load(warp_loaded_frag_B_scale_bias);
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorA::AccessType *>(
+            this->smem_iterator_A_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess / 8;
+        
+        cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+            dst_ptr, iterator_A.get(), iterator_A.valid());
+
+        ++iterator_A;
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorB::AccessType *>(
+              this->smem_iterator_B_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess / 8;
+
+        // Uses Nan fill for out of bound data
+        cutlass::arch::cp_async_nan<kSrcBytes, kCacheOpB>(
+            dst_ptr, iterator_B.get(), iterator_B.valid());
+
+        ++iterator_B;
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.advance();
+      iterator_B.advance();
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a fence to group cp.async instructions into stages.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed. 
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[Detail::kBBufferSize];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[Detail::kBBufferSize];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::conv::warp::WgradScaleBiasReluTransform<WarpTransformedFragmentB,
+                                            WarpLoadedFragmentScaleBias>
+        elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_B[0],
+                         warp_loaded_frag_B_scale_bias);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (Detail::kBBufferSize == 2) {
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize]);
+          ++this->warp_tile_iterator_A_;
+        }
+
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % Detail::kBBufferSize],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
+                               warp_loaded_frag_B_scale_bias);
+        }
+
+        warp_mma(
+                 accum, 
+                 warp_transformed_frag_A[warp_mma_k % Detail::kBBufferSize],
+                 warp_transformed_frag_B[warp_mma_k % 2],
+                 accum
+                );
+
+        if (Detail::kBBufferSize == 1) {
+          this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+          ++this->warp_tile_iterator_A_;
+  
+        }
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B_scale_bias);
+        }
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a fence to group cp.async instructions into stages.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages of cp.async have committed
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.advance();
+          iterator_B.advance();
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+        }
+      }
+
+    }
+
+    // Insert fence and wait for all outstanding cp.async operations to commit.
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac642385cd445e9a36a2f4c6f6c9e51f309cb87
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -0,0 +1,470 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  int problem_size_trs;
+  int problem_size_c;
+  int filter_trs_;
+
+  TensorCoord thread_offset_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_trs(problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_trs(problem_size.T * problem_size.R * problem_size.S),
+        problem_size_c(problem_size.C),
+        filter_trs_(0) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    thread_offset_ =
+        thread_offset_ +
+        TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    // moves to the next tile
+    ++filter_trs_;
+    if (filter_trs_ == problem_size_trs) {
+      filter_trs_ = 0;
+      add_tile_offset(TensorCoord(1, 0));
+    }
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    uint32_t enabled = 0;
+
+#if defined(_MSC_VER) || (__CUDACC_VER_MAJOR__ < 11)
+    enabled = threadIdx.x < kThreads * 2;
+#else
+    asm volatile(
+        "{\n"
+        "  .reg .u32 tid_reg;\n"
+        "  .reg .pred p;\n"
+        "  mov.u32 tid_reg, %%tid.x;\n"
+        "  setp.lt.u32 p, tid_reg, %1;\n"
+        "  selp.u32 %0, 1, 0, p;\n"
+        "}\n" : "+r"(enabled) :"n"(kThreads * 2));
+#endif
+
+    return ((thread_offset_.contiguous() < problem_size_c) && enabled);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+
+  using Params = PredicatedScaleBiasVectorAccessIteratorParams;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv3dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv3dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(params, problem_size,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    iterator_.advance();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9844be9f000920fd82f18dc6dab5755611f08ea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
@@ -0,0 +1,371 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 1;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kIterations = WarpShape::kContiguous / 8;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  using Params = Conv2dWgradActivationIteratorOptimizedParams;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+
+  /// Size of tensor
+  Conv2dProblemSize problem_size_;
+
+  int32_t thread_offset_;
+
+  // Channel dimension in contiguous dimension stays constant for each gemm_iteration_k
+  int32_t filter_c_[kIterations];
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(params, problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int c = 0; c < kIterations; ++c) {
+      int rsc_offset = thread_offset_ + c * 8;
+
+      int residual, tmp;
+      params_.sc_divmod(tmp, residual, rsc_offset);
+      params_.c_divmod(tmp, filter_c_[c], residual);
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + filter_c_[c],
+        true
+      );
+    }
+
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + filter_c_[c],
+        true 
+      );
+    }
+
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedScaleBiasVectorIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Conv2dProblemSize const &problem_size, Layout const &layout)
+        : params_(problem_size, layout::TensorNHWC(0, 0, 0)){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Extent of tensor
+      Conv2dProblemSize const &problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      Params const &params,                   ///< Precomputed parameters object
+      Conv2dProblemSize const &problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(params, problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace conv 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c5aed6dba0fa206fcab9545eeeb165558cb724a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
+      Convolution problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+CUTLASS_HOST_DEVICE
+static int get_strided_dgrad_tile_m(
+  cutlass::conv::Conv2dProblemSize const &problem_size,
+  int tile_size_m) {
+
+  // CTAs in M dimension per starting filter position
+  int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, tile_size_m);
+
+  // Inflate number of CTAs in M dimension to cover every strating filter position even those that
+  // may fall out of valid MMA (Dy * w) but are needed to apply epilogue (beta * Dx_source) 
+  // and point-wise fusion
+  int tile_m = tile_m_per_filter * int(problem_size.stride().product());
+
+  // There is a possible performance optimization here that leads up to 2x speeds than the current 
+  // CUTLASS strided dgrad performance for stride > filter, i.e., stride={2x2} and filter={1x1})
+  //
+  // * Optimization * 
+  // Only launch CTAs in M dimension which contribute to a row in Dx output
+  // 
+  // 
+  // * Constraints *
+  // (A) stride <= filter, for example, stride={2x2} and filter={3x3}: 
+  //       - (A.1): There are no constraints for this case and the optimization does 
+  //                affect this case functionality or performance. 
+  // (B) stride > filter, for example, stride={2x2} and filter={1x1}: 
+  //       - (B.1): Dx output tensor should be zero initialized
+  //       - (B.2): The kernel epilogue cannot apply beta. Thus, beta should be zero 
+
+  return tile_m;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Threadblock swizzling function for strided dgrad convolution
+struct StridedDgradHorizontalThreadblockSwizzle : 
+  public gemm::threadblock::GemmHorizontalThreadblockSwizzle {
+
+  using Base = gemm::threadblock::GemmHorizontalThreadblockSwizzle;
+
+  CUTLASS_HOST_DEVICE
+  StridedDgradHorizontalThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    gemm::GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    // compute number of tiles in m dimension
+    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
+
+    // compute number of tiles in n dimension 
+    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
+
+    return gemm::GemmCoord(
+      tile_m,
+      tile_n,
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
+  private:
+    using Base::get_tiled_shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Threadblock swizzling function for strided dgrad convolution
+template <int N = 1>
+struct StridedDgradIdentityThreadblockSwizzle : 
+  public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
+
+  using Base = gemm::threadblock::GemmIdentityThreadblockSwizzle<N>;
+
+  CUTLASS_HOST_DEVICE
+  StridedDgradIdentityThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For ImplicitGemmConvolution Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    gemm::GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    // compute number of tiles in m dimension
+    int tile_m = get_strided_dgrad_tile_m(problem_size, tile_size.m());
+
+    // compute number of tiles in n dimension 
+    int tile_n = (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n();
+
+    return gemm::GemmCoord(
+      tile_m,
+      tile_n,
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// For GEMM problem size (MxNxK) (Do not use base class get_tiled_shape())
+  private:
+    using Base::get_tiled_shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+template <int N = 1, int Output_N = 1, int Output_P = 1, int Output_Q = 1>
+struct DepthwiseDirect2dConvIdentityThreadblockSwizzle
+    : public gemm::threadblock::GemmIdentityThreadblockSwizzle<N> {
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvIdentityThreadblockSwizzle() {}
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static gemm::GemmCoord get_tiled_shape(cutlass::conv::Operator conv_operator,
+                            cutlass::conv::Conv2dProblemSize const &problem_size,
+                            gemm::GemmCoord tile_size,
+                            int split_k_slices) {
+        
+    gemm::GemmCoord implicit_gemm_problem_size =
+        cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return gemm::GemmCoord(1,
+                     (implicit_gemm_problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+                     split_k_slices);
+  }
+};
+
+} // namespace threadblock
+} // namespace conv
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7af2e37bd610a12f334943902395b6956362589
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/thread/depthwise_mma.h"
+
+
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseSimt
+    : public cutlass::gemm::warp::
+          MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_> {
+  using Base = cutlass::gemm::warp::
+      MmaSimt<Shape_, ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_, LayoutC_, Policy_>;
+      
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+public:
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::conv::warp::DepthwiseMmaSimtTileIterator<
+    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseSimt():Base() {}
+};
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+    typename FilterShape_,
+    /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+    typename ThreadOutputShape_,
+    /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm_ = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_ = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_ =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_ = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaDepthwiseDirectConvSimt {
+ public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Shape of filter shape per threadblock - concept: gemm::GemmShape<Depth, Height, Width>
+  using FilterShape = FilterShape_;
+
+  /// Shape of the output tile computed by thread- concept: conv::TensorNHWCShape<>
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of the output tile computed by threadblock - concept: conv::TensorNHWCShape<>
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Iterator algo type
+  static conv::IteratorAlgorithm const IteratorAlgorithm = IteratorAlgorithm_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_; 
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+  
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
+                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
+                                    platform::is_same< ElementA, int8_t >::value && 
+                                    platform::is_same< ElementB, int8_t >::value;
+
+  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
+
+  /// Thread-level matrix multiply accumulate operator
+  using ThreadMma = cutlass::conv::thread::DepthwiseDirectConvElementwiseInnerProduct<
+    cutlass::gemm::GemmShape<
+      Shape::kM / Policy::WarpShape::kRow,    // number of output pixels proccessed per thread
+      Shape::kN / Policy::WarpShape::kColumn, // number of channels proccessed per thread
+      1>,
+    ElementA,
+    ElementB,
+    ElementC,
+    arch::OpMultiplyAdd,
+    dp4a_type
+  >;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Shape of the underlying instruction
+  using InstructionShape = cutlass::gemm::GemmShape<1,1,use_dp4a ? 4 : 1>;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = cutlass::conv::warp::DepthwiseDirect2dConvSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>, // <output tile=(P*Q), output channels> per warp
+    FilterShape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    cutlass::gemm::Operand::kA,
+    ElementA,
+    Policy,
+    IteratorAlgorithm,
+    StrideShape,
+    DilationShape,
+    ActivationShape,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<1, Shape::kN>,
+    cutlass::gemm::Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    cutlass::gemm::Operand::kC,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename ThreadMma::FragmentC;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaDepthwiseDirectConvSimt() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &d, 
+    FragmentA a, 
+    FragmentB b, 
+    FragmentC const &c, int group_idx = 0) const {
+
+    ThreadMma mma;
+
+    mma(d, a, b, c);
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..47fd1e08b9ff9f693b462fd89f5230475d918120
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
@@ -0,0 +1,862 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/conv/convolution.h"
+
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
+///
+/// concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Operand identity
+  cutlass::gemm::Operand Operand,
+  /// Data type of A elements
+  typename Element_,
+  /// Layout of operand
+  typename Layout_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK = 1,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize = 1
+>
+class DepthwiseMmaSimtTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseMmaSimtTileIterator<Shape_,
+                                   cutlass::gemm::Operand::kB,
+                                   Element_,
+                                   layout::RowMajor,
+                                   Policy_,
+                                   PartitionsK,
+                                   PartitionGroupSize>
+    : public cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize> {
+
+  using Base = cutlass::gemm::warp::MmaSimtTileIterator<Shape_,
+                                               cutlass::gemm::Operand::kB,
+                                               Element_,
+                                               layout::RowMajor,
+                                               Policy_,
+                                               PartitionsK,
+                                               PartitionGroupSize>;
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = typename Base::TensorRef;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = typename Base::ThreadShape;
+
+  /// Number of individual loads
+  using Iterations =  typename Base::Iterations;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+  static_assert(Policy::LaneMmaShape::kN == 1, "Each thread should be 1 element per LDS along the k-dim");
+  
+private:
+
+  MatrixCoord lane_offset_;
+  int channel_idx_;
+  int base_channel_idx_;
+  int warps_n_;
+
+ public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator():Base() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) : Base(ref, lane_id) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    warps_n_ = -1;
+    channel_idx_ = 0;
+    base_channel_idx_ = 0;
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+  }
+  
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseMmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    if(warps_n_ == -1){
+        warps_n_ = coord.column();
+    }
+    
+    Base::add_tile_offset(coord);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        void const *ptr = this->ref_.data() +
+                          this->ref_.offset({-(channel_idx_ - base_channel_idx_),
+                                             n * Policy::WarpShape::kColumn}) +
+                          pointer_offset / Policy::LaneMmaShape::kN;
+
+        // Base_k of a warp +  Base_k of current threads.
+        int thread_k_base_idx =
+            warps_n_ * Shape::kColumn / Policy::LaneMmaShape::kN + lane_offset_.column();
+
+        if (channel_idx_ + k == thread_k_base_idx + n * Policy::WarpShape::kColumn) {
+          // Depthwise kernel would only do computation when channel == k.
+          // Loads an element when the current computation channel == the k corresponding to this thread.
+          arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
+        } else {
+          // Reduce SMEM load
+          dst_ptr[n + k * Iterations::kColumn].fill(Element(0));
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    if(k_group % PartitionGroupSize == 0 && k_group != 0){
+      base_channel_idx_ = k_group;
+    }
+    channel_idx_ = k_group;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename ThreadBlockOutputShape_,
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm = IteratorAlgorithm::kAnalytic,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape = cutlass::MatrixShape<-1, -1>,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape =  cutlass::MatrixShape<-1, -1>,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape = cutlass::conv::TensorNHWCShape<-1,-1,-1,-1>,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK = 1,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize = 1>
+class DepthwiseDirect2dConvSimtTileIterator;
+
+
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Iterator algo type
+    conv::IteratorAlgorithm IteratorAlgorithm,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape,   
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm,
+                                            StrideShape,   
+                                            DilationShape,
+                                            ActivationShape,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    ThreadOutputShape::kNHW, // Output tile shape Computed by current threads
+    ThreadOutputShape::kC
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+protected:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  int activation_offset[ThreadOutputShape::kH][ThreadOutputShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+  int iterator_offset_;
+
+  int inc_next_s_ ;
+  int inc_next_r_ ;
+  
+  MatrixCoord lane_offset_;
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template<typename Params>
+  CUTLASS_HOST_DEVICE
+  void setup_initial_status(Params const& params)  {
+  
+    inc_next_s_ = params.inc_next[0];
+    inc_next_r_ = params.inc_next[1];
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_p_ =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    int base_q_ =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int base_w = (base_q_ + q) * params.stride[0];
+          int base_h = (base_p_ + p) * params.stride[1];
+
+          int offset = base_h * params.activation_tile_w + base_w;
+          activation_offset[p][q][col] = offset;
+        }
+      }
+    }
+  }
+
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ = MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      iterator_offset_ += inc_next_s_;
+
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      iterator_offset_ += inc_next_r_;
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    iterator_offset_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator & operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          void const *ptr = ref_.data() +
+                            ref_.offset({activation_offset[p][q][n] + (iterator_offset_),
+                                         n * Policy::WarpShape::kColumn}) +
+                            pointer_offset / Policy::LaneMmaShape::kN;
+          arch::shared_load(dst_ptr[n + q + p * ThreadOutputShape::kW], ptr);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+    typename FilterShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadOutputShape_,
+    /// Size of the matrix to load (concept: TensorNHWC)
+    typename ThreadBlockOutputShape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+    typename Policy_,
+    /// Stride ( MatrixShape<Height, Width> )
+    typename StrideShape_,
+    /// Dilation ( MatrixShape<Height, Width> )
+    typename DilationShape_,
+    /// Activation Shape loaded by threadblock
+    typename ActivationShape_,
+    /// Number of partitions along K dimension - used in sliced-K
+    int PartitionsK,
+    /// Group Size along kPartition - used in sliced-K
+    int PartitionGroupSize>
+class DepthwiseDirect2dConvSimtTileIterator<Shape_,
+                                            FilterShape_,
+                                            ThreadOutputShape_,
+                                            ThreadBlockOutputShape_,
+                                            cutlass::gemm::Operand::kA,
+                                            Element_,
+                                            Policy_,
+                                            IteratorAlgorithm::kFixedStrideDilation,
+                                            StrideShape_,
+                                            DilationShape_,
+                                            ActivationShape_,
+                                            PartitionsK,
+                                            PartitionGroupSize> {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Shape of filter (concept: gemm::GemmShape<Depth, Height, Width>)
+  using FilterShape = FilterShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadOutputShape = ThreadOutputShape_;
+
+  /// Shape of tile to load (concept: TensorNHWC)
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  /// Stride ( MatrixShape<Height, Width> )
+  using StrideShape = StrideShape_;
+
+  /// Dilation ( MatrixShape<Height, Width> )
+  using DilationShape = DilationShape_;
+
+  /// Activation Shape loaded by threadblock
+  using ActivationShape = ActivationShape_;
+
+  /// Operand tag
+  static cutlass::gemm::Operand const kOperand = cutlass::gemm::Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow),
+                "The warp-level GEMM M size must be divisible by the number of threads arranged "
+                "along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0,
+                "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  // Activations loaded by threadblock
+  static int const ThreadActivationShapeH = (ThreadOutputShape::kH - 1) * StrideShape::kRow +
+                                            (FilterShape::kRow - 1) * DilationShape::kRow + 1;
+
+  static int const ThreadActivationShapeW = (ThreadOutputShape::kW - 1) * StrideShape::kColumn +
+                                            (FilterShape::kColumn - 1) * DilationShape::kColumn + 1;
+
+  using ThreadActivationShape = cutlass::conv::
+      TensorNHWCShape<1, ThreadActivationShapeH, ThreadActivationShapeW, ThreadOutputShape::kC>;
+
+  // Thread-level shape of a fragment
+  using ThreadShape =
+      MatrixShape<ThreadOutputShape::kNHW,
+                  ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / Policy::LaneMmaShape::kN>;
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+ protected:
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+  Array<Element, Policy::LaneMmaShape::kN>
+      activation[ThreadActivationShape::kH][ThreadActivationShape::kW][Iterations::kColumn];
+  int iterator_r_;
+  int iterator_s_;
+
+
+  MatrixCoord lane_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator(TensorRef ref, int lane_id) {
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    // Set channel offset
+    lane_offset_ = lane_layout.inverse(lane_id) * MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset_);
+
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+               ref.stride(0) / Policy::LaneMmaShape::kN);
+
+    iterator_r_ = 0;
+    iterator_s_ = 0;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  template <typename Params>
+  CUTLASS_HOST_DEVICE void setup_initial_status(
+      Params const &params) {
+
+    // Get base HW offset of current threads
+    int threadgroup = threadIdx.x / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    int base_h =
+        (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH * StrideShape::kRow;
+    int base_w =
+        (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW * StrideShape::kColumn;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadActivationShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadActivationShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (base_h + h) * ActivationShape::kW + (base_w + w);
+
+          void const *ptr = ref_.data() + ref_.offset({offset, col * Policy::WarpShape::kColumn});
+          arch::shared_load(activation[h][w][col], ptr);
+        }
+      }
+    }
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+    // Set warp row and col start
+    lane_offset_ =
+        MatrixCoord({lane_offset_.row() + coord.row() * Shape::kRow, lane_offset_.column()});
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  void advance(int32_t pointer_offset) {
+    ref_.reset(ref_.data() + pointer_offset / sizeof(Element) / Policy::LaneMmaShape::kN);
+    iterator_s_ = 0;
+    iterator_r_ = 0;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator++() {
+    ++iterator_s_;
+    if (iterator_s_ < FilterShape::kColumn) {
+      return *this;
+    }
+
+    iterator_s_ = 0;
+
+    ++iterator_r_;
+    if (iterator_r_ < FilterShape::kRow) {
+      return *this;
+    }
+
+    iterator_r_ = 0;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  DepthwiseDirect2dConvSimtTileIterator &operator--() {
+    // Do nothing
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr =
+        reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int p = 0; p < ThreadOutputShape::kH; ++p) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int q = 0; q < ThreadOutputShape::kW; ++q) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Iterations::kColumn; ++n) {
+          const int h = p * StrideShape::kRow + iterator_r_ * DilationShape::kRow;
+          const int w = q * StrideShape::kColumn + iterator_s_ * DilationShape::kColumn;
+
+          dst_ptr[n + q + p * ThreadOutputShape::kW] = activation[h][w][n];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    // Do nothing at present.
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cb3935a7e070f0dc34b1ec9c31d9ac448d43b8b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/conv/warp/scale_bias_relu_transform.h
@@ -0,0 +1,221 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per channel scale+bias+relu before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace conv {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentScaleBias>
+struct FpropScaleBiasReluTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumScaleBias = FragmentScaleBias::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns
+  static int const MmaCols = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using ScaleBiasOperand = Array<T, MmaElements * MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
+    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
+    // It requires C to be an even number.
+    asm volatile(
+        "{\n\t"
+        " .reg .pred %%p;\n\t"
+        " .reg .b32 t1;\n\t"
+        " setp.eq.u32 %%p, %2, %4;\n\t"
+        " fma.rn.f16x2.relu t1, %1, %2, %3;\n"
+        " selp.u32 %0, 0, t1, %%p;\n\t"
+        "}\n"
+        : "=r"(ptr_activations[0])
+        : "r"(ptr_scale_bias[0]), "r"(ptr_activations[0]),
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16x2));
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentScaleBias const &scale_bias) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    ScaleBiasOperand const *ptr_scale_bias =
+        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i], ptr_scale_bias[(i / MmaScaleBiasPair) % MmaCols]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentScaleBias>
+struct WgradScaleBiasReluTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumScaleBias = FragmentScaleBias::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 rows
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using ScaleBiasOperand = Array<__half2, MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations, ScaleBiasOperand const &scale_bias) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
+    __half2 *ptr_activations = reinterpret_cast<__half2 *>(&activations);
+    uint32_t const *ptr_scale_bias = reinterpret_cast<uint32_t const *>(&scale_bias);
+
+#if 1 
+    // CUDA + PTX version
+
+    bool h1_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].x) == cutlass::arch::OOB_NAN_F16);
+    bool h2_oob = (reinterpret_cast<uint16_t &>(ptr_activations[0].y) == cutlass::arch::OOB_NAN_F16);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We cannot gurantee that the pair of F16 are both in bound or both 
+    // out-of-bound because C x R x S can be an odd number.
+    asm volatile(
+        "{\n\t"
+        " fma.rn.f16x2.relu %0, %1, %2, %3;\n"
+        "}"
+        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
+        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
+          "r"(ptr_scale_bias[1]));
+
+    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h1_oob ?
+            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff0000) :
+            reinterpret_cast<uint32_t &>(ptr_activations[0]);
+
+    reinterpret_cast<uint32_t &>(ptr_activations[0]) = h2_oob ?
+            (reinterpret_cast<uint32_t &>(ptr_activations[0]) & 0xffff) :
+            reinterpret_cast<uint32_t &>(ptr_activations[0]);
+#else
+    // pure PTX version
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+    asm volatile(
+        "{\n"
+        " .reg .b16 t1, t2;\n"
+        " .reg .b32 t3, t4, t5, t6;\n"
+        " .reg .pred p1, p2;\n"
+        " mov.b32 {t1, t2}, %2;\n"
+        " setp.eq.s16 p1, t1, %4;\n"
+        " setp.eq.s16 p2, t2, %4;\n"
+        " fma.rn.f16x2.relu t3, %1, %2, %3;\n"
+        " and.b32 t4, t3, %5;\n"
+        " selp.b32 t5, t4, t3, p1;\n"
+        " and.b32 t6, t5, %6;\n"
+        " selp.b32 %0, t6, t5, p2;\n"
+        "}\n"
+        : "=r"(reinterpret_cast<uint32_t &>(ptr_activations[0]))
+        : "r"(ptr_scale_bias[0]), "r"(reinterpret_cast<uint32_t &>(ptr_activations[0])),
+          "r"(ptr_scale_bias[1]), "n"(cutlass::arch::OOB_NAN_F16), "n"(0xffff0000), "n"(0x0000ffff));
+#endif
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentScaleBias const &scale_bias) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    ScaleBiasOperand const *ptr_scale_bias =
+        reinterpret_cast<ScaleBiasOperand const *>(&scale_bias);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i], ptr_scale_bias[(i / MmaRows)]);
+    }
+  }
+};
+} // namespace warp
+} // namespace conv 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/coord.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/coord.h
new file mode 100644
index 0000000000000000000000000000000000000000..16cfa1b322f24f3e1c64f14b91dd880798e3b68d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/coord.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief A Coord is a coordinate of arbitrary rank into a tensor or matrix
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically-sized array specifying Coords within a tensor
+template <
+  int Rank_,                          ///< Logical rank of coordinate
+  typename Index_ = int,              ///< Index type used for each dimension
+  typename LongIndex_ = int64_t       ///< Long index type used for linear offsets
+>
+struct Coord {
+
+public:
+
+  //
+  // Type and constant definitions
+  //
+
+  /// Number of elements in Coord
+  static int const kRank = Rank_;
+
+  /// Index type used to store elements
+  using Index = Index_;
+
+  /// Type used to represent linear offsets
+  using LongIndex = LongIndex_;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Indices
+  Index idx[kRank];
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor initializes uniformly
+  CUTLASS_HOST_DEVICE
+  explicit Coord(Index value = Index(0)) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = value;
+    }
+  }
+
+  /// Constructs from an array of integers
+  CUTLASS_HOST_DEVICE
+  Coord(Index const (&_idx)[kRank]) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = _idx[i];
+    }
+  }
+
+  /// Constructs from some other Coord
+  template <int R, typename I, typename L>
+  CUTLASS_HOST_DEVICE
+  Coord(Coord<R, I, L> other) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = other[i];
+    }
+  }
+
+  /// Returns a slice of the Coord which may be larger or smaller in rank
+  /// than this.
+  template <int Slice>
+  CUTLASS_HOST_DEVICE
+  Coord<Slice, Index, LongIndex> slice(int start = 0, Index identity = 0) const {
+    Coord<Slice, Index, LongIndex> result;
+    for (int i = 0; i < Slice; ++i) {
+      if (i + start < kRank) {
+        result[i] = idx[i + start];
+      }
+      else {
+        result[i] = identity;
+      }
+    }
+    return result;
+  }
+
+  /// Returns the index of the dimension with least value
+  CUTLASS_HOST_DEVICE
+  int min_dim_index() const {
+    int i = 0;
+    for (int j = 1; j < kRank; ++j) {
+      if (idx[j] < idx[i]) {
+        i = j;
+      }
+    }
+    return i;
+  }
+
+  /// Returns the index of the dimension with greatest value
+  CUTLASS_HOST_DEVICE
+  int max_dim_index() const {
+    int i = 0;
+    for (int j = 1; j < kRank; ++j) {
+      if (idx[j] > idx[i]) {
+        i = j;
+      }
+    }
+    return i;
+  }
+
+  /// Returns true if Coord is non-zero.
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    for (int i = 0; i < kRank; ++i) {
+      if (idx[i]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// Returns true if Coord is uniformly zero.
+  CUTLASS_HOST_DEVICE
+  bool operator!() const {
+    for (int i = 0; i < kRank; ++i) {
+      if (idx[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Coord operator+(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] + b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Coord operator-(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] - b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Coord operator*(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] * b.idx[i];
+    }
+    return c;
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Coord operator/(Coord const& b) const {
+    Coord c;
+    for (int i = 0; i < kRank; ++i) {
+      c.idx[i] = idx[i] / b.idx[i];
+    }
+    return c;
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Coord& operator+=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] += b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Coord& operator-=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] -= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Coord& operator*=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] *= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Coord& operator/=(Coord const& b) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] /= b.idx[i];
+    }
+    return *this;
+  }
+
+  /// Member access operator
+  CUTLASS_HOST_DEVICE Index& operator[](int dim) { return idx[dim]; }
+
+  /// Member access operator
+  CUTLASS_HOST_DEVICE Index const& operator[](int dim) const { return idx[dim]; }
+
+  /// Computes the dot product with anotherCoord object
+  CUTLASS_HOST_DEVICE
+  LongIndex dot(Coord const& b, LongIndex sum = LongIndex(0)) const {
+    for (int i = 0; i < kRank; ++i) {
+      sum += idx[i] * b.idx[i];
+    }
+    return sum;
+  }
+
+  /// Gets the index of a given Coord element
+  template <int Dim>
+  CUTLASS_HOST_DEVICE Index& at() {
+    return idx[Dim];
+  }
+
+  /// Access via index; may limit unrolling potential
+  CUTLASS_HOST_DEVICE
+  Index& at(int dim) { return idx[dim]; }
+
+  /// Gets the index of a given Coord element
+  template <int Dim>
+  CUTLASS_HOST_DEVICE Index const& at() const {
+    return idx[Dim];
+  }
+
+  /// Access via index; may limit unrolling potential
+  CUTLASS_HOST_DEVICE
+  Index const& at(int dim) const { return idx[dim]; }
+
+  /// Determines if two Coord<> objects are equal
+  CUTLASS_HOST_DEVICE
+  bool operator==(Coord const& b) const {
+    bool equal = true;
+    for (int i = 0; equal && i < kRank; ++i) {
+      equal = (idx[i] == b.idx[i]);
+    }
+    return equal;
+  }
+
+  /// Not equal
+  CUTLASS_HOST_DEVICE
+  bool operator!=(Coord const& b) const { return !(*this == b); }
+
+  /// Clamps a coordinate to a range specified by maximum and minimum values
+  CUTLASS_HOST_DEVICE
+  Coord& clamp(Coord const& max, Coord const& min = Coord()) {
+    for (int i = 0; i < kRank; ++i) {
+      idx[i] = __NV_STD_MAX(__NV_STD_MIN(idx[i], max.idx[i]), min.idx[i]);
+    }
+    return *this;
+  }
+
+  /// Returns the sum of all elements
+  CUTLASS_HOST_DEVICE
+  Index sum() const {
+    Index sum_(idx[0]);
+    for (int i = 1; i < kRank; ++i) {
+      sum_ += idx[i];
+    }
+    return sum_;
+  }
+
+  /// Returns the product of all elements
+  CUTLASS_HOST_DEVICE
+  LongIndex product() const {
+    LongIndex product_(idx[0]);
+    for (int i = 1; i < kRank; ++i) {
+      product_ *= idx[i];
+    }
+    return product_;
+  }
+
+  /// Less than operator
+  CUTLASS_HOST_DEVICE
+  bool operator<(Coord const &b) const {
+    for (int i = 0; i < kRank; ++i) {
+      if (!(idx[i] < b[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Less than or equals operator
+  CUTLASS_HOST_DEVICE
+  bool operator<=(Coord const &b) const {
+    for (int i = 0; i < kRank; ++i) {
+      if (!(idx[i] <= b[i])) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Greater than operator
+  CUTLASS_HOST_DEVICE
+  bool operator>(Coord const &b) const {
+    return !(*this <= b);
+  }
+
+  /// Greater than or equals operator
+  CUTLASS_HOST_DEVICE
+  bool operator>=(Coord const &b) const {
+    return !(*this < b);
+  }
+};
+
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Index s, Coord<Rank, Index> coord) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
+/// Scalar multiplication
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator*(Coord<Rank, Index> coord, Index s) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] *= s;
+  }
+  return coord;
+}
+
+/// Scalar division
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator/(Index s, Coord<Rank, Index> coord) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] = s / coord[i];
+  }
+  return coord;
+}
+
+/// Scalar division
+template <int Rank, typename Index>
+CUTLASS_HOST_DEVICE
+Coord<Rank, Index> operator/(Coord<Rank, Index> coord, Index s) {
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank; ++i) {
+    coord[i] /= s;
+  }
+  return coord;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Integer-valued make_Coord
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to make a 1-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<1, T> make_Coord(T _0) {
+  T values[1] = {_0};
+  return Coord<1, T>(values);
+}
+
+/// Helper to make a 2-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<2, T> make_Coord(T _0, T _1) {
+  T values[2] = {_0, _1};
+  return Coord<2, T>(values);
+}
+
+/// Helper to make a 3-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<3, T> make_Coord(T _0, T _1, T _2) {
+  T values[3] = {_0, _1, _2};
+  return Coord<3, T>(values);
+}
+
+/// Helper to make a 4-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<4, T> make_Coord(T _0, T _1, T _2, T _3) {
+  T values[4] = {_0, _1, _2, _3};
+  return Coord<4, T>(values);
+}
+
+/// Helper to make a 5-element coordinate
+template <typename T> 
+CUTLASS_HOST_DEVICE
+Coord<5, T> make_Coord(T _0, T _1, T _2, T _3, T _4) {
+  T values[5] = {_0, _1, _2, _3, _4};
+  return Coord<5, T>(values);
+}
+
+/// Helper to make a 1-element coordinate
+template <int N, typename T> 
+CUTLASS_HOST_DEVICE
+Coord<N, T>make_Coord_with_padding(T _0) {
+  Coord<N, T> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = N - 1; i > 0; --i) {
+    coord[i] = 0;
+  }
+
+  coord[0] = _0;
+
+  return coord;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/core_io.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/core_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..046b3063a8ca7e1b79248ddea8d10af239eb4bdb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/core_io.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for printing cutlass/core objects
+*/
+#pragma once
+
+#include <iostream>
+#include <typeinfo>
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Output operator for CUDA built-in dim3 type
+inline std::ostream &operator<<(std::ostream &out, dim3 d) {
+  return out << d.x << ", " << d.y << ", " << d.z;
+}
+
+/// Output operator for CUDA built-in error type
+inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
+  return out << cudaGetErrorString(error);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                    stream operators for cutlass namespace                                     //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, int Rank>
+inline
+std::ostream& operator<<(std::ostream& out, Array<Element, Rank> const& v) {
+  for (int i = 0; i < Rank; ++i) {
+    out << (i ? ", " : "") << v[i];
+  }
+  return out;
+}
+
+template <int Rank>
+inline
+std::ostream& operator<<(std::ostream& out, Coord<Rank> const& coord) {
+  for (int i = 0; i < Rank; ++i) {
+    out << (i ? ", " : "") << coord[i];
+  }
+  return out;
+}
+
+inline
+std::istream & operator>>(std::istream &stream, half_t &x) {
+  float tmp;
+  stream >> tmp;
+  x = static_cast<cutlass::half_t>(tmp);
+  return stream;
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, half_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) {
+  return out << float(x);
+}
+
+
+inline
+std::ostream & operator<<(std::ostream &out, float_e2m1_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, detail::float_e2m1_unpacksmem_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, float_e3m2_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, float_e2m3_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, detail::float_e3m2_unpacksmem_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, detail::float_e2m3_unpacksmem_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, float_ue8m0_t const &x) {
+  return out << float(x);
+}
+
+inline
+std::ostream & operator<<(std::ostream &out, float_ue4m3_t const &x) {
+  return out << float(x);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to enable formatted printing of CUTLASS scalar types to an ostream
+template <typename T>
+struct ScalarIO {
+
+  /// Value to print
+  T value;
+
+  /// Default ctor
+  ScalarIO() { }
+
+  /// Constructs from a value
+  ScalarIO(T value): value(value) {}
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default printing to ostream
+template <typename T>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<T> const &scalar) {
+  return out << scalar.value;
+}
+
+/// Printing to ostream of int8_t as integer rather than character
+template <>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<int8_t> const &scalar) {
+  return out << int(scalar.value);
+}
+
+/// Printing to ostream of uint8_t as integer rather than character
+template <>
+inline std::ostream &operator<<(std::ostream &out, ScalarIO<uint8_t> const &scalar) {
+  return out << unsigned(scalar.value);
+}
+
+
+/// Default printing to ostream for MatrixShape
+template <int Row, int Column>
+inline
+std::ostream & operator<<(std::ostream &out, MatrixShape<Row, Column> const &matrix_shape) {
+  out << "cutlass::MatrixShape::(kRow, kColumn) {"
+    << cutlass::MatrixShape<Row,Column>::kRow <<","
+    << cutlass::MatrixShape<Row,Column>::kColumn <<"}";
+  return out;
+}
+
+
+/// Prints matrix to ostream
+template <typename Element, int Rows, int Columns>
+std::ostream & operator<<(std::ostream &out, Matrix<Element, Rows, Columns> const &rhs) {
+
+  for (int i = 0; i < Rows; ++i) {
+    for (int j = 0; j < Columns; ++j) {
+      ScalarIO<Element> element(rhs.at(i, j));
+      out << (j ? ", " : "") << element;
+    }
+    out << "\\n";
+  }
+
+  return out;
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &out, Quaternion<T> const &rhs) {
+
+  out << ScalarIO<T>(rhs.w()) << " ";
+  if (rhs.x() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.x()) << "*i ";
+  if (rhs.y() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.y()) << "*j ";
+  if (rhs.z() >= 0) {
+    out << "+";
+  }
+
+  out << ScalarIO<T>(rhs.z()) << "*k";
+
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::gemm namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace gemm {
+
+/// Default printing to ostream for GemmShape
+template <int M, int N, int K>
+inline
+std::ostream & operator<<(std::ostream &out, GemmShape<M,N,K> const &gemm_shape) {
+  out << "cutlass::gemm::GemmShape::(kM, kN, kK) {"
+    << cutlass::gemm::GemmShape<M,N,K>::kM <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kN <<","
+    << cutlass::gemm::GemmShape<M,N,K>::kK << "}";
+  return out;
+}
+
+/// Default printing to ostream for GemmCoord
+inline
+std::ostream & operator<<(std::ostream &out, GemmCoord const &gemm_coord) {
+  out << "cutlass::gemm::GemmCoord {"
+    << gemm_coord.m() <<","
+    << gemm_coord.n() <<","
+    << gemm_coord.k() << "}";
+  return out;
+}
+
+} //namespace gemm
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                       stream operators for cutlass namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default printing to ostream for PitchLinearShape
+template < int Contiguous, int Strided>
+inline
+std::ostream & operator<<(std::ostream &out, PitchLinearShape<Contiguous, Strided> const &pitch_linear_shape) {
+  out << "cutlass::PitchLinearShape:(kContiguous, kStrided) {"
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kContiguous <<","
+    << cutlass::layout::PitchLinearShape<Contiguous,Strided>::kStrided <<"}";
+  return out;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//                         stream operators for cutlass::conv namespace                          //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+namespace conv {
+/// Default printing to ostream for Conv2dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv2dProblemSize const& problem) {
+  out << "NHWC: (" << problem.N << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KRSC: (" << problem.K << ", " << problem.R << ", " << problem.S << ", " << problem.C / problem.groups << ")" << std::endl
+      << "NPQK: (" << problem.N << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "groups: (" << problem.groups << ")" << std::endl
+      << "Pad_h, Pad_w: (" << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "Stride_h, Stride_w: (" << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "Dilation_h, Dilation_w: (" << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ")" << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+
+/// Default printing to ostream for Conv3dProblemSize
+inline
+std::ostream& operator<<(std::ostream& out, Conv3dProblemSize const& problem) {
+  out << "NDHWC: (" << problem.N << ", " << problem.D << ", " << problem.H << ", " << problem.W << ", " << problem.C << ")" << std::endl
+      << "KTRSC: (" << problem.K << ", " << problem.T << ", " << problem.R << ", " << problem.S << ", " << problem.C << ")" << std::endl
+      << "NZPQK: (" << problem.N << ", " << problem.Z << ", " << problem.P << ", " << problem.Q << ", " << problem.K << ")" << std::endl
+      << "pad_d, pad_h, pad_w: ("  << problem.pad_d << ", " << problem.pad_h << ", " << problem.pad_w << ")" << std::endl
+      << "stride_d, stride_h, stride_w: ("  << problem.stride_d << ", " << problem.stride_h << ", " << problem.stride_w << ")" << std::endl
+      << "dilation_d, dilation_h, dilation_w: ("  << problem.dilation_d << ", " << problem.dilation_h << ", " << problem.dilation_w << ")" << std::endl
+      << "split_k_slices: (" << problem.split_k_slices << ") " << std::endl
+      << "mode: (" << ((problem.mode==conv::Mode::kConvolution) ? "conv" : "xcross") << ")";
+
+  return out;
+}
+
+} // namespace conv
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cuda_host_adapter.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cuda_host_adapter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8af62be2d3e27ccf499acaead03dc3aadd4c151
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cuda_host_adapter.hpp
@@ -0,0 +1,428 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Interface between a CUTLASS device-wide operator and CUDA.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+
+#include "cutlass/platform/platform.h"
+#if ! defined(__CUDACC_RTC__)
+#include <cstdio>
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// NVRTC doesn't need definitions for these host classes
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))) \
+    && !defined(__CUDACC_RTC__)
+#define CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED
+#endif
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) && !defined(__CUDACC_RTC__))
+#define CUDA_HOST_ADAPTER_TENSORMAP_ENABLED
+#endif
+
+// Include <cuda.h> for CUDA Driver API calls if any of these capabilities are enabled.
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||        \
+    defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+#include <cuda.h>
+
+#endif // defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED) ||
+       // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Macro-level guard for CUDA Host Adapter
+//
+#if !defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER)
+#define CUTLASS_ENABLE_CUDA_HOST_ADAPTER false
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#if !defined(__CUDACC_RTC__)
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) ||                               \
+    ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+#include <cudaTypedefs.h>
+#endif // (__CUDACC_VERSION__ >= 11.8)
+
+#include <driver_types.h>
+
+#define CUTLASS_CUDA_DRIVER_STRINGIFY(tok) #tok
+
+#if defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver) \
+  template <typename... Args>                       \
+  CUresult call_##func(Args... args) {              \
+    return func(args...);                           \
+  }
+
+#else // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if (__CUDACC_VER_MAJOR__ > 12)
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPointByVersion(    \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn, ver,                                              \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func##_v##ver>(pfn)(args...); \
+  }
+
+#else
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_DECL(func, ver)             \
+  template <typename... Args>                                   \
+  CUresult call_##func(Args... args) {                          \
+    cudaDriverEntryPointQueryResult cuda_status;                \
+    void* pfn = nullptr;                                        \
+    cudaError_t cuda_err = cudaGetDriverEntryPoint(             \
+        CUTLASS_CUDA_DRIVER_STRINGIFY(func),                    \
+        &pfn,                                                   \
+        cudaEnableDefault,                                      \
+        &cuda_status);                                          \
+    if (cuda_status != cudaDriverEntryPointSuccess ||           \
+        cuda_err != cudaSuccess) {                              \
+      return CUDA_ERROR_UNKNOWN;                                \
+    }                                                           \
+    return reinterpret_cast<PFN_##func>(pfn)(args...);          \
+  }
+
+#endif // (__CUDACC_VER_MAJOR__ > 12)
+
+#endif // defined(CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL)
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeTiled, 12000);
+CUTLASS_CUDA_DRIVER_WRAPPER_DECL(cuTensorMapEncodeIm2col, 12000);
+#endif
+
+#undef CUTLASS_CUDA_DRIVER_STRINGIFY
+
+#define CUTLASS_CUDA_DRIVER_WRAPPER_CALL(func) cutlass::call_##func
+
+#endif // !defined(__CUDACC_RTC__)
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This class manages runtime CUlaunchAttribute that can be supplied to CudaHostAdapter
+/// CudaHostLaunchAttributes will be an empty struct in earlier CTK where CUlaunchAttribute
+/// is not introduced.
+struct CudaHostLaunchAttributes {
+
+#if defined(CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+  /// Reasonable maximum launch attributes that are commonly applied
+  static constexpr int32_t kMaximumAttributeCount = 5;
+
+  /// Launch attributes
+  CUlaunchAttribute launch_attributes[kMaximumAttributeCount];
+  int32_t      attribute_count = 0;
+
+  CUTLASS_HOST_DEVICE
+  CudaHostLaunchAttributes(CUlaunchAttribute *launch_attributes_ = nullptr,
+                           int32_t attribute_count_ = 0) {
+    CUTLASS_ASSERT(attribute_count_ >= 0 && attribute_count_ < kMaximumAttributeCount);
+    for (int32_t i = 0; i < attribute_count_ && i < kMaximumAttributeCount; ++i) {
+      launch_attributes[i] = launch_attributes_[i];
+    }
+    attribute_count = attribute_count_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  CUlaunchAttribute const* data() const {
+    return launch_attributes;
+  }
+
+  CUTLASS_HOST_DEVICE
+  size_t size() const {
+    return attribute_count;
+  }
+  
+#endif // (CUDA_HOST_ADAPTER_LAUNCH_ATTRIBUTES_ENABLED)
+
+};
+
+
+/// This class defines an object which abstracts interactions between the CUTLASS device-wide GEMM and
+/// CUDA. The intention is to enable CUTLASS to be used with both the CUDA Runtime API and CUDA Driver API.
+struct CudaHostAdapter {
+
+  /// Limit the number of kernels
+  static constexpr int32_t kMaximumKernelCount = 4;
+
+  /// Maximum cluster size
+  static constexpr int MaxClusterSize = 32;
+
+  //
+  // Data members
+  //
+
+  /// Handles
+  void        *kernel_handles[kMaximumKernelCount];
+  int32_t      kernel_count = 0;
+
+  CudaHostLaunchAttributes launch_attributes;
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CudaHostAdapter() = default;
+
+  /// Dtor
+  virtual ~CudaHostAdapter() = default;
+
+  /// Copy Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(const CudaHostAdapter & rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(rhs.launch_attributes) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  /// Copy Assignment
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter& operator=(const CudaHostAdapter & rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+
+    launch_attributes = rhs.launch_attributes;
+
+    return *this;
+  }
+
+
+  /// Move ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(CudaHostAdapter && rhs)
+      : kernel_count(rhs.kernel_count),
+        launch_attributes(std::move(rhs.launch_attributes)) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+  }
+
+  // / Move assignment
+  CUTLASS_HOST_DEVICE 
+  CudaHostAdapter& operator=(CudaHostAdapter && rhs) {
+    CUTLASS_ASSERT(rhs.kernel_count >= 0 && rhs.kernel_count < kMaximumKernelCount);
+    for (int32_t i = 0; i < rhs.kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = rhs.kernel_handles[i];
+    }
+    kernel_count = rhs.kernel_count;
+    launch_attributes = std::move(rhs.launch_attributes);
+    return *this;
+  }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  CudaHostAdapter(void **kernel_handles_, 
+                  int32_t kernel_count_,
+                  CudaHostLaunchAttributes const &launch_attributes_ = { })
+      : kernel_count(kernel_count_),
+        launch_attributes(launch_attributes_) {
+    CUTLASS_ASSERT(kernel_count >= 0 && kernel_count < kMaximumKernelCount);
+
+    for (int32_t i = 0; i < kernel_count && i < kMaximumKernelCount; ++i) {
+      kernel_handles[i] = kernel_handles_[i];
+    }
+  }
+
+  /// Returns true if the CudaHostAdapter is empty (kernel_count == 0)
+  CUTLASS_HOST_DEVICE 
+  bool empty() const { return !kernel_count; }
+
+  /// Returns kernel_count
+  CUTLASS_HOST_DEVICE
+  size_t size() const { return static_cast<size_t>(kernel_count); }
+
+  /// Queries the occupancy of a kernel
+  virtual Status query_occupancy(
+    int32_t *device_sms, 
+    int32_t *sm_occupancy,
+    int32_t kernel_index,
+    int32_t thread_count,
+    int32_t smem_size) const = 0;
+ 
+  /// Launches a kernel without using Threadblock Clusters. 
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+  /// Launches a kernel using the CUDA Extensible Launch API and Threadblock Clusters.
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const cluster_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+  
+
+  /// Launches a kernel using the CUDA Extensible Launch API and Threadblock Clusters.
+  /// This API is for preferred cluster launch; a preferred and a fallback cluster shapes are
+  /// considered for launch respectively.
+  virtual Status launch(
+    dim3 const grid_dims,
+    dim3 const cluster_dims,
+    dim3 const fallback_cluster_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    void** kernel_params,
+    int32_t kernel_index) const = 0;
+
+  
+
+#if defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+  /// Create a tensor map descriptor object representing im2col memory region.
+  virtual CUresult tensorMapEncodeIm2col (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const int* pixelBoxLowerCorner,
+    const int* pixelBoxUpperCorner,
+    cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Create a tensor map descriptor object representing tiled memory region.
+  virtual CUresult tensorMapEncodeTiled (
+    CUtensorMap* tensorMap,
+    CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank,
+    void* globalAddress,
+    const cuuint64_t* globalDim,
+    const cuuint64_t* globalStrides,
+    const cuuint32_t* boxDim,
+    const cuuint32_t* elementStrides,
+    CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) const = 0;
+
+  /// Modify an existing tensor map descriptor with an updated global address.
+  virtual CUresult tensorMapReplaceAddress(
+    CUtensorMap* tensorMap,
+    void* globalAddress)  const = 0;
+
+#endif // defined(CUDA_HOST_ADAPTER_TENSORMAP_ENABLED)
+
+protected:
+
+  /**
+   * Fills a buffer in Global Memory with a byte sequence copied from host memory.
+   * This function can be overridden to dispatch to the appropriate cuMemsetD*Async API
+  */
+  virtual Status memsetDeviceImpl(
+    void* destination, ///< Device memory pointer to be filled
+    void const* fill_value, ///< Value to be filled in the buffer
+    size_t fill_size, ///< Size of the data type to be used for filling the buffer
+    size_t count, ///< Number of elements of size fill_size
+    cudaStream_t stream) const = 0;
+
+public:
+
+  /// Fills a buffer in Global Memory with a byte sequence copied from host memory
+  template<class FillValueType>
+  CUTLASS_HOST_DEVICE
+  Status memsetDevice(
+      void* destination,
+      FillValueType fill_value, 
+      size_t count,
+      cudaStream_t stream) const {
+    return this->memsetDeviceImpl(
+      destination,
+      &fill_value,
+      sizeof(FillValueType),
+      count,
+      stream);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cutlass.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cutlass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c68a3ba38cb554278e692d012ca2a93b547e08f1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/cutlass.h
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Basic include for CUTLASS.
+*/
+
+#pragma once
+
+#include "cutlass/detail/helper_macros.hpp"
+
+#if (__CUDACC_VER_MAJOR__ >= 13)
+  #define CUDA_STD_HEADER(header) <cccl/cuda/std/header>
+#else
+  #define CUDA_STD_HEADER(header) <cuda/std/header>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/// Status code returned by CUTLASS operations
+enum class Status {
+  kSuccess,                    ///< Operation was successful.
+  kErrorMisalignedOperand,     ///< operands fail alignment requirements.
+  kErrorInvalidDataType,       ///< DataType fails requirement.
+  kErrorInvalidLayout,         ///< Layout fails alignment requirement.
+  kErrorInvalidProblem,        ///< Specified problem size is not supported by operator.
+  kErrorNotSupported,          ///< Operation is not supported on current device.
+  kErrorWorkspaceNull,         ///< The given workspace is null when it is required to be non-null.
+  kErrorInternal,              ///< An error within CUTLASS occurred.
+  kErrorArchMismatch,          ///< CUTLASS runs on a device that it was not compiled for.
+  kErrorInsufficientDriver,    ///< CUTLASS runs with a driver that is too old.
+  kErrorMemoryAllocation,      ///< Kernel launch failed due to insufficient device memory.
+  kInvalid                     ///< Status is unspecified.
+};
+
+/// Convert cutlass status to status strings
+CUTLASS_HOST_DEVICE
+static char const* cutlassGetStatusString(cutlass::Status status) {
+  switch (status) {
+    case cutlass::Status::kSuccess:
+      return "Success";
+    case cutlass::Status::kErrorMisalignedOperand:
+      return "Error Misaligned Operand";
+    case cutlass::Status::kErrorInvalidDataType:
+      return "Error Invalid Data Type";
+    case cutlass::Status::kErrorInvalidLayout:
+      return "Error Invalid Layout";
+    case cutlass::Status::kErrorInvalidProblem:
+      return "Error Invalid Problem";
+    case cutlass::Status::kErrorNotSupported:
+      return "Error Not Supported";
+    case cutlass::Status::kErrorWorkspaceNull:
+      return "Error Workspace Null";
+    case cutlass::Status::kErrorInternal:
+      return "Error Internal";
+    case cutlass::Status::kErrorInsufficientDriver:
+      return "Error Insufficient Driver";
+    case cutlass::Status::kErrorArchMismatch:
+      return "Error Architecture Mismatch";
+    case cutlass::Status::kErrorMemoryAllocation:
+      return "Error Memory Allocation failed";
+    case cutlass::Status::kInvalid: break;
+  }
+
+  return "Invalid status";
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static const int NumThreadsPerWarp = 32;
+static const int NumThreadsPerWarpGroup = 128;
+static const int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+static const int NumThreadsPerHalfWarp = NumThreadsPerWarp / 2;
+static const int NumThreadsPerQuad = 4;
+static const int NumThreadsPerQuadPair = NumThreadsPerQuad * 2;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to return true when called by thread 0 of threadblock 0.
+CUTLASS_HOST_DEVICE bool thread0() {
+  #if defined(__CUDA_ARCH__)
+    return (!threadIdx.x && !threadIdx.y && !threadIdx.z) && (!blockIdx.x && !blockIdx.y && !blockIdx.z);
+  #else
+    return false;
+  #endif
+}
+
+/// Returns a lane index in the warp. The threads in warp may not be convergent
+CUTLASS_DEVICE
+int canonical_lane_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x % NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp-uniform value indicating the canonical warp index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_idx_sync() { 
+  #if defined(__CUDA_ARCH__)
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarp, 0);
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp index in the CTA. The threads in warp may not be convergent
+/// As it doesn't sync the warp, it faster and allows forward progress
+CUTLASS_DEVICE
+int canonical_warp_idx() { 
+  #if defined(__CUDA_ARCH__)
+    return threadIdx.x / NumThreadsPerWarp;
+  #else
+    return 0;
+  #endif
+}
+
+/// Returns a warp-uniform value indicating the canonical warp group index of the calling threads.
+/// Threads within the warp must be converged.
+CUTLASS_DEVICE
+int canonical_warp_group_idx() {
+  #if defined(__CUDA_ARCH__)
+    return __shfl_sync(0xffffffff, threadIdx.x / NumThreadsPerWarpGroup, 0);
+  #else
+    return 0;
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/blockwise_scale_layout.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/blockwise_scale_layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a304cd6e3adae2c009c5b474a8e2920b618a3ea3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/blockwise_scale_layout.hpp
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief Blockwise Scale configs specific for Blockwise/Groupwise MMA
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/atom/mma_traits_sm100.hpp"
+#include "cute/arch/mma_sm90.hpp"
+
+namespace cutlass::detail{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cute;
+
+template<int SFVecSizeM, int SFVecSizeN, int SFVecSizeK, UMMA::Major majorSFA = UMMA::Major::MN, UMMA::Major majorSFB = UMMA::Major::MN>
+struct Sm1xxBlockwiseScaleConfig {
+
+  using ShapeSFA = Shape<Shape<Int<SFVecSizeM>, int32_t>, Shape<Int<SFVecSizeK>, int32_t>, int32_t>;
+  using ShapeSFB = Shape<Shape<Int<SFVecSizeN>, int32_t>, Shape<Int<SFVecSizeK>, int32_t>, int32_t>;
+
+  using StrideSFA = conditional_t<majorSFA == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using StrideSFB = conditional_t<majorSFB == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using LayoutSFA = Layout<ShapeSFA, StrideSFA>;
+  using LayoutSFB = Layout<ShapeSFB, StrideSFB>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFA() {
+    return LayoutSFA{};
+  }
+
+  template<typename CtaShape_MNK>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  smem_atom_layoutSFA(CtaShape_MNK cta_shape_mnk) {
+    static_assert(cute::is_static_v<CtaShape_MNK>, "Expect static CTA shape");
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K] = cta_shape_mnk;
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, Int<cute::ceil_div(size<0>(CtaShape_MNK{}), SFVecSizeM)>{}));
+      }
+      else {
+        return make_stride(make_stride(_0{}, Int<cute::ceil_div(size<2>(CtaShape_MNK{}), SFVecSizeK)>{}), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K] = cta_shape_mnk;
+    return make_layout(
+      make_shape(make_shape(Int<SFVecSizeM>{}, Int<cute::ceil_div(size<0>(CtaShape_MNK{}), SFVecSizeM)>{}),
+                 make_shape(Int<SFVecSizeK>{}, Int<cute::ceil_div(size<2>(CtaShape_MNK{}), SFVecSizeK)>{})),
+      strides
+    );
+  }
+
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFB() {
+    return LayoutSFB{};
+  }
+
+  template<typename CtaShape_MNK>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  smem_atom_layoutSFB(CtaShape_MNK cta_shape_mnk) {
+    static_assert(cute::is_static_v<CtaShape_MNK>, "Expect static CTA shape");
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, Int<cute::ceil_div(size<1>(CtaShape_MNK{}), SFVecSizeN)>{}));
+      }
+      else {
+        return make_stride(make_stride(_0{}, Int<cute::ceil_div(size<2>(CtaShape_MNK{}), SFVecSizeK)>{}), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K] = cta_shape_mnk;
+    return make_layout(
+      make_shape(make_shape(Int<SFVecSizeN>{}, Int<cute::ceil_div(size<1>(CtaShape_MNK{}), SFVecSizeN)>{}),
+                 make_shape(Int<SFVecSizeK>{}, Int<cute::ceil_div(size<2>(CtaShape_MNK{}), SFVecSizeK)>{})),
+      strides
+    );
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFA(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(M, SFVecSizeM)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, SFVecSizeK)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto mk_layout = make_layout(
+      make_shape(make_shape(Int<SFVecSizeM>{}, cute::ceil_div(M, SFVecSizeM)),
+                 make_shape(Int<SFVecSizeK>{}, cute::ceil_div(K, SFVecSizeK))),
+      strides
+    );
+
+    return make_layout(append(shape(mk_layout), L), append(stride(mk_layout), size(filter_zeros(mk_layout))));
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFB.
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFB(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      if constexpr (majorSFB == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(N, SFVecSizeN)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, SFVecSizeK)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto nk_layout = make_layout(
+      make_shape(make_shape(Int<SFVecSizeN>{}, cute::ceil_div(N, SFVecSizeN)),
+                 make_shape(Int<SFVecSizeK>{}, cute::ceil_div(K, SFVecSizeK))),
+      strides
+    );
+
+    return make_layout(append(shape(nk_layout), L), append(stride(nk_layout), size(filter_zeros(nk_layout))));
+  }
+
+};
+
+template<UMMA::Major majorSFA = UMMA::Major::MN, UMMA::Major majorSFB = UMMA::Major::MN>
+struct RuntimeBlockwiseScaleConfig {
+
+  using ShapeSFA = Shape<Shape<int32_t, int32_t>, Shape<int32_t, int32_t>, int32_t>;
+  using ShapeSFB = Shape<Shape<int32_t, int32_t>, Shape<int32_t, int32_t>, int32_t>;
+
+  using StrideSFA = conditional_t<majorSFA == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using StrideSFB = conditional_t<majorSFB == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using LayoutSFA = Layout<ShapeSFA, StrideSFA>;
+  using LayoutSFB = Layout<ShapeSFB, StrideSFB>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFA() {
+    return LayoutSFA{};
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFB() {
+    return LayoutSFB{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template <class ProblemShape, class SFVecShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFA(ProblemShape problem_shape, SFVecShape sf_vec_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto [sfm, sfn, sfk] = sf_vec_shape;
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(M, sfm)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, sfk)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [sfm, sfn, sfk] = sf_vec_shape;
+    auto mk_layout = make_layout(
+      make_shape(make_shape(sfm, cute::ceil_div(M, sfm)),
+                 make_shape(sfk, cute::ceil_div(K, sfk))),
+      strides
+    );
+
+    return make_layout(append(shape(mk_layout), L), append(stride(mk_layout), size(filter_zeros(mk_layout))));
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFB.
+  template <class ProblemShape, class SFVecShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_SFB(ProblemShape problem_shape, SFVecShape sf_vec_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto [sfm, sfn, sfk] = sf_vec_shape;
+
+      if constexpr (majorSFB == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(N, sfn)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, sfk)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [sfm, sfn, sfk] = sf_vec_shape;
+    auto nk_layout = make_layout(
+      make_shape(make_shape(sfn, cute::ceil_div(N, sfn)),
+                 make_shape(sfk, cute::ceil_div(K, sfk))),
+      strides
+    );
+
+    return make_layout(append(shape(nk_layout), L), append(stride(nk_layout), size(filter_zeros(nk_layout))));
+  }
+
+};
+
+// Sm90 only supports MN major for SFA and SFB for now
+template<int SFVecSizeM, int SFVecSizeN, int SFVecSizeK, cute::GMMA::Major majorSFA = cute::GMMA::Major::MN, cute::GMMA::Major majorSFB = cute::GMMA::Major::MN>
+using Sm90BlockwiseScaleConfig = Sm1xxBlockwiseScaleConfig<
+    SFVecSizeM, 
+    SFVecSizeN, 
+    SFVecSizeK, 
+    majorSFA == cute::GMMA::Major::MN ? UMMA::Major::MN : UMMA::Major::K, 
+    majorSFB == cute::GMMA::Major::MN ? UMMA::Major::MN : UMMA::Major::K>;
+
+template<int SFVecSizeM, int SFVecSizeN, int SFVecSizeK, UMMA::Major majorSFA = UMMA::Major::MN, UMMA::Major majorSFB = UMMA::Major::MN>
+using Sm100BlockwiseScaleConfig = Sm1xxBlockwiseScaleConfig<SFVecSizeM, SFVecSizeN, SFVecSizeK, majorSFA, majorSFB>;
+
+template<int SFVecSizeM, int SFVecSizeN, int SFVecSizeK, UMMA::Major majorSFA = UMMA::Major::MN, UMMA::Major majorSFB = UMMA::Major::MN>
+using Sm120BlockwiseScaleConfig = Sm1xxBlockwiseScaleConfig<SFVecSizeM, SFVecSizeN, SFVecSizeK, majorSFA, majorSFB>;
+
+template<class MmaTileShape_MNK>
+constexpr auto sm90_trivial_blockwise_scale_config(MmaTileShape_MNK) {
+  return Sm90BlockwiseScaleConfig<size<0>(MmaTileShape_MNK{}), size<1>(MmaTileShape_MNK{}), size<2>(MmaTileShape_MNK{})>{};
+}
+
+template<class MmaTileShape_MNK>
+constexpr auto sm100_trivial_blockwise_scale_config(MmaTileShape_MNK) {
+  return Sm100BlockwiseScaleConfig<size<0>(MmaTileShape_MNK{}), size<1>(MmaTileShape_MNK{}), size<2>(MmaTileShape_MNK{})>{};
+}
+
+template<class MmaTileShape_MNK>
+constexpr auto sm120_trivial_blockwise_scale_config(MmaTileShape_MNK) {
+  return Sm120BlockwiseScaleConfig<size<0>(MmaTileShape_MNK{}), size<1>(MmaTileShape_MNK{}), size<2>(MmaTileShape_MNK{})>{};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/cluster.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/cluster.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d35765adebaa35bfcd767ff245ec72d453c28563
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/cluster.hpp
@@ -0,0 +1,99 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+
+
+#include "cute/container/tuple.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/trace.h"
+#include "cute/layout.hpp" // cute::make_shape
+#include "cutlass/trace.h" // CUTLASS_TRACE_HOST
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+// Returns either ClusterShape, if it is static, or a Shape<int,int,Int<1>> populated with the
+// x and y dimensions of `dynamic_cluster_shape`.
+template <class ClusterShape>
+CUTLASS_HOST_DEVICE
+static auto
+select_cluster_shape(ClusterShape cluster_shape, dim3 dynamic_cluster_shape) {
+  return cute::conditional_return<not cute::is_static_v<ClusterShape>>(
+    make_shape(static_cast<int>(dynamic_cluster_shape.x), static_cast<int>(dynamic_cluster_shape.y), cute::Int<1>{}),
+    cluster_shape);
+}
+
+template <class ClusterShape>
+CUTLASS_DEVICE
+static auto
+select_cluster_shape(ClusterShape cluster_shape) {
+  if constexpr (cute::is_static_v<ClusterShape>) {
+    return cluster_shape;
+  }
+  else {
+    dim3 dynamic_cluster_shape = cute::cluster_shape();
+    return make_shape(static_cast<int>(dynamic_cluster_shape.x), static_cast<int>(dynamic_cluster_shape.y), cute::Int<1>{});
+  }
+}
+
+// Dynamic cluster shape can_implement rule
+template <class AtomThrShapeMNK>
+CUTLASS_HOST_DEVICE
+bool
+preferred_cluster_can_implement(dim3 cluster_shape, dim3 cluster_shape_fallback) {
+  bool implementable{true};
+
+  // Runtime cluster shape should satisfy MMA requirements
+  auto AtomThrShapeM = cute::size<0>(AtomThrShapeMNK{});
+  implementable &= (cluster_shape.x > 0 && cluster_shape.y > 0 && cluster_shape.z > 0);
+  implementable &= (cluster_shape.x % AtomThrShapeM == 0);
+
+  implementable &= (cluster_shape_fallback.x > 0 && cluster_shape_fallback.y > 0 && cluster_shape_fallback.z > 0);
+  implementable &= (cluster_shape_fallback.x % AtomThrShapeM == 0);
+
+  // Only support pow2 runtime cluster shape for now
+  implementable &= ispow2(cluster_shape.x) &&
+                   ispow2(cluster_shape.y) &&
+                   ispow2(cluster_shape.z);
+
+  implementable &= ispow2(cluster_shape_fallback.x) &&
+                   ispow2(cluster_shape_fallback.y) &&
+                   ispow2(cluster_shape_fallback.z);
+
+  return implementable;
+}
+
+} // namespace cutlass::detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..01085c54159fc1cd5d6b7e2ee1d40a46cccd4f67
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective.hpp
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/container/tuple.hpp"
+#include "cute/layout.hpp" // cute::size(shape)
+#include "cute/arch/mma_sm100_desc.hpp" // cute::UMMA::MXF4Format, cute::UMMA::MXF8F6F4Format 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <size_t I, class Tuple>
+struct deduce_mixed_width_dtype {
+static_assert(I >= 0u && I <= 2u, "Valid indices are 0, 1, and 2, which represent Operand, Scale, and Bias, respectively.");
+
+private:
+  using underlying_tuple = cute::conditional_t<cute::is_tuple<Tuple>::value, Tuple, cute::tuple<Tuple>>;
+  static constexpr size_t valid_index = cute::min(I, cute::tuple_size_v<underlying_tuple> - 1);
+
+public:
+  using type = cute::conditional_t<(I < cute::tuple_size_v<underlying_tuple>), 
+                                    cute::tuple_element_t<valid_index, underlying_tuple>,
+                                    void>;
+};
+
+template <size_t I, class Tuple>
+using deduce_mixed_width_dtype_t = typename deduce_mixed_width_dtype<I, Tuple>::type;
+
+
+
+template <class Element>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_runtime_f8f6f4() {
+  return (cute::is_same_v<Element, cutlass::type_erased_dynamic_float8_t> ||
+          cute::is_same_v<Element, cutlass::type_erased_dynamic_float6_t> ||
+          cute::is_same_v<Element, cutlass::type_erased_dynamic_float4_t>);
+}
+
+template <class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_f8f6f4_inputs() {
+   return ( 
+            
+            cute::is_same_v<ElementA, cute::type_erased_dynamic_float8_t> || 
+            cute::is_same_v<ElementA, cute::type_erased_dynamic_float6_t> ||
+            cute::is_same_v<ElementA, cute::type_erased_dynamic_float4_t> ||
+            
+            cute::is_same_v<ElementA, cute::float_e4m3_t> ||
+            cute::is_same_v<ElementA, cute::float_e5m2_t> 
+            || cute::is_same_v<ElementA, cute::float_e3m2_t> ||
+            cute::is_same_v<ElementA, cute::float_e2m3_t> ||
+            cute::is_same_v<ElementA, cute::float_e2m1_t>
+            
+          ) &&
+          ( 
+            
+            cute::is_same_v<ElementB, cute::type_erased_dynamic_float8_t> ||
+            cute::is_same_v<ElementB, cute::type_erased_dynamic_float6_t> ||
+            cute::is_same_v<ElementB, cute::type_erased_dynamic_float4_t> ||
+            
+            cute::is_same_v<ElementB, cute::float_e4m3_t> ||
+            cute::is_same_v<ElementB, cute::float_e5m2_t> 
+            || cute::is_same_v<ElementB, cute::float_e3m2_t> ||
+            cute::is_same_v<ElementB, cute::float_e2m3_t> ||
+            cute::is_same_v<ElementB, cute::float_e2m1_t>
+            
+          );
+}
+
+template <class TiledMma, class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm100_mma_f8f6f4() {
+  return (cute::size<2>(typename TiledMma::Shape_MNK{}) == 32) && is_sm10x_f8f6f4_inputs<ElementA, ElementB>();
+}
+
+template <class Element>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_f8f6f4_element() {
+  return (cute::is_same_v<Element, cute::float_e4m3_t> 
+          || cute::is_same_v<Element, cute::float_e5m2_t> 
+          || cute::is_same_v<Element, cute::float_e3m2_t>
+          || cute::is_same_v<Element, cute::float_e2m3_t>
+          || cute::is_same_v<Element, cute::float_e2m1_t>
+          
+        );
+}
+
+
+template <class Element>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_f4_element() {
+  return (cute::is_same_v<Element, cute::float_e2m1_t> 
+  );
+}
+
+template <class ElementType>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_mxf8f6f4_input() {
+          // ElementType must be F8, F6, or F4
+   return ( cute::is_same_v<ElementType, cutlass::type_erased_dynamic_float8_t> ||
+            cute::is_same_v<ElementType, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t> ||
+            cute::is_same_v<ElementType, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
+            cute::is_same_v<ElementType, cutlass::float_e4m3_t> ||
+            cute::is_same_v<ElementType, cutlass::float_e5m2_t> ||
+            cute::is_same_v<ElementType, cutlass::detail::float_e2m3_unpacksmem_t> ||
+            cute::is_same_v<ElementType, cutlass::detail::float_e3m2_unpacksmem_t> ||
+            cute::is_same_v<ElementType, cutlass::detail::float_e2m1_unpacksmem_t>);
+}
+
+template <class ElementType>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm10x_mxf4nvf4_input() {
+          // ElementType must be F4
+   return ( cute::is_same_v<ElementType, cute::type_erased_dynamic_float4_t> ||
+            cute::is_same_v<ElementType, cute::float_e2m1_t> 
+          );
+}
+
+template <class ElementType, bool IsRuntimeDataType>
+struct sm10x_block_scale_runtime_input_t {
+  static constexpr bool IsF8F6F4MmaInput = is_sm10x_mxf8f6f4_input<ElementType>();
+  static constexpr bool IsF4MmaInput = is_sm10x_mxf4nvf4_input<ElementType>();
+
+  using Type = cute::conditional_t<IsRuntimeDataType && IsF8F6F4MmaInput, 
+                                   cute::UMMA::MXF8F6F4Format, 
+               cute::conditional_t<IsRuntimeDataType && IsF4MmaInput, 
+                                   cute::UMMA::MXF4Format, 
+                                   void*
+                                   >
+                                  >;
+};
+
+
+template <class TiledMma, class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm120_f8f6f4() {
+  return (cute::size<2>(typename TiledMma::Shape_MNK{}) == 32) && is_sm10x_f8f6f4_inputs<ElementA, ElementB>();
+}
+
+template <class TiledMma, class ElementA, class ElementB>
+CUTLASS_HOST_DEVICE
+static constexpr bool
+is_sm100_sparse_f8f6f4() {
+  return (cute::size<2>(typename TiledMma::Shape_MNK{}) == 64) && is_sm10x_f8f6f4_inputs<ElementA, ElementB>();
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/mixed_input_utils.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/mixed_input_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..89d250001eaa00990319cc2a1da35ec0dccb8703
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/mixed_input_utils.hpp
@@ -0,0 +1,1249 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+// The universal converter
+template <
+  class SrcType,
+  class DstType,
+  class LayoutIn,
+  class LayoutOut
+>
+struct LayoutAwareConvertImpl {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn, LayoutIn> const& src,
+    cute::Tensor<EngineOut, LayoutOut>    & dst) {
+
+    static_assert(cute::is_same_v<SrcType, typename EngineIn::value_type> &&
+                  cute::is_same_v<DstType, typename EngineOut::value_type>);
+    static_assert(cute::cosize_v<LayoutIn> == cute::cosize_v<LayoutOut>);
+    constexpr int N = decltype(cute::max_common_vector(LayoutIn{}, LayoutOut{})){};
+    using SrcArray = cutlass::Array<SrcType, N>;
+    using DstArray = cutlass::Array<DstType, N>;
+    using Converter = cutlass::NumericArrayConverter<DstType,
+                                                     SrcType,
+                                                     N,
+                                                     cutlass::FloatRoundStyle::round_to_nearest>;
+    auto&& src_vm = cute::recast<SrcArray>(src);
+    auto&& dst_vm = cute::recast<DstArray>(dst);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < src_vm.size(); ++i) {
+      dst_vm(i) = Converter::convert(src_vm(i));
+    }
+  }
+};
+
+// Specialization for INT4 -> BF16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int4b_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                 cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                 cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (4 * (ii));
+      static constexpr uint32_t xor_mask = 0x43084308;
+      static constexpr uint32_t lo_mask  = 0x000F000F;
+      static constexpr uint32_t immLut   = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(xor_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias = xor_mask; // 0x43084308, {136, 136}
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(bf16x2_val,
+                              reinterpret_cast<const __nv_bfloat162&>(lo_bias));
+      }
+    }
+  }
+};
+
+// Specialization for UINT4 -> BF16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::uint4b_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                 cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::uint4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::uint4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (4 * (ii));
+      static constexpr uint32_t or_mask = 0x43004300;
+      static constexpr uint32_t lo_mask = 0x000F000F;
+      static constexpr uint32_t immLut  = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(or_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias = or_mask; // 0x43004300, {128, 128}
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(bf16x2_val,
+                             reinterpret_cast<const __nv_bfloat162&>(lo_bias));
+      }
+    }
+  }
+};
+
+// Specialization for INT4 -> FP16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int4b_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      auto src_ = src_reg >> (4 * (ii));
+      r[ii + 0] = src_;
+      r[ii + 1] = src_;
+      static constexpr uint32_t lo_xor_mask = 0x64086408;
+      static constexpr uint32_t hi_xor_mask = 0x64806480;
+      static constexpr uint32_t lo_mask     = 0x000F000F;
+      static constexpr uint32_t hi_mask     = 0x00F000F0;
+      static constexpr uint32_t immLut      = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 0])
+          : "n"(lo_mask), "n"(lo_xor_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 1])
+          : "n"(hi_mask), "n"(hi_xor_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias  = 0x64086408; // {1032, 1032}
+      static constexpr uint32_t hi_bias  = 0xD480D480; // {-72, -72}
+      static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<const half2&>(lo_bias));
+      }
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+        fp16x2_val = __hfma2(fp16x2_val,
+                              reinterpret_cast<const half2&>(hi_scale),
+                              reinterpret_cast<const half2&>(hi_bias));
+      }
+    }
+  }
+};
+
+// Specialization for UINT4 -> FP16 with [02461357] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::uint4b_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>,
+  cute::Layout<_8>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_4>, cute::Stride<_4,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_8>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::uint4b_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::uint4b_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      auto src_ = src_reg >> (4 * (ii));
+      r[ii + 0] = src_;
+      r[ii + 1] = src_;
+      static constexpr uint32_t or_mask = 0x64006400;
+      static constexpr uint32_t lo_mask = 0x000F000F;
+      static constexpr uint32_t hi_mask = 0x00F000F0;
+      static constexpr uint32_t immLut  = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(lo_mask), "n"(or_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii + 1])
+          : "n"(hi_mask), "n"(or_mask), "n"(immLut));
+      static constexpr uint32_t lo_bias  = or_mask;    // 0x64006400, {1024, 1024}
+      static constexpr uint32_t hi_bias  = 0xD400D400; // {-64, -64}
+      static constexpr uint32_t hi_scale = 0x2C002C00; // {1/16, 1/16}
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 0]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<const half2&>(lo_bias));
+      }
+      {
+        half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii + 1]);
+        fp16x2_val = __hfma2(fp16x2_val,
+                             reinterpret_cast<const half2&>(hi_scale),
+                             reinterpret_cast<const half2&>(hi_bias));
+      }
+    }
+  }
+};
+/*
+// Specialization for E5M2 -> FP16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::float_e5m2_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::float_e5m2_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::float_e5m2_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      // in registers: a3, a1, a2, a0
+      r[RegArray::kElements - ii - 1] = src_reg << (8 * (ii));
+
+      static constexpr uint32_t and_mask = 0xFF00FF00;
+      asm volatile(
+          "{\n"
+          "  and.b32 %0, %0, %1;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask));
+    }
+  }
+};
+*/
+// Specialization for INT8 -> BF16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int8_t,
+  cutlass::bfloat16_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int8_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::bfloat16_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int8_t, 8>;
+    using DstArray = cutlass::Array<cutlass::bfloat16_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      uint32_t tmp0, tmp1;
+      r[ii] = src_reg >> (8 * (ii));
+      static constexpr uint32_t or_mask    = 0x43004300;
+      static constexpr uint32_t and_mask_0 = 0x007F007F;
+      static constexpr uint32_t and_mask_1 = 0x00800080;
+      static constexpr uint32_t immLut     = (0xf0 & 0xcc) | 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(tmp0)
+          : "r"(r[ii]), "n"(and_mask_0), "n"(or_mask), "n"(immLut));
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(tmp1)
+          : "r"(r[ii]), "n"(and_mask_1), "n"(or_mask), "n"(immLut));
+      {
+        __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+        bf16x2_val = __hsub2(reinterpret_cast<__nv_bfloat162 const&>(tmp0),
+                             reinterpret_cast<__nv_bfloat162 const&>(tmp1));
+      }
+    }
+  }
+};
+
+// Specialization for INT8 -> FP16 with [3120] value order
+template <>
+struct LayoutAwareConvertImpl<
+  cutlass::int8_t,
+  cutlass::half_t,
+  cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>,
+  cute::Layout<_4>
+> {
+  template<class EngineIn, class EngineOut>
+  CUTLASS_DEVICE
+  static void convert(
+    cute::Tensor<EngineIn,
+                cute::Layout<cute::Shape<_2,_2>, cute::Stride<_2,_1>>
+                > const& src,
+    cute::Tensor<EngineOut,
+                cute::Layout<_4>
+                >& dst) {
+
+    static_assert(cute::is_same_v<cutlass::int8_t, typename EngineIn::value_type> &&
+                  cute::is_same_v<cutlass::half_t, typename EngineOut::value_type>);
+    using SrcArray = cutlass::Array<cutlass::int8_t, 8>;
+    using DstArray = cutlass::Array<cutlass::half_t, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, 4, sizeof(DstArray)>;
+
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      r[ii] = src_reg >> (8 * (ii));
+      static constexpr uint32_t xor_mask = 0x64806480;
+      static constexpr uint32_t and_mask = 0x00FF00FF;
+      static constexpr uint32_t immLut   = (0xf0 & 0xcc) ^ 0xaa;
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+      {
+        static constexpr uint32_t bias = 0x64806480;
+        __half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hsub2(fp16x2_val,
+                             reinterpret_cast<__half2 const&>(bias));
+      }
+    }
+  }
+};
+
+template <
+  class EngineIn,
+  class EngineOut,
+  class LayoutIn,
+  class LayoutOut
+>
+CUTLASS_DEVICE
+void LayoutAwareConvert( // Accept mutable temporaries
+  cute::Tensor<EngineIn, LayoutIn>   const& src,
+  cute::Tensor<EngineOut, LayoutOut>     && dst) {
+
+  LayoutAwareConvert(src, dst);
+}
+template <
+  class EngineIn,
+  class EngineOut,
+  class LayoutIn,
+  class LayoutOut
+>
+CUTLASS_DEVICE
+void LayoutAwareConvert(
+  cute::Tensor<EngineIn, LayoutIn>   const& src,
+  cute::Tensor<EngineOut, LayoutOut>      & dst) {
+
+  using SrcType = typename EngineIn::value_type;
+  using DstType = typename EngineOut::value_type;
+  Tensor src_vm = coalesce(src);
+  Tensor dst_vm = coalesce(dst);
+  Layout src_layout = src_vm.layout();
+  Layout dst_layout = dst_vm.layout();
+  LayoutAwareConvertImpl<SrcType,
+                         DstType,
+                         decltype(src_layout),
+                         decltype(dst_layout)>::convert(src_vm, dst_vm);
+}
+
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+  namespace detail {
+    enum class ConversionMode {
+      DirectConvert,              // A * B
+      ConvertAndScale,            // (scale * A) * B
+      ConvertAndScaleWithZero     // (scale * A + zeros) * B
+    };
+  } // namespace detail
+} //namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective::detail {
+
+template <class PointerType>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_logical_ptr(PointerType const* ptr) {
+  return cute::recast_ptr<PointerType const>(ptr);
+}
+template<int Stages, class LayoutAtom, class TileShape, class Stride>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_smem_layout(LayoutAtom layout_atom, TileShape const& tile_shape, Stride const& stride) {
+  if constexpr (not cute::is_layout<Stride>::value) {
+    return tile_to_shape(
+      layout_atom,
+      append(tile_shape, Int<Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,Stride>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{});
+  }
+  else {
+    auto gmem_tile = composition(stride, tile_shape);
+    return make_layout_like(append(gmem_tile, make_layout(Int<Stages>{}, 0)));
+  }
+}
+template<class Shape, class Stride>
+static constexpr
+CUTLASS_HOST_DEVICE
+auto get_gmem_layout(Shape const& shape, Stride const& stride) {
+  if constexpr (not cute::is_layout<Stride>::value) {
+    return make_layout(shape, stride);
+  }
+  else {
+    return stride;
+  }
+}
+
+template<class Collective>
+struct MixedInputUtils {
+private:
+  using ConversionMode = cutlass::detail::ConversionMode;
+  using KernelSchedule = typename Collective::KernelSchedule;
+  using SmemLayoutA = typename Collective::SmemLayoutA;
+  using SmemLayoutB = typename Collective::SmemLayoutB;
+  using SmemLayoutScale = typename Collective::SmemLayoutScale;
+  using SwappedElementA = typename Collective::SwappedElementA;
+  using SwappedElementB = typename Collective::SwappedElementB;
+  using RealSwappedElementA = typename Collective::RealSwappedElementA;
+  using RealSwappedElementB = typename Collective::RealSwappedElementB;
+  using ElementScale = typename Collective::ElementScale;
+  using ElementZero = typename Collective::ElementZero;
+  using SmemCopyAtomScale = typename Collective::SmemCopyAtomScale;
+  static constexpr auto KernelConversionMode = Collective::KernelConversionMode;
+  static constexpr auto ModeHasScales = Collective::ModeHasScales;
+  static constexpr auto UseScaleLookupTable = Collective::UseScaleLookupTable;
+
+public:
+  static constexpr auto
+  elements_per_smem_scale() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    }
+    else if constexpr (ModeHasScales) {
+      return cute::cosize_v<SmemLayoutScale>;
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  static constexpr auto
+  elements_per_smem_zero() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert ||
+                  KernelConversionMode == ConversionMode::ConvertAndScale ) {
+      return 0;
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::cosize_v<SmemLayoutScale>;
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in scale smem allocation.");
+    }
+  }
+
+  // These methods use some the public members of the class. For that reason, we define them after the public section.
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_mk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementA>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_nk() {
+    return cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(cute::sizeof_bits_v<SwappedElementB>));
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_extra() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    }
+    else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(SmemLayoutScale{}) * size<1>(SmemLayoutScale{}) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  static constexpr uint32_t
+  compute_tma_transaction_bytes_extra_transform() {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return 0;
+    }
+    else if constexpr (ModeHasScales) {
+      constexpr uint32_t scale_tx_bytes = cutlass::bits_to_bytes(size<0>(filter_zeros(SmemLayoutScale{})) * size<1>(filter_zeros(SmemLayoutScale{})) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementScale>));
+      static_assert(scale_tx_bytes % 128 == 0, "Each scale stage must be 128B aligned."); // required by TMA
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return scale_tx_bytes;
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        // Scale and zero share smem layout
+        constexpr uint32_t zero_tx_bytes = cutlass::bits_to_bytes(size<0>(filter_zeros(SmemLayoutScale{})) * size<1>(filter_zeros(SmemLayoutScale{})) * static_cast<uint32_t>(cute::sizeof_bits_v<ElementZero>));
+        static_assert(zero_tx_bytes % 128 == 0, "Each zero stage must be 128B aligned."); // required by TMA
+        return scale_tx_bytes + zero_tx_bytes;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Type not handled in tma transaction bytes computation.");
+    }
+  }
+
+  /// Utilities to copy A and extra inputs from smem to RF
+  template <class SmemTiledCopyA,
+            class TensorASmemView,
+            class TensorACopyView,
+            class... Ts,
+            class... Us
+            >
+  CUTLASS_DEVICE
+  static void copy_tensors_MK(
+    SmemTiledCopyA const& smem_tiled_copy_A,
+    TensorASmemView const& tCsA,
+    TensorACopyView& tCrA_copy_view,
+    cute::tuple<Ts...> const& partitioned_mma_extra_info,
+    cute::tuple<Us...> const& tiled_copy_and_views,
+    int k_block,
+    int read_stage) {
+
+    copy(smem_tiled_copy_A, tCsA(_,_,k_block,read_stage), tCrA_copy_view(_,_,k_block));
+
+    if (k_block == 0) {
+      // We are starting a new k-tile so copy the scale
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // nothing to do
+      }
+      else if constexpr (ModeHasScales) {
+        auto smem_tiled_copy_S = cute::get<0>(tiled_copy_and_views);
+        auto tCrS_copy_view    = cute::get<1>(tiled_copy_and_views);
+        auto tCsS              = cute::get<0>(partitioned_mma_extra_info);
+        copy(smem_tiled_copy_S, tCsS(_,_,k_block,read_stage), tCrS_copy_view(_,_,k_block));
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tCsZ              = cute::get<2>(partitioned_mma_extra_info);
+          auto tCrZ_copy_view    = cute::get<2>(tiled_copy_and_views);
+          copy(smem_tiled_copy_S, tCsZ(_,_,k_block,read_stage), tCrZ_copy_view(_,_,k_block));
+        } else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+        }
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+  }
+
+  /// (Designed for separate transform pipeline in Blackwell)
+  /// Utilities to copy extra inputs from smem to RF
+  template <class... Ts>
+  CUTLASS_DEVICE
+  static void copy_scale_zeros_for_transform(
+    cute::tuple<Ts...> & partitioned_transform_extra_info,
+    int load2transform_consumer_index) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+    }
+    else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = cute::get<0>(partitioned_transform_extra_info);
+      auto&& scales          = cute::get<1>(partitioned_transform_extra_info);
+      using ScaleType        = decltype(scales);
+      auto tSrS              = make_tensor(scales.data(), scales.layout());
+      auto tSsS              = cute::get<2>(partitioned_transform_extra_info);
+      copy(smem_tiled_copy_S, tSsS(_,_,_,_,load2transform_consumer_index), tSrS);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        // Nothing extra to do
+      } else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        auto&& zeros           = cute::get<3>(partitioned_transform_extra_info);
+        using ZeroType         = decltype(zeros);
+        auto tZrZ              = make_tensor(zeros.data(), zeros.layout());
+        auto tZsZ              = cute::get<4>(partitioned_transform_extra_info);
+        copy(smem_tiled_copy_S, tZsZ(_,_,_,_,load2transform_consumer_index), tZrZ);
+
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  // Helper functions to select packing for conversion
+  template <class SrcType,
+            class DstType,
+            int Cosize>
+  struct select_packing { // Naive packing policy
+    static constexpr auto value() {
+      return Int<cute::gcd(Cosize, 32 / cute::min(sizeof_bits_v<SrcType>, sizeof_bits_v<DstType>))>{};
+    }
+  };
+
+  // The core converter uses a lookup table to converts i4 -> 8 bit value.
+  template <class EngineIn,
+            class LayoutIn,
+            class EngineOut,
+            class LayoutOut,
+            class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE
+  static void lookup_table_convert( // Accept mutable temporaries
+    Tensor<EngineIn, LayoutIn>       const& src,
+    Tensor<EngineOut, LayoutOut>         && dst,
+    Tensor<EngineScale, LayoutScale> const& scales_neg,
+    Tensor<EngineScale, LayoutScale> const& scales_pos) {
+
+    lookup_table_convert(src, dst, scales_neg, scales_pos);
+  }
+  template <class EngineIn,
+            class LayoutIn,
+            class EngineOut,
+            class LayoutOut,
+            class EngineScale,
+            class LayoutScale>
+  CUTLASS_DEVICE
+  static void lookup_table_convert(
+    Tensor<EngineIn, LayoutIn>       const& src,
+    Tensor<EngineOut, LayoutOut>          & dst,
+    Tensor<EngineScale, LayoutScale> const& scales_neg,
+    Tensor<EngineScale, LayoutScale> const& scales_pos) {
+
+    constexpr int N = cute::cosize(LayoutIn{});
+    static_assert(N == 4 || N == 8);
+    static_assert(cosize(LayoutScale{}) <= N / 4,
+                  "at least 4 consecutive weights must share the same scale.");
+    using SrcArray = cutlass::Array<cutlass::int4b_t, 8>;
+    using DstArray = cutlass::Array<RealSwappedElementB, 8>;
+    using RegArray = cutlass::AlignedArray<uint32_t, N / 4, sizeof(DstArray)>;
+
+    // View the input as reg
+    auto&& src_reg = cute::recast<uint32_t>(src)(0);
+    auto&& r       = cute::recast<RegArray>(dst)(0);
+
+    // Determines if to get from the signed or unsigned candidates
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa;
+    uint32_t sign; // ((reg & 0x88888888) | 0x64206420) >> 1
+    asm volatile(
+      "{\n"
+      "  lop3.b32 %0, %1, %2, %3, %4;\n" \
+      "}\n"
+      : "=r"(sign)
+      : "r"(src_reg), "n"(0x88888888), "n"(0x64206420), "n"(immLut)
+    );
+    sign = sign >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = src_reg & 0x77777777;
+    Tensor scales_neg_ = cute::filter(scales_neg);
+    Tensor scales_pos_ = cute::filter(scales_pos);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i, lut_idx >>=16, sign >>=16) {
+      auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_(i));
+      auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_pos_(i));
+      asm volatile(
+        "{\n"
+        "  .reg .b32 pos, neg                    ;\n" \
+        "  prmt .b32 neg, %3, %4, %1             ;\n" \
+        "  prmt .b32 pos, %5, %6, %1             ;\n" \
+        "  prmt .b32 %0, pos, neg, %2            ;\n" \
+        "}\n"
+        : "=r"(r[i])
+        : "r"(lut_idx), "r"(sign), "r"(scale_neg_[0]), "r"(scale_neg_[1]), "r"(scale_pos_[0]), "r"(scale_pos_[1])
+      );
+    }
+  }
+
+  /// Utilities to dequantize A.
+  template <class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Layout const& tensor) {
+    static_assert(shape<0>(Layout{}) >= 4 && stride<0>(Layout{}) == 0, "At least 4 adjacent weights in a thread must share the same scale.");
+  }
+  template <class Engine,
+            class Layout>
+  CUTLASS_DEVICE
+  static void static_check_scale(Tensor<Engine, Layout> const& tensor) {
+    static_check_scale(flatten(Layout{}));
+  }
+  template <class EngineIn,
+            class EngineOut,
+            class LayoutIn,
+            class LayoutOut,
+            class... Ts>
+  CUTLASS_DEVICE
+  static void dequantize_A_kblock(
+    Tensor<EngineIn, LayoutIn> const& tCrA_load,
+    Tensor<EngineOut, LayoutOut>& tCrA_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const k_block) {
+
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    Tensor src = tCrA_load(_, _, k_block);
+    Tensor dst = tCrA_mma(_, _, k_block);
+
+    CUTE_STATIC_ASSERT_V(size(src(_, 0)) == cosize(src(_, 0).layout()),
+                         "The first mode of tensor src must be contiguous in memory");
+    // try to make the size of the first mode equal to 32bit
+    int constexpr NumValPerSrcReg = cute::min(decltype(size(src(_, 0)))::value,
+                                              ceil_div(32, sizeof_bits_v<SrcType>));
+    Tensor src_vm = cute::group_modes<1,-1>(cute::zipped_divide(src, Int<NumValPerSrcReg>{}));
+    Tensor dst_vm = cute::group_modes<1,-1>(cute::zipped_divide(dst, Int<NumValPerSrcReg>{}));
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+      }
+    }
+    else if constexpr (UseScaleLookupTable) {
+      constexpr int num_elements = decltype(size(src))::value;
+      static_assert(is_same_v<RealSwappedElementA, cutlass::int4b_t>, "Lookup table only supports int4 being the quant type now.");
+      static_assert(sizeof_bits_v<ElementScale> == 64, "Lookup table only supports 8 8bit scale values now.");
+      static_assert(num_elements % 4 == 0 && num_elements >= 4, "Lookup table requires a vector size of 4x when converting.");
+
+      Tensor tCrS_neg = cute::get<1>(partitioned_extra_info);
+      auto&& tCrS_pos = cute::get<2>(partitioned_extra_info); // modification to its value is needed
+      Tensor scales_neg = tCrS_neg(_, _, k_block);
+      Tensor scales_pos = tCrS_pos(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(cute::size(src) == cute::size(scales_neg));
+
+      static_check_scale(scales_neg);
+      static_check_scale(scales_pos);
+      Tensor scales_neg_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales_neg, Int<NumValPerSrcReg>{}));
+      Tensor scales_pos_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales_pos, Int<NumValPerSrcReg>{}));
+
+      if (k_block == 0) {
+        Tensor scales_neg_vm_ = filter(scales_neg_vm);
+        Tensor scales_pos_vm_ = filter(scales_pos_vm);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(scales_neg_vm_.layout()); ++i)
+        {
+          auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
+          auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>      &>(scales_pos_vm_(i));
+          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+          asm volatile(
+              "{\n"
+              "  lop3 .b32 %0, %2, %4, %5, %6;\n" \
+              "  xor  .b32 %1, %3, %5;        \n" \
+              "}\n"
+              : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080), "n"(immLut)
+            );
+        }
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size<1>(dst_vm); ++i) {
+        lookup_table_convert(src_vm(_, i), dst_vm(_, i), scales_neg_vm(_, i), scales_pos_vm(_, i));
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      Tensor scales_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) *= scales_vm(j, i);
+          }
+        }
+      }
+      else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) *= scales_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
+      Tensor scales = cute::get<1>(partitioned_extra_info)(_, _, k_block);
+      Tensor zeros  = cute::get<3>(partitioned_extra_info)(_, _, k_block);
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+      CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+      Tensor scales_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales, Int<NumValPerSrcReg>{}));
+      Tensor zeros_vm = cute::group_modes<1,-1>(cute::zipped_divide(zeros, Int<NumValPerSrcReg>{}));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), dst_vm(_, i));
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            dst_vm(j, i) = dst_vm(j, i) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+        }
+      }
+      else {
+        auto stage = make_tensor_like<ElementScale>(src_vm(_, 0));
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size<1>(dst_vm); ++i) {
+          LayoutAwareConvert(src_vm(_, i), stage);
+          CUTLASS_PRAGMA_UNROLL
+          for (int j = 0; j < size<0>(dst_vm); ++j) {
+            stage(j) = stage(j) * scales_vm(j, i) + zeros_vm(j, i);
+          }
+          LayoutAwareConvert(stage, dst_vm(_, i));
+        }
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "No A data is loaded.");
+    }
+  }
+
+  /// (Designed for separate transform pipeline in Blackwell)
+  /// Utilities to dequantize A.
+  template <class EngineIn,
+            class EngineOut,
+            class LayoutIn,
+            class LayoutOut,
+            class... Ts>
+  CUTLASS_DEVICE
+  static void dequantize_A_kblock_for_transform(
+    Tensor<EngineIn, LayoutIn> const& tArA,
+    Tensor<EngineOut, LayoutOut>& tArACompute,
+    cute::tuple<Ts...> const& partitioned_extra_info,
+    int const k_block) {
+
+    static_assert(is_rmem<EngineIn>::value, "Input tensor for A conversion must come from registers");
+    static_assert(is_rmem<EngineOut>::value, "Output tensor for A conversion must come from registers");
+    static_assert(cosize_v<LayoutIn> == cosize_v<LayoutOut>);
+    static_assert(size_v<LayoutIn> == cosize_v<LayoutIn>);
+    static_assert(size_v<LayoutOut> == cosize_v<LayoutOut>);
+    using SrcType = typename EngineIn::value_type;
+    using DstType = typename EngineOut::value_type;
+
+    auto src = tArA(_, _, _, k_block);
+    auto dst = tArACompute(_, _, _, k_block);
+    constexpr int num_elements = decltype(size(src))::value;
+
+    constexpr int pack = decltype(select_packing<SrcType, DstType, num_elements>::value())::value;
+    using Converter = cutlass::NumericArrayConverter<DstType, SrcType, pack, cutlass::FloatRoundStyle::round_to_nearest>;
+    using SrcArray = cutlass::Array<SrcType, pack>;
+    using DstArray = cutlass::Array<DstType, pack>;
+    constexpr int DstElementsPerReg = 32 / sizeof_bits_v<DstType>;
+    using RegArray = cutlass::AlignedArray<uint32_t, pack / DstElementsPerReg, sizeof(DstArray)>;
+
+    auto src_arr = recast<SrcArray>(src);
+    auto dst_arr = recast<DstArray>(dst);
+
+    Tensor dst_vm = cute::group_modes<1,-1>(cute::zipped_divide(dst, pack));
+
+    cute::transform(src_arr, dst_arr, Converter::convert);
+    
+    if constexpr (ModeHasScales) {
+
+      auto const& scales = cute::get<1>(partitioned_extra_info)(_,_,_,k_block);
+
+      CUTE_STATIC_ASSERT_V(size(src) == size(scales));
+
+      if constexpr (is_same_v<DstType, ElementScale>) {
+
+        using ScaleArray = cutlass::Array<ElementScale, pack>;
+        auto scale_arr = recast<ScaleArray>(filter_zeros(scales));
+
+        if constexpr (is_same_v<DstType, cutlass::bfloat16_t>){
+          Tensor scales_vm = cute::group_modes<1,-1>(cute::zipped_divide(scales, pack));
+
+          for (int i = 0; i < size<1>(dst_vm); ++i){
+            auto&& r       = cute::recast<RegArray>(dst_vm(_,i))(0);
+            auto&& scale_reg = cute::recast<RegArray>(scales_vm(_,i))(0);
+            CUTLASS_PRAGMA_UNROLL
+            for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+              __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+              bf16x2_val = __hmul2(bf16x2_val,
+                                  reinterpret_cast<const __nv_bfloat162&>(scale_reg[ii]));
+            }
+          }
+        }
+        else{
+          cute::transform(dst_arr, scale_arr, dst_arr, cute::multiplies{});
+        }
+      }
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+         // Do Nothing
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        static_assert(is_same_v<ElementScale, ElementZero>, "ElementScale and ElementZero must be the same.");
+
+        auto const& zeros = cute::get<3>(partitioned_extra_info)(_,_,_,k_block);
+        CUTE_STATIC_ASSERT_V(size(src) == size(zeros));
+
+        if constexpr (is_same_v<DstType, ElementZero>) {
+          using ZeroArray = cutlass::Array<ElementZero, pack>;
+          auto zero_arr = recast<ZeroArray>(filter_zeros(zeros));
+
+        if constexpr (is_same_v<DstType, cutlass::bfloat16_t>) {
+          Tensor zeros_vm = cute::group_modes<1,-1>(cute::zipped_divide(zeros, pack));
+
+          for (int i = 0; i < size<1>(dst_vm); ++i){
+            auto&& r       = cute::recast<RegArray>(dst_vm(_,i))(0);
+            auto&& zero_reg = cute::recast<RegArray>(zeros_vm(_,i))(0);
+            CUTLASS_PRAGMA_UNROLL
+            for (size_t ii = 0; ii < RegArray::kElements; ++ii) {
+              __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+              bf16x2_val = __hadd2(bf16x2_val,
+                                  reinterpret_cast<const __nv_bfloat162&>(zero_reg[ii]));
+            }
+          }
+        }
+        else{
+          cute::transform(dst_arr, zero_arr, dst_arr, cute::plus{});
+         }
+       }
+     }
+     else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");
+     }
+  }
+}
+
+
+  /// Utilities for any additional inputs inside of the TMA load
+  template <
+    class Params,
+    class TensorStorage,
+    class... Ts
+  >
+  CUTLASS_DEVICE
+  static auto partition_extra_tma_inputs(
+    Params const& mainloop_params,
+    cute::tuple<Ts...> const& load_inputs,
+    TensorStorage& shared_tensors,
+    uint2 const& cluster_local_block_id,
+    int const m_coord,
+    int const l_coord) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      Tensor sS  = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+      Tensor gS_mkl = get<2>(load_inputs);
+      auto block_tma_s = mainloop_params.tma_load_scale.get_slice(cluster_local_block_id.y);
+      Tensor gS = gS_mkl(_,_,m_coord,_,l_coord);                                                  // (BLK_M,BLK_K,k)
+
+      Tensor tSgS = block_tma_s.partition_S(gS);
+      Tensor tSsS = block_tma_s.partition_D(sS);                                              // (TMA,TMA_M,TMA_K,PIPE)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tSgS, tSsS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ  = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{}); // (BLK_M,BLK_K,PIPE)
+        Tensor gZ_mkl = get<3>(load_inputs);
+        auto block_tma_z = mainloop_params.tma_load_zero.get_slice(cluster_local_block_id.y);
+        Tensor gZ = gZ_mkl(_,_,m_coord,_,l_coord);                                            // (BLK_M,BLK_K,k)
+
+        Tensor tZgZ = block_tma_z.partition_S(gZ);
+        Tensor tZsZ = block_tma_z.partition_D(sZ);                                            // (TMA,TMA_M,TMA_K,PIPE)
+        return cute::make_tuple(tSgS, tSsS, tZgZ, tZsZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for input partitioning.");
+    }
+  }
+
+  /// Utilities for partitioning extra inputs for loading from smem in the mainloop.
+  template <
+    class ThreadMma,
+    class TensorStorage
+  >
+  CUTLASS_DEVICE
+  static auto partition_extra_mma_info(
+    ThreadMma const& mma_thread_slice,
+    TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (UseScaleLookupTable) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS_neg = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout());
+      Tensor tCrS_pos = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS_neg, tCrS_pos);
+      }
+    }
+    else if constexpr (ModeHasScales) {
+      Tensor sS = make_tensor(make_smem_ptr(shared_tensors.smem_scale.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = mma_thread_slice.partition_A(sS);
+      Tensor tCrS = make_tensor<ElementScale>(mma_thread_slice.partition_fragment_A(sS(_,_,Int<0>{})).layout());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(tCsS, tCrS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_tensors.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = mma_thread_slice.partition_A(sZ);
+        Tensor tCrZ = make_tensor<ElementZero>(mma_thread_slice.partition_fragment_A(sZ(_,_,Int<0>{})).layout());
+        return cute::make_tuple(tCsS, tCrS, tCsZ, tCrZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  template <
+    class TiledMma,
+    class TiledCopy,
+    class TensorStorage
+  >
+  CUTLASS_DEVICE
+  static auto partition_extra_transform_info(
+    TiledMma const& tiled_mma,
+    TiledCopy const& smem_tiled_copy_S,
+    TensorStorage& shared_storage) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+      auto smem_thr_copy_S = smem_tiled_copy_S.get_slice(threadIdx.x % 128);
+
+      Tensor sS = make_tensor(make_smem_ptr(shared_storage.input.smem_scale.begin()), SmemLayoutScale{}); // (BLK_M,BLK_SCALE_K,PIPE)
+      Tensor tCsS = cta_mma.partition_A(sS);
+      Tensor tSsS = smem_thr_copy_S.partition_S(tCsS);
+      Tensor tSrS = make_tensor<ElementScale>(tSsS(_,_,_,_,0).shape());
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tSrS, tSsS);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor sZ = make_tensor(make_smem_ptr(shared_storage.input.smem_zero.begin()), SmemLayoutScale{});// (BLK_M,BLK_SCALE_K,PIPE)
+        Tensor tCsZ = cta_mma.partition_A(sZ);
+        Tensor tZsZ = smem_thr_copy_S.partition_S(tCsZ);
+        Tensor tZrZ = make_tensor<ElementZero>(tZsZ(_,_,_,_,0).shape());
+        return cute::make_tuple(smem_tiled_copy_S, tSrS, tSsS, tZrZ, tZsZ);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+
+  /// Returns the tiled copy and copy views for the extra inputs.
+  template <class TiledMma, class... Ts>
+  CUTLASS_DEVICE
+  static auto retile_extra_mma_info(
+    TiledMma const& tiled_mma,
+    cute::tuple<Ts...>& partitioned_extra_info,
+    int const warp_group_thread_idx) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // nothing to do
+      return cute::make_tuple();
+    }
+    else if constexpr (ModeHasScales) {
+      auto smem_tiled_copy_S = make_tiled_copy_A(SmemCopyAtomScale{}, tiled_mma);
+      auto smem_thr_copy_S   = smem_tiled_copy_S.get_thread_slice(warp_group_thread_idx);
+      Tensor tCrS_copy_view  = smem_thr_copy_S.retile_D(cute::get<1>(partitioned_extra_info));        // (CPY,CPY_M,CPY_K)
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tCrZ_copy_view  = smem_thr_copy_S.retile_D(cute::get<3>(partitioned_extra_info));      // (CPY,CPY_M,CPY_K)
+        return cute::make_tuple(smem_tiled_copy_S, tCrS_copy_view, tCrZ_copy_view);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in A -> RF path.");
+    }
+  }
+};
+
+} // cutlass::gemm::collective::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/sm103_kernel_type.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/sm103_kernel_type.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..04120a41ae0f404ed22ed05d08f138526b8e9fc3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/collective/sm103_kernel_type.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Kernel type definitions specific for SM103 BlockScaled MMA
+*/
+
+#pragma once
+
+namespace cutlass::sm103::detail {
+
+enum class KernelPrefetchType {
+  TmaPrefetch,      // TMA Prefetch (is the default version)
+  Disable           // Disable Prefetch
+};
+
+} // namespace cutlass::sm103::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/dependent_false.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/dependent_false.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2dd6a16a67c12beece2645bf4781b820e07e78e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/dependent_false.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+/// @brief A bool constant that depends on one or more template parameters.
+///
+/// For more detailed documentation and use cases,
+/// please see `dependent_false` below.
+template <bool Value, class... Args>
+inline constexpr bool dependent_bool_value = Value;
+
+/// @brief An always-false value that depends on one or more template parameters.
+///
+/// This exists because `static_assert(false);` always fails,
+/// even if it occurs in the `else` branch of an `if constexpr`.
+/// The following example shows how to use `dependent_false` in that case.
+///
+/// @code
+/// template<class T>
+/// void foo (T t)
+/// {
+///     if constexpr (std::is_integral_v<T>) {
+///         do_integer_stuff(t);
+///     }
+///     else if constexpr (std::is_floating_point_v<T>) {
+///         do_floating_point_stuff(t);
+///     }
+///     else {
+///         static_assert(dependent_false<T>, "T must be "
+///             "an integral or floating-point type.");
+///     }
+/// }
+/// @endcode
+///
+/// This implements the C++ Standard Library proposal P1830R1.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1830r1.pdf
+///
+/// That proposal is under review as of 2022/12/05.
+/// The following link shows P1830's current review status.
+///
+/// https://github.com/cplusplus/papers/issues/572
+///
+/// P2593R0 proposes an alternate solution to this problem,
+/// that would change the C++ language itself.
+///
+/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+///
+/// For headers in this library, however, we only consider library solutions
+/// as work-arounds for future C++ features.
+template <class... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
+}  // end namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/helper_macros.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/helper_macros.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf9b803b27b3b148e1441260471ecab99e82bfd3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/helper_macros.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Helper macros for the CUTLASS library
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifdef CUTLASS_NAMESPACE
+#define concat_tok(a, b) a ## b
+#define mkcutlassnamespace(pre, ns) concat_tok(pre, ns)
+#define cutlass mkcutlassnamespace(cutlass_, CUTLASS_NAMESPACE)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__ __host__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#elif defined(__CUDACC_RTC__)
+#define CUTLASS_HOST_DEVICE __forceinline__ __device__
+#define CUTLASS_DEVICE __forceinline__ __device__
+#else
+#define CUTLASS_HOST_DEVICE inline
+#define CUTLASS_DEVICE inline
+#endif
+
+#if ! defined(_MSC_VER)
+#define CUTLASS_LAMBDA_FUNC_INLINE __attribute__((always_inline))
+#else
+#define CUTLASS_LAMBDA_FUNC_INLINE [[msvc::forceinline]]
+#endif
+
+#define CUTLASS_HOST __host__
+#define CUTLASS_GLOBAL __global__ static
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+CUTLASS_HOST_DEVICE void __CUTLASS_UNUSED(T const &) 
+{ }
+
+#if defined(__GNUC__)
+  #define CUTLASS_UNUSED(expr) __CUTLASS_UNUSED(expr)
+#else
+  #define CUTLASS_UNUSED(expr) do { ; } while (&expr != &expr)
+#endif
+
+#ifdef _MSC_VER
+// Provides support for alternative operators 'and', 'or', and 'not'
+#include <ciso646>
+#endif // _MSC_VER
+
+#if !defined(__CUDACC_RTC__)
+#include <cassert>
+#endif
+
+#if defined(__CUDA_ARCH__)
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __FUNCSIG__); asm volatile ("brkpt;\n"); }
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() { printf("%s not implemented\n", __PRETTY_FUNCTION__); asm volatile ("brkpt;\n"); }
+  #endif
+#else
+  #if defined(_MSC_VER)
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __FUNCSIG__)
+  #else
+    #define CUTLASS_NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+  #endif
+#endif
+
+// CUTLASS_CMATH_NAMESPACE is the namespace where code can find
+// <cmath> functions like isnan and log.  Such functions are in
+// the std namespace in host code, but in the global namespace
+// in device code.
+//
+// The intended use case for this macro is in "using" declarations
+// for making argument-dependent lookup (ADL) work in generic code.
+// For example, if T is cutlass::half_t, the following code will
+// invoke cutlass::isnan(half_t).  If T is float, it will invoke
+// std::isnan on host and ::isnan on device.  (CUTLASS's support
+// for NVRTC prevents it from using things in the std namespace
+// in device code.)  Correct use of "using" declarations can help
+// avoid unexpected implicit conversions, like from half_t to float.
+//
+// template<class T>
+// bool foo(T x) {
+//   using CUTLASS_CMATH_NAMESPACE :: isnan;
+//   return isnan(x);
+// }
+//
+// Without this macro, one would need to write the following.
+//
+// template<class T>
+// bool foo(T x) {
+// #if defined(__CUDA_ARCH__)
+//   using ::isnan;
+// #else
+//   using std::isnan;
+// #endif
+//   return isnan(x);
+// }
+
+#if defined(__CUDA_ARCH__)
+#  define CUTLASS_CMATH_NAMESPACE
+#else
+#  define CUTLASS_CMATH_NAMESPACE std
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+
+#ifndef CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+#define CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 0
+#endif
+
+
+// CUDA 10.1 introduces the mma instruction
+#if !defined(CUTLASS_ENABLE_TENSOR_CORE_MMA)
+#define CUTLASS_ENABLE_TENSOR_CORE_MMA 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CUTLASS_ASSERT(x) assert(x)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// CUTLASS_PRAGMA_(UNROLL|NO_UNROLL) optimization directives for the CUDA compiler.
+#if defined(__CUDA_ARCH__) && !defined(__INTELLISENSE__)
+  #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+    #define CUTLASS_PRAGMA_UNROLL _Pragma("unroll")
+    #define CUTLASS_PRAGMA_NO_UNROLL _Pragma("unroll 1")
+  #else
+    #define CUTLASS_PRAGMA_UNROLL #pragma unroll
+    #define CUTLASS_PRAGMA_NO_UNROLL #pragma unroll 1
+  #endif
+
+  #define CUTLASS_GEMM_LOOP CUTLASS_PRAGMA_NO_UNROLL
+
+#else
+
+    #define CUTLASS_PRAGMA_UNROLL
+    #define CUTLASS_PRAGMA_NO_UNROLL
+    #define CUTLASS_GEMM_LOOP
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+#define CUTLASS_THREAD_LOCAL thread_local
+#else
+#define CUTLASS_THREAD_LOCAL
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(_MSVC_LANG)
+#  define CUTLASS_CPLUSPLUS _MSVC_LANG
+#else
+#  define CUTLASS_CPLUSPLUS __cplusplus
+#endif
+
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/n4762.pdf
+// Section 14.8 Predefined macro names
+#if (201703L <= CUTLASS_CPLUSPLUS)
+#define CUTLASS_CONSTEXPR_IF_CXX17 constexpr
+#define CUTLASS_CXX17_OR_LATER 1
+#else
+#define CUTLASS_CONSTEXPR_IF_CXX17
+#define CUTLASS_CXX17_OR_LATER 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// __CUDA_ARCH_SPECIFIC__ is introduced in CUDA 12.9
+#if !defined(CUDA_ARCH_CONDITIONAL)
+
+#if defined(__CUDA_ARCH_SPECIFIC__)
+#define CUDA_ARCH_CONDITIONAL(ARCH_XXYY) (__CUDA_ARCH_SPECIFIC__ == ARCH_XXYY)
+#else
+#define CUDA_ARCH_CONDITIONAL(ARCH_XXYY) (false)
+#endif
+
+#endif
+
+// __CUDA_ARCH_FAMILY_SPECIFIC__ is introduced in CUDA 12.9
+#if !defined(CUDA_ARCH_FAMILY)
+
+#if defined(__CUDA_ARCH_FAMILY_SPECIFIC__)
+#define CUDA_ARCH_FAMILY(ARCH_XXYY) (__CUDA_ARCH_FAMILY_SPECIFIC__ == ARCH_XXYY)
+#else
+#define CUDA_ARCH_FAMILY(ARCH_XXYY) (false)
+#endif
+
+#endif
+
+#if !defined(CUDA_ARCH_CONDITIONAL_OR_FAMILY)
+#define CUDA_ARCH_CONDITIONAL_OR_FAMILY(ARCH_XXYY) \
+  (CUDA_ARCH_CONDITIONAL(ARCH_XXYY) || CUDA_ARCH_FAMILY(ARCH_XXYY))
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}; // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/layout.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1c1bd6c5529ccb7af5c70e558be327c81396106
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/layout.hpp
@@ -0,0 +1,434 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/pointer_sparse.hpp"       // cute::is_sparse
+#include "cute/swizzle.hpp"              // cute::Swizzle
+#include "cute/swizzle_layout.hpp"       // cute::get_swizzle_portion
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/copy_sm90_tma.hpp"
+#include "cute/arch/copy_sm100_tma.hpp"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/collective.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For each cutlass::layout, provides its corresponding cute stride types, 64b by default
+
+template <class L>
+struct TagToStrideA {
+  using type = L;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+template <class L>
+struct TagToStrideB {
+  using type = L;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor> {
+  using type = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor> {
+  using type = cute::Stride<int64_t, cute::Int<1>, int64_t>;
+  using tag = layout::ColumnMajor;
+};
+
+// For each cutlass::layout *, provides its corresponding cute stride types, 64b by default
+// Used by pointer array and grouped gemm
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [M, K, L]
+template <>
+struct TagToStrideA<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::RowMajor *> {
+  using UnderlyingType = cute::Stride<cute::Int<1>, int64_t, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::RowMajor;
+};
+
+// Maps to modes [N, K, L]
+template <>
+struct TagToStrideB<layout::ColumnMajor *> {
+  using UnderlyingType = cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>;
+  using type = UnderlyingType*;
+  using tag = layout::ColumnMajor;
+};
+
+// Maps to modes [M, N, L]
+template <class LayoutTag>
+struct TagToStrideC : TagToStrideA<LayoutTag> { };
+
+// Conv: Maps to modes ((P,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((P,Q,Z,N), C, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorNDHWC> {
+  using type = cute::Stride<cute::Stride<int64_t, int64_t, int64_t, int64_t>, cute::Int<1>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCS> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSR> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes (K, (C,S,R,T), _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorKCSRT> {
+  using type = cute::Stride<int64_t, cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Conv: Maps to modes ((C,S,R,T), K, _0) for compatiblity with GEMM epilogues expecting a batch mode stride
+template <>
+struct TagToStrideC<cutlass::layout::TensorCSRTK> {
+  using type = cute::Stride<cute::Stride<cute::Int<1>, int64_t, int64_t, int64_t>, int64_t, cute::Int<0>>;
+};
+
+// Convenience aliases
+template<class LayoutTag>
+using TagToStrideA_t = typename TagToStrideA<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideB_t = typename TagToStrideB<LayoutTag>::type;
+
+template<class LayoutTag>
+using TagToStrideC_t = typename TagToStrideC<LayoutTag>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// For 2.x compatibility APIs, provide stride->layout tag mappers
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  // Account for stride types with and without batch mode and batch modes with static zero stride
+  return cute::is_constant<1, decltype(cute::front(cute::get<ModeIndex>(cute::remove_pointer_t<Stride>{})))>::value;
+}
+
+template<int ModeIndex, class Shape, class Stride>
+constexpr bool
+is_major(cute::Layout<Shape,Stride> = {}) {
+  return is_major<ModeIndex>(Stride{});
+}
+
+// Note : This method can be used for deducing the Layout Tag of A, C, D Matrices
+template<class StrideA>
+constexpr
+auto
+stride_to_layout_tag_A() {
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  if constexpr (cute::is_layout<InternalStrideA>::value) {
+    return stride_to_layout_tag_A<decltype(cute::stride(InternalStrideA{}))>();
+  }
+  else if constexpr (is_major<0, StrideA>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  // Specialize for sparse layout
+  else if constexpr (cute::get<0>(InternalStrideA{}) == cute::_2{} &&
+                     cute::rank(cute::get<1>(InternalStrideA{})) == 2 &&
+                     cute::is_same_v<cute::_1, cute::remove_cvref_t<decltype(cute::get<1,0>(InternalStrideA{}))>>) {
+    return layout::ColumnMajor{};
+  }
+  else { // K major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideB>
+constexpr
+auto
+stride_to_layout_tag_B() {
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  if constexpr (cute::is_layout<InternalStrideB>::value) {
+    return stride_to_layout_tag_B<decltype(cute::stride(InternalStrideB{}))>();
+  }
+  else if constexpr (is_major<0, StrideB>()) { // N major
+    return layout::RowMajor{};
+  }
+  else { // K major
+    return layout::ColumnMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+template<class StrideC>
+constexpr
+auto
+stride_to_layout_tag_C() {
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  if constexpr (cute::is_layout<InternalStrideC>::value) {
+    return stride_to_layout_tag_C<decltype(cute::stride(InternalStrideC{}))>();
+  }
+  else if constexpr (is_major<0, StrideC>()) { // M major
+    return layout::ColumnMajor{};
+  }
+  else { // N major
+    return layout::RowMajor{};
+  }
+
+  CUTE_GCC_UNREACHABLE;
+}
+
+// Utilities to map Stride back on to their corresponding layout tags
+template <class S>
+struct StrideToLayoutTagA {
+  using type = decltype(detail::stride_to_layout_tag_A<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagB {
+  using type = decltype(detail::stride_to_layout_tag_B<S>());
+};
+
+template <class S>
+struct StrideToLayoutTagC {
+  using type = decltype(detail::stride_to_layout_tag_C<S>());
+};
+
+// Convenience aliases
+template<class S>
+using StrideToLayoutTagA_t = typename StrideToLayoutTagA<S>::type;
+
+template<class S>
+using StrideToLayoutTagB_t = typename StrideToLayoutTagB<S>::type;
+
+template<class S>
+using StrideToLayoutTagC_t = typename StrideToLayoutTagC<S>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Inspects a tiled copy and whether its copy engine is TMA or not
+template<class GmemTiledCopy>
+constexpr bool is_tma_copy_engine() {
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return false;
+  }
+  else {
+   if constexpr (   cute::is_base_of_v<cute::SM90_TMA_LOAD,                         GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_MULTICAST,              GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL,                 GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_LOAD_IM2COL_MULTICAST,       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE,                       GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM90_TMA_STORE_IM2COL,                GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD,                   GmemTiledCopy>
+                  || cute::is_base_of_v<cute::SM100_TMA_2SM_LOAD_MULTICAST,         GmemTiledCopy>
+                  ) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <class X, class = void>
+struct RawDtype { using type = X; };
+
+template <class X>
+struct RawDtype<X,cute::void_t<typename X::raw_type>> { using type = typename X::raw_type; };
+
+
+// Inspects a TiledCopy and returns its alignment in terms of element count
+template <class GmemTiledCopy, class Element, class ElementMma = Element>
+constexpr int
+get_alignment_count_from_gmem_tiled_copy() {
+
+  if constexpr (cute::is_void_v<GmemTiledCopy>) {
+    return 1;
+  }
+
+  // Account for ElementC = void kernels
+  else if constexpr (cute::is_void_v<Element>) {
+    return 0;
+  }
+
+  else {
+    // For TMA tiled copies, we know the alignment has to be 128 bits
+    if constexpr (is_tma_copy_engine<GmemTiledCopy>()) {
+      if constexpr ( cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e2m1_unpacksmem_t> ||
+                     cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e3m2_unpacksmem_t> ||
+                     cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::float_e2m3_unpacksmem_t> ||
+                     cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::type_erased_dynamic_float4_unpacksmem_t> ||
+                     cute::is_same_v<typename RawDtype<ElementMma>::type, cutlass::detail::type_erased_dynamic_float6_unpacksmem_t> ||
+                     cutlass::gemm::collective::detail::is_sm10x_f8f6f4_element<Element>() && cute::is_same_v<typename RawDtype<ElementMma>::type, uint8_t>) {
+        return 128;
+      }
+
+      // For sparse MMA, alignment in logical elements is increased by sparsity factor
+      if constexpr (cute::is_sparse_v<ElementMma>) {
+        return 128 / sizeof_bits<Element>::value * ElementMma::sparsity;
+      }
+      return 128 / sizeof_bits<Element>::value;
+    }
+    else {
+      // For non-TMA tiled copies, TiledCopy holds the alignment count directly in its TiledShape_MN
+      return GmemTiledCopy::NumValSrc;
+    }
+  }
+}
+
+// Return alignment bit requirements for the GEMM inputs.
+template <
+  class ElementType
+  , bool IsF8F6F4SubBytes=false
+>
+constexpr int
+get_input_alignment_bits() {
+  if constexpr (IsF8F6F4SubBytes && sizeof_bits<ElementType>::value == 4) {
+    // 16U4 format: The inner tensor size dimension should be multiple of 64B.
+    return 64 * 8;
+  }
+  else if constexpr (IsF8F6F4SubBytes && sizeof_bits<ElementType>::value == 6) {
+    // 16U6 format : The inner tensor size dimension must be a multiple of 96B.
+    return 96 * 8;
+  }
+  // TMA 16B alignment requirement
+  return 128;
+}
+
+// Return alignment bit requirements for the GEMM outputs.
+template <class ElementType>
+constexpr int
+get_output_alignment_bits() {
+  if constexpr (sizeof_bits<ElementType>::value == 6) {
+    // 16U6 format : The inner tensor size dimension must be a multiple of 96B.
+    return 96 * 8;
+  }
+  // TMA 16B alignment requirement
+  return 128;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(cute::Layout<Shape,Stride> const& layout) {
+  // Condition: shape must divide by Alignment without rounding
+  bool shape_check = cute::size(layout.shape()) == Alignment * cute::size(cute::upcast<Alignment>(layout));
+  // Condition: every dynamic stride must be a multiple of Alignment
+  bool stride_check = cute::all_of(cute::flatten(layout.stride()), [](auto s){ return cute::is_static<decltype(s)>::value || (s % Alignment == 0); });
+  return shape_check && stride_check;
+}
+
+// Check if tensor layout satisfies a given major alignment
+template<int Alignment, class Shape, class Stride>
+CUTLASS_HOST_DEVICE constexpr
+bool
+check_alignment(Shape const& shape, Stride const& stride) {
+  return check_alignment<Alignment>(cute::make_layout(shape, stride));
+}
+
+template<int B, int M, int S>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(cute::Swizzle<B, M, S>) {
+  static_assert(B >= 0 and M >= 0);
+  return size_t(1) << size_t(B + M + cute::abs(S));
+}
+
+template<class Layout>
+CUTLASS_HOST_DEVICE constexpr
+size_t
+alignment_for_swizzle(Layout layout) {
+  return alignment_for_swizzle(cute::get_swizzle_portion(layout));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..84de1c7d3c9359b94ababf184a2c2db724236b11
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Mainloop Fusion configs specific for scale factors
+*/
+
+#pragma once
+
+#include <cute/util/type_traits.hpp> // cute::void_t
+
+namespace cutlass::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename CollectiveMainloop, typename = void>
+struct ElementSFType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct ElementSFType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::ElementSF>> {
+  using type = typename CollectiveMainloop::ElementSF;
+};
+
+template <typename CollectiveMainloop, typename = void>
+struct LayoutSFAType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct LayoutSFAType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::LayoutSFA>> {
+  using type = typename CollectiveMainloop::LayoutSFA;
+};
+
+template <typename CollectiveMainloop, typename = void>
+struct LayoutSFBType {
+  using type = void;
+};
+
+template <typename CollectiveMainloop>
+struct LayoutSFBType<CollectiveMainloop, cute::void_t<typename CollectiveMainloop::LayoutSFB>> {
+  using type = typename CollectiveMainloop::LayoutSFB;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4cbd3864a7fbfc524229cb183c62564cead1e7f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/mma.hpp
@@ -0,0 +1,87 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cute/layout.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class TiledMma, class = void>
+struct IsSparseTensorOp : cute::false_type { };
+
+// TiledMma for sparse must have ValTypeE
+template <class TiledMma>
+struct IsSparseTensorOp<TiledMma, cute::void_t<typename TiledMma::ValTypeE>>
+    : cute::true_type { };
+
+
+template <class TiledMma, class = void>
+struct IsBlockScaledTensorOp : cute::false_type { };
+
+// TiledMma for blockScaled must have FrgTypeSFA
+template <class TiledMma>
+struct IsBlockScaledTensorOp<TiledMma, cute::void_t<typename TiledMma::FrgTypeSFA>>
+    : cute::true_type { };
+
+
+// The following metafunction is used to extract the OperatorClass from a cutlass 3.x kernel.
+template <class TiledMma>
+struct get_operator_class {
+  static constexpr bool is_sparse_op = IsSparseTensorOp<TiledMma>::value;
+  static constexpr bool is_block_scaled_op = IsBlockScaledTensorOp<TiledMma>::value;    
+  // All tensorop operations have atom shape's M >= 8   
+  static constexpr bool is_tensor_op = cute::size<0>(typename TiledMma::AtomShape_MNK{}) >= 8;
+  using type = cute::conditional_t<
+                is_tensor_op, 
+                cute::conditional_t<
+                  is_sparse_op,
+                  cutlass::arch::OpClassSparseTensorOp,
+                  cute::conditional_t<                          
+                    is_block_scaled_op,                         
+                    cutlass::arch::OpClassBlockScaledTensorOp,  
+                    cutlass::arch::OpClassTensorOp
+                    >                                           
+                  >,
+                cutlass::arch::OpClassSimt
+                >;
+};
+
+template <class T>
+using get_operator_class_t = typename get_operator_class<T>::type;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_blockscaled_layout.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_blockscaled_layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4f20cb237cb9b6960275339ee803e00f8e40031
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_blockscaled_layout.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Blocked Scale configs specific for SM100 BlockScaled MMA
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/atom/mma_traits_sm100.hpp"
+
+namespace cutlass::detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cute;
+
+template<int SFVecSize, UMMA::Major major = UMMA::Major::K>
+struct Sm1xxBlockScaledBasicChunk {
+
+  using Blk_MN    = _128;
+  using Blk_SF    =   _4; 
+
+  using SfKMajorAtom  = Layout< Shape< Shape<_32,_4>, Shape<Int<SFVecSize>, _4>>, 
+                               Stride<Stride<_16,_4>, Stride<           _0, _1>>>;
+  using SfMNMajorAtom = Layout< Shape< Shape<Int<SFVecSize>, _4>,  Shape<_32,_4>>, 
+                               Stride<Stride<            _0, _1>, Stride<_16,_4>>>;
+  using SfAtom    = cute::conditional_t<major == UMMA::Major::K, SfKMajorAtom, SfMNMajorAtom>;
+};
+
+template<int SFVecSize_>
+struct Sm1xxBlockScaledConfig {
+  // We are creating the SFA and SFB tensors' layouts in the collective since they always have the same layout.
+  // k-major order
+  static constexpr int SFVecSize = SFVecSize_;
+  using Sm1xxBlkScaledChunk = Sm1xxBlockScaledBasicChunk<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledChunk::Blk_MN;
+  using Blk_SF = typename Sm1xxBlkScaledChunk::Blk_SF; 
+  using SfAtom = typename Sm1xxBlkScaledChunk::SfAtom;
+
+  using LayoutSF = decltype(blocked_product(SfAtom{}, make_layout( make_shape(int32_t(0), int32_t(0), int32_t(0)),
+                                                                  make_stride(int32_t(0),       _1{}, int32_t(0)))));
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFA() {
+    return LayoutSF{};
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFB() {
+    return LayoutSF{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template < class ProblemShape, class LayoutSFA = LayoutSF>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape_SFA(ProblemShape problem_shape, LayoutSFA layout_sfa = LayoutSFA{}) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    return tile_to_shape(SfAtom{}, make_shape(M,K,L), Step<_2,_1,_3>{});
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFB.
+  template <class ProblemShape, class LayoutSFB = LayoutSF>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape_SFB(ProblemShape problem_shape, LayoutSFB layout_sfb = LayoutSFB{}) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    return tile_to_shape(SfAtom{}, make_shape(N,K,L), Step<_2,_1,_3>{});
+  }
+
+  template<class TiledMma, class TileShape_MNK>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_smem_layoutSFA(TiledMma tiled_mma, TileShape_MNK tileshape_mnk) {
+
+    constexpr int MMA_NSF = TiledMma::K / SFVecSize;
+    // Basic storage block for new Scaling Factor Layouts
+    using mnBasicBlockShape  =  Shape<_32,_4>;
+    using mnBasicBlockStride = Stride<_16,_4>;
+    using kBasicBlockShape  = Shape<Int<SFVecSize>, Int<MMA_NSF>>;
+    using kBasicBlockStride = Stride<_0, _1>;
+
+    // ((MMA_TILE_M,MMA_TILE_K), MMA_M, MMA_K)
+    using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(cute::size<0>(TileShape_MNK{}),
+                                                                          cute::size<2>(TileShape_MNK{}))));
+    // ((MMA_TILE_N,MMA_TILE_K), MMA_N, MMA_K)
+    using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(cute::size<1>(TileShape_MNK{}),
+                                                                          cute::size<2>(TileShape_MNK{}))));
+    // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
+    // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+    using Blk_MN    = typename Sm1xxBlkScaledChunk::Blk_MN;
+    using Blk_SF    = typename Sm1xxBlkScaledChunk::Blk_SF; 
+    using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
+
+    using TL_VMNK = typename TiledMma::ThrLayoutVMNK;
+    constexpr TL_VMNK tl_vmnk{};
+    constexpr int MMA_M = cute::size<0>(TileShape_MNK{}) / cute::size<0>(tl_vmnk);
+    using mma_SFA_shape  = decltype( make_shape( prepend(Int<MMA_M>{}/Blk_MN{},  mnBasicBlockShape{}),  kBasicBlockShape{}));
+    using mma_SFA_stride = decltype(make_stride( prepend(          Blk_Elems{}, mnBasicBlockStride{}), kBasicBlockStride{}));
+    using sSFA_shape     = decltype( make_shape( mma_SFA_shape{}, _1{},   make_shape( Blk_SF{}/Int<MMA_NSF>{}, Int<size<2>(TileShape_MNK{}) / SFVecSize / Blk_SF{}>{})));
+    using sSFA_stride    = decltype(make_stride(mma_SFA_stride{}, _0{},  make_stride(          Int<MMA_NSF>{},                   Int<MMA_M /Blk_MN{} * Blk_Elems{}>{})));
+    using SmemLayoutAtomSFA = decltype(make_layout(sSFA_shape{}, sSFA_stride{}));
+    return SmemLayoutAtomSFA{};
+  }
+
+  template<class TiledMma, class TileShape_MNK>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_smem_layoutSFB(TiledMma tiled_mma, TileShape_MNK tileshape_mnk) {
+
+    constexpr int MMA_NSF = TiledMma::K / SFVecSize;
+    // Basic storage block for new Scaling Factor Layouts
+    using mnBasicBlockShape  =  Shape<_32,_4>;
+    using mnBasicBlockStride = Stride<_16,_4>;
+    using kBasicBlockShape  = Shape<Int<SFVecSize>, Int<MMA_NSF>>;
+    using kBasicBlockStride = Stride<_0, _1>;
+
+    // ((MMA_TILE_M,MMA_TILE_K), MMA_M, MMA_K)
+    using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(cute::size<0>(TileShape_MNK{}),
+                                                                          cute::size<2>(TileShape_MNK{}))));
+    // ((MMA_TILE_N,MMA_TILE_K), MMA_N, MMA_K)
+    using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(cute::size<1>(TileShape_MNK{}),
+                                                                          cute::size<2>(TileShape_MNK{}))));
+    // A single indivisible block will hold 4 scale factors of 128 rows/columns (A/B matrix).
+    // 4 is chosen to make consecutive 32bits of data to have scale factors for only a single row (col). 32bits corresponds to the TMEM word size 
+    using Blk_MN    = typename Sm1xxBlkScaledChunk::Blk_MN;
+    using Blk_SF    = typename Sm1xxBlkScaledChunk::Blk_SF; 
+    using Blk_Elems = decltype(Blk_MN{} * Blk_SF{});
+
+    using TL_VMNK = typename TiledMma::ThrLayoutVMNK;
+    constexpr TL_VMNK tl_vmnk{};
+    constexpr int MMA_N = cute::size<1>(TileShape_MNK{});
+    // If MMA_N is 192, we need to operate at MMA_N = 256 granularity for UTCCP to work for ScaleFactorB.
+    // Both TMA and UTCCP will transfer scale factor B as if we have 256 columns in B matrix.
+    constexpr int MMA_N_SFB = cutlass::ceil_div(MMA_N, Blk_MN{}) * Blk_MN{};
+    using mma_SFB_shape  = decltype(make_shape( prepend(   Int<MMA_N_SFB>{}/Blk_MN{},  mnBasicBlockShape{}),  kBasicBlockShape{}));
+    using mma_SFB_stride = decltype(make_stride(prepend(                 Blk_Elems{}, mnBasicBlockStride{}), kBasicBlockStride{}));
+    using sSFB_shape     = decltype( make_shape( mma_SFB_shape{}, _1{},  make_shape( Blk_SF{}/Int<MMA_NSF>{}, Int<size<2>(TileShape_MNK{}) / SFVecSize / Blk_SF{}>{})));
+    using sSFB_stride    = decltype(make_stride(mma_SFB_stride{}, _0{}, make_stride(         Int<MMA_NSF>{},               Int<MMA_N_SFB / Blk_MN{} * Blk_Elems{}>{})));
+    using SmemLayoutAtomSFB = decltype(make_layout(sSFB_shape{}, sSFB_stride{}));
+    return SmemLayoutAtomSFB{};
+  }
+};
+
+
+template<int SFVecSize_, UMMA::Major major = UMMA::Major::K>
+struct Sm1xxBlockScaledOutputConfig {
+  // We are creating the SFD tensors' layouts in the collective.
+  // k-major order
+  static constexpr int SFVecSize = SFVecSize_;
+  using Sm1xxBlkScaledChunk = cutlass::detail::Sm1xxBlockScaledBasicChunk<SFVecSize, major>;
+  using Blk_MN = typename Sm1xxBlkScaledChunk::Blk_MN;
+  using Blk_SF = typename Sm1xxBlkScaledChunk::Blk_SF; 
+  using SfAtom = typename Sm1xxBlkScaledChunk::SfAtom;
+
+  using LayoutKMajorSF  = decltype(blocked_product(SfAtom{}, make_layout(make_shape (int32_t(0), int32_t(0), int32_t(0)),
+                                                                         make_stride(int32_t(0),       _1{}, int32_t(0)))));
+
+  using LayoutMNMajorSF = decltype(blocked_product(SfAtom{}, make_layout(make_shape (int32_t(0), int32_t(0), int32_t(0)),
+                                                                         make_stride(      _1{}, int32_t(0), int32_t(0)))));
+
+  using LayoutSF = cute::conditional_t<major == UMMA::Major::K, LayoutKMajorSF, LayoutMNMajorSF>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFD() {
+    return LayoutSF{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFC.
+  template <class ProblemShape, class LayoutSFD = LayoutSF>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape_SFD(ProblemShape problem_shape, LayoutSFD layout_sfc = LayoutSFD{}) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    if constexpr (major == UMMA::Major::K) {
+      return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_2,_1,_3>{});
+    } 
+    else { 
+      return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_1,_2,_3>{});
+    }
+  }
+};
+
+//// Describe the Scalefactor Tensor without VectorSize
+struct Sm1xxBlockScaledTensorConfig {
+  // k-major order
+  // The blockscaled tensor does not need to know vectorsize
+  using Blk_M = _128;
+  using Blk_N =   _4; 
+  using SfAtom = Layout< Shape< Shape<_32,_4>,  Shape<_4>>, 
+                        Stride<Stride<_16,_4>, Stride<_1>>>;
+
+  template <class ProblemShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNL = append<3>(problem_shape, 1);
+    auto [M, N, L] = problem_shape_MNL;
+    return tile_to_shape(SfAtom{}, make_shape(M,N,L), Step<_2,_1,_3>{});
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_mixed_dtype_blockwise_layout.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_mixed_dtype_blockwise_layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6c92c4d1199995029a550f61dce6a9903d7333e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_mixed_dtype_blockwise_layout.hpp
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Block Wise Scale configs specific for SM100 Blockwise/Groupwise MMA
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/atom/mma_traits_sm100.hpp"
+
+namespace cutlass::detail{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cute;
+
+template<int SFVecSizeMN, int SFVecSizeK, UMMA::Major majorSFA = UMMA::Major::MN>
+struct Sm100MixedInputBlockwiseScaleConfig {
+
+  using ShapeScale = Shape<Shape<Int<SFVecSizeMN>, int32_t>, Shape<Int<SFVecSizeK>, int32_t>, int32_t>;
+
+  using StrideScale = conditional_t<majorSFA == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using LayoutScale = Layout<ShapeScale, StrideScale>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layout_scale() {
+    return LayoutScale{};
+  }
+
+  template<class CtaShape_MN_K>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  smem_atom_layout_scale(CtaShape_MN_K cta_shape_mn_k) {
+    static_assert(cute::is_static_v<CtaShape_MN_K>, "Expect static CTA shape");
+
+    int constexpr size_MN = cute::get<0>(CtaShape_MN_K{});
+    int constexpr size_K = cute::get<1>(CtaShape_MN_K{});
+
+    int constexpr SmemSizeMN = (SFVecSizeMN < size_MN) 
+                           ? SFVecSizeMN 
+                           : size_MN;
+
+    int constexpr SmemSizeK = (SFVecSizeK < size_K) 
+                           ? SFVecSizeK 
+                           : size_K;
+
+    int constexpr div_MN = cute::ceil_div(size_MN, SmemSizeMN);
+    int constexpr div_K = cute::ceil_div(size_K, SmemSizeK);
+    
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, Int<div_MN>{}));
+      }
+      else {
+        return make_stride(make_stride(_0{}, Int<div_K>{}), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    return make_layout(
+      make_shape(make_shape(Int<SmemSizeMN>{}, Int<div_MN>{}),
+                 make_shape(Int<SmemSizeK>{}, Int<div_K>{})),
+      strides
+    );
+  }
+
+
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template <class ScaledInputDim>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_scale(ScaledInputDim scale_input_dims) {
+    const auto scale_input_dims_MNKL = append<3>(scale_input_dims, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [MN, K, L] = scale_input_dims_MNKL;
+      if constexpr (majorSFA == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(MN, SFVecSizeMN)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, SFVecSizeK)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [MN, K, L] = scale_input_dims_MNKL;
+    auto mk_layout = make_layout(
+      make_shape(make_shape(Int<SFVecSizeMN>{}, cute::ceil_div(MN, SFVecSizeMN)),
+                 make_shape(Int<SFVecSizeK>{}, cute::ceil_div(K, SFVecSizeK))),
+      strides
+    );
+
+    return make_layout(append(shape(mk_layout), L), append(stride(mk_layout), size(filter_zeros(mk_layout))));
+  }
+
+};
+
+template<UMMA::Major majorScale = UMMA::Major::MN>
+struct RuntimeMixedInputBlockwiseScaleConfig {
+
+  using ShapeScale = Shape<Shape<int32_t, int32_t>, Shape<int32_t, int32_t>, int32_t>;
+
+  using StrideScale = conditional_t<majorScale == UMMA::Major::MN, 
+      Stride<Stride<_0,_1>,Stride<_0,int32_t>, int32_t>, 
+      Stride<Stride<_0,int32_t>,Stride<_0,_1>, int32_t>>;
+
+  using LayoutScale = Layout<ShapeScale, StrideScale>;
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layout_scale() {
+    return LayoutScale{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_S.
+  template <class ProblemShape, class SFVecShape>
+  CUTE_HOST_DEVICE
+  static constexpr auto 
+  tile_atom_to_shape_scale(ProblemShape problem_shape, SFVecShape sf_vec_shape) {
+    auto problem_shape_MNKL = append<3>(problem_shape, 1);
+
+    auto strides = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto [MN, K, L] = problem_shape_MNKL;
+      auto [sfmn, sfk] = sf_vec_shape;
+      if constexpr (majorScale == UMMA::Major::MN) {
+        return make_stride(make_stride(_0{}, _1{}), make_stride(_0{}, cute::ceil_div(MN, sfmn)));
+      }
+      else {
+        return make_stride(make_stride(_0{}, cute::ceil_div(K, sfk)), make_stride(_0{}, _1{}));
+      }
+    }();
+
+    auto [MN, K, L] = problem_shape_MNKL;
+    auto [sfmn, sfk] = sf_vec_shape;
+    auto mk_layout = make_layout(
+      make_shape(make_shape(sfmn, cute::ceil_div(MN, sfmn)),
+                 make_shape(sfk, cute::ceil_div(K, sfk))),
+      strides
+    );
+
+    return make_layout(append(shape(mk_layout), L), append(stride(mk_layout), size(filter_zeros(mk_layout))));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_tmem_helper.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_tmem_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f12bac12dc898f177d95438ca10a1b060f4402ac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm100_tmem_helper.hpp
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*! \file
+    \brief TMEM Accumulator Helpers for SM100
+*/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+
+namespace cutlass::detail{
+constexpr uint32_t TmemColMask = 0x0000'FFFF;
+
+template <class TmemTensor>
+CUTE_HOST_DEVICE
+static constexpr auto find_tmem_tensor_col_offset(TmemTensor tensor) {
+  using namespace cute;
+  return cosize(recast<uint32_t>(tensor).layout()) & TmemColMask;
+}
+
+template <int AccumulatorPipelineStageCount, bool IsOverlappingAccum,
+          class TiledMma, class AccumulatorShape,
+          class EpilogueTile>
+CUTE_HOST_DEVICE 
+static constexpr auto make_sm100_accumulator(TiledMma tiled_mma, AccumulatorShape acc_shape, EpilogueTile epilogue_tile) {
+  using namespace cute;
+  static_assert(rank(acc_shape) == 3 || (rank(acc_shape) == 4 && IsOverlappingAccum == false), 
+    "Expect a rank >= 3 accumulator shape compatible with an SM100 tiled mma, Overlapping accumulators is only available for non-complex kernels");
+  if constexpr (IsOverlappingAccum) {
+    Tensor accumulators_tmp = TiledMma::make_fragment_C(append(acc_shape, Int<2>{}));
+    return make_tensor(
+        accumulators_tmp.data(),
+        shape(accumulators_tmp),
+        replace<3>(
+            stride(accumulators_tmp),
+            Int<(256 - size<1>(EpilogueTile{})) * stride<0, 1>(accumulators_tmp.layout())>{}));
+  } else {
+    return TiledMma::make_fragment_C(append(
+        acc_shape,
+        Int<AccumulatorPipelineStageCount>{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE)
+  }
+}
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm103_blockscaled_layout.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm103_blockscaled_layout.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..300448d7cd0273ac4572b5484d5780d98576d4b5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/detail/sm103_blockscaled_layout.hpp
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Blocked Scale configs specific for SM103 BlockScaled MMA
+*/
+
+#pragma once
+
+#include "cutlass/layout/matrix.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/atom/mma_traits_sm100.hpp"
+
+namespace cutlass::detail{
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+using namespace cute;
+
+template <int SFVecSize, UMMA::Major major = UMMA::Major::K>
+struct Sm103BlockScaledBasicChunk {
+
+  using Blk_MN    = _128;
+  using Blk_SF    =   _4; 
+
+  using SfKMajorAtom  =  Layout< Shape< Shape< _8, _4, _4>,  Shape<Int<SFVecSize>, _4>>, 
+                               Stride<Stride<_16,_128, _4>, Stride<            _0, _1>>>;
+  using SfMNMajorAtom = Layout< Shape< Shape<Int<SFVecSize>, _4>,  Shape<_8,   _4, _4>>, 
+                               Stride<Stride<            _0, _1>, Stride<_16,_128, _4>>>;
+  using SfAtom    = cute::conditional_t<major == UMMA::Major::K, SfKMajorAtom, SfMNMajorAtom>;
+};
+
+template <int SFVecSize_>
+struct Sm103BlockScaledConfig {
+  // We are creating the SFA and SFB tensors' layouts in the collective since they always have the same layout.
+  // k-major order
+  static constexpr int SFVecSize = SFVecSize_;
+  using Sm103BlkScaledChunk = Sm103BlockScaledBasicChunk<SFVecSize>;
+  using Blk_MN = typename Sm103BlkScaledChunk::Blk_MN;
+  using Blk_SF = typename Sm103BlkScaledChunk::Blk_SF; 
+  using SfAtom = typename Sm103BlkScaledChunk::SfAtom;
+
+  using LayoutSF = decltype(tile_to_shape(SfAtom{}, make_shape(int(0),int(0),int(0)),Step<_2,_1,_3>{}));
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFA() {
+    return LayoutSF{};
+  }
+
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  deduce_layoutSFB() {
+    return LayoutSF{};
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFA.
+  template < class ProblemShape, class LayoutSFA = LayoutSF>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape_SFA(ProblemShape problem_shape, LayoutSFA layout_sfa = LayoutSFA{}) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    return tile_to_shape(SfAtom{}, make_shape(M,K,L), Step<_2,_1,_3>{});
+  }
+
+  // The following function is provided for user fill dynamic problem size to the layout_SFB.
+  template <class ProblemShape, class LayoutSFB = LayoutSF>
+  CUTE_HOST_DEVICE
+  static constexpr auto
+  tile_atom_to_shape_SFB(ProblemShape problem_shape, LayoutSFB layout_sfb = LayoutSFB{}) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    return tile_to_shape(SfAtom{}, make_shape(N,K,L), Step<_2,_1,_3>{});
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/device_kernel.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/device_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1d3e5b1feb5e38ec9a57e6ee784b3e0e9b5a27
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/device_kernel.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for generic CUTLASS kernel.
+*/
+
+#pragma once
+
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_HOST_DEVICE
+#include <cutlass/arch/synclog.hpp>  // cutlass::arch::synclog_*
+#include <cutlass/platform/platform.h> // uint64_t
+
+// __grid_constant__ was introduced in CUDA 11.7.
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 7))) && !CUTLASS_CLANG_CUDA
+#  define CUTLASS_GRID_CONSTANT_SUPPORTED
+#endif
+
+// __grid_constant__ can be enabled only on SM70+
+#if defined(CUTLASS_GRID_CONSTANT_SUPPORTED) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+#  define CUTLASS_GRID_CONSTANT_ENABLED
+#endif
+
+#if ! defined(CUTLASS_GRID_CONSTANT)
+#  if defined(CUTLASS_GRID_CONSTANT_ENABLED)
+#    define CUTLASS_GRID_CONSTANT __grid_constant__
+#  else
+#    define CUTLASS_GRID_CONSTANT
+#  endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+template <typename T>   struct Type2Type  {  using type=T;                    };
+// using the simple type to replace the complex type to reduce this symbol size
+template <typename  T>                                                                        struct GetUnderlyingKernel                              : public Type2Type<T>               {};
+template <uint64_t shader_guid, unsigned index, template <uint64_t, unsigned> class Wrapper > struct GetUnderlyingKernel<Wrapper<shader_guid,index>>  : public Wrapper<shader_guid,index> {};
+template <typename  T>                                                                        using  GetUnderlyingKernel_t                            = typename GetUnderlyingKernel<T>::type;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+void Kernel(typename Operator::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  typename Operator::SharedStorage *shared_storage =
+      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
+
+  Operator op;
+
+  op(params, *shared_storage);
+  cutlass::arch::synclog_print();
+}
+
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+void Kernel2(typename Operator::Params params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+  // Declare pointer to dynamic shared memory.
+  typename Operator::SharedStorage *shared_storage =
+      reinterpret_cast<typename Operator::SharedStorage *>(SharedStorageBase);
+
+  Operator::invoke(params, *shared_storage);
+  cutlass::arch::synclog_print();
+
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// 3.0 specific launch
+//
+////////////////////////////////////////////////////////////////////////////////
+
+/// Generic CUTLASS kernel template.
+template <typename Operator>
+CUTLASS_GLOBAL
+#ifdef __CUDACC__
+// Enclosing this in __CUDACC__ suppresses MSVC warnings.
+__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+#endif // __CUDACC__
+void device_kernel(CUTLASS_GRID_CONSTANT typename Operator::Params const params)
+{
+  // Dynamic shared memory base pointer
+  extern __shared__ char smem[];
+  Operator op;
+  op(params, smem);
+  cutlass::arch::synclog_print();
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+} /// namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bd817a5dd6e8cad0b4295ae1ff41d1f838eebf3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -0,0 +1,126 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/arch/copy.hpp>         // cute::DefaultCopy
+#include <cute/util/type_traits.hpp>  // cute::is_base_of_v
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify epilogue subtile shape or dispatch to automatic computation of subtile shape
+struct EpilogueTileAuto {};
+
+// Used to let the builder pick the epilogue schedule automatically.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct EpilogueScheduleAuto {};
+
+template <
+  class ArchTag,
+  class OpClass,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class EpilogueTileType,
+  class ElementAccumulator,
+  class ElementCompute,
+  class ElementC,
+  class GmemLayoutTagC,
+  int AlignmentC,
+  class ElementD,
+  class GmemLayoutTagD,
+  int AlignmentD,
+  class EpilogueScheduleType,
+  class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute,ElementC,ElementCompute>,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not build a collective epilogue for given parameters.");
+};
+
+// helper sub-builder for epilogue fusion callbacks (for internal use by CollectiveBuilder only)
+namespace detail {
+
+// callbacks builder with operation tag
+template<
+  class DispatchPolicy,
+  class FusionOp,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class ElementAccumulator,
+  class AccLoadOp = cute::DefaultCopy,
+  class = void
+>
+struct CallbacksBuilder {
+  using Callbacks = fusion::FusionCallbacks<DispatchPolicy, FusionOp, TileShape_MNK, EpilogueTile_MN>;
+};
+
+// callbacks builder with callbacks passthrough
+template <
+  class DispatchPolicy,
+  class FusionCallbacks,
+  class TileShape_MNK,
+  class EpilogueTile_MN,
+  class AccLoadOp,
+  class ElementAccumulator
+>
+struct CallbacksBuilder<
+  DispatchPolicy,
+  FusionCallbacks,
+  TileShape_MNK,
+  EpilogueTile_MN,
+  ElementAccumulator,
+  AccLoadOp,
+  cute::enable_if_t<not cute::is_base_of_v<fusion::FusionOperation, FusionCallbacks>>
+> {
+  using Callbacks = FusionCallbacks;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "builders/sm90_builder.inl"
+#include "builders/sm100_builder.inl"  
+#include "builders/sm103_builder.inl"
+#include "builders/sm120_builder.inl"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..918017efa4c22da5ad673fbecb55d2c7cea4d68c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/collective_epilogue.hpp
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cutlass/detail/dependent_false.hpp>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class... Args
+>
+class CollectiveEpilogue {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy>, "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "detail.hpp"
+
+//
+// Gemm
+//
+#include "default_epilogue.hpp"
+#include "default_epilogue_array.hpp"
+#include "epilogue_tensor_broadcast.hpp"
+#include "sm70_epilogue_vectorized.hpp"
+#include "sm70_epilogue_vectorized_array.hpp"
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+#include "sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp"
+#include "sm90_epilogue_array_tma_warpspecialized.hpp"
+#include "sm100_epilogue_nosmem.hpp"  
+#include "sm100_epilogue_array_nosmem.hpp"  
+#include "sm100_epilogue_tma_warpspecialized.hpp" 
+#include "sm100_epilogue_array_tma_warpspecialized.hpp" 
+//
+// Conv
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed34bc10719d2ad45d22d890e3275ce8046c5385
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies an element wise operation to all elements within the fragment
+/// and writes them out to destination storage.
+template <
+  class ElementC_,
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogue {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>, ElementD, ElementC>; // prevents void ref breakages
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  // Note: SharedStorage is unused for DefaultEpilogue
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogue(Params const& params_, SharedStorage const& shared_storage = SharedStorage())
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      [[maybe_unused]] ResidueMNK,
+      int thread_idx,
+      [[maybe_unused]] char*)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), make_shape(M,N,L), stride_c);     // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                   // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                                         // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    auto shape_MN = make_shape(M,N);
+    Tensor mD_crd = make_identity_tensor(shape_MN);                                                            // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(blk_shape_MNK), make_coord(m_coord, n_coord));         // (BLK_M,BLK_N)
+    Tensor tCcD_mn = thr_mma.partition_C(cD_mn);                                                   // (VEC,THR_M,THR_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (BLK_M,BLK_N)
+    Tensor tCcD = make_coord_tensor(tCcD_mn.layout());                                          // (VEC,THR_M,THR_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = shape_MN - cD_mn(_0{});                                                                  // (m,n)
+    auto residue_tCcD = shape_MN - tCcD_mn(_0{});                                                              // (m,n)
+
+    // Fully OOB tile
+    if (not elem_less(repeat_like(residue_cD, _0{}), residue_cD)) {
+      return;
+    }
+
+    using FragCType = remove_cvref_t<decltype(tCgC(0))>;
+    using FragDType = remove_cvref_t<decltype(tCgD(0))>;
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        FragCType fragC;
+        bool pred = elem_less(tCcD(i), residue_tCcD);
+        arch::global_load<FragCType, sizeof(FragCType)>(fragC, &tCgC(i), pred);
+        FragDType fragD = epilogue_op(accumulators(i), fragC);
+        arch::global_store<FragDType, sizeof(FragDType)>(fragD, &tCgD(i), pred);
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        bool pred = elem_less(tCcD(i), residue_tCcD);
+        FragDType fragD = epilogue_op(accumulators(i));
+        arch::global_store<FragDType, sizeof(FragDType)>(fragD, &tCgD(i), pred);
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3cab46ddcfd86ecbb2d3f1de43856f91e1002bfd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/default_epilogue_array.hpp
@@ -0,0 +1,287 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Applies an element wise operation to all elements within the fragment
+// and writes them out to destination storage.
+template <
+  class ElementC_,
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_
+>
+class DefaultEpilogueArray {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+  using DispatchPolicy = EpilogueSchedule_;
+  
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>, ElementD, ElementC>; // prevents void ref breakages
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecialized> || cute::is_same_v<EpilogueSchedule, PtrArrayDefault> || cute::is_same_v<EpilogueSchedule, PtrArrayNoSmemWarpSpecializedTransposed>, "Incompatible epilogue schedule.");
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage { };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  DefaultEpilogueArray(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      [[maybe_unused]] ResidueMNK,
+      int thread_idx,
+      [[maybe_unused]] char*)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    auto [stride_c, stride_d] = [&, l = l_coord]() {
+      if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+        // If grouped gemm
+        if (epilogue_op.is_source_needed()) {
+            return make_tuple(
+                detail::get_epilogue_stride<EpilogueSchedule>(params.dC[l]),
+                detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l])
+            );
+        } 
+        else {
+          return make_tuple(
+              InternalStrideC{}, 
+              detail::get_epilogue_stride<EpilogueSchedule>(params.dD[l])
+          );
+        }
+      } 
+      else {
+        return make_tuple(
+            detail::get_epilogue_stride<EpilogueSchedule>(params.dC),
+            detail::get_epilogue_stride<EpilogueSchedule>(params.dD)
+        );
+      }
+    }();
+    
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr<GmemElementC>(ptr_C_l), make_shape(M,N,mock_L), stride_c);     // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);     // (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord, mock_l_coord);                                             // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord, mock_l_coord);                                             // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgC = thr_mma.partition_C(gC);                                                         // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value, "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(blk_shape_MNK), make_coord(m_coord, n_coord));         // (BLK_M,BLK_N)
+    Tensor tCcD = thr_mma.partition_C(cD_mn);                                                      // (VEC,THR_M,THR_N)
+
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_shape(M,N))) {
+          tCgD(i) = epilogue_op(accumulators(i), tCgC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(accumulators); ++i) {
+        if (elem_less(tCcD(i), make_shape(M,N))) {
+          tCgD(i) = epilogue_op(accumulators(i));
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb09f8b19475fdeeca844b20a158933726d2a895
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/detail.hpp
@@ -0,0 +1,887 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cute/util/type_traits.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Stride>
+constexpr bool
+is_m_major() {
+  return cutlass::gemm::detail::is_major<0,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_n_major() {
+  return cutlass::gemm::detail::is_major<1,Stride>();
+}
+
+template <class Stride>
+constexpr bool
+is_im2col() {
+  return cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNHWC>>
+      || cute::is_same_v<Stride, cutlass::detail::TagToStrideC_t<cutlass::layout::TensorNDHWC>>;
+}
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<>
+struct sm90_is_ptr_array_tma<PtrArrayTmaWarpSpecialized> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_v = sm90_is_ptr_array_tma<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_cooperative : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_cooperative<PtrArrayTmaWarpSpecializedCooperative> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_cooperative_v = sm90_is_ptr_array_tma_cooperative<Schedule>::value;
+
+template<class Schedule>
+struct sm90_is_ptr_array_tma_pingpong : cute::false_type {};
+
+template<>
+struct sm90_is_ptr_array_tma_pingpong<PtrArrayTmaWarpSpecializedPingpong> : cute::true_type {};
+
+template<class Schedule>
+static constexpr bool sm90_is_ptr_array_tma_pingpong_v = sm90_is_ptr_array_tma_pingpong<Schedule>::value;
+
+template<class DispatchPolicy>
+struct sm90_is_ptr_array_tma_dispatch_policy : cute::false_type {};
+
+template<
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups
+>
+struct sm90_is_ptr_array_tma_dispatch_policy<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                   StagesD, 
+                                   FragmentSize,
+                                   ReuseSmemC, 
+                                   DelayTmaStore, 
+                                   NumEpilogueWarpGroups>> 
+    : cute::true_type {};
+
+template<
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups
+>
+struct sm90_is_ptr_array_tma_dispatch_policy<
+    Sm120PtrArrayTmaWarpSpecialized<StagesC, 
+                                   StagesD, 
+                                   FragmentSize,
+                                   ReuseSmemC, 
+                                   DelayTmaStore, 
+                                   NumEpilogueWarpGroups>> 
+    : cute::true_type {};
+
+template<class DispatchPolicy>
+static constexpr bool sm90_is_ptr_array_tma_dispatch_policy_v = sm90_is_ptr_array_tma_dispatch_policy<DispatchPolicy>::value;
+
+using cutlass::atomic_maximum;
+
+template <class T>
+static constexpr int elements_per_access_v = cutlass::sizeof_bits<uint32_t>::value / cutlass::sizeof_bits<T>::value;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_cooperative_v =
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecializedCooperative, EpilogueSchedule> ||
+  sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule>;
+
+template <class EpilogueSchedule>
+static constexpr bool sm90_is_warp_specialized_v =
+  (!sm90_is_ptr_array_tma_cooperative_v<EpilogueSchedule> && sm90_is_ptr_array_tma_v<EpilogueSchedule>) ||
+  cute::is_base_of_v<cutlass::epilogue::TmaWarpSpecialized, EpilogueSchedule>;
+
+template <class GmemLayoutTag>
+static constexpr bool is_im2col_mode =
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNHWC> ||
+  cute::is_same_v<GmemLayoutTag, cutlass::layout::TensorNDHWC>;
+
+template <class T>
+struct EmptyStorage {
+  CUTLASS_HOST_DEVICE
+  T* data() { return nullptr; }
+};
+
+template<class EpilogueSchedule, class Stride>
+CUTLASS_HOST_DEVICE
+auto get_epilogue_stride(Stride stride){
+  if constexpr (cute::is_base_of_v<cutlass::gemm::EpilogueTransposed, EpilogueSchedule>||
+                cute::is_base_of_v<cutlass::epilogue::PtrArrayNoSmemWarpSpecializedTransposed, EpilogueSchedule>) {
+    return cute::make_stride(cute::get<1>(stride), cute::get<0>(stride), cute::get<2>(stride));
+  }
+  else {
+    return stride;
+  }
+}
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithBias { 
+  static constexpr bool value = false; 
+  using type = typename ThreadEpilogueOp::ElementCompute; 
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithBias <ThreadEpilogueOp, cute::void_t<typename ThreadEpilogueOp::ElementBias>> { 
+  static constexpr bool value = true; 
+  using type = typename ThreadEpilogueOp::ElementBias; 
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithPerChannelScaling {
+  static constexpr bool value = false;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithPerChannelScaling <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsPerChannelScalingSupported>> {
+  static constexpr bool value = true;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithResidualAdd {
+  static constexpr bool value = false;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithResidualAdd <ThreadEpilogueOp, cute::void_t<decltype(ThreadEpilogueOp::IsResidualSupported)>> {
+  static constexpr bool value = ThreadEpilogueOp::IsResidualSupported;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithActivation {
+  static constexpr bool value = false;
+  using type = void;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithActivation <ThreadEpilogueOp, cute::enable_if_t<ThreadEpilogueOp::IsEltActSupported>> {
+  static constexpr bool value = true;
+  using type = typename ThreadEpilogueOp::ActivationFn;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithPerChannelScaled {
+  static constexpr bool value = false;
+};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithPerChannelScaled <ThreadEpilogueOp, cute::void_t<decltype(ThreadEpilogueOp::IsPerRowScaleSupported)>> {
+  static constexpr bool value = ThreadEpilogueOp::IsPerRowScaleSupported || ThreadEpilogueOp::IsPerColScaleSupported;
+};
+
+template <typename ThreadEpilogueOp, typename = void>
+struct IsThreadEpilogueOpWithElementwiseArguments : cute::false_type {};
+
+template <typename ThreadEpilogueOp>
+struct IsThreadEpilogueOpWithElementwiseArguments<
+        ThreadEpilogueOp,
+        cute::void_t<typename ThreadEpilogueOp::ElementwiseOp::Arguments>> : cute::true_type {};
+
+// Check if ActivationFn has 'Arguments' type defined
+template <class ActivationFn, class = void>
+struct sm100_act_has_arguments : cute::false_type {};
+
+template <class ActivationFn>
+struct sm100_act_has_arguments<ActivationFn, cute::void_t<typename ActivationFn::Arguments> > : cute::true_type {};
+
+template<typename EpilogueOp, typename = void>
+struct Sm100EpilogueOpNumAccumulatorMtxs {
+  static constexpr int value = 1;
+};
+
+template<typename EpilogueOp>
+struct Sm100EpilogueOpNumAccumulatorMtxs<EpilogueOp, cute::void_t<decltype(EpilogueOp::NumAccumulatorMtxs)>> {
+  static constexpr int value = EpilogueOp::NumAccumulatorMtxs;
+};
+
+
+// Wrapper class to use operator-style epilogues in sm90 TMA warp-specialized kernels
+template <class EpilogueOp>
+class Sm90TmaWarpSpecializedAdapter : public EpilogueOp {
+public:
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  using LoadPipeline = cutlass::PipelineTransactionAsync<0>;
+  using LoadPipelineState = cutlass::PipelineState<0>;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+  constexpr static bool RequiresTransactionBytes = false;
+
+  using StorePipeline = cutlass::PipelineTmaStore<0>;
+  using StorePipelineState = cutlass::PipelineState<0>;
+
+  using TensorStorage = typename EpilogueOp::SharedStorage;
+  using TensorMapStorage = typename EpilogueOp::SharedStorage;
+  using PipelineStorage = typename LoadPipeline::SharedStorage;
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors([[maybe_unused]] typename EpilogueOp::Params const&) {
+  }
+
+  // ctor inheritance
+  using EpilogueOp::EpilogueOp;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TmaWarpSpecializedAdapter(
+      typename EpilogueOp::Params const& params,
+      [[maybe_unused]] TensorStorage& shared_tensors)
+    : EpilogueOp(params) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] CtaTileMNK cta_tile_mnk,
+      [[maybe_unused]] CtaCoordMNKL cta_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] int subtile_idx=-1)
+  {
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC
+  >
+  CUTLASS_DEVICE auto
+  load(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape_mnkl,
+      [[maybe_unused]] TileShapeMNK tile_shape_MNK,
+      [[maybe_unused]] TileCoordMNKL tile_coord_mnkl,
+      [[maybe_unused]] TiledMma tiled_mma,
+      [[maybe_unused]] int thread_idx,
+      [[maybe_unused]] TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapC const& load_tensormap,
+      [[maybe_unused]] int subtile_idx=-1,
+      [[maybe_unused]] bool wait = false)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state)
+  {
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+    [[maybe_unused]] typename EpilogueOp::Params const& params,
+    [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+    [[maybe_unused]] int32_t sm_count,
+    [[maybe_unused]] int32_t sm_idx,
+    [[maybe_unused]] int32_t warp_group_idx) {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(cta_tile_mnk);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(cta_tile_mnk) * get<0,i>(cta_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(cta_tile_mnk);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(cta_tile_mnk) * get<1,i>(cta_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        cta_tile_mnk,
+        cta_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] TensorMapD const& store_tensormap,
+      int subtile_index = -1)
+  {
+    constexpr int BLK_M_RANK = cute::rank<0>(tile_shape_MNK);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return get<0,i>(problem_shape_mnkl) - get<0,i>(tile_shape_MNK) * get<0,i>(tile_coord_mnkl);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(tile_shape_MNK);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return get<1,i>(problem_shape_mnkl) - get<1,i>(tile_shape_MNK) * get<1,i>(tile_coord_mnkl);
+      }));
+
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    (*this)(
+        problem_shape_mnkl,
+        tile_shape_MNK,
+        tile_coord_mnkl,
+        accumulators,
+        tiled_mma,
+        residue_mnk,
+        thread_idx,
+        reinterpret_cast<char*>(&shared_tensors));
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  // Dummy methods to perform different parts of TMA/Tensormap modifications
+
+  template <bool IsLoad,
+            class ProblemShapeMNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] ProblemShapeMNKL problem_shape,
+      [[maybe_unused]] int32_t next_batch,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      [[maybe_unused]] TensorMapStorage& shared_tensormaps,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] int32_t warp_group_idx) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
+};
+
+
+// Wrapper class to use operator-style epilogues in sm100 TMA warp-specialized kernels
+template <class EpilogueOp>
+class Sm100TmaWarpSpecializedAdapter : public EpilogueOp {
+public:
+  using LoadPipeline = cutlass::PipelineTransactionAsync<0>; // 0 stage to disable smem alloc
+  using LoadPipelineState = cutlass::PipelineState<0>;
+
+  using StorePipeline = cutlass::PipelineTmaStore<1>; // tma store pipe has no smem alloc
+  using StorePipelineState = cutlass::PipelineState<1>;
+
+  using TensorStorage = typename EpilogueOp::SharedStorage;
+  using TensorMapStorage = typename EpilogueOp::SharedStorage;
+  using PipelineStorage = typename LoadPipeline::SharedStorage;
+
+  static constexpr int NumAccumulatorMtxs = Sm100EpilogueOpNumAccumulatorMtxs<EpilogueOp>::value;
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK) {
+    return 1;
+  }
+
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors([[maybe_unused]] typename EpilogueOp::Params const&) {
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  // ctor inheritance
+  using EpilogueOp::EpilogueOp;
+
+  CUTLASS_DEVICE auto
+  load_init(
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] TensorMapStorage& shared_tensormap,
+      [[maybe_unused]] int32_t const sm_count,
+      [[maybe_unused]] int32_t const sm_idx) const {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      TensorStorage& shared_tensors,
+      bool reverse_epi_n = false)
+  {
+    // C load is performed in epilogue operator
+    return load_pipe_producer_state;
+  }
+
+  // with Tensormap
+  template<
+    bool ReuseTmem = false,
+    class ProblemShapeMNKL,
+    class CtaTileShapeMNK,
+    class CtaTileCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class TensorMap
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileShapeMNK tile_shape_mnk,
+      CtaTileCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] cute::tuple<TensorMap, bool> const& load_tensormap_info,
+      bool reverse_epi_n = false)
+  {
+    // C load is performed in epilogue operator
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE void
+  load_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      [[maybe_unused]] LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      [[maybe_unused]] StorePipelineState store_pipe_producer_state)
+  {
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] TensorMapStorage& shared_tensormap,
+      [[maybe_unused]] int32_t const sm_count,
+      [[maybe_unused]] int32_t const sm_idx) const {
+    return cute::make_tuple(nullptr);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TensorStorage& shared_tensors
+      )
+  {
+    // Wait for mma warp to fill tmem buffer with accumulator results
+    acc_pipeline.consumer_wait(acc_pipe_consumer_state);
+
+    auto [acc_state_next] = (*this).template operator()<ReuseTmem>(
+        acc_pipeline,
+        acc_pipe_consumer_state,
+        problem_shape_mnkl,
+        cta_tile_mnk,
+        cta_coord_mnkl,
+        accumulators,
+        shared_tensors);
+
+    // Let mma warp know tmem buffer is consumed and empty
+    ++load_pipe_consumer_state;
+    ++store_pipe_producer_state;
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state, acc_state_next);
+  }
+
+  // FastF32 API
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TiledCopyT2R
+  >
+  CUTLASS_DEVICE auto
+  store(
+    LoadPipeline load_pipeline,
+    LoadPipelineState load_pipe_consumer_state,
+    StorePipeline store_pipeline,
+    StorePipelineState store_pipe_producer_state,
+    ProblemShapeMNKL problem_shape_mnkl,
+    CtaTileMNK cta_tile_mnk,
+    CtaCoordMNKL cta_coord_mnkl,
+    MmaTileMNK mma_tile_mnk,
+    TiledMma tiled_mma,
+    cute::Tensor<AccEngine, AccLayout>& tTR_rAcc,
+    TensorStorage& shared_tensors,
+    TiledCopyT2R tiled_t2r)
+  {
+    (*this)(
+      problem_shape_mnkl,
+      cta_tile_mnk,
+      cta_coord_mnkl,
+      tTR_rAcc,
+      shared_tensors,
+      tiled_t2r);
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+    // FastF32 API with Tensor Map
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TiledCopyT2R,
+    class TensorMap
+  >
+  CUTLASS_DEVICE auto
+  store(
+    LoadPipeline load_pipeline,
+    LoadPipelineState load_pipe_consumer_state,
+    StorePipeline store_pipeline,
+    StorePipelineState store_pipe_producer_state,
+    ProblemShapeMNKL problem_shape_mnkl,
+    CtaTileMNK cta_tile_mnk,
+    CtaCoordMNKL cta_coord_mnkl,
+    MmaTileMNK mma_tile_mnk,
+    TiledMma tiled_mma,
+    cute::Tensor<AccEngine, AccLayout>& tTR_rAcc,
+    TensorStorage& shared_tensors,
+    TensorMap tensormap,
+    TiledCopyT2R tiled_t2r) {
+    (*this)(
+      problem_shape_mnkl,
+      cta_tile_mnk,
+      cta_coord_mnkl,
+      tTR_rAcc,
+      shared_tensors,
+      tiled_t2r);
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class TileCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TensorMap
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      TileCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TensorStorage& shared_tensors,
+      TensorMap tensormap
+      )
+  {
+    // Wait for mma warp to fill tmem buffer with accumulator results
+    acc_pipeline.consumer_wait(acc_pipe_consumer_state);
+
+    auto [acc_state_next] = (*this).template operator()<ReuseTmem>(
+        acc_pipeline,
+        acc_pipe_consumer_state,
+        problem_shape_mnkl,
+        cta_tile_mnk,
+        cta_coord_mnkl,
+        accumulators,
+        shared_tensors);
+
+    // Let mma warp know tmem buffer is consumed and empty
+    ++load_pipe_consumer_state;
+    ++store_pipe_producer_state;
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state, acc_state_next);
+  }
+
+  template <class CtaTileMNK>
+  CUTLASS_DEVICE void
+  store_tail(
+      [[maybe_unused]] LoadPipeline load_pipeline,
+      [[maybe_unused]] LoadPipelineState load_pipe_consumer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      [[maybe_unused]] StorePipelineState store_pipe_producer_state,
+      [[maybe_unused]] CtaTileMNK cta_tile_mnk)
+  {
+  }
+
+  // Dummy methods to perform different parts of TMA/Tensormap modifications
+
+  template <bool IsLoad, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      [[maybe_unused]] TensorMapStorage& shared_tensormap,
+      [[maybe_unused]] typename EpilogueOp::Params const& params,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap,
+      [[maybe_unused]] ProblemShape problem_shape,
+      [[maybe_unused]] int32_t next_batch) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      [[maybe_unused]] TensorMapStorage& shared_tensormap,
+      [[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire([[maybe_unused]] cute::TmaDescriptor const* tensormap) { }
+};
+
+
+// SFINAE helpers for detecting beta/beta_ptr/beta_ptr_array in EVT arguments.
+template <class Arguments, class = void>
+struct has_beta {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta<Arguments, cute::void_t<decltype(Arguments{}.thread.beta)>> {
+  static constexpr bool value = true;
+};
+
+template <class Arguments, class = void>
+struct has_beta_ptr {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta_ptr<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr)>> {
+  static constexpr bool value = true;
+};
+
+template <class Arguments, class = void>
+struct has_beta_ptr_array {
+  static constexpr bool value = false;
+};
+
+template <class Arguments>
+struct has_beta_ptr_array<Arguments, cute::void_t<decltype(Arguments{}.thread.beta_ptr_array)>> {
+  static constexpr bool value = true;
+};
+
+} // namespace detail
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d32dd6aeefe91b2663a9c9adeee3848e16f6c08f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
@@ -0,0 +1,271 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor for performing tensor-tensor broadacasts atop existing epilogues.
+
+  Concretely, the opeartion performed is the following:
+    UnaryOp(
+        BinaryOp1(
+            BinaryOp0(
+                Activation((alpha * A @ B) + bias),
+                beta * C0
+            ),
+            beta * C1
+        )
+    )
+
+    where:
+        - C0 and C1 have the same extents as the output
+        - BinaryOp0 and BinaryOp1 perform elementwise binary operations
+        - UnaryOp is an elementwise operation
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Collective epilogue that applies elementwise tensor-tensor operations atop other epilogues
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class EpilogueSchedule_,
+  bool PerColumnBias_ = false
+>
+class EpilogueTensorBroadcast {
+public:
+  //
+  // Type Aliases
+  //
+  using EpilogueSchedule = EpilogueSchedule_;
+
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementBias = typename ThreadEpilogueOp::ElementBias;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ActivationFunctor = typename ThreadEpilogueOp::ActivationFunctor;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static constexpr int kOutputAlignment = ThreadEpilogueOp::kCount;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static constexpr bool IsBinaryOp0Enabled = ThreadEpilogueOp::IsBinaryOp0Enabled;
+  static constexpr bool IsBinaryOp1Enabled = ThreadEpilogueOp::IsBinaryOp1Enabled;
+  static constexpr bool IsUnaryOpEnabled = ThreadEpilogueOp::IsUnaryOpEnabled;
+
+  static constexpr bool PerColumnBias = PerColumnBias_;
+  using BiasStride = typename cute::conditional_t<PerColumnBias, Stride<_0, _1, _0>, Stride<_1, _0, _0>>;
+
+  struct SharedStorage { };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias* ptr_Bias = nullptr;
+    ElementC* ptr_C0 = nullptr;
+    ElementC* ptr_C1 = nullptr;
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  EpilogueTensorBroadcast(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source0_needed() || epilogue_op.is_source1_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_HOST_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      [[maybe_unused]] char* smem_buf)
+  {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 4");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    auto stride_c    = detail::get_epilogue_stride<EpilogueSchedule>(params.dC);
+    auto stride_d    = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
+    auto stride_bias = detail::get_epilogue_stride<EpilogueSchedule>(BiasStride{});
+
+    // Represent the full output tensor
+    Tensor mC0_mnl = make_tensor(make_gmem_ptr(params.ptr_C0), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mC1_mnl = make_tensor(make_gmem_ptr(params.ptr_C1), make_shape(M,N,L), stride_c);                   // (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                     // (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), stride_bias);            // (m,n,l)
+
+    Tensor gC0_mnl = local_tile(mC0_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gC1_mnl = local_tile(mC1_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});        // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});  // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this thread block is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC0 = gC0_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gC1 = gC1_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                     // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                               // (BLK_M,BLK_N)
+
+    // Partition source and destination tiles to match the accumulator partitioning
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCgD = thr_mma.partition_C(gD);                                                           // (VEC,THR_M,THR_N)
+    Tensor tCgC0 = thr_mma.partition_C(gC0);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgC1 = thr_mma.partition_C(gC1);                                                         // (VEC,THR_M,THR_N)
+    Tensor tCgBias = thr_mma.partition_C(gBias);                                                     // (VEC,THR_M,THR_N)
+
+    static_assert(is_static<FrgLayout>::value,
+        "Accumulator layout must be static");
+    CUTE_STATIC_ASSERT_V(size(tCgC0) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgC1) == size(tCgD),
+        "Source and destination must have the same number of elements.");
+    CUTE_STATIC_ASSERT_V(size(tCgD) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+    CUTE_STATIC_ASSERT_V(size(tCgBias) == size(accumulators),
+        "Accumulator count must have the same destination element count.");
+
+    auto cD = make_identity_tensor(make_shape(unwrap(shape<0>(gD)), unwrap(shape<1>(gD))));
+    Tensor tCcD = thr_mma.partition_C(cD);
+
+    bool bias_needed = params.ptr_Bias != nullptr;
+    bool c0_needed = (params.ptr_C0 != nullptr) && epilogue_op.is_source0_needed();
+    bool c1_needed = (params.ptr_C1 != nullptr) && epilogue_op.is_source1_needed();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accumulators); ++i) {
+      if (elem_less(tCcD(i), make_coord(get<0>(residue_mnk), get<1>(residue_mnk)))) {
+        ElementBias bias = bias_needed ? tCgBias(i) : ElementBias(0);
+        ElementC c0 = c0_needed ? tCgC0(i) : ElementC(0);
+        ElementC c1 = c1_needed ? tCgC1(i) : ElementC(0);
+
+        tCgD(i) = epilogue_op(accumulators(i), c0, c1, bias);
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3b2d0880e56fe65d7dda6efb982dee52f23b3e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_nosmem.hpp
@@ -0,0 +1,937 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by Ptr-Array and Grouped GEMM epilogues.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class CopyOpT2R_,
+  class AlignmentC_,
+  class AlignmentD_
+>
+class CollectiveEpilogue<
+    Sm100PtrArrayNoSmem,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    CopyOpT2R_,
+    AlignmentC_,
+    AlignmentD_,
+    cute::enable_if_t<IsDefaultFusionOp<ThreadEpilogueOp_>::value>
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm100PtrArrayNoSmem;
+  using EpilogueTile = EpilogueTile_;
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpT2R = CopyOpT2R_;
+  using AlignmentC = AlignmentC_;
+  using AlignmentD = AlignmentD_;
+
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  constexpr static int ThreadCount = 128;
+  constexpr static int kOutputAlignment = ThreadEpilogueOp::kCount;
+  constexpr static bool isEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+
+  struct SharedStorage {
+    struct TensorStorage { };
+    struct TensorMapStorage { };
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int /*sm_count*/ = 0) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params, SharedStorage&) : params(params) { };
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  operator()(
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK cta_tile_shape_mnk,
+      TileCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine, AccLayout> const& accumulators,                                      // (MMA,MMA_M,MMA_N)
+      [[maybe_unused]] SharedStorage&) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+
+    // Batches are managed by using appropriate pointers to C and D matrices
+    auto problem_shape_mnl = append<3>(make_shape(M,N),Int<1>{});
+    auto cta_coord_mnl = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+    auto cta_tiler = take<0,2>(cta_tile_shape_mnk);
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    auto [stride_c, stride_d] = [&, l = l_coord]() {
+      if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+        // If grouped gemm
+        if (epilogue_op.is_source_needed()) {
+            return make_tuple(
+                detail::get_epilogue_stride<DispatchPolicy>(params.dC[l]),
+                detail::get_epilogue_stride<DispatchPolicy>(params.dD[l])
+            );
+        } 
+        else {
+          return make_tuple(
+              InternalStrideC{}, 
+              detail::get_epilogue_stride<DispatchPolicy>(params.dD[l])
+          );
+        }
+      } 
+      else {
+        return make_tuple(
+            detail::get_epilogue_stride<DispatchPolicy>(params.dC),
+            detail::get_epilogue_stride<DispatchPolicy>(params.dD)
+        );
+      }
+    }();
+
+    // Get the residual tensor for the current batch
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mC = make_tensor(make_gmem_ptr(ptr_C_l), problem_shape_mnl, stride_c);                            // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), problem_shape_mnl, stride_d);              // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+
+    // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
+    auto tiled_t2r = make_tmem_copy(CopyOpT2R{}, tensor<0>(accumulators));
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_gD));                              // (T2R,T2R_M,T2R_N)
+
+    Tensor tTR_rC = make_tensor<GmemElementC>(shape(tTR_gC));                                          // (T2R,T2R_M,T2R_N)
+
+    Tensor coordCD = make_identity_tensor(problem_shape_mnl);                                     // (M,N,L) -> (m,n,l)
+    Tensor cCD = local_tile(coordCD, cta_tiler, cta_coord_mnl);                             // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cCD = thread_t2r.partition_D(cCD);                                       // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    constexpr auto mclD = decltype(max_common_layout(tTR_rAcc.layout(), tTR_gD.layout())){};
+    constexpr int VD = cute::min(AlignmentD{}, size(mclD));
+    Tensor tTR_rD_frag = make_tensor<ElementD>(shape(tTR_rAcc));
+    Tensor tTR_rD_src = recast<Array<ElementD, VD>>(coalesce(tTR_rD_frag));
+    Tensor tR2G_rD_dst = recast<Array<ElementD, VD>>(coalesce(tTR_gD));
+
+    Tensor tTR_cD_mn_frg = tensor<1>(zipped_divide(coalesce(tTR_cCD), mclD.compose(Int<VD>{})));
+    Tensor tDpD = make_tensor<bool>(shape(tR2G_rD_dst));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int t = 0; t < size(tDpD); t++) {
+      tDpD(t) = elem_less(tTR_cD_mn_frg(t), problem_shape_mnl);
+    }
+
+    constexpr auto mclC = decltype(max_common_layout(tTR_rAcc.layout(), tTR_gC.layout())){};
+    constexpr int VC = cute::min(AlignmentC{}, size(mclC));
+
+    Tensor tTR_cC_mn_frg = tensor<1>(zipped_divide(coalesce(tTR_cCD), mclC.compose(Int<VC>{})));
+    Tensor tG2R_rC_dst = recast<Array<GmemElementC, VC>>(coalesce(tTR_gC));
+    Tensor tCpC = make_tensor<bool>(shape(tG2R_rC_dst));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int t = 0; t < size(tCpC); t++) {
+      tCpC(t) = elem_less(tTR_cC_mn_frg(t), problem_shape_mnl);
+    }
+    Tensor tTR_rC_src = recast<Array<GmemElementC, VC>>(coalesce(tTR_gC));
+    Tensor tTR_rC_dst = recast<Array<GmemElementC, VC>>(coalesce(tTR_rC));
+
+    // Detect interleaved complex fp32 kernels
+    [[maybe_unused]] Tensor accs = accumulators;
+    using ElementTmem = typename decltype(accs)::value_type;
+    constexpr bool is_interleaved_complex_f32 = is_complex<ElementAccumulator>::value && cute::is_same_v<ElementTmem, float>;
+
+    // 1. Load accumulators into register from tmem
+    // Tmem -> rmem and transformation for interleaved complex kernels
+    if constexpr (is_interleaved_complex_f32) {
+      using ElementComputeAccumulator = float;
+
+      Tensor tAccReal = accumulators(make_coord(_,_),_0{},_0{},_0{});                                  // (CTA_M,CTA_N)
+      Tensor tAccImag = accumulators(make_coord(_,_),_0{},_0{},_1{});                                  // (CTA_M,CTA_N)
+      Tensor tTR_tAccReal = thread_t2r.partition_S(tAccReal);                                      // (T2R,T2R_M,T2R_N)
+      Tensor tTR_tAccImag = thread_t2r.partition_S(tAccImag);                                      // (T2R,T2R_M,T2R_N)
+      Tensor tTR_rAccReal = make_tensor<ElementComputeAccumulator>(shape(tTR_gD));                 // (T2R,T2R_M,T2R_N)
+      Tensor tTR_rAccImag = make_tensor<ElementComputeAccumulator>(shape(tTR_gD));                 // (T2R,T2R_M,T2R_N)
+
+      copy(tiled_t2r, tTR_tAccReal, tTR_rAccReal);
+      copy(tiled_t2r, tTR_tAccImag, tTR_rAccImag);
+
+      // 1.1. Transform accumulators in registers
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAccReal); i++) {
+        tTR_rAcc(i) = {tTR_rAccReal(i), tTR_rAccImag(i)};
+      }
+    }
+
+    // Standard tmem -> rmem epilogue
+    else {
+      Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                           // (CTA_M,CTA_N)
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);                                              // (T2R,T2R_M,T2R_N)
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+    }
+
+    cutlass::arch::fence_view_async_tmem_load();
+    acc_pipeline.consumer_release(acc_pipe_consumer_state);
+    ++acc_pipe_consumer_state;
+
+    // 2. Apply element-wise operation and store to gmem
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      copy_if(tCpC, tTR_rC_src, tTR_rC_dst);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rD_frag(i) = epilogue_op(tTR_rAcc(i), tTR_rC(i));
+      }
+
+      copy_if(tDpD, tTR_rD_src, tR2G_rD_dst);
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rD_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy_if(tDpD, tTR_rD_src, tR2G_rD_dst);
+    }
+
+    return cute::make_tuple(acc_pipe_consumer_state);
+  }
+
+  // API with Global Accumulator in registers for FastFP32 (emulated MMA) kernels.
+  // The accumulator in TMEM periodically loaded into the registers so that the MMA can clear out the TMEM accumulator
+  // values for better accuracy. This epilogue accepts the accumulator in registers and take TiledCopy for the
+  // TMEM->Reg as a parameter to be used in partitioning GMEM tensors C and D.
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledCopy
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK cta_tile_shape_mnk,
+      TileCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine, AccLayout>& tTR_rGlobAcc,                                      // (MMA,MMA_M,MMA_N)
+      [[maybe_unused]] SharedStorage&,
+      TiledCopy tiled_t2r) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be Register resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(AccLayout{}) == 5, "Accumulators must be copy-partitioned:  (T2R,T2R_M,T2R_N,EPI_M,EPI_N)");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+
+    // Batches are managed by using appropriate pointers to C and D matrices
+    auto problem_shape_mnl = append<3>(make_shape(M,N),Int<1>{});
+    auto cta_coord_mnl = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+    auto cta_tiler = take<0,2>(cta_tile_shape_mnk);
+
+    ThreadEpilogueOp epilogue_op{params.thread};
+    // Get the residual tensor for the current batch
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mC = make_tensor(make_gmem_ptr(ptr_C_l), problem_shape_mnl, append<3>(params.dC,_0{}));           // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), problem_shape_mnl, append<3>(params.dD,_0{})); // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+
+
+    // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
+    auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
+ 
+
+    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                      // (M,N,L) -> (m,n,l)
+    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                               // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cD = thread_t2r.partition_D(cD);                                         // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    // 2. Apply element-wise operation and store to gmem
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rGlobAcc); ++i) {
+        if (elem_less(tTR_cD(i), problem_shape_mnl)) {
+          tTR_gD(i) = epilogue_op(tTR_rGlobAcc(i), tTR_gC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rGlobAcc); ++i) {
+        if (elem_less(tTR_cD(i), problem_shape_mnl)) {
+          tTR_gD(i) = epilogue_op(tTR_rGlobAcc(i));
+        }
+      }
+    }
+  }
+
+protected:
+  Params const& params;
+};
+
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpT2R_,
+  class AlignmentC_,
+  class AlignmentD_
+>
+class CollectiveEpilogue<
+    Sm100PtrArrayNoSmem,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpT2R_,
+    AlignmentC_,
+    AlignmentD_,
+    cute::enable_if_t<not IsDefaultFusionOp<FusionCallbacks_>::value>
+> {
+public:
+  //
+  // Type Aliases
+  //
+  // Required by the gemm::kernel
+  using DispatchPolicy = Sm100PtrArrayNoSmem;
+  using ElementC = ElementC_;
+  using ElementD = ElementD_;
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+  using StrideC = StrideC_;
+  using StrideD = StrideD_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using EpilogueTile = EpilogueTile_;
+  using CopyOpT2R = CopyOpT2R_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+private:
+  constexpr static bool IsReductionBufferNeeded = ThreadEpilogueOp::IsDePerRowBiasSupported
+                                               || is_same_v<ThreadEpilogueOp, epilogue::fusion::FusionOperation>; // alloc reduction buffer for custom EVTs
+  constexpr static size_t ImplicitSharedStorageSize = IsReductionBufferNeeded ? size(EpilogueTile{}) : 0;
+
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+
+public:
+  constexpr static int ThreadCount = 128;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+
+  struct SharedStorage {
+    using FusionStorage = typename FusionCallbacks::SharedStorage;
+    FusionStorage thread;
+    array_aligned<uint8_t, ImplicitSharedStorageSize> buffer;
+  };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC = {};
+    ElementD** ptr_D = nullptr;
+    StrideD dD = {};
+  };
+
+  // Device side epilogue params
+  struct Params {
+    typename FusionCallbacks::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC = {};
+    ElementD** ptr_D = nullptr;
+    StrideD dD = {};
+  };
+
+  //
+  // Constructor and Data Members
+  //
+  CUTLASS_DEVICE
+  CollectiveEpilogue(Params const& params_, SharedStorage& shared_tensors)
+  : fusion_callbacks(params_.thread, shared_tensors.thread)
+  , smem_buffer_ptr(shared_tensors.buffer.data())
+  , params(params_) {};
+
+protected:
+  FusionCallbacks fusion_callbacks;
+  uint8_t* smem_buffer_ptr;
+  Params const& params;
+
+public:
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int /*sm_count*/ = 0) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+    return fusion_implementable;
+  }
+
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class AccEngine, class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  operator()(
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      [[maybe_unused]] SharedStorage&
+  ) {
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    // Wait for mma warp to fill tmem buffer with accumulator results
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+    static_assert(cute::sizeof_bits_v<ElementD> != 6, "Output element requires smem");
+
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+
+    // Batches are managed by using appropriate pointers to C and D matrices
+    auto problem_shape_mnl = append<3>(make_shape(M,N),Int<1>{});
+    auto cta_coord_mnl = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+    auto cta_tiler = take<0,2>(cta_tile_mnk);
+    auto cta_coord_mnk = cute::make_coord(m_coord, n_coord, k_coord, cute::Int<0>{});
+
+    bool is_C_load_needed = fusion_callbacks.is_C_load_needed();
+
+    auto [stride_c, stride_d] = [&, l = l_coord]() {
+      if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+        // If grouped gemm
+        if (is_C_load_needed) {
+            return make_tuple(
+                detail::get_epilogue_stride<DispatchPolicy>(params.dC[l]),
+                detail::get_epilogue_stride<DispatchPolicy>(params.dD[l])
+            );
+        } 
+        else {
+          return make_tuple(
+              InternalStrideC{}, 
+              detail::get_epilogue_stride<DispatchPolicy>(params.dD[l])
+          );
+        }
+      } 
+      else {
+        return make_tuple(
+            detail::get_epilogue_stride<DispatchPolicy>(params.dC),
+            detail::get_epilogue_stride<DispatchPolicy>(params.dD)
+        );
+      }
+    }();
+
+    // Get the residual tensor for the current batch
+    ElementC const* ptr_C_l = nullptr;
+    if (is_C_load_needed) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+
+    int thread_idx = threadIdx.x % ThreadCount;
+
+    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                             // (CTA_M,CTA_N)
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    TiledCopy tiled_t2r = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+
+    constexpr int FragmentSize = size(EpilogueTile{}) / ThreadCount;
+
+    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                      // (M,N,L) -> (m,n,l)
+    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                               // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor cD_epi = flat_divide(cD, EpilogueTile{});
+    Tensor tTR_cD = thread_t2r.partition_D(cD_epi);                                     // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_cD(_,_,_,_0{},_0{})));
+
+    // Construct the EVT consumer callbacks
+    auto residue_cD = make_coord(M,N) - cD(_0{});
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD(_0{});
+    Tensor cD_ = make_coord_tensor(cD.layout());
+    Tensor tTR_cD_ = make_coord_tensor(tTR_cD.layout());
+    constexpr bool RefSrc = false;
+
+    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(ptr_C_l), problem_shape_mnl, stride_c);
+
+    Tensor tTR_gC = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      mC, cta_tile_mnk, cta_coord_mnk, EpilogueTile{}, tiled_t2r, thread_idx);
+
+    Tensor mD = make_tensor(make_gmem_ptr(recast_ptr<ElementD>(params.ptr_D[l_coord])), problem_shape_mnl, stride_d);
+
+    Tensor tTR_gD = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      mD, cta_tile_mnk, cta_coord_mnk, EpilogueTile{}, tiled_t2r, thread_idx);
+
+    // Register Tensor
+    Tensor tTR_rD = make_tensor<ElementD>(take<0,3>(shape(tTR_gD)));
+
+    Tensor coord_cCD = make_identity_tensor(problem_shape_mnl);
+    Tensor tTR_cCD = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      coord_cCD, cta_tile_mnk, cta_coord_mnk, EpilogueTile{}, tiled_t2r, thread_idx);
+    constexpr auto mclD = decltype(max_common_layout(tTR_gD(_,_,_,_0{},_0{}), tTR_rD)){};
+    constexpr int VD = cute::min(AlignmentD_{}, size(mclD));
+
+    auto tCrC = make_tensor<GmemElementC>(take<0,3>(shape(tTR_gC)));
+    constexpr auto mclC = decltype(max_common_layout(tTR_gC(_,_,_,_0{},_0{}), tCrC)){};
+    constexpr int VC = cute::min(AlignmentC_{}, size(mclC));
+
+    Tensor tTR_rD_frg = recast<Array<ElementD, FragmentSize>>(coalesce(tTR_rD));
+
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+      problem_shape_mnkl,
+      cta_tile_mnk,
+      cta_coord_mnkl,
+      int(0),
+      EpilogueTile{},
+      tiled_t2r,
+      cD_,
+      residue_cD,
+      tTR_cD_,
+      residue_tTR_cD,
+      tCrC,
+      thread_idx
+    };
+
+    auto synchronize = [] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // The Epilogue Loop
+    auto epi_loop_fn = [&] (auto& cst_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Ensure there are no threads from the previous wave writing to shared memory being utilized for the current wave.
+      synchronize();
+      cst_callbacks.begin();
+      if (cst_callbacks.begin_sync_needed()) {
+        synchronize();
+      }
+
+      // If tmem doesn't have enough capacity to support double buffering, a portion of tmem (a column of epilogue tiles)
+      // is overlapped between 2 pseudo-buffers. The shared tmem portion corresponds to the last epilogue tile column of
+      // tmem accumulator buffer 0, and the first epilogue tile column of tmem accumulator 1.
+      // Thus, whenever we are processing tmem accumulator buffer 0, we process the epilogue tiles with reversed column order.
+      // Once the last epilogue tile column is loaded from tmem, the acc_pipeline is released.
+      // Then, the next accumulation stage for buffer 1 can start.
+      [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0;
+      static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
+
+      // For each epilogue subtile within the CTA tile
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<4>(tTR_tAcc));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<3>(tTR_tAcc));
+      
+      // Lambda to process a single epilogue tile
+      auto process_tile = [&](int epi_m, int epi_n, int iter_m, int iter_n) CUTLASS_LAMBDA_FUNC_INLINE {
+        bool is_last_iteration = iter_m == NumEpiSubtilesM-1 && iter_n == NumEpiSubtilesN-1;
+        bool do_acc_release = is_last_iteration;
+
+        // Adjust release condition for tmem reuse
+        if constexpr (ReuseTmem) {
+          do_acc_release = iter_m == NumEpiSubtilesM-1 && iter_n == 0;  // Release on first N iteration
+        }
+
+        Tensor tTR_cCD_mn = tTR_cCD(_,_,_,epi_m,epi_n);
+        Tensor tTR_pCD_mn = cute::lazy::transform(tTR_cCD_mn, [&] (auto const& c) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(c, problem_shape_mnl); });
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if constexpr (not cute::is_void_v<ElementC>) {
+          if (is_C_load_needed) {
+            using CVecType = uint_bit_t<VC * sizeof_bits_v<ElementC>>;
+
+            if constexpr (!is_same_v<CVecType, uint256_t>) {
+              Tensor tTR_gC_frg = recast<CVecType>(coalesce(tTR_gC(_,_,_,epi_m,epi_n)));
+              Tensor tTR_rC_frg = recast<CVecType>(coalesce(tCrC));
+              Tensor tTR_pC_frg = tensor<1>(zipped_divide(coalesce(tTR_pCD_mn), mclC.compose(Int<VC>{})));
+              copy_if(tTR_pC_frg, tTR_gC_frg, tTR_rC_frg);
+            }
+            else {
+              auto tiled_g2r = make_tiled_copy_D(Copy_Atom<SM100_LOAD_256bit_CACHE_NOALLOCATION, ElementC>{}, tiled_t2r);
+              auto thr_g2r = tiled_g2r.get_slice(threadIdx.x);
+              Tensor c_src = thr_g2r.retile_S(tTR_gC(_,_,_,epi_m,epi_n));
+              Tensor c_dst = thr_g2r.retile_D(tCrC);
+              Tensor c_prd = thr_g2r.retile_D(tTR_pCD_mn);
+              copy_if(tiled_g2r, c_prd, c_src, c_dst);
+            }
+          }
+        }
+
+        // Copy accumulator tile from tmem to register
+        // The current tile in tmem
+        Tensor tTR_tAcc_mn = tTR_tAcc(_,_,_,epi_m,epi_n);
+
+        Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc));
+
+        copy(tiled_t2r, tTR_tAcc_mn, tTR_rAcc);
+
+        // After the last tmem load, signal that tmem buffer is consumed and empty
+        if (do_acc_release) {
+          cutlass::arch::fence_view_async_tmem_load();
+          acc_pipeline.consumer_release(acc_pipe_consumer_state);
+          ++acc_pipe_consumer_state;
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tTR_rAcc_frg); ++epi_v) {
+          tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+        }
+
+        Tensor reduction_buffer = make_tensor(
+          raw_pointer_cast(make_smem_ptr(smem_buffer_ptr)), make_layout(Shape<Int<ImplicitSharedStorageSize>>{}));
+
+        cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tTR_rAcc /*not used*/);
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+        using VecType = uint_bit_t<VD * sizeof_bits_v<ElementD>>;
+        if constexpr (!is_same_v<VecType, uint256_t>) {
+          Tensor tTR_gD_frg = recast<VecType>(coalesce(tTR_gD(_,_,_,epi_m,epi_n)));
+          Tensor tTR_rD_frg = recast<VecType>(coalesce(tTR_rD));
+          Tensor tTR_pD_frg = tensor<1>(zipped_divide(coalesce(tTR_pCD_mn), mclD.compose(Int<VD>{})));
+          copy_if(tTR_pD_frg, tTR_rD_frg, tTR_gD_frg);
+        }
+        else {
+          auto tiled_r2g = make_tiled_copy_D(Copy_Atom<SM100_STORE_256bit_CACHE_NOALLOCATION, ElementD>{}, tiled_t2r);
+          auto thr_r2g = tiled_r2g.get_slice(threadIdx.x);
+          Tensor src = thr_r2g.retile_S(tTR_rD);
+          Tensor dst = thr_r2g.retile_D(tTR_gD(_,_,_,epi_m,epi_n));
+          Tensor prd = thr_r2g.retile_D(tTR_pCD_mn);
+          copy_if(tiled_r2g, prd, src, dst);
+        }
+      };
+
+      // Use static iteration with appropriate ordering
+      // When ReuseTmem is true and reverse_epi_n is true, we need reverse N iteration
+      auto n_seq = cute::make_int_sequence<NumEpiSubtilesN>{};
+      auto m_seq = cute::make_int_sequence<NumEpiSubtilesM>{};
+      
+      if constexpr (UnrollEpiLoop) {
+        // Fully unrolled static iteration
+        cute::for_each(n_seq, [&](auto I_N) CUTLASS_LAMBDA_FUNC_INLINE {
+          constexpr int iter_n = I_N;
+          int epi_n = iter_n;
+          if constexpr (ReuseTmem) {
+            if (reverse_epi_n) {
+              epi_n = NumEpiSubtilesN - 1 - iter_n;  // Reverse N iteration
+            }
+          }
+          
+          cute::for_each(m_seq, [&](auto I_M) CUTLASS_LAMBDA_FUNC_INLINE {
+            constexpr int iter_m = I_M;
+            process_tile(iter_m, epi_n, iter_m, iter_n);
+          });
+        });
+      } else {
+        // Runtime loop with pragma unroll(1)
+        #pragma unroll 1
+        for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+          int epi_n = iter_n;
+          if constexpr (ReuseTmem) {
+            if (reverse_epi_n) {
+              epi_n = NumEpiSubtilesN - 1 - iter_n;  // Reverse N iteration
+            }
+          }
+          
+          #pragma unroll 1
+          for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+            process_tile(iter_m, epi_n, iter_m, iter_n);
+          }
+        }
+      }
+
+    cst_callbacks.end();
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    epi_loop_fn(cst_callbacks);
+    return cute::make_tuple(acc_pipe_consumer_state);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For sm100 kernels requiring warp specialized epilogues
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class CopyOpT2R_,
+  class AlignmentC,
+  class AlignmentD
+>
+class CollectiveEpilogue<
+    Sm100PtrArrayNoSmemWarpSpecialized,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    CopyOpT2R_,
+    AlignmentC,
+    AlignmentD
+> : public detail::Sm100TmaWarpSpecializedAdapter<CollectiveEpilogue<
+      Sm100PtrArrayNoSmem,
+      EpilogueTile_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      ThreadEpilogueOp_,
+      CopyOpT2R_,
+      AlignmentC,
+      AlignmentD,
+      void>>
+{
+public:
+  // ctor inheritance
+  using detail::Sm100TmaWarpSpecializedAdapter<CollectiveEpilogue<
+      Sm100PtrArrayNoSmem,
+      EpilogueTile_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      ThreadEpilogueOp_,
+      CopyOpT2R_,
+      AlignmentC,
+      AlignmentD,
+      void>>::Sm100TmaWarpSpecializedAdapter;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f0a915d7d61411de8f1fd6158365904258bf9fd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp
@@ -0,0 +1,1526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing elementwise operations used by Ptr-Array and Grouped Gemm epilogue.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  class CtaTileShape_, // (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpT2R_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm100PtrArrayTmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>,
+    CtaTileShape_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpT2R_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm100PtrArrayTmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>;
+  using CtaTileShape = CtaTileShape_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpT2R = CopyOpT2R_;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  constexpr static int ThreadCount = 128;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+
+private:
+
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using GmemElementD = cute::conditional_t<is_destination_supported, ElementD, fusion::get_element_aux_t<FusionCallbacks>>;
+  using GmemElementC = cute::conditional_t<is_source_supported, ElementC, GmemElementD>; // prevents void ref breakages
+  static_assert(not cute::is_void_v<GmemElementD>, "GmemElementD is void");
+
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<GmemElementD>::type;
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<GmemElementC>::type;
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  static_assert(StagesC >= 1, "StagesC must be >= 1");
+  static_assert(StagesD >= 1, "StagesD must be >= 1");
+  
+  constexpr static bool ReuseSmemC = ReuseSmemC_ && is_destination_supported;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
+
+  using SmemLayoutStageC = decltype(tile_to_shape(SmemLayoutAtomC{}, product_each(shape(EpilogueTile{})),
+      cute::conditional_t<is_m_major_C, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayoutStageD = decltype(tile_to_shape(SmemLayoutAtomD{}, product_each(shape(EpilogueTile{})),
+      cute::conditional_t<is_m_major_D, Step<_2,_1>, Step<_1,_2>>{} ));
+
+  constexpr static int StageCBits = cosize_v<SmemLayoutStageC> * sizeof_bits_v<SmemElementC>;
+  constexpr static int StageDBits = cosize_v<SmemLayoutStageD> * sizeof_bits_v<SmemElementD>;
+  constexpr static int MaxStageBits = cute::max(StageCBits, StageDBits);
+  constexpr static int StrideStageC = (ReuseSmemC ? MaxStageBits : StageCBits) / sizeof_bits_v<SmemElementC>;
+  constexpr static int StrideStageD = (ReuseSmemC ? MaxStageBits : StageDBits) / sizeof_bits_v<SmemElementD>;
+
+  using SmemLayoutC = decltype(cute::append<3>(SmemLayoutStageC{}, Layout<Int<StagesC>,                        Int<StrideStageC>>{}));
+  using SmemLayoutD = decltype(cute::append<3>(SmemLayoutStageD{}, Layout<Int<ReuseSmemC ? StagesC : StagesD>, Int<StrideStageD>>{}));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                              && MaxStageBits % sizeof_bits_v<SmemElementC> == 0
+                                              && MaxStageBits % sizeof_bits_v<SmemElementD> == 0;
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+  // TMA store delay only benefits with loop unrolling
+  constexpr static bool DelayTmaStore = DelayTmaStore_ and UnrollEpiLoop;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes = StageCBits / 8;
+  constexpr static uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_C;
+      cute::TmaDescriptor smem_tensormap_D;
+    } tensormaps;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Planar complex kernels have two accumulator copies for the real and imaginary tensors.
+  constexpr static int NumAccumulatorMtxs = 1;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideD, StrideD>;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TensorShapeC = decltype(repeat_like(append<3>(StrideC{}, _1{}), int32_t(0)));
+    using TensorShapeD = decltype(repeat_like(append<3>(StrideD{}, _1{}), int32_t(0)));
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(
+            make_gmem_ptr(static_cast<GmemElementC const*>(nullptr)),
+            TensorShapeC{},
+            append<3>(InternalStrideC{}, _0{})),
+        SmemLayoutStageC{},
+        EpilogueTile{},
+        _1{}));
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(
+            make_gmem_ptr(static_cast<GmemElementD*>(nullptr)),
+            TensorShapeD{},
+            append<3>(InternalStrideD{}, _0{})),
+        SmemLayoutStageD{},
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    cute::TmaDescriptor* tensormaps;
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+  };
+
+  //
+  // Gemm Host Functions
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_M = int32_t(size<0>(CtaTileShape{}));
+    auto init_N = int32_t(size<1>(CtaTileShape{}));
+    auto init_L = 1;
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_c = InternalStrideC{};
+      stride_d = InternalStrideD{};
+    } 
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
+      init_M = get<0>(problem_shape_MNKL);
+      init_N = get<1>(problem_shape_MNKL);
+
+      stride_c = args.dC;
+      stride_d = args.dD;
+    }
+
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+      // Tensor pointers will be fixed before the first access
+      ElementC const* ptr_C_first_batch = nullptr;
+      Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
+      tma_load_c = make_tma_copy(CopyOpG2S{}, tensor_c, SmemLayoutStageC{}, EpilogueTile{}, _1{});
+    }
+
+    typename Params::TMA_D tma_store_d{};
+    if constexpr (is_destination_supported) {
+      // Tensor pointers will be fixed before the first access
+      ElementD* ptr_D_first_batch = nullptr;
+      Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
+      tma_store_d = make_tma_copy(CopyOpS2G{}, tensor_d, SmemLayoutStageD{}, EpilogueTile{}, _1{});
+    }
+
+    auto fusion_workspace = static_cast<char*>(workspace);
+    auto fusion_workspace_size = round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment);
+    auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
+                                        static_cast<char*>(workspace) + fusion_workspace_size);
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, fusion_workspace),
+      tma_load_c,
+      tma_store_d,
+      tma_descriptor_workspace,
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = cute::is_void_v<ElementC> ? 1 : 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count) + (round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment));
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    bool implementable = true;
+    bool fusion_implementable = true;
+
+    if (problem_shape.is_host_problem_shape_available()) {
+      for (int i = 0; i < problem_shape.groups(); ++i) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+
+        if constexpr (is_destination_supported) {
+          constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+          constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(cute::make_shape(M,N,L), InternalStrideD{});
+        }
+
+        if constexpr (is_source_supported) {
+          constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+          constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(cute::make_shape(M,N,L), InternalStrideC{});
+        }
+
+        fusion_implementable = fusion_implementable && FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
+      }
+    }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if (cute::is_void_v<ElementC> || args.ptr_C == nullptr) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+      if constexpr (detail::has_beta_ptr_array<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr_array == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  //
+  // Static Device Functions
+  //
+
+  template<class CtaTileMNK>
+  CUTLASS_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK const& cta_tile_mnk) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(cta_tile_mnk)), EpilogueTile{}));
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK const& cta_tile_mnk) {
+    return get_load_pipe_increment(cta_tile_mnk);
+  }
+
+  //
+  // Constructor and Data Members
+  //
+  CUTLASS_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+
+  //
+  // Non-static Device Functions
+  //
+public:
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormap,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    // Fetch a copy of tensormaps for the CTA from Params
+    constexpr bool IsEpiLoad = true;
+    auto load_tensormap = tensormaps_init<IsEpiLoad>(params, shared_tensormap, sm_count, sm_idx);
+    return cute::make_tuple(load_tensormap);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class TensorMapC
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      TensorStorage& shared_tensors,
+      cute::tuple<TensorMapC, bool> load_tensormap_info,
+      bool reverse_epi_n = false) {
+    using namespace cute;
+
+    // Check to see if tensormaps have been replaced in gmem
+    if (get<1>(load_tensormap_info) /* did_batch_change */) {
+      tensormaps_fence_acquire<true /* IsEpiLoad */>(get<0>(load_tensormap_info));
+    }
+
+    int lane_idx = canonical_lane_idx();
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(append<3>(make_shape(M,N),Int<1>{}));              //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(cta_tile_mnk));
+    Tensor gC = local_tile(mC, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (TMA,TMA_M,TMA_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      lane_idx
+                    };
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter_n = 0; iter_n < size<3>(gC_epi); ++iter_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter_m = 0; iter_m < size<2>(gC_epi); ++iter_m) {
+        int epi_m = iter_m, epi_n = iter_n;
+        if constexpr (ReuseTmem) {
+          if (reverse_epi_n) {
+            epi_n = size<3>(gC_epi) - 1 - iter_n;
+          }
+        }
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Execute the TMA load for C if needed
+        if (issue_tma_load && is_C_load_needed) {
+          copy(params.tma_load_c.with(get<0>(load_tensormap_info), *tma_barrier, mcast_mask),
+              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+        }
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE void
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      [[maybe_unused]] StorePipelineState store_pipe_producer_state) {
+    load_pipeline.producer_tail(load_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormap,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    // Fetch a copy of tensormaps for the CTA from Params
+    constexpr bool IsEpiLoad = false;
+    cute::TmaDescriptor* store_tensormap = nullptr;
+    int thread_idx = threadIdx.x % ThreadCount;
+    int warp_idx = thread_idx / NumThreadsPerWarp;
+    // Only the first epilogue warp needs to perform TMA related operations
+    if (warp_idx == 0) {
+      store_tensormap = tensormaps_init<IsEpiLoad>(params, shared_tensormap, sm_count, sm_idx);
+    }
+    return cute::make_tuple(store_tensormap);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TensorStorage& shared_tensors,
+      cute::tuple<TensorMapD, bool> store_tensormap_info
+      ) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(accumulators) == 3, "Accumulators must be MMA-partitioned: [MMA, MMA_M, MMA_N]");
+    static_assert(size<1>(accumulators) == 1 && size<2>(accumulators) == 1, "TiledMMA must match partitioned ShapeMN");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "CoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+    int thread_idx = threadIdx.x % ThreadCount;
+    int warp_idx = thread_idx / NumThreadsPerWarp;
+    [[maybe_unused]] int lane_idx = thread_idx % NumThreadsPerWarp;
+
+    // Check to see if tensormaps have been replaced in gmem
+    // Only the first epilogue warp needs to perform TMA related operations
+    if (get<1>(store_tensormap_info) /* did_batch_change */ && warp_idx == 0) {
+      tensormaps_fence_acquire<false /* IsEpiLoad */>(get<0>(store_tensormap_info));
+    }
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N),Int<1>{}));             //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(cta_tile_mnk));
+    Tensor gD = local_tile(mD, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                             // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor gD_epi   = flat_divide(  gD, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    // (t)hread-partition for (t)mem to (r)egister copy (tTR_)
+    TiledCopy tiled_t2r = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor tTR_sD   = thread_t2r.partition_D(sD_epi(_,_,_0{}));                        // (T2R,T2R_M,T2R_N)
+
+    // Allocate D and accumulator registers
+    // Does directly store the visitor into smem.
+    constexpr bool IsDirectR2S = cute::is_same_v<CopyOpR2R, AutoVectorizingCopyWithAssumedAlignment<128>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_sD));                              // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rD   = make_tensor<RegisterElementD>(shape(tTR_sD));                                // (T2R,T2R_M,T2R_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc));               // (EPI_V)
+    Tensor tTR_rD_frg   = recast<Array<RegisterElementD, FragmentSize>>(coalesce(tTR_rD));                   // (EPI_V)
+    CUTE_STATIC_ASSERT(size(tTR_rAcc) % DispatchPolicy::FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_D(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_t2r);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tTR_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tTR_rC = make_tensor<RegisterElementC>(shape(tTR_sD));                                  // (T2R,T2R_M,T2R_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tTR_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = make_tiled_copy_D(Copy_Atom<CopyOpR2R, RegisterElementD>{}, tiled_t2r);
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+    Tensor tRR_rD_src = thread_r2r.retile_S(tTR_rD);                                   // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+    Tensor tRR_rD_dst = thread_r2r.retile_D(tTR_rD);                                   // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = make_tiled_copy_D(Copy_Atom<CopyOpR2S, SmemElementD>{}, tiled_r2r);
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_sD = thread_r2s.partition_D(sD_epi);                                         // (R2S,R2S_M,R2S_N,PIPE_D)
+    Tensor tRS_rD = [&]() {
+      if constexpr (!IsDirectR2S) {
+        return make_tensor<SmemElementD>(shape(tRS_sD(_,_,_,_0{})));
+      }
+      else{
+        return thread_r2s.retile_S(tTR_rD);                                                 // (R2S,R2S_M,R2S_N)
+      }
+    }();
+
+    Tensor tRR_rD_dst_frg = recast<Array<RegisterElementD, FragmentSize>>(coalesce(tRR_rD_dst));
+    Tensor tRS_rD_frg     = recast<Array<SmemElementD, FragmentSize>>(coalesce(tRS_rD));
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));        // (CTA_M,CTA_N)
+    Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
+
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = false; // Register tensors reference T2R copy dst layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_t2r,
+                      cD,
+                      residue_cD,
+                      tTR_cD,
+                      residue_tTR_cD,
+                      tTR_rC,
+                      thread_idx
+                    };
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for sub-128 thread T2R tiled copy
+    Layout tmem_warp_layout = typename decltype(make_tmem_warp_partitioner(tAcc_epi(_,_,0,0)))::TiledLayout_TV{};
+    constexpr bool predicate_tmem_load = size(tmem_warp_layout) != cosize(tmem_warp_layout);
+    bool issue_tmem_load = true;
+
+    // If tmem doesn't have enough capacity to support double buffering, a portion of tmem (a column of epilogue tiles)
+    // is overlapped between 2 pseudo-buffers. The shared tmem portion corresponds to the last epilogue tile column of
+    // tmem accumulator buffer 0, and the first epilogue tile column of tmem accumulator 1.
+    // Thus, whenever we are processing tmem accumulator buffer 0, we process the epilogue tiles with reversed column order.
+    // Once the last epilogue tile column is loaded from tmem, the acc_pipeline is released.
+    // Then, the next accumulation stage for buffer 1 can start.
+    [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0;
+    static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
+
+    // Predication for TMA store (a single thread from one warp issues TMA store)
+    bool issue_tma_store = (warp_idx == 0) && cute::elect_one_sync();
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    // If TMA store supported async transaction mbarriers we would not need this synchronous release behavior.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    [[maybe_unused]] int epi_m_prev = 0;
+    [[maybe_unused]] int epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The Epilogue Loop
+    auto epi_loop_fn = [&] (auto& cst_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+      bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+      bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+      // The TMA store sequence for one epilogue loop iteration
+      auto tma_store_fn = [&] (int epi_m, int epi_n) CUTLASS_LAMBDA_FUNC_INLINE {
+        // Write the tile from smem to gmem with TMA
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+        synchronize(); // ensure all threads have issued their async fence
+
+        if constexpr (is_destination_supported) {
+          if (issue_tma_store) {
+            copy(params.tma_store_d.with(get<0>(store_tensormap_info)), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+          }
+        }
+  
+        // Post async fence, pre TMA commit callback entry point
+        cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+  
+        // Commit the TMA stores for this stage
+        if (issue_tma_store) {
+          store_pipeline.producer_commit(store_pipe_producer_state);
+        }
+        ++store_pipe_producer_state;
+  
+        // Wait for the next smem buffer to be available
+        if (issue_tma_store) {
+          store_pipeline.producer_acquire(store_pipe_producer_state);
+        }
+        synchronize();
+  
+        if constexpr (ReuseSmemC) {
+          // producer_acquire returns when at most StagesD-1 committed stores are pending
+          bool store_finished = store_pipe_producer_state.count() > StorePipeline::UnacquiredStages;
+          // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+          if (store_finished) {
+            if (is_producer_load_needed) {
+              load_pipeline.consumer_release(load_pipe_consumer_state);
+            }
+            ++load_pipe_consumer_state;
+          }
+        }
+      }; // tma_store_fn
+
+      // Begin the wait for the producer load results
+      ConsumerToken load_wait_token{BarrierStatus::WaitDone};
+      if (is_producer_load_needed) {
+        load_wait_token = load_pipeline.consumer_try_wait(load_wait_state);
+      }
+      // Begin the wait for the accumulator results
+      ConsumerToken acc_wait_token = acc_pipeline.consumer_try_wait(acc_pipe_consumer_state);
+
+      cst_callbacks.begin();
+      if (cst_callbacks.begin_sync_needed()) {
+        synchronize();
+      }
+      // For each epilogue subtile within the CTA tile
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+          int epi_m = iter_m, epi_n = iter_n;
+          bool is_first_iteration = iter_m == 0 && iter_n == 0;
+          bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
+          bool do_acc_release = is_last_iteration;
+
+          // Reverse subtile order for tmem reuse if necessary
+          if constexpr (ReuseTmem) {
+            if (reverse_epi_n) {
+              epi_n = size<3>(gD_epi) - 1 - iter_n;
+            }
+            do_acc_release = iter_m == size<2>(gD_epi)-1 && iter_n == 0;
+          }
+
+          cst_callbacks.begin_loop(epi_m, epi_n);
+
+          if (is_producer_load_needed) {
+            // Wait for the producer load to fill smem
+            load_pipeline.consumer_wait(load_wait_state, load_wait_token);
+
+            if (is_C_load_needed) {
+              // Copy source tile from smem to register
+              copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+              // Ensure smem loads are complete before reusing smem for mixed types/layouts
+              if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+                synchronize();
+              }
+            }
+          }
+
+          // First loop fusion callback entry point
+          cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+          if (is_producer_load_needed) {
+            // Let producer load warp know smem buffers are consumed and empty
+            if constexpr (not ReuseSmemC) {
+              cutlass::arch::fence_view_async_shared();
+              load_pipeline.consumer_release(load_pipe_consumer_state);
+              ++load_pipe_consumer_state;
+            }
+            ++load_wait_state;
+          }
+
+          if (is_first_iteration) {
+            // Wait for mma warp to fill tmem buffer with accumulator results
+            acc_pipeline.consumer_wait(acc_pipe_consumer_state, acc_wait_token);
+          }
+
+          // The current tile in tmem
+          Tensor tTR_tAcc_mn = tTR_tAcc(_,_,_,epi_m,epi_n);
+
+          // Compute tmem load predication if necessary
+          if constexpr (predicate_tmem_load) {
+            // Issue tmem load if this tile's tmem subpartition is accessible by this warp
+            int subpart_idx = (tTR_tAcc_mn.data().dp_ / 32) % 4;
+            issue_tmem_load = warp_idx == subpart_idx;
+          }
+
+          // Copy accumulator tile from tmem to register
+          if (issue_tmem_load) {
+            copy(tiled_t2r, tTR_tAcc_mn, tTR_rAcc);
+          }
+
+          // After the last tmem load, signal that tmem buffer is consumed and empty
+          if (do_acc_release) {
+            cutlass::arch::fence_view_async_tmem_load();
+            acc_pipeline.consumer_release(acc_pipe_consumer_state);
+            ++acc_pipe_consumer_state;
+          }
+
+          // Vectorized fragment loop with visitor callback entry point
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tTR_rD_frg); ++epi_v) {
+            tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+          }
+
+          // The latest we can delay the TMA store is right before the smem store of the next iteration
+          // since the current TMA store needs to be committed before we can acquire the next smem buffer
+          if constexpr (DelayTmaStore) {
+            // Issue TMA stores for the previous subtile
+            if (not is_first_iteration) {
+              tma_store_fn(epi_m_prev, epi_n_prev);
+            }
+            epi_m_prev = epi_m;
+            epi_n_prev = epi_n;
+          }
+          
+          if constexpr (!IsDirectR2S) {
+            // At present, only FP4 col output with scalefactor generation fusion would go into these branch
+            copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+          }
+          tRS_rD_frg(_0{}) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRR_rD_dst_frg(_0{}));
+
+          // Smem reduction callback entry point using current store buffer for workspace
+          Tensor reduction_buffer = make_tensor(raw_pointer_cast(sD_epi(_,_,store_pipe_producer_state.index()).data()),
+                                                make_layout(stride<2>(get_nonswizzle_portion(SmemLayoutD{})), _1{}));
+          cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
+
+          // Copy output tile from register to smem
+          bool issue_smem_store = issue_tmem_load;
+          if constexpr (is_destination_supported) {
+            if (issue_smem_store) {
+              copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+            }
+          }
+
+          // Post reduction, pre TMA store callback entry point
+          cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+          if constexpr (not DelayTmaStore) {
+            // Issue TMA stores for this subtile
+            tma_store_fn(epi_m, epi_n);
+          }
+
+          cst_callbacks.end_loop(epi_m, epi_n);
+
+          if (is_producer_load_needed) {
+            // Begin the wait for the next subtile producer load
+            load_wait_token = load_pipeline.consumer_try_wait(load_wait_state, is_last_iteration);
+          }
+        } // for epi_m
+      } // for epi_n
+
+      if constexpr (DelayTmaStore) {
+        // Issue TMA stores for the last subtile
+        tma_store_fn(epi_m_prev, epi_n_prev);
+      }
+
+      cst_callbacks.end();
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    epi_loop_fn(cst_callbacks);
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state, acc_pipe_consumer_state);
+  }
+
+  // API with Global Accumulator in registers for FastFP32 (emulated MMA) kernels.
+  // The accumulator in TMEM periodically loaded into the registers so that the MMA can clear out the TMEM accumulator
+  // values for better accuracy. This epilogue accepts the accumulator in registers and take TiledCopy for the
+  // TMEM->Reg as a parameter to be used in partitioning GMEM tensors C and D.
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TiledCopyT2R,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine, AccLayout>& tTR_rAcc,                                     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+      TensorStorage& shared_tensors,
+      TensorMapD store_tensormap,
+      TiledCopyT2R tiled_t2r
+      ) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be Register resident.");
+    static_assert(rank(AccLayout{}) == 5, "Accumulators must be copy-partitioned:  (T2R,T2R_M,T2R_N,EPI_M,EPI_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "CoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+    int thread_idx = threadIdx.x % ThreadCount;
+    int warp_idx = thread_idx / NumThreadsPerWarp;
+    [[maybe_unused]] int lane_idx = thread_idx % NumThreadsPerWarp;
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord),Int<0>{});
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N),Int<1>{}));             //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(cta_tile_mnk));
+    Tensor gD = local_tile(mD, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(  gD, EpilogueTile{});                           // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    // (t)hread-partition for (t)mem to (r)egister copy (tTR_)
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_sD = thread_t2r.partition_D(sD_epi(_,_,_0{}));                                      // (T2R,T2R_M,T2R_N)
+
+    // Allocate D and accumulator registers
+    Tensor tTR_rD = make_tensor<SmemElementD>(shape(tTR_sD));                                      // (T2R,T2R_M,T2R_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tTR_rD_frg = recast<Array<SmemElementD, FragmentSize>>(coalesce(tTR_rD));                         // (EPI_V)
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r  = make_tiled_copy_D(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_t2r);
+    ThrCopy thread_s2r   = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tTR_rD).layout();                                   // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tTR_rC = make_tensor<RegisterElementC>(shape(tTR_sD));                                  // (T2R,T2R_M,T2R_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tTR_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_t2r);
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rD = thread_r2s.retile_S(tTR_rD);                                                   // (R2S,R2S_M,R2S_N)
+    Tensor tRS_sD = thread_r2s.partition_D(sD_epi);                                         // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                         // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
+
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = false; // Register tensors reference T2R copy dst layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_t2r,
+                      cD,
+                      residue_cD,
+                      tTR_cD,
+                      residue_tTR_cD,
+                      tTR_rC,
+                      thread_idx
+        };
+
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = warp_idx == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    // If TMA store supported async transaction mbarriers we would not need this synchronous release behavior.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    int epi_m_prev = 0, epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if (issue_tma_store) {
+        copy(params.tma_store_d.with(store_tensormap), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = store_pipe_producer_state.count() > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Begin the wait for the producer load results
+    ConsumerToken load_wait_token{BarrierStatus::WaitDone};
+    if (is_producer_load_needed) {
+      load_wait_token = load_pipeline.consumer_try_wait(load_wait_state);
+    }
+
+    cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
+
+    // For each epilogue subtile within the CTA tile
+    constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+    constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+    #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+    for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+      for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+        int epi_m = iter_m, epi_n = iter_n;
+        bool is_first_iteration = iter_m == 0 && iter_n == 0;
+        bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state, load_wait_token);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+            // Ensure smem loads are complete before reusing smem for mixed types/layouts
+            if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+              synchronize();
+            }
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          // Let producer load warp know smem buffers are consumed and empty
+          if constexpr (not ReuseSmemC) {
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        bool issue_smem_store = true;
+        Tensor tTR_rAcc_epi_tile = tTR_rAcc(_,_,_,epi_m,epi_n);
+        Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc_epi_tile));     // (EPI_V)        
+
+        // Vectorized fragment loop with visitor callback entry point
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tTR_rD_frg); ++epi_v) {
+          tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+        }
+
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        Tensor reduction_buffer = make_tensor(raw_pointer_cast(sD_epi(_,_,store_pipe_producer_state.index()).data()),
+                                              make_layout(stride<2>(get_nonswizzle_portion(SmemLayoutD{})), _1{}));
+        cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tTR_rD_frg);
+
+        // Copy output tile from register to smem
+        if (issue_smem_store) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Begin the wait for the next subtile producer load
+          load_wait_token = load_pipeline.consumer_try_wait(load_wait_state, is_last_iteration);
+        }
+      } // for epi_m
+    } // for epi_n
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template <class CtaTileMNK>
+  CUTLASS_DEVICE void
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      CtaTileMNK cta_tile_mnk) {
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // wait for all TMA stores to complete
+        store_pipeline.producer_tail(store_pipe_producer_state);
+
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(cta_tile_mnk));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE auto
+  tensormaps_init(Params const& params,
+      TensorMapStorage& shared_tensormap,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* tma_desc = nullptr;
+    cute::TmaDescriptor* gmem_tensormap = params.tensormaps;
+    if constexpr (IsLoad) {
+      if (is_source_supported) {
+        tma_desc = &gmem_tensormap[sm_idx];
+        if (cute::elect_one_sync()) {
+          // Bringing tensormaps from params to smem for modification later
+          Tensor pC_tensormap = make_tensor(params.tma_load_c.get_tma_descriptor(), Int<1>{}, Int<1>{});
+          Tensor sC_tensormap = make_tensor(make_smem_ptr(&shared_tensormap.smem_tensormap_C), Int<1>{}, Int<1>{});
+          copy(recast<uint128_t>(pC_tensormap), recast<uint128_t>(sC_tensormap));
+        }
+        __syncwarp();
+      }
+    } else if constexpr (is_destination_supported) {
+      int const offset_Ddesc = cute::is_void_v<ElementC> ? 0 : sm_count;
+      tma_desc = &gmem_tensormap[sm_idx + offset_Ddesc];
+      if (cute::elect_one_sync()) {
+        // Bringing tensormaps from params to smem for modification later
+        Tensor pD_tensormap = make_tensor(params.tma_store_d.get_tma_descriptor(), Int<1>{}, Int<1>{});
+        Tensor sD_tensormap = make_tensor(make_smem_ptr(&shared_tensormap.smem_tensormap_D), Int<1>{}, Int<1>{});
+        copy(recast<uint128_t>(pD_tensormap), recast<uint128_t>(sD_tensormap));
+      }
+      __syncwarp();
+    }
+
+    return tma_desc;
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormap,
+      Params const& params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.ptr_C != nullptr) {
+          cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormap.smem_tensormap_C,
+                                                          params.ptr_C[next_batch]);
+        }
+      }
+    } else if constexpr (is_destination_supported) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormap.smem_tensormap_D,
+                                                      params.ptr_D[next_batch]);
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride = {0,0,0,0,0};
+
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.dC != nullptr) {
+          ElementC const* ptr_C = nullptr;
+          Tensor tensor_c = make_tensor(ptr_C, make_layout(make_shape(M,N,Int<1>{}), params.dC[next_group]));
+
+          cute::detail::fill_tma_gmem_shape_stride(params.tma_load_c, tensor_c, 
+                                                  prob_shape, prob_stride);
+          // Convert strides to byte strides
+          for (uint64_t& stride : prob_stride) {
+            stride = (stride * sizeof_bits_v<ElementC>) / 8;
+          }
+          cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                                  prob_shape,
+                                                                  prob_stride);
+        }
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      ElementD const* ptr_D = nullptr;
+      Tensor tensor_d = make_tensor(ptr_D, make_layout(make_shape(M,N,Int<1>{}), params.dD[next_group]));
+
+      cute::detail::fill_tma_gmem_shape_stride(params.tma_store_d, tensor_d, 
+                                               prob_shape, prob_stride);
+      // Convert strides to byte strides
+      for (uint64_t& stride : prob_stride) {
+        stride = (stride * sizeof_bits_v<ElementD>) / 8;
+      }
+
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_D,
+                                                              prob_shape,
+                                                              prob_stride);
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <bool IsLoad, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormap,
+      Params const& params,
+      cute::TmaDescriptor const* tensormap,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address<IsLoad>(shared_tensormap, params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties<IsLoad>(
+            shared_tensormap, params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release<IsLoad>(shared_tensormap, tensormap);
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      TensorMapStorage& shared_tensormap,
+      cute::TmaDescriptor const* tensormap) {
+    // Commit and wait for all TMA load/store instructions before updating the tensormap in gmem.
+    // This operation only happens when the group/batch changes between consecutive tiles.
+    // If there are no uncommitted instructions then tma_desc_commit_group results in an empty bulk async-group.
+    auto tma_desc_wait_all_fn = [] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (cute::elect_one_sync()) {
+        cute::tma_desc_commit_group();
+        cute::tma_desc_wait_group();
+      }
+    };
+    // Entire warp must do this (ie its aligned)
+    if constexpr (IsLoad) {
+      if (is_source_supported) {
+        tma_desc_wait_all_fn();
+        tma_descriptor_cp_fence_release(tensormap, shared_tensormap.smem_tensormap_C);
+      }
+    } else if constexpr (is_destination_supported) {
+      tma_desc_wait_all_fn();
+      tma_descriptor_cp_fence_release(tensormap, shared_tensormap.smem_tensormap_D);
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::TmaDescriptor const* tensormap) {
+    if constexpr (IsLoad) {
+      if (is_source_supported) {
+        cute::tma_descriptor_fence_acquire(tensormap);
+      }
+    } else if constexpr (is_destination_supported) {
+      cute::tma_descriptor_fence_acquire(tensormap);
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..90dfb80c00b7c4c48ce74d69cca52aeea8b80baa
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp
@@ -0,0 +1,856 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/conv/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/numeric/numeric_types.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+template<class T>
+struct IsDefaultFusionOp {
+  static constexpr bool value = false;
+};
+
+template<
+  class ElementD, class ElementCompute,
+  class ElementC, FloatRoundStyle RoundStyle
+>
+struct IsDefaultFusionOp<
+  epilogue::fusion::LinearCombination<
+    ElementD, ElementCompute, ElementC, ElementCompute, RoundStyle>
+> {
+  static constexpr bool value = true;
+};
+
+template<
+  class ElementOutput, int Count, class ElementAccumulator,
+  class ElementCompute, epilogue::thread::ScaleType::Kind Scale,
+  FloatRoundStyle Round, class ElementSource
+>
+struct IsDefaultFusionOp<
+  epilogue::thread::LinearCombination<
+    ElementOutput, Count, ElementAccumulator,
+    ElementCompute, Scale, Round, ElementSource>
+> {
+  static constexpr bool value = true;
+};
+
+// Legacy direct store sm100 epilogue using thread::LinearCombination, do not expect this to be stable
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class CopyOpT2R_,
+  class AlignmentC_,
+  class AlignmentD_
+>
+class CollectiveEpilogue<
+    Sm100NoSmem,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    CopyOpT2R_,
+    AlignmentC_,
+    AlignmentD_,
+    cute::enable_if_t<IsDefaultFusionOp<ThreadEpilogueOp_>::value>
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm100NoSmem;
+  using EpilogueTile = EpilogueTile_;
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using CopyOpT2R = CopyOpT2R_;
+  using AlignmentC = AlignmentC_;
+  using AlignmentD = AlignmentD_;
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+  constexpr static int ThreadCount = 128;
+  constexpr static int kOutputAlignment = ThreadEpilogueOp::kCount;
+  constexpr static bool isEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  constexpr static bool isSourceNeeded = not cute::is_void_v<ElementC>;
+
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+
+  struct SharedStorage { };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <conv::Operator ConvOp, int NumDims>
+  static bool
+  can_implement(cutlass::conv::ConvProblemShape<ConvOp,NumDims> const& problem_shape, Arguments const& args) {
+    return can_implement(cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape), args);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto shape = cute::make_shape(M,N,L);
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<AlignmentD{}>(shape, StrideD{});
+    if constexpr (isSourceNeeded) {
+      implementable = implementable && cutlass::detail::check_alignment<AlignmentC{}>(shape, StrideC{});
+    }
+    return implementable;  
+  }
+
+  //
+  // Constructor and Data Members
+  //
+  CUTLASS_DEVICE
+  CollectiveEpilogue(Params const& params, SharedStorage&) : params(params) { };
+
+protected:
+  Params const& params;
+
+  //
+  // Non-static Device Methods
+  //
+public:
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  operator()(
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK cta_tile_shape_mnk,
+      TileCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine, AccLayout> const& accumulators,                                      // (MMA,MMA_M,MMA_N)
+      [[maybe_unused]] SharedStorage&) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    auto problem_shape_mnl = select<0,1,3>(problem_shape_mnkl);
+    auto cta_coord_mnl = select<0,1,3>(cta_coord_mnkl);
+    auto cta_tiler = take<0,2>(cta_tile_shape_mnk);
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), problem_shape_mnl, append<3>(params.dC,_0{}));      // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D), problem_shape_mnl, append<3>(params.dD,_0{}));      // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+
+    // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
+    auto tiled_t2r = make_tmem_copy(CopyOpT2R{}, tensor<0>(accumulators));
+    auto thread_idx = threadIdx.x % size(tiled_t2r);
+
+    auto thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_gD));                              // (T2R,T2R_M,T2R_N)
+
+    Tensor tTR_rC = make_tensor<GmemElementC>(shape(tTR_gC));                                          // (T2R,T2R_M,T2R_N)
+
+    Tensor coordCD = make_identity_tensor(problem_shape_mnl);                                     // (M,N,L) -> (m,n,l)
+    Tensor cCD = local_tile(coordCD, cta_tiler, cta_coord_mnl);                             // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cCD = thread_t2r.partition_D(cCD);                                       // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    constexpr auto mclD = decltype(max_common_layout(tTR_rAcc.layout(), tTR_gD.layout())){};
+    constexpr int VD = cute::min(AlignmentD{}, size(mclD));
+    Tensor tTR_rD_frag = make_tensor<ElementD>(shape(tTR_rAcc));
+    Tensor tTR_rD_src = recast<Array<ElementD, VD>>(coalesce(tTR_rD_frag));
+    Tensor tR2G_rD_dst = recast<Array<ElementD, VD>>(coalesce(tTR_gD));
+
+    Tensor tTR_cD_mn_frg = tensor<1>(zipped_divide(coalesce(tTR_cCD), mclD.compose(Int<VD>{})));
+    Tensor tDpD = make_tensor<bool>(shape(tR2G_rD_dst));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int t = 0; t < size(tDpD); t++) {
+      tDpD(t) = elem_less(tTR_cD_mn_frg(t), problem_shape_mnl);
+    }
+
+    constexpr auto mclC = decltype(max_common_layout(tTR_rAcc.layout(), tTR_gC.layout())){};
+    constexpr int VC = cute::min(AlignmentC{}, size(mclC));
+
+    Tensor tTR_cC_mn_frg = tensor<1>(zipped_divide(coalesce(tTR_cCD), mclC.compose(Int<VC>{})));
+    Tensor tG2R_rC_dst = recast<Array<GmemElementC, VC>>(coalesce(tTR_gC));
+    Tensor tCpC = make_tensor<bool>(shape(tG2R_rC_dst));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int t = 0; t < size(tCpC); t++) {
+      tCpC(t) = elem_less(tTR_cC_mn_frg(t), problem_shape_mnl);
+    }
+    Tensor tTR_rC_src = recast<Array<GmemElementC, VC>>(coalesce(tTR_gC));
+    Tensor tTR_rC_dst = recast<Array<GmemElementC, VC>>(coalesce(tTR_rC));
+
+    // Detect interleaved complex fp32 kernels
+    [[maybe_unused]] Tensor accs = accumulators;
+    using ElementTmem = typename decltype(accs)::value_type;
+    constexpr bool is_interleaved_complex_f32 = is_complex<ElementAccumulator>::value && cute::is_same_v<ElementTmem, float>;
+
+    // 1. Load accumulators into register from tmem
+    // Tmem -> rmem and transformation for interleaved complex kernels
+    if constexpr (is_interleaved_complex_f32) {
+      using ElementComputeAccumulator = float;
+
+      Tensor tAccReal = accumulators(make_coord(_,_),_0{},_0{},_0{});                                  // (CTA_M,CTA_N)
+      Tensor tAccImag = accumulators(make_coord(_,_),_0{},_0{},_1{});                                  // (CTA_M,CTA_N)
+      Tensor tTR_tAccReal = thread_t2r.partition_S(tAccReal);                                      // (T2R,T2R_M,T2R_N)
+      Tensor tTR_tAccImag = thread_t2r.partition_S(tAccImag);                                      // (T2R,T2R_M,T2R_N)
+      Tensor tTR_rAccReal = make_tensor<ElementComputeAccumulator>(shape(tTR_gD));                 // (T2R,T2R_M,T2R_N)
+      Tensor tTR_rAccImag = make_tensor<ElementComputeAccumulator>(shape(tTR_gD));                 // (T2R,T2R_M,T2R_N)
+
+      copy(tiled_t2r, tTR_tAccReal, tTR_rAccReal);
+      copy(tiled_t2r, tTR_tAccImag, tTR_rAccImag);
+
+      // 1.1. Transform accumulators in registers
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAccReal); i++) {
+        tTR_rAcc(i) = {tTR_rAccReal(i), tTR_rAccImag(i)};
+      }
+    }
+
+    // Standard tmem -> rmem epilogue
+    else {
+      Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                           // (CTA_M,CTA_N)
+      Tensor tTR_tAcc = thread_t2r.partition_S(tAcc);                                              // (T2R,T2R_M,T2R_N)
+
+      copy(tiled_t2r, tTR_tAcc, tTR_rAcc);
+    }
+
+    cutlass::arch::fence_view_async_tmem_load();
+    acc_pipeline.consumer_release(acc_pipe_consumer_state);
+    ++acc_pipe_consumer_state;
+
+    // 2. Apply element-wise operation and store to gmem
+    ThreadEpilogueOp epilogue_op{params.thread};
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      copy_if(tCpC, tTR_rC_src, tTR_rC_dst);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rD_frag(i) = epilogue_op(tTR_rAcc(i), tTR_rC(i));
+      }
+
+      copy_if(tDpD, tTR_rD_src, tR2G_rD_dst);
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rAcc); i++) {
+        tTR_rD_frag(i) = epilogue_op(tTR_rAcc(i));
+      }
+
+      copy_if(tDpD, tTR_rD_src, tR2G_rD_dst);
+    }
+
+    return cute::make_tuple(acc_pipe_consumer_state);
+  }
+
+
+  // API with Global Accumulator in registers for FastFP32 (emulated MMA) kernels.
+  // The accumulator in TMEM periodically loaded into the registers so that the MMA can clear out the TMEM accumulator
+  // values for better accuracy. This epilogue accepts the accumulator in registers and take TiledCopy for the
+  // TMEM->Reg as a parameter to be used in partitioning GMEM tensors C and D.
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledCopy
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK cta_tile_shape_mnk,
+      TileCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine, AccLayout>& tTR_rGlobAcc,                                      // (MMA,MMA_M,MMA_N)
+      [[maybe_unused]] SharedStorage&,
+      TiledCopy tiled_t2r) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be Register resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(AccLayout{}) == 5, "Accumulators must be copy-partitioned:  (T2R,T2R_M,T2R_N,EPI_M,EPI_N)");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    auto problem_shape_mnl = select<0,1,3>(problem_shape_mnkl);
+    auto cta_coord_mnl = select<0,1,3>(cta_coord_mnkl);
+    auto cta_tiler = take<0,2>(cta_tile_shape_mnk);
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), problem_shape_mnl, append<3>(params.dC,_0{})); // (M,N,L)
+    Tensor mD = make_tensor(make_gmem_ptr(params.ptr_D), problem_shape_mnl, append<3>(params.dD,_0{}));      // (M,N,L)
+    Tensor gC = local_tile(mC, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+    Tensor gD = local_tile(mD, cta_tiler, cta_coord_mnl);                                              // (CTA_M,CTA_N)
+
+
+    // Partition source and destination tiles according to tmem copy T2R partitioning (tTR_)
+    auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
+    Tensor tTR_gC   = thread_t2r.partition_D(gC);                                                  // (T2R,T2R_M,T2R_N)
+    Tensor tTR_gD   = thread_t2r.partition_D(gD);                                                  // (T2R,T2R_M,T2R_N)
+
+
+    Tensor coordCD = make_identity_tensor(problem_shape_mnl);                                     // (M,N,L) -> (m,n,l)
+    Tensor cCD = local_tile(coordCD, cta_tiler, cta_coord_mnl);                             // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor tTR_cCD = thread_t2r.partition_D(cCD);                                       // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    // 2. Apply element-wise operation and store to gmem
+    ThreadEpilogueOp epilogue_op{params.thread};
+    // source is needed
+    if (epilogue_op.is_source_needed()) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rGlobAcc); ++i) {
+        if (elem_less(tTR_cCD(i), problem_shape_mnl)) {
+          tTR_gD(i) = epilogue_op(tTR_rGlobAcc(i), tTR_gC(i));
+        }
+      }
+    }
+    // source is not needed, avoid load
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tTR_rGlobAcc); ++i) {
+        if (elem_less(tTR_cCD(i), problem_shape_mnl)) {
+          tTR_gD(i) = epilogue_op(tTR_rGlobAcc(i));
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Direct store sm100 epilogue supporting EVT
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpT2R_,
+  class AlignmentC_,
+  class AlignmentD_
+>
+class CollectiveEpilogue<
+    Sm100NoSmem,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpT2R_,
+    AlignmentC_,
+    AlignmentD_,
+    cute::enable_if_t<not IsDefaultFusionOp<FusionCallbacks_>::value>
+> {
+public:
+  //
+  // Type Aliases
+  //
+  // Required by the gemm::kernel
+  using DispatchPolicy = Sm100NoSmem;
+  using ElementC = ElementC_;
+  using ElementD = ElementD_;
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+  using StrideC = StrideC_;
+  using StrideD = StrideD_;
+  using EpilogueTile = EpilogueTile_;
+  using CopyOpT2R = CopyOpT2R_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = void;
+
+private:
+  constexpr static bool IsReductionBufferNeeded = ThreadEpilogueOp::IsDePerRowBiasSupported
+                                               || is_same_v<ThreadEpilogueOp, epilogue::fusion::FusionOperation>; // alloc reduction buffer for custom EVTs
+  constexpr static size_t ImplicitSharedStorageSize = IsReductionBufferNeeded ? size(EpilogueTile{}) : 0;
+
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+
+public:
+  constexpr static int ThreadCount = 128;
+  constexpr static uint32_t TmaTransactionBytes = 0;
+
+  struct SharedStorage {
+    using FusionStorage = typename FusionCallbacks::SharedStorage;
+    FusionStorage thread;
+    array_aligned<uint8_t, ImplicitSharedStorageSize> buffer;
+  };
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC = {};
+    ElementD* ptr_D = nullptr;
+    StrideD dD = {};
+  };
+
+  // Device side epilogue params
+  struct Params {
+    typename FusionCallbacks::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC = {};
+    ElementD* ptr_D = nullptr;
+    StrideD dD = {};
+  };
+
+  //
+  // Constructor and Data Members
+  //
+  CUTLASS_DEVICE
+  CollectiveEpilogue(Params const& params_, SharedStorage& shared_tensors)
+  : fusion_callbacks(params_.thread, shared_tensors.thread)
+  , smem_buffer_ptr(shared_tensors.buffer.data())
+  , params(params_) {};
+
+protected:
+  FusionCallbacks fusion_callbacks;
+  uint8_t* smem_buffer_ptr;
+  Params const& params;
+
+public:
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+    return fusion_implementable;
+  }
+
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class AccEngine, class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  operator()(
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      [[maybe_unused]] SharedStorage&
+  ) {
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    // Wait for mma warp to fill tmem buffer with accumulator results
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+    static_assert(cute::sizeof_bits_v<ElementD> != 6, "Output element requires smem");
+
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto problem_shape_mnl = select<0,1,3>(problem_shape_mnkl);
+    auto cta_coord_mnl = select<0,1,3>(cta_coord_mnkl);
+    auto cta_tiler = take<0,2>(cta_tile_mnk);
+
+    int thread_idx = threadIdx.x % ThreadCount;
+
+    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                             // (CTA_M,CTA_N)
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    TiledCopy tiled_t2r = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+
+    constexpr int FragmentSize = size(EpilogueTile{}) / ThreadCount;
+
+    Tensor coordD = make_identity_tensor(problem_shape_mnl);                                      // (M,N,L) -> (m,n,l)
+    Tensor cD = local_tile(coordD, cta_tiler, cta_coord_mnl);                               // (CTA_M,CTA_N) -> (m,n,l)
+    Tensor cD_epi = flat_divide(cD, EpilogueTile{});
+    Tensor tTR_cD = thread_t2r.partition_D(cD_epi);                                     // (T2R,T2R_M,T2R_N) -> (m,n,l)
+
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_cD(_,_,_,_0{},_0{})));
+
+    // Construct the EVT consumer callbacks
+    auto residue_cD = make_coord(M,N) - cD(_0{});
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD(_0{});
+    Tensor cD_ = make_coord_tensor(cD.layout());
+    Tensor tTR_cD_ = make_coord_tensor(tTR_cD.layout());
+    constexpr bool RefSrc = false;
+
+    Tensor mC = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), make_shape(M,N,L), params.dC);
+
+    Tensor tTR_gC = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      mC, cta_tile_mnk, cta_coord_mnkl, EpilogueTile{}, tiled_t2r, thread_idx);
+
+    Tensor mD = make_tensor(make_gmem_ptr(recast_ptr<ElementD>(params.ptr_D)), make_shape(M,N,L), params.dD);
+
+    Tensor tTR_gD = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      mD, cta_tile_mnk, cta_coord_mnkl, EpilogueTile{}, tiled_t2r, thread_idx);
+
+    // Register Tensor
+    Tensor tTR_rD = make_tensor<ElementD>(take<0,3>(shape(tTR_gD)));
+
+    Tensor coord_cCD = make_identity_tensor(problem_shape_mnl);
+    Tensor tTR_cCD = cutlass::epilogue::fusion::sm90_partition_for_epilogue<RefSrc>(
+                      coord_cCD, cta_tile_mnk, cta_coord_mnkl, EpilogueTile{}, tiled_t2r, thread_idx);
+    constexpr auto mclD = decltype(max_common_layout(tTR_gD(_,_,_,_0{},_0{}), tTR_rD)){};
+    constexpr int VD = cute::min(AlignmentD_{}, size(mclD));
+
+    auto tCrC = make_tensor<GmemElementC>(take<0,3>(shape(tTR_gC)));
+    constexpr auto mclC = decltype(max_common_layout(tTR_gC(_,_,_,_0{},_0{}), tCrC)){};
+    constexpr int VC = cute::min(AlignmentC_{}, size(mclC));
+
+    Tensor tTR_rD_frg = recast<Array<ElementD, FragmentSize>>(coalesce(tTR_rD));
+
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+      problem_shape_mnkl,
+      cta_tile_mnk,
+      cta_coord_mnkl,
+      int(0),
+      EpilogueTile{},
+      tiled_t2r,
+      cD_,
+      residue_cD,
+      tTR_cD_,
+      residue_tTR_cD,
+      tCrC,
+      thread_idx
+    };
+
+    auto synchronize = [] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // The Epilogue Loop
+    auto epi_loop_fn = [&] (auto& cst_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+      bool is_C_load_needed = fusion_callbacks.is_C_load_needed();
+
+      // Ensure there are no threads from the previous wave writing to shared memory being utilized for the current wave.
+      synchronize();
+      cst_callbacks.begin();
+      if (cst_callbacks.begin_sync_needed()) {
+        synchronize();
+      }
+
+      // If tmem doesn't have enough capacity to support double buffering, a portion of tmem (a column of epilogue tiles)
+      // is overlapped between 2 pseudo-buffers. The shared tmem portion corresponds to the last epilogue tile column of
+      // tmem accumulator buffer 0, and the first epilogue tile column of tmem accumulator 1.
+      // Thus, whenever we are processing tmem accumulator buffer 0, we process the epilogue tiles with reversed column order.
+      // Once the last epilogue tile column is loaded from tmem, the acc_pipeline is released.
+      // Then, the next accumulation stage for buffer 1 can start.
+      [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0;
+      static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
+
+      // For each epilogue subtile within the CTA tile
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<4>(tTR_tAcc));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<3>(tTR_tAcc));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+          int epi_m = iter_m, epi_n = iter_n;
+
+          bool is_last_iteration = iter_m == size<3>(tTR_tAcc)-1 && iter_n == size<4>(tTR_tAcc)-1;
+          bool do_acc_release = is_last_iteration;
+
+          // Reverse subtile order for tmem reuse if necessary
+          if constexpr (ReuseTmem) {
+            if (reverse_epi_n) {
+              epi_n = size<4>(tTR_tAcc) - 1 - iter_n;
+            }
+            do_acc_release = iter_m == size<3>(tTR_tAcc)-1 && iter_n == 0;
+          }
+
+          Tensor tTR_cCD_mn = tTR_cCD(_,_,_,epi_m,epi_n);
+          Tensor tTR_pCD_mn = cute::lazy::transform(tTR_cCD_mn, [&] (auto const& c) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(c, problem_shape_mnl); });
+          cst_callbacks.begin_loop(epi_m, epi_n);
+
+          if constexpr (not cute::is_void_v<ElementC>) {
+            if (is_C_load_needed) {
+              using CVecType = uint_bit_t<VC * sizeof_bits_v<ElementC>>;
+
+              if constexpr (!is_same_v<CVecType, uint256_t>) {
+                Tensor tTR_gC_frg = recast<CVecType>(coalesce(tTR_gC(_,_,_,epi_m,epi_n)));
+                Tensor tTR_rC_frg = recast<CVecType>(coalesce(tCrC));
+                Tensor tTR_pC_frg = tensor<1>(zipped_divide(coalesce(tTR_pCD_mn), mclC.compose(Int<VC>{})));
+                copy_if(tTR_pC_frg, tTR_gC_frg, tTR_rC_frg);
+              }
+              else {
+                auto tiled_g2r = make_tiled_copy_D(Copy_Atom<SM100_LOAD_256bit_CACHE_NOALLOCATION, ElementC>{}, tiled_t2r);
+                auto thr_g2r = tiled_g2r.get_slice(threadIdx.x);
+                Tensor c_src = thr_g2r.retile_S(tTR_gC(_,_,_,epi_m,epi_n));
+                Tensor c_dst = thr_g2r.retile_D(tCrC);
+                Tensor c_prd = thr_g2r.retile_D(tTR_pCD_mn);
+                copy_if(tiled_g2r, c_prd, c_src, c_dst);
+              }
+            }
+          }
+
+          // Copy accumulator tile from tmem to register
+          // The current tile in tmem
+          Tensor tTR_tAcc_mn = tTR_tAcc(_,_,_,epi_m,epi_n);
+
+          Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc));
+
+          copy(tiled_t2r, tTR_tAcc_mn, tTR_rAcc);
+
+          // After the last tmem load, signal that tmem buffer is consumed and empty
+          if (do_acc_release) {
+            cutlass::arch::fence_view_async_tmem_load();
+            acc_pipeline.consumer_release(acc_pipe_consumer_state);
+            ++acc_pipe_consumer_state;
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tTR_rAcc_frg); ++epi_v) {
+            tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+          }
+
+          Tensor reduction_buffer = make_tensor(
+            raw_pointer_cast(make_smem_ptr(smem_buffer_ptr)), make_layout(Shape<Int<ImplicitSharedStorageSize>>{}));
+
+          cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tTR_rAcc /*not used*/);
+
+          cst_callbacks.end_loop(epi_m, epi_n);
+
+          using VecType = uint_bit_t<VD * sizeof_bits_v<ElementD>>;
+          if constexpr (!is_same_v<VecType, uint256_t>) {
+            Tensor tTR_gD_frg = recast<VecType>(coalesce(tTR_gD(_,_,_,epi_m,epi_n)));
+            Tensor tTR_rD_frg = recast<VecType>(coalesce(tTR_rD));
+            Tensor tTR_pD_frg = tensor<1>(zipped_divide(coalesce(tTR_pCD_mn), mclD.compose(Int<VD>{})));
+            copy_if(tTR_pD_frg, tTR_rD_frg, tTR_gD_frg);
+          }
+          else {
+            auto tiled_r2g = make_tiled_copy_D(Copy_Atom<SM100_STORE_256bit_CACHE_NOALLOCATION, ElementD>{}, tiled_t2r);
+            auto thr_r2g = tiled_r2g.get_slice(threadIdx.x);
+            Tensor src = thr_r2g.retile_S(tTR_rD);
+            Tensor dst = thr_r2g.retile_D(tTR_gD(_,_,_,epi_m,epi_n));
+            Tensor prd = thr_r2g.retile_D(tTR_pCD_mn);
+            copy_if(tiled_r2g, prd, src, dst);
+          }
+
+        } // for epi_m
+      } // for epi_n
+
+      cst_callbacks.end();
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    epi_loop_fn(cst_callbacks);
+    return cute::make_tuple(acc_pipe_consumer_state);
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// For sm100 kernels requiring warp specialized epilogues
+template <
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class CopyOpT2R_,
+  class AlignmentC_,
+  class AlignmentD_
+>
+class CollectiveEpilogue<
+    Sm100NoSmemWarpSpecialized,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    ThreadEpilogueOp_,
+    CopyOpT2R_,
+    AlignmentC_,
+    AlignmentD_
+> : public detail::Sm100TmaWarpSpecializedAdapter<CollectiveEpilogue<
+      Sm100NoSmem,
+      EpilogueTile_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      ThreadEpilogueOp_,
+      CopyOpT2R_,
+      AlignmentC_,
+      AlignmentD_,
+      void>>
+{
+public:
+  // ctor inheritance
+  using detail::Sm100TmaWarpSpecializedAdapter<CollectiveEpilogue<
+      Sm100NoSmem,
+      EpilogueTile_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      ThreadEpilogueOp_,
+      CopyOpT2R_,
+      AlignmentC_,
+      AlignmentD_,
+      void>>::Sm100TmaWarpSpecializedAdapter;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..412a4b7b747b60ebedfa26eec95a692b4d9adaf4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp
@@ -0,0 +1,1299 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/conv/detail.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  class CtaTileShape_, // (CTA_M,CTA_N,CTA_K, optional: Tile_L)
+  class EpilogueTile_, // (EPI_TILE_M, EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpT2R_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm100TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>,
+    CtaTileShape_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpT2R_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm100TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, ReuseSmemC_, DelayTmaStore_>;
+  using CtaTileShape = CtaTileShape_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using CopyOpT2R = CopyOpT2R_;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  constexpr static int ThreadCount = 128;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+
+private:
+  using GmemElementD = ElementD;
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<GmemElementD>::type;
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<GmemElementC>::type;
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  static_assert(StagesC >= 1, "StagesC must be >= 1");
+  static_assert(StagesD >= 1, "StagesD must be >= 1");
+  
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  using SmemLayoutStageC = decltype(tile_to_shape(SmemLayoutAtomC{}, product_each(shape(EpilogueTile{})),
+      cute::conditional_t<is_m_major_C, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayoutStageD = decltype(tile_to_shape(SmemLayoutAtomD{}, product_each(shape(EpilogueTile{})),
+      cute::conditional_t<is_m_major_D, Step<_2,_1>, Step<_1,_2>>{} ));
+
+  constexpr static int StageCBits = cosize_v<SmemLayoutStageC> * sizeof_bits_v<SmemElementC>;
+  constexpr static int StageDBits = cosize_v<SmemLayoutStageD> * sizeof_bits_v<SmemElementD>;
+  constexpr static int MaxStageBits = cute::max(StageCBits, StageDBits);
+  constexpr static int StrideStageC = (ReuseSmemC ? MaxStageBits : StageCBits) / sizeof_bits_v<SmemElementC>;
+  constexpr static int StrideStageD = (ReuseSmemC ? MaxStageBits : StageDBits) / sizeof_bits_v<SmemElementD>;
+
+  using SmemLayoutC = decltype(cute::append<3>(SmemLayoutStageC{}, Layout<Int<StagesC>,                        Int<StrideStageC>>{}));
+  using SmemLayoutD = decltype(cute::append<3>(SmemLayoutStageD{}, Layout<Int<ReuseSmemC ? StagesC : StagesD>, Int<StrideStageD>>{}));
+
+  constexpr static bool support_smem_reuse = is_source_supported && StagesD <= StagesC
+                                              && MaxStageBits % sizeof_bits_v<SmemElementC> == 0
+                                              && MaxStageBits % sizeof_bits_v<SmemElementD> == 0;
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  // Not unroll epi subtile loop when the activation op is heavy to reduce instruction size and register pressure.
+  constexpr static bool UnrollEpiLoop =
+    not cutlass::epilogue::thread::kIsHeavy_member_or_false<typename ThreadEpilogueOp::ActivationFn>::value;
+  // TMA store delay only benefits with loop unrolling
+  constexpr static bool DelayTmaStore = DelayTmaStore_ and UnrollEpiLoop;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes = StageCBits / 8;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Planar complex kernels have two accumulator copies for the real and imaginary tensors.
+  constexpr static int NumAccumulatorMtxs = 1;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+private:
+  static constexpr auto
+  get_tma_epi_tile() {
+    return cute::transform_apply(EpilogueTile{}, seq<0,1>{},
+      [] (auto epi_tiler, auto mode) {
+        auto cta_tiler_shape = get<mode>(CtaTileShape{});
+        // Use a dynamic stride to prevent mode coalescing
+        auto cta_tiler_stride = repeat_like(cta_tiler_shape, 0);
+        auto cta_tiler = make_layout(cta_tiler_shape, cta_tiler_stride);
+        // This is a multimodal CTA tiler, transform before returning
+        if constexpr (depth(cta_tiler) > 0) {
+          // This is an implicit multimodal tiler, match profile and return
+          if constexpr (tuple_size_v<decltype(shape(cta_tiler))> == 1) {
+            return make_tile(epi_tiler);
+          }
+          // This is an explicit multimodal tiler, compose out epi tiler
+          else {
+            return shape(composition(cta_tiler, epi_tiler));
+          }
+        }
+        // This is a flat CTA tiler, no need for transformation
+        else {
+          return epi_tiler;
+        }
+      },
+      [] (auto... epi_tilers) {
+        return make_tile(epi_tilers...);
+      }
+    );
+  }
+
+  using TmaEpilogueTile = decltype(get_tma_epi_tile());
+
+  template <class ProblemShapeMNL>
+  static constexpr auto
+  get_tma_load_c(ProblemShapeMNL const& problem_shape_mnl, Arguments const& args) {
+    Tensor tensor_c = make_tensor(make_gmem_ptr<GmemElementC>(args.ptr_C),
+                                  make_layout(problem_shape_mnl, append<3>(args.dC, _0{})));
+    return make_tma_copy(CopyOpG2S{}, tensor_c, SmemLayoutStageC{}, TmaEpilogueTile{}, _1{});
+  }
+
+  template <class ProblemShapeMNL>
+  static constexpr auto
+  get_tma_store_d(ProblemShapeMNL const& problem_shape_mnl, Arguments const& args) {
+    Tensor tensor_d = make_tensor(make_gmem_ptr<GmemElementD>(args.ptr_D),
+                                  make_layout(problem_shape_mnl, append<3>(args.dD, _0{})));
+    return make_tma_copy(CopyOpS2G{}, tensor_d, SmemLayoutStageD{}, TmaEpilogueTile{}, _1{});
+  }
+  
+public:
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(get_tma_load_c (repeat_like(append<3>(StrideC{},_1{}), int32_t(0)), Arguments{}));
+    using TMA_D = decltype(get_tma_store_d(repeat_like(append<3>(StrideD{},_1{}), int32_t(0)), Arguments{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+  };
+
+  //
+  // Gemm Host Functions
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnl = select<0,1,3>(append<4>(problem_shape, 1));
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+      tma_load_c = get_tma_load_c(problem_shape_mnl, args);
+    }
+
+    typename Params::TMA_D tma_store_d = get_tma_store_d(problem_shape_mnl, args);
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      tma_load_c,
+      tma_store_d
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, 
+      CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits_d = cutlass::detail::get_output_alignment_bits<ElementD>();
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto shape = cute::make_shape(M,N,L);
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_D = tma_alignment_bits_d / cutlass::sizeof_bits<ElementD>::value;
+    if constexpr (cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>) { // ignore L stride for implicit gemm
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(take<0,2>(shape), take<0,2>(StrideD{}));
+    }
+    else {
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(shape, StrideD{});
+    }
+
+    if constexpr (is_source_supported) {
+      constexpr int tma_alignment_bits_c = cutlass::detail::get_output_alignment_bits<ElementC>();
+      constexpr int min_tma_aligned_elements_C = tma_alignment_bits_c / cutlass::sizeof_bits<ElementC>::value;
+      if constexpr (cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(take<0,2>(shape), take<0,2>(StrideC{}));
+      }
+      else {
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(shape, StrideC{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    return implementable && fusion_implementable;
+  }
+
+  //
+  // Conv Host Functions
+  //
+
+  template <conv::Operator ConvOp, int NumDims>
+  static constexpr Params
+  to_underlying_arguments(cutlass::conv::ConvProblemShape<ConvOp,NumDims> const& problem_shape, Arguments const& args, void* workspace) {
+    return to_underlying_arguments(cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape), args, workspace);
+  }
+
+  template <conv::Operator ConvOp, int NumDims>
+  static size_t
+  get_workspace_size(cutlass::conv::ConvProblemShape<ConvOp,NumDims> const& problem_shape, Arguments const& args) {
+    return get_workspace_size(cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape), args);
+  }
+
+  template <conv::Operator ConvOp, int NumDims>
+  static cutlass::Status
+  initialize_workspace(cutlass::conv::ConvProblemShape<ConvOp,NumDims> const& problem_shape, Arguments const& args,
+      void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return initialize_workspace(cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape), args, workspace, stream, cuda_adapter);
+  }
+
+  template <conv::Operator ConvOp, int NumDims>
+  static bool
+  can_implement(cutlass::conv::ConvProblemShape<ConvOp,NumDims> const& problem_shape, Arguments const& args) {
+    return can_implement(cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape), args);
+  }
+
+  //
+  // Static Device Functions
+  //
+
+  template<class CtaTileMNK>
+  CUTLASS_DEVICE
+  static constexpr int
+  get_load_pipe_increment(CtaTileMNK const& cta_tile_mnk) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(cta_tile_mnk)), EpilogueTile{}));
+  }
+
+  template<class CtaTileMNK>
+  CUTLASS_DEVICE
+  static constexpr int
+  get_store_pipe_increment(CtaTileMNK const& cta_tile_mnk) {
+    return get_load_pipe_increment(cta_tile_mnk);
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE static void
+  prefetch_tma_descriptors(Params const& epilogue_params) {
+    cute::prefetch_tma_descriptor(epilogue_params.tma_load_c.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(epilogue_params.tma_store_d.get_tma_descriptor());
+  }
+
+  //
+  // Constructor and Data Members
+  //
+  CUTLASS_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+
+  //
+  // Non-static Device Functions
+  //
+public:
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      TensorStorage& shared_tensors,
+      bool reverse_epi_n = false) {
+    using namespace cute;
+
+    int lane_idx = canonical_lane_idx();
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+
+    // The tma tensor C under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape =
+      conditional_return<is_im2col_C>(make_coord(m_coord, n_coord), make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(cta_tile_mnk));
+    Tensor gC = local_tile(mC, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (TMA,TMA_M,TMA_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      lane_idx
+                    };
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter_n = 0; iter_n < size<3>(gC_epi); ++iter_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter_m = 0; iter_m < size<2>(gC_epi); ++iter_m) {
+        int epi_m = iter_m, epi_n = iter_n;
+        if constexpr (ReuseTmem) {
+          if (reverse_epi_n) {
+            epi_n = size<3>(gC_epi) - 1 - iter_n;
+          }
+        }
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Execute the TMA load for C if needed
+        if (issue_tma_load && is_C_load_needed) {
+          copy(params.tma_load_c.with(*tma_barrier, mcast_mask),
+              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+        }
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE void
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      [[maybe_unused]] StorePipeline store_pipeline,
+      [[maybe_unused]] StorePipelineState store_pipe_producer_state) {
+    load_pipeline.producer_tail(load_pipe_producer_state);
+  }
+
+  template<
+    bool ReuseTmem = false,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TensorStorage& shared_tensors
+      ) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_tmem<AccEngine>::value, "Accumulator must be TMEM resident.");
+    static_assert(rank(accumulators) == 3, "Accumulators must be MMA-partitioned: [MMA, MMA_M, MMA_N]");
+    static_assert(size<1>(accumulators) == 1 && size<2>(accumulators) == 1, "TiledMMA must match partitioned ShapeMN");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "CoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+    int thread_idx = threadIdx.x % ThreadCount;
+    int warp_idx = thread_idx / NumThreadsPerWarp;
+    [[maybe_unused]] int lane_idx = thread_idx % NumThreadsPerWarp;
+
+    // The tma tensor D under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape =
+      conditional_return<is_im2col_D>(make_coord(m_coord, n_coord), make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(cta_tile_mnk));
+    Tensor gD = local_tile(mD, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    Tensor tAcc = accumulators(make_coord(_,_),_0{},_0{});                                             // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor gD_epi   = flat_divide(  gD, EpilogueTile{});                         // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    // (t)hread-partition for (t)mem to (r)egister copy (tTR_)
+    TiledCopy tiled_t2r = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_tAcc = thread_t2r.partition_S(tAcc_epi);                                // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    Tensor tTR_sD   = thread_t2r.partition_D(sD_epi(_,_,_0{}));                        // (T2R,T2R_M,T2R_N)
+
+    // Allocate D and accumulator registers
+    // Does directly store the visitor into smem.
+    constexpr bool IsDirectR2S = cute::is_same_v<CopyOpR2R, AutoVectorizingCopyWithAssumedAlignment<128>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_sD));                              // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rD   = make_tensor<RegisterElementD>(shape(tTR_sD));                                // (T2R,T2R_M,T2R_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc));               // (EPI_V)
+    Tensor tTR_rD_frg   = recast<Array<RegisterElementD, FragmentSize>>(coalesce(tTR_rD));                   // (EPI_V)
+    CUTE_STATIC_ASSERT(size(tTR_rAcc) % DispatchPolicy::FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_D(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_t2r);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tTR_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tTR_rC = make_tensor<RegisterElementC>(shape(tTR_sD));                                  // (T2R,T2R_M,T2R_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tTR_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = make_tiled_copy_D(Copy_Atom<CopyOpR2R, RegisterElementD>{}, tiled_t2r);
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+    Tensor tRR_rD_src = thread_r2r.retile_S(tTR_rD);                                   // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+    Tensor tRR_rD_dst = thread_r2r.retile_D(tTR_rD);                                   // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = make_tiled_copy_D(Copy_Atom<CopyOpR2S, SmemElementD>{}, tiled_r2r);
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_sD = thread_r2s.partition_D(sD_epi);                                         // (R2S,R2S_M,R2S_N,PIPE_D)
+    Tensor tRS_rD = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (!IsDirectR2S) {
+        return make_tensor<SmemElementD>(shape(tRS_sD(_,_,_,_0{})));
+      }
+      else{
+        return thread_r2s.retile_S(tTR_rD);                                                 // (R2S,R2S_M,R2S_N)
+      }
+    }();
+
+    Tensor tRR_rD_dst_frg = recast<Array<RegisterElementD, FragmentSize>>(coalesce(tRR_rD_dst));
+    Tensor tRS_rD_frg     = recast<Array<SmemElementD, FragmentSize>>(coalesce(tRS_rD));
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
+
+    // Arguments for the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = false; // Register tensors reference T2R copy dst layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_t2r,
+                      cD,
+                      residue_cD,
+                      tTR_cD,
+                      residue_tTR_cD,
+                      tTR_rC,
+                      thread_idx
+                    };
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for sub-128 thread T2R tiled copy
+    Layout tmem_warp_layout = typename decltype(make_tmem_warp_partitioner(tAcc_epi(_,_,0,0)))::TiledLayout_TV{};
+    constexpr bool predicate_tmem_load = size(tmem_warp_layout) != cosize(tmem_warp_layout);
+    bool issue_tmem_load = true;
+
+    // If tmem doesn't have enough capacity to support double buffering, a portion of tmem (a column of epilogue tiles)
+    // is overlapped between 2 pseudo-buffers. The shared tmem portion corresponds to the last epilogue tile column of
+    // tmem accumulator buffer 0, and the first epilogue tile column of tmem accumulator 1.
+    // Thus, whenever we are processing tmem accumulator buffer 0, we process the epilogue tiles with reversed column order.
+    // Once the last epilogue tile column is loaded from tmem, the acc_pipeline is released.
+    // Then, the next accumulation stage for buffer 1 can start.
+    [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0;
+    static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage");
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = warp_idx == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    // If TMA store supported async transaction mbarriers we would not need this synchronous release behavior.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    [[maybe_unused]] int epi_m_prev = 0;
+    [[maybe_unused]] int epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The Epilogue Loop
+    auto epi_loop_fn = [&] (auto& cst_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+      bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+      bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+      // The TMA store sequence for one epilogue loop iteration
+      auto tma_store_fn = [&] (int epi_m, int epi_n) CUTLASS_LAMBDA_FUNC_INLINE {
+        // Write the tile from smem to gmem with TMA
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+        synchronize(); // ensure all threads have issued their async fence
+        if (issue_tma_store) {
+          copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+  
+        // Post async fence, pre TMA commit callback entry point
+        cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+  
+        // Commit the TMA stores for this stage
+        if (issue_tma_store) {
+          store_pipeline.producer_commit(store_pipe_producer_state);
+        }
+        ++store_pipe_producer_state;
+  
+        // Wait for the next smem buffer to be available
+        if (issue_tma_store) {
+          store_pipeline.producer_acquire(store_pipe_producer_state);
+        }
+        synchronize();
+  
+        if constexpr (ReuseSmemC) {
+          // producer_acquire returns when at most StagesD-1 committed stores are pending
+          bool store_finished = store_pipe_producer_state.count() > StorePipeline::UnacquiredStages;
+          // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+          if (store_finished) {
+            if (is_producer_load_needed) {
+              load_pipeline.consumer_release(load_pipe_consumer_state);
+            }
+            ++load_pipe_consumer_state;
+          }
+        }
+      }; // tma_store_fn
+
+      cst_callbacks.begin();
+      if (cst_callbacks.begin_sync_needed()) {
+        synchronize();
+      }
+
+      // Begin the wait for the producer load results
+      ConsumerToken load_wait_token{BarrierStatus::WaitDone};
+      if (is_producer_load_needed) {
+        load_wait_token = load_pipeline.consumer_try_wait(load_wait_state);
+      }
+      // Begin the wait for the accumulator results
+      ConsumerToken acc_wait_token = acc_pipeline.consumer_try_wait(acc_pipe_consumer_state);
+
+      // For each epilogue subtile within the CTA tile
+      constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+      constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+      for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+        #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+        for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+          int epi_m = iter_m, epi_n = iter_n;
+          bool is_first_iteration = iter_m == 0 && iter_n == 0;
+          bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
+          bool do_acc_release = is_last_iteration;
+
+          // Reverse subtile order for tmem reuse if necessary
+          if constexpr (ReuseTmem) {
+            if (reverse_epi_n) {
+              epi_n = size<3>(gD_epi) - 1 - iter_n;
+            }
+            do_acc_release = iter_m == size<2>(gD_epi)-1 && iter_n == 0;
+          }
+
+          cst_callbacks.begin_loop(epi_m, epi_n);
+
+          if (is_producer_load_needed) {
+            // Wait for the producer load to fill smem
+            load_pipeline.consumer_wait(load_wait_state, load_wait_token);
+
+            if (is_C_load_needed) {
+              // Copy source tile from smem to register
+              copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+              // Ensure smem loads are complete before reusing smem for mixed types/layouts
+              if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+                synchronize();
+              }
+            }
+          }
+
+          // First loop fusion callback entry point
+          cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+          if (is_producer_load_needed) {
+            // Let producer load warp know smem buffers are consumed and empty
+            if constexpr (not ReuseSmemC) {
+              cutlass::arch::fence_view_async_shared();
+              load_pipeline.consumer_release(load_pipe_consumer_state);
+              ++load_pipe_consumer_state;
+            }
+            ++load_wait_state;
+          }
+
+          if (is_first_iteration) {
+            // Wait for mma warp to fill tmem buffer with accumulator results
+            acc_pipeline.consumer_wait(acc_pipe_consumer_state, acc_wait_token);
+          }
+
+          // The current tile in tmem
+          Tensor tTR_tAcc_mn = tTR_tAcc(_,_,_,epi_m,epi_n);
+
+          // Compute tmem load predication if necessary
+          if constexpr (predicate_tmem_load) {
+            // Issue tmem load if this tile's tmem subpartition is accessible by this warp
+            int subpart_idx = (tTR_tAcc_mn.data().dp_ / 32) % 4;
+            issue_tmem_load = warp_idx == subpart_idx;
+          }
+          bool issue_smem_store = issue_tmem_load;
+
+          // Copy accumulator tile from tmem to register
+          if (issue_tmem_load) {
+            copy(tiled_t2r, tTR_tAcc_mn, tTR_rAcc);
+          }
+
+          // After the last tmem load, signal that tmem buffer is consumed and empty
+          if (do_acc_release) {
+            cutlass::arch::fence_view_async_tmem_load();
+            acc_pipeline.consumer_release(acc_pipe_consumer_state);
+            ++acc_pipe_consumer_state;
+          }
+
+          // Vectorized fragment loop with visitor callback entry point
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tTR_rD_frg); ++epi_v) {
+            tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+          }
+
+          // The latest we can delay the TMA store is right before the smem store of the next iteration
+          // since the current TMA store needs to be committed before we can acquire the next smem buffer
+          if constexpr (DelayTmaStore) {
+            // Issue TMA stores for the previous subtile
+            if (not is_first_iteration) {
+              tma_store_fn(epi_m_prev, epi_n_prev);
+            }
+            epi_m_prev = epi_m;
+            epi_n_prev = epi_n;
+          }
+
+          if constexpr (!IsDirectR2S) {
+            // At present, only FP4 col output with scalefactor generation fusion would go into these branch
+            copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+          }
+          tRS_rD_frg(_0{}) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRR_rD_dst_frg(_0{}));
+
+          // Smem reduction callback entry point using current store buffer for workspace
+          Tensor reduction_buffer = make_tensor(raw_pointer_cast(sD_epi(_,_,store_pipe_producer_state.index()).data()),
+                                                make_layout(stride<2>(get_nonswizzle_portion(SmemLayoutD{})), _1{}));
+          cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tRS_rD_frg);
+
+          // Copy output tile from register to smem
+          if (issue_smem_store) {
+            copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+          }
+
+          // Post reduction, pre TMA store callback entry point
+          cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+          if constexpr (not DelayTmaStore) {
+            // Issue TMA stores for this subtile
+            tma_store_fn(epi_m, epi_n);
+          }
+
+          cst_callbacks.end_loop(epi_m, epi_n);
+
+          if (is_producer_load_needed) {
+            // Begin the wait for the next subtile producer load
+            load_wait_token = load_pipeline.consumer_try_wait(load_wait_state, is_last_iteration);
+          }
+        } // for epi_m
+      } // for epi_n
+
+      if constexpr (DelayTmaStore) {
+        // Issue TMA stores for the last subtile
+        tma_store_fn(epi_m_prev, epi_n_prev);
+      }
+
+      cst_callbacks.end();
+    }; // epi_loop_fn
+
+    //
+    // BEGIN EPILOGUE
+    //
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    epi_loop_fn(cst_callbacks);
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state, acc_pipe_consumer_state);
+  }
+
+  // API with Global Accumulator in registers for FastFP32 (emulated MMA) kernels.
+  // The accumulator in TMEM periodically loaded into the registers so that the MMA can clear out the TMEM accumulator
+  // values for better accuracy. This epilogue accepts the accumulator in registers and take TiledCopy for the
+  // TMEM->Reg as a parameter to be used in partitioning GMEM tensors C and D.
+  template<
+    class ProblemShapeMNKL,
+    class CtaTileMNK,
+    class CtaCoordMNKL,
+    class MmaTileMNK,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class TiledCopyT2R
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      CtaTileMNK cta_tile_mnk,
+      CtaCoordMNKL cta_coord_mnkl,
+      MmaTileMNK mma_tile_mnk,
+      TiledMma tiled_mma,
+      cute::Tensor<AccEngine, AccLayout>& tTR_rAcc,                                     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+      TensorStorage& shared_tensors,
+      TiledCopyT2R tiled_t2r
+      ) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be Register resident.");
+    static_assert(rank(AccLayout{}) == 5, "Accumulators must be copy-partitioned:  (T2R,T2R_M,T2R_N,EPI_M,EPI_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(rank(CtaCoordMNKL{}) == 4, "CoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = cta_coord_mnkl;
+    int thread_idx = threadIdx.x % ThreadCount;
+    int warp_idx = thread_idx / NumThreadsPerWarp;
+    [[maybe_unused]] int lane_idx = thread_idx % NumThreadsPerWarp;
+
+    // The tma tensor D under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape =
+      conditional_return<is_im2col_D>(make_coord(m_coord, n_coord), make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(cta_tile_mnk));
+    Tensor gD = local_tile(mD, take<0,2>(cta_tile_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(  gD, EpilogueTile{});                           // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    // (t)hread-partition for (t)mem to (r)egister copy (tTR_)
+    ThrCopy thread_t2r = tiled_t2r.get_slice(thread_idx);
+    Tensor tTR_sD = thread_t2r.partition_D(sD_epi(_,_,_0{}));                                      // (T2R,T2R_M,T2R_N)
+
+    // Allocate D and accumulator registers
+    Tensor tTR_rD = make_tensor<SmemElementD>(shape(tTR_sD));                                      // (T2R,T2R_M,T2R_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tTR_rD_frg = recast<Array<SmemElementD, FragmentSize>>(coalesce(tTR_rD));                         // (EPI_V)
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r  = make_tiled_copy_D(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_t2r);
+    ThrCopy thread_s2r   = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tTR_rD).layout();                                   // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tTR_rC = make_tensor<RegisterElementC>(shape(tTR_sD));                                  // (T2R,T2R_M,T2R_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tTR_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_t2r);
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rD = thread_r2s.retile_S(tTR_rD);                                                   // (R2S,R2S_M,R2S_N)
+    Tensor tRS_sD = thread_r2s.partition_D(sD_epi);                                         // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                         // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{}));     // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout());                          // (T2R,T2R_M,T2R_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{});                                                   // (m,n)
+
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = false; // Register tensors reference T2R copy dst layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      cta_tile_mnk,
+                      cta_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_t2r,
+                      cD,
+                      residue_cD,
+                      tTR_cD,
+                      residue_tTR_cD,
+                      tTR_rC,
+                      thread_idx
+                    };
+
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(ThreadCount, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = warp_idx == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    // If TMA store supported async transaction mbarriers we would not need this synchronous release behavior.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    int epi_m_prev = 0, epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if (issue_tma_store) {
+        copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = store_pipe_producer_state.count() > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
+
+    // Begin the wait for the producer load results
+    ConsumerToken load_wait_token{BarrierStatus::WaitDone};
+    if (is_producer_load_needed) {
+      load_wait_token = load_pipeline.consumer_try_wait(load_wait_state);
+    }
+
+    // For each epilogue subtile within the CTA tile
+    constexpr int NumEpiSubtilesN = CUTE_STATIC_V(size<3>(gD_epi));
+    constexpr int NumEpiSubtilesM = CUTE_STATIC_V(size<2>(gD_epi));
+    #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesN : 1)
+    for (int iter_n = 0; iter_n < NumEpiSubtilesN; ++iter_n) {
+      #pragma unroll(UnrollEpiLoop ? NumEpiSubtilesM : 1)
+      for (int iter_m = 0; iter_m < NumEpiSubtilesM; ++iter_m) {
+        int epi_m = iter_m, epi_n = iter_n;
+        bool is_first_iteration = iter_m == 0 && iter_n == 0;
+        bool is_last_iteration = iter_m == size<2>(gD_epi)-1 && iter_n == size<3>(gD_epi)-1;
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state, load_wait_token);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+            // Ensure smem loads are complete before reusing smem for mixed types/layouts
+            if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+              synchronize();
+            }
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          // Let producer load warp know smem buffers are consumed and empty
+          if constexpr (not ReuseSmemC) {
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        Tensor tTR_rAcc_epi_tile = tTR_rAcc(_,_,_,epi_m,epi_n);
+        Tensor tTR_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(coalesce(tTR_rAcc_epi_tile));     // (EPI_V)        
+
+        // Vectorized fragment loop with visitor callback entry point
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_v = 0; epi_v < size(tTR_rD_frg); ++epi_v) {
+          tTR_rD_frg(epi_v) = cst_callbacks.visit(tTR_rAcc_frg(epi_v), epi_v, epi_m, epi_n);
+        }
+
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        Tensor reduction_buffer = make_tensor(raw_pointer_cast(sD_epi(_,_,store_pipe_producer_state.index()).data()),
+                                              make_layout(stride<2>(get_nonswizzle_portion(SmemLayoutD{})), _1{}));
+        cst_callbacks.reduce(reduction_buffer, synchronize, epi_m, epi_n, is_last_iteration, tTR_rD_frg);
+
+        // Copy output tile from register to smem
+        bool issue_smem_store = true;
+        if (issue_smem_store) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Begin the wait for the next subtile producer load
+          load_wait_token = load_pipeline.consumer_try_wait(load_wait_state, is_last_iteration);
+        }
+      } // for epi_m
+    } // for epi_n
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  template <class CtaTileMNK>
+  CUTLASS_DEVICE void
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      CtaTileMNK cta_tile_mnk) {
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // wait for all TMA stores to complete
+        store_pipeline.producer_tail(store_pipe_producer_state);
+
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(cta_tile_mnk));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2b8d84dc92fb8b1a823135b2fdc556bce9dbebc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
@@ -0,0 +1,549 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class StrideC,
+  class StrideD,
+  class ThreadEpilogueOp,
+  class SmemLayout,
+  class CopyAtomR2S,
+  class TiledCopyS2R,
+  class CopyAtomR2G,
+  class EpilogueScheduleType = EpilogueSimtVectorized,
+  class Enable = void
+>
+class Epilogue {
+  static_assert(cute::is_same_v<EpilogueScheduleType, EpilogueSimtVectorized> ||
+                cute::is_same_v<EpilogueScheduleType, EpiloguePtrArraySimtVectorized>,
+                "Could not find an epilogue specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpilogueSimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using ElementBias = typename detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::type;
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = void;
+  using GmemTiledCopyD = CopyAtomR2G;
+
+  static constexpr bool IsEpilogueBiasSupported = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
+  using StrideBias = cute::conditional_t<detail::is_m_major<StrideD>(), Stride<_1,_0,int64_t>, Stride<_0,_1,int64_t>>;
+
+  static_assert(cute::rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  static constexpr bool IsActHasArgs = detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpilogueOp>::value;
+
+  // Host side epilogue arguments
+  template<class ThreadEpiOp, class = void>
+  struct ThreadEpilogueOpArguments {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+  };
+
+  template<class ThreadEpiOp>
+  struct ThreadEpilogueOpArguments<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    ElementScalar alpha{0};
+    ElementScalar beta{0};
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+  };
+
+  struct Arguments {
+    ThreadEpilogueOpArguments<ThreadEpilogueOp> thread{};
+    using StrideBias = decltype(thread.dBias);
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  template<class ThreadEpiOp, class = void>
+  struct ParamsType {
+    typename ThreadEpiOp::Params thread{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  template<class ThreadEpiOp>
+  struct ParamsType<
+          ThreadEpiOp,
+          cute::enable_if_t<detail::IsThreadEpilogueOpWithElementwiseArguments<ThreadEpiOp>::value>> {
+    typename ThreadEpiOp::Params thread{};
+    typename ThreadEpiOp::ElementwiseArguments activation{};
+    ElementC const* ptr_C = nullptr;
+    StrideC dC{};
+    ElementD* ptr_D = nullptr;
+    StrideD dD{};
+    ElementBias const* ptr_Bias = nullptr;
+    StrideBias dBias{};
+  };
+
+  using Params = ParamsType<ThreadEpilogueOp>;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      [[maybe_unused]] ProblemShape const& _,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    typename ThreadEpilogueOp::Params thread_op_args;
+    thread_op_args.alpha = args.thread.alpha;
+    thread_op_args.beta = args.thread.beta;
+    thread_op_args.alpha_ptr = args.thread.alpha_ptr;
+    thread_op_args.beta_ptr = args.thread.beta_ptr;
+
+    if constexpr (IsActHasArgs) {
+      return {
+        thread_op_args,
+        args.thread.activation,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+    else {
+      return {
+        thread_op_args,
+        args.ptr_C,
+        args.dC,
+        args.ptr_D,
+        args.dD,
+        args.thread.bias_ptr,
+        args.thread.dBias
+      };
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_), epilogue_op(params_.thread) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    return epilogue_op.is_source_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+
+    // Represent the full output tensor
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC);             //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), params.dD);             //             (m,n,l)
+    Tensor mBias_mnl = make_tensor(make_gmem_ptr(params.ptr_Bias), make_shape(M,N,L), params.dBias);    //             (m,n,l)
+
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});             // (BLK_M,BLK_N,m,n,l)
+    Tensor gBias_mnl = local_tile(mBias_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});       // (BLK_M,BLK_N,m,n,l)
+
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gBias = gBias_mnl(_,_,m_coord,n_coord,l_coord);                                             // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gBiast = flat_divide(gBias, tile);                                          // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gBias = thread_s2r.partition_D(gBiast);                   // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rC = make_tensor<ElementC>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementD>(shape(tSR_rAcc));                            // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rBias = make_tensor_like(tSR_gBias);                      // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("gBias   : "); print(gBias.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rC : "); print(tSR_rC.layout()); print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+      print("gBiast  : "); print(gBiast.layout()); print("\n");
+      print("tSR_gBias  : "); print(tSR_gBias.layout()); print("\n");
+      print("tSR_rBias  : "); print(tSR_rBias.layout()); print("\n");
+    }
+#endif
+
+    if constexpr (IsEpilogueBiasSupported) {
+      if (params.ptr_Bias) {
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        Tensor tSR_gBias_flt = filter_zeros(tSR_gBias);
+        Tensor tSR_rBias_flt = filter_zeros(tSR_rBias);
+        Tensor tSR_cD_flt = filter_zeros(tSR_cD, tSR_gBias.stride());
+        Tensor tSR_pD_flt = cute::lazy::transform(tSR_cD_flt, [&](auto const& c){ return elem_less(c, take<0,2>(residue_mnk)); });
+
+        // Step 0. Copy Bias from GMEM to fragment
+        copy_if(tSR_pD_flt, tSR_gBias_flt, tSR_rBias_flt);
+      }
+    }
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if constexpr (IsEpilogueBiasSupported) {
+          Tensor tSR_rBiasmn = tSR_rBias(_,_,_,step_m,step_n);
+
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rC(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              if constexpr (IsActHasArgs) {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i), params.activation);
+              } else {
+                epilogue_op(tSR_rD(i), tSR_rD(i), tSR_rAcc(i), tSR_rBiasmn(i));
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        } else {
+          if (epilogue_op.is_source_needed()) {
+            // source is needed
+            Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+            // Step 5. Copy C from GMEM to a fragment
+            CUTLASS_PRAGMA_UNROLL
+            for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+                // Predication
+                if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                  CUTLASS_PRAGMA_UNROLL
+                  for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                    tSR_rC(i,m,n) = tSR_gCmn(i,m,n);
+                  }
+                }
+              }
+            }
+
+            // Step 6. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i), tSR_rC(i));
+            }
+          }
+          else {
+            // source is not needed, avoid load and lift compute
+
+            // Step 5. Elementwise operation with conversion
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tSR_rAcc); ++i) {
+              tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // The Last Step. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+  ThreadEpilogueOp epilogue_op;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5030efded1e3608d91d0dca87f9f41fff827875f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
@@ -0,0 +1,412 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Ptr Array Epilogue Vectorized
+/// Applies an element wise operation to all elements within the fragment
+/// and writes it out to destination storage.
+///
+/// Ways to generalize this:
+/// - CTA tile shape
+/// - vectorization requirements (GMEM)
+/// - vectoriz(able) transform()
+///
+template <
+  class StrideC_,
+  class StrideD_,
+  class ThreadEpilogueOp_,
+  class SmemLayout_,
+  class CopyAtomR2S_,
+  class TiledCopyS2R_,
+  class CopyAtomR2G_,
+  class EpilogueScheduleType_
+>
+class Epilogue<
+        StrideC_,
+        StrideD_,
+        ThreadEpilogueOp_,
+        SmemLayout_,
+        CopyAtomR2S_,
+        TiledCopyS2R_,
+        CopyAtomR2G_,
+        EpilogueScheduleType_,
+        cute::enable_if_t<
+          cute::is_same_v<EpilogueScheduleType_, EpiloguePtrArraySimtVectorized>
+        >
+      > {
+public:
+  //
+  // Type Aliases
+  //
+  // derived types of output thread level operator
+  using ThreadEpilogueOp = ThreadEpilogueOp_;
+  using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
+  using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
+  using ElementScalar = ElementCompute;
+  using ElementOutput = typename ThreadEpilogueOp::ElementOutput;
+  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = typename ThreadEpilogueOp::ElementD;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+
+  using SmemLayout   = SmemLayout_;
+  using CopyAtomR2S  = CopyAtomR2S_;
+  using TiledCopyS2R = TiledCopyS2R_;
+  using CopyAtomR2G  = CopyAtomR2G_;
+
+  using GmemTiledCopyC = TiledCopyS2R;
+  using GmemTiledCopyD = TiledCopyS2R;
+
+  static const int kOutputAlignment = ThreadEpilogueOp::kCount;
+
+  using AlignmentType = typename cute::uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
+
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementAccumulator, cute::cosize_v<SmemLayout>> smem_epilogue;
+  };
+
+  using TensorMapStorage = SharedStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename ThreadEpilogueOp::Params thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC{};
+    ElementD** ptr_D = nullptr;
+    StrideD dD{};
+  };
+
+  // Device side epilogue params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const&,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      [[maybe_unused]] ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Epilogue(Params const& params_)
+      : params(params_) { }
+
+  CUTLASS_DEVICE
+  bool
+  is_source_needed() {
+    // For Ptr-Array or Grouped Gemm we cannot determine if source is needed based on first beta.
+    return true;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class BlockShapeMNK,
+    class BlockCoordMNKL,
+    class FrgEngine, class FrgLayout,
+    class TiledMma,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator()(
+      ProblemShapeMNKL problem_shape_mnkl,
+      BlockShapeMNK blk_shape_MNK,
+      BlockCoordMNKL blk_coord_mnkl,
+      cute::Tensor<FrgEngine,FrgLayout> const& accumulators,                   // (MMA,MMA_M,MMA_N)
+      TiledMma tiled_mma,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    static_assert(cute::rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<BlockShapeMNK>::value, "ThreadBlock tile shape must be static");
+    static_assert(cute::rank(BlockShapeMNK{}) == 3, "BlockShapeMNK must be rank 3");
+    static_assert(cute::rank(BlockCoordMNKL{}) == 4, "BlockCoordMNKL must be rank 3");
+
+    // synchronizing function for smem reads/writes
+#if CUDA_BARRIER_ENABLED
+    auto synchronize = [] () { cutlass::arch::NamedBarrier::sync(typename TiledCopyS2R::TiledNumThr{}, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+#else
+    auto synchronize = [] () { __syncthreads(); };
+#endif
+
+    // Separate out problem shape for convenience
+    auto M = get<0>(problem_shape_mnkl);
+    auto N = get<1>(problem_shape_mnkl);
+    auto L = get<3>(problem_shape_mnkl);
+    // Batches are managed by using appropriate pointers to C and D matrices
+    const int32_t mock_L = 1;
+    const int32_t mock_l_coord = 0;
+    // Slice to get the tile this CTA is responsible for
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord_mnkl;
+
+    // If scalar alpha/beta are provided, i.e., same alpha/beta applies to all batches/groups.
+    // If pointers to alpha/beta are provided, i.e., alpha/beta can differ between batches/groups,
+    // we get the correct alpha/beta values for the current batch/group using group index.
+    ThreadEpilogueOp epilogue_op = ThreadEpilogueOp(params.thread, l_coord);
+
+    if (epilogue_op.is_source_needed() && params.dC == nullptr) {
+      // Beta value is non-zero while pointer to C is a nullptr
+      assert(0);
+    }
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (!cute::is_same_v<InternalStrideC, StrideC>) {
+      // If grouped gemm
+      if (epilogue_op.is_source_needed()) {
+        stride_c = params.dC[l_coord];
+      }
+      stride_d = params.dD[l_coord];
+    }
+    else {
+      stride_c = params.dC;
+      stride_d = params.dD;
+    }
+
+    // Represent the full output tensor
+    ElementC const* ptr_C_l = nullptr;
+    if (epilogue_op.is_source_needed()) {
+      ptr_C_l = params.ptr_C[l_coord];
+    }
+    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      //             (m,n,l)
+    Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      //             (m,n,l)
+    Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+    Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});      // (BLK_M,BLK_N,m,n,l)
+
+    Tensor gC = gC_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+    Tensor gD = gD_mnl(_,_,m_coord,n_coord,mock_l_coord);                                                   // (BLK_M,BLK_N)
+
+    // Construct a tensor in SMEM that we can partition for rearranging data
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sAcc = make_tensor(make_smem_ptr(storage.smem_epilogue.data()), SmemLayout{});            // (SMEM_M,SMEM_N)
+
+    // Partition sAcc to match the accumulator partitioning
+    auto tiled_r2s = make_tiled_copy_C(CopyAtomR2S{}, tiled_mma);
+    auto thread_r2s     = tiled_r2s.get_thread_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                              // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor tRS_sAcc = thread_r2s.partition_D(sAcc);                                   // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Tile gD and gC by the shape of SmemLayout first
+    auto tile  = make_shape(size<0>(sAcc), size<1>(sAcc));
+    Tensor gCt = flat_divide(gC, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor gDt = flat_divide(gD, tile);                                                // (SMEM_M,SMEM_N,TILE_M,TILE_N)
+
+    // Partition sAcc, gC, and gD for the output
+    auto tiled_s2r = TiledCopyS2R{};
+    auto thread_s2r     = tiled_s2r.get_thread_slice(thread_idx);
+    Tensor tSR_sAcc = thread_s2r.partition_S(sAcc);                      //               ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_gC = thread_s2r.partition_D(gCt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+    Tensor tSR_gD = thread_s2r.partition_D(gDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    // Allocate intermediate registers on the dst tensors
+    Tensor tSR_rAcc = make_tensor<ElementAccumulator>(take<0,3>(shape(tSR_gC)));       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tSR_rD = make_tensor<ElementOutput>(shape(tSR_rAcc));                       // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+    // Repeat the D-partitioning for coordinates and predication
+    Tensor cD   = make_identity_tensor(make_shape(size<0>(gD),size<1>(gD)));           // (BLK_M,BLK_N) -> (blk_m,blk_n)
+    Tensor cDt  = flat_divide(cD, tile);                                 //                (SMEM_M,SMEM_N,TILE_M,TILE_N)
+    Tensor tSR_cD = thread_s2r.partition_D(cDt);                         // ((Atom,AtomNum),ATOM_M,ATOM_N,TILE_M,TILE_N)
+
+    CUTE_STATIC_ASSERT(size<1>(tRS_rAcc) % size<3>(tSR_gC) == 0);  // TILE_M divides MMA_M
+    CUTE_STATIC_ASSERT(size<2>(tRS_rAcc) % size<4>(tSR_gC) == 0);  // TILE_N divides MMA_N
+
+#if 0
+    if (thread_idx == 0 && m_coord == 0 && n_coord == 0) {
+      print("aC   : "); print(accumulators.layout()); print("\n");
+      print("gC   : "); print(gC.layout()); print("\n");
+      print("gD   : "); print(gD.layout()); print("\n");
+      print("sAcc   : "); print(sAcc.layout()); print("\n");
+      print("\n");
+      print("tRS_sAcc : "); print(tRS_sAcc.layout()); print("\n");
+      print("tRS_rAcc : "); print(tRS_rAcc.layout()); print("\n");
+      print("\n");
+      print("gDt  : "); print(gDt.layout()); print("\n");
+      print("tSR_sAcc : "); print(tSR_sAcc.layout()); print("\n");
+      print("tSR_rAcc : "); print(tSR_rAcc.layout()); print("\n");
+      print("\n");
+      print("tSR_rD : "); print(tSR_rD.layout()); print("\n");
+      print("tSR_gC : "); print(tSR_gC.layout()); print("\n");
+      print("tSR_gD : "); print(tSR_gD.layout()); print("\n");
+      print("\n");
+    }
+#endif
+
+    // For each tiling needed for SmemLayout to cover shape(gD)
+    CUTLASS_PRAGMA_UNROLL
+    for (int step_m = 0; step_m < size<2>(cDt); ++step_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int step_n = 0; step_n < size<3>(cDt); ++step_n) {
+        // Step 1. Copy to SMEM
+        CUTLASS_PRAGMA_UNROLL
+        for (int pipe_m = 0; pipe_m < size<1>(tRS_sAcc); ++pipe_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int pipe_n = 0; pipe_n < size<2>(tRS_sAcc); ++pipe_n) {
+            int mma_m = step_m * size<1>(tRS_sAcc) + pipe_m;
+            int mma_n = step_n * size<2>(tRS_sAcc) + pipe_n;
+
+            copy(tiled_r2s, tRS_rAcc(_,mma_m,mma_n), tRS_sAcc(_,pipe_m,pipe_n));
+          }
+        }
+
+        // Step 2. Wait for SMEM writes to complete
+        synchronize();
+
+        // Step 3. Copy from SMEM into a fragment
+        copy(tiled_s2r, tSR_sAcc, tSR_rAcc);
+
+        // Step 4. Wait for SMEM reads to complete
+        synchronize();
+
+        Tensor tSR_gDmn = tSR_gD(_,_,_,step_m,step_n);
+        Tensor tSR_cDmn = tSR_cD(_,_,_,step_m,step_n);
+
+        if (epilogue_op.is_source_needed()) {
+          // source is needed
+          Tensor tSR_gCmn = tSR_gC(_,_,_,step_m,step_n);
+
+          Tensor tSR_rCmn = make_tensor<ElementC>(shape(tSR_gCmn));                     // ((Atom,AtomNum),ATOM_M,ATOM_N)
+
+          // Step 5. Copy C from GMEM to a fragment
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rCmn(i,m,n) = tSR_gCmn(i,m,n);
+                }
+              }
+            }
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Elementwise operation with conversion
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < size<0>(tSR_rAcc); ++i) {
+                  tSR_rD(i,m,n) = epilogue_op(tSR_rAcc(i,m,n), tSR_rCmn(i,m,n));
+                }
+                // Step 7. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+        else {
+          // source is not needed, avoid load and lift compute
+
+          // Step 5. Elementwise operation with conversion
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_rAcc); ++i) {
+            tSR_rD(i) = epilogue_op(tSR_rAcc(i));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < size<1>(tSR_gDmn); ++m) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int n = 0; n < size<2>(tSR_gDmn); ++n) {
+              // Predication
+              if (elem_less(tSR_cDmn(0,m,n), take<0,2>(residue_mnk))) {
+                // Step 6. Copy to GMEM
+                copy(CopyAtomR2G{}, tSR_rD(_,m,n), tSR_gDmn(_,m,n));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+private:
+  Params params;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..77ef3ed2defbc2f286ac3002185a2864a8b322f8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -0,0 +1,1245 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                   StagesD_,
+                                   FragmentSize_,
+                                   ReuseSmemC_,
+                                   DelayTmaStore_,
+                                   NumEpilogueWarpGroups_
+                                  >,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90PtrArrayTmaWarpSpecialized<StagesC_,
+                                                        StagesD_,
+                                                        FragmentSize_,
+                                                        ReuseSmemC_,
+                                                        DelayTmaStore_, 
+                                                        NumEpilogueWarpGroups_
+                                                       >;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using InternalStrideC = cute::remove_pointer_t<StrideC>;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using InternalStrideD = cute::remove_pointer_t<StrideD>;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<InternalStrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<InternalStrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+  constexpr static uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_C;
+      cute::array<cute::TmaDescriptor, NumEpilogueWarpGroups> smem_tensormap_D;
+    } tensormaps;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideC, StrideC>;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const** ptr_C = nullptr;
+    StrideC dC;
+    ElementD ** ptr_D = nullptr;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementC const*>(nullptr)),
+            repeat_like(InternalStrideC{}, int32_t(0)), InternalStrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr(static_cast<NonVoidElementD const*>(nullptr)),
+            repeat_like(InternalStrideD{}, int32_t(0)), InternalStrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    cute::TmaDescriptor* tensormaps;
+    ElementC const** ptr_C;
+    StrideC dC;
+    ElementD** ptr_D;
+    StrideD dD;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_M = int32_t(size<0>(CtaTileMNK{}));
+    auto init_N = int32_t(size<1>(CtaTileMNK{}));
+    auto init_L = 1;
+
+    static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D");
+
+    InternalStrideC stride_c;
+    InternalStrideD stride_d;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_c = InternalStrideC{};
+      stride_d = InternalStrideD{};
+    } 
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1);
+      init_M = get<0>(problem_shape_MNKL);
+      init_N = get<1>(problem_shape_MNKL);
+      stride_c = args.dC;
+      stride_d = args.dD;
+    }
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+      ElementC const* ptr_C_first_batch = reinterpret_cast<ElementC const*>(reinterpret_cast<uint64_t>(args.ptr_C) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      Tensor tensor_c = make_tensor(ptr_C_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_c, _0{})));
+      tma_load_c = make_tma_copy(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    typename Params::TMA_D tma_store_d{};
+    if constexpr (is_destination_supported) {
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+      ElementD const* ptr_D_first_batch = reinterpret_cast<ElementD const*>(reinterpret_cast<uint64_t>(args.ptr_D) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      Tensor tensor_d = make_tensor(ptr_D_first_batch, make_layout(make_shape(init_M,init_N,init_L), append<3>(stride_d, _0{})));
+      tma_store_d = make_tma_copy(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{},
+          _1{});
+    }
+
+    auto fusion_workspace = static_cast<char*>(workspace);
+    auto fusion_workspace_size = round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment);
+    auto tma_descriptor_workspace = reinterpret_cast<cute::TmaDescriptor*>(
+                                      static_cast<char*>(workspace) + fusion_workspace_size);
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, fusion_workspace),
+      tma_load_c,
+      tma_store_d,
+      tma_descriptor_workspace,
+      args.ptr_C,
+      args.dC,
+      args.ptr_D,
+      args.dD,
+      transaction_bytes,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    auto descriptors_shape = cute::make_shape(sm_count, Int<NumInputTensors>{});
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (size(descriptors_shape) * SizeOfCuTensorMap) + 
+        (round_nearest(FusionCallbacks::get_workspace_size(problem_shape, args.thread), MinTensorMapWorkspaceAlignment));
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    bool implementable = true;
+    bool fusion_implementable = true;
+
+    if (problem_shape.is_host_problem_shape_available()) {
+      for (int i = 0; i < problem_shape.groups(); ++i) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+
+        if constexpr (is_destination_supported) {
+          constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+          constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_D>(cute::make_shape(M,N,L), InternalStrideD{});
+        }
+
+        if constexpr (is_source_supported) {
+          constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+          constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(cute::make_shape(M,N,L), InternalStrideC{});
+        }
+
+        fusion_implementable = fusion_implementable && FusionCallbacks::can_implement(problem_shape_MNKL, args.thread);
+      }
+    }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if (cute::is_void_v<ElementC> || args.ptr_C == nullptr) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+      if constexpr (detail::has_beta_ptr_array<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr_array == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  CUTLASS_DEVICE auto
+  load_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    // Initialize tma for loading
+    constexpr bool IsLoad = true;
+    auto load_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, 0);
+    return load_tensormaps;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma,
+    class TensorMapC,
+    __CUTE_REQUIRES(std::is_pointer_v<TensorMapC>)
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapC const& load_tensormap,
+      int subtile_idx=-1) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));             //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    };
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    LoadPipelineState last_load_producer_state = load_pipe_producer_state;
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    LoadPipelineState prior_state = load_pipe_producer_state;
+
+    bool did_load = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (is_C_load_needed) {
+          if (issue_tma_load) {
+            copy(params.tma_load_c.with(load_tensormap, *tma_barrier, mcast_mask),
+                bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+            load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+          }
+          last_load_producer_state = load_pipe_producer_state;
+          did_load = true;
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+
+    if (!fusion_callbacks.is_producer_load_needed()) {
+      return load_pipe_producer_state; 
+    }
+
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma,
+    class TensorMapD
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      TensorMapD const& store_tensormap,
+      int subtile_idx=-1) {
+
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+
+    static_assert(!is_im2col_D, "Do not support im2col");
+
+    auto coord_shape = append<3>(make_shape(m_coord, n_coord), Int<0>{});
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(append<3>(make_shape(M,N), Int<1>{}));            //       (M,N,L)
+
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                          // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));     // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+      // When the epilogue subtile is larger than the MMA tiles, loop over multiple MMA tiles
+      CUTE_STATIC_ASSERT(epi_tile_n % mma_tile_n == 0, "MMA_TILE_N must divide EPI_TILE_N");
+    }
+    else {
+    CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    }
+
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference R2S copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs{
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    };
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
+    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                         // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (a single thread from one warp issues TMA store)
+    bool issue_tma_store = ((thread_idx / NumThreadsPerWarp) == 0) && cute::elect_one_sync();
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    int epi_m_prev = 0, epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d.with(store_tensormap), bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+          // When the epilogue subtile is larger than the MMA tiles, loop over multiple
+          // MMA tiles
+          static constexpr int MmaMPerEpiM = epi_tile_m / mma_tile_m;
+          static constexpr int MmaNPerEpiN = epi_tile_n / mma_tile_n;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_n_in_epi = 0; mma_n_in_epi < MmaNPerEpiN; ++mma_n_in_epi) {
+            int mma_n = (epi_n * MmaNPerEpiN) + mma_n_in_epi;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m_in_epi = 0; mma_m_in_epi < MmaMPerEpiM; ++mma_m_in_epi) {
+              int mma_m = (epi_m * MmaMPerEpiM) + mma_m_in_epi;
+              Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+              int idx_in_epi_subtile = (mma_n_in_epi * MmaMPerEpiM + mma_m_in_epi);
+
+              tRS_rCompute_frg(idx_in_epi_subtile) = cst_callbacks.visit(
+                tRS_rAcc_frg_mn(0), idx_in_epi_subtile, epi_m, epi_n);
+            }
+          }
+        }
+        else {
+          int mma_m = epi_m;
+          int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+          Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+          // Vectorized fragment loop with visitor callback entry point
+          int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+          int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+            tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          }
+        }
+
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rD);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output needs register shuffling before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rD_frg); ++i) {
+          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
+        }
+        
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+    int warp_idx_in_warp_group = canonical_warp_idx_sync() % NumWarpsPerWarpGroup;
+    // Since only one warp issues TMA store, we only need that one warp to initialize tensormaps
+    if (warp_idx_in_warp_group == 0) {
+      // Initialize tma
+      constexpr bool IsLoad = false;
+      auto store_tensormaps = tensormaps_init<IsLoad>(params, shared_tensormaps, sm_count, sm_idx, warp_group_idx);
+      return store_tensormaps;
+    }
+    TmaDescriptor* null_tma_desc = nullptr;
+    return cute::make_tuple(null_tma_desc);
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx,
+      int32_t warp_group_idx) {
+
+    constexpr uint32_t NumInputTensors = NumEpilogueWarpGroups + (cute::is_void_v<ElementC> ? 0 : 1);
+    Layout desc_layout = make_layout(make_shape(sm_count, Int<NumInputTensors>{}));
+
+    Tensor gmem_tensormap = make_tensor(params.tensormaps, desc_layout);                      // (SMs, NumInputTensors)
+
+    if constexpr (IsLoad) {
+      if (is_source_supported) {
+        constexpr int C_tensormap_index = NumEpilogueWarpGroups;
+        Tensor pC_tensormap = make_tensor(params.tma_load_c.get_tma_descriptor(), Int<1>{}, Int<1>{});
+        Tensor sC_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_C), Int<1>{}, Int<1>{});
+
+        if (cute::elect_one_sync()) {
+          // Bringing tensormaps from params to smem for modification later
+          copy(recast<uint128_t>(pC_tensormap), recast<uint128_t>(sC_tensormap));
+        }
+        __syncwarp();
+        return cute::make_tuple(&gmem_tensormap(sm_idx, C_tensormap_index));
+
+      }
+      TmaDescriptor* null_tma_desc = nullptr;
+      return cute::make_tuple(null_tma_desc);
+    }
+    else {
+      Tensor pD_tensormap = make_tensor(params.tma_store_d.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sD_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_D[warp_group_idx]), Int<1>{}, Int<1>{});
+
+      if (cute::elect_one_sync()) {
+        // Bringing tensormaps from params to smem for modification later
+        copy(recast<uint128_t>(pD_tensormap), recast<uint128_t>(sD_tensormap));
+      }
+      __syncwarp();
+      return cute::make_tuple(&gmem_tensormap(sm_idx, warp_group_idx));
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+    // Replacing global_address for the next batch
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.ptr_C != nullptr) {
+          cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                          params.ptr_C[next_batch]);
+        }
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                      params.ptr_D[next_batch]);
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t warp_group_idx) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride = {0,0,0,0,0};
+
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        if (params.dC != nullptr) {
+          ElementC const* ptr_C = nullptr;
+          Tensor tensor_c = make_tensor(ptr_C, make_layout(make_shape(M,N,Int<1>{}), params.dC[next_group]));
+
+          cute::detail::fill_tma_gmem_shape_stride(params.tma_load_c, tensor_c, 
+                                                  prob_shape, prob_stride);
+          // Convert strides to byte strides
+          for (uint64_t& stride : prob_stride) {
+            stride = (stride * sizeof_bits_v<ElementC>) / 8;
+          }
+          cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_C,
+                                                                  prob_shape,
+                                                                  prob_stride);
+        }
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      ElementD const* ptr_D = nullptr;
+      Tensor tensor_d = make_tensor(ptr_D, make_layout(make_shape(M,N,Int<1>{}), params.dD[next_group]));
+
+      cute::detail::fill_tma_gmem_shape_stride(params.tma_store_d, tensor_d, 
+                                               prob_shape, prob_stride);
+      // Convert strides to byte strides
+      for (uint64_t& stride : prob_stride) {
+        stride = (stride * sizeof_bits_v<ElementD>) / 8;
+      }
+
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_D[warp_group_idx],
+                                                              prob_shape,
+                                                              prob_stride);
+    }
+  }
+
+  template <bool IsLoad, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& params,
+      cute::TmaDescriptor const* tensormap,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch,
+      int32_t warp_group_idx) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address<IsLoad>(shared_tensormaps, params, next_batch, warp_group_idx);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties<IsLoad>(
+            shared_tensormaps, params, next_batch, problem_shape_mnkl, warp_group_idx);
+      }
+
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release(
+      TensorMapStorage& shared_tensormaps,
+      cute::TmaDescriptor const* tensormap,
+      const int32_t warp_group_idx = 0) {
+    // Commit and wait for all TMA load/store instructions before updating the tensormap in gmem.
+    // This operation only happens when the group/batch changes between consecutive tiles.
+    // If there are no uncommitted instructions then tma_desc_commit_group results in an empty bulk async-group.
+    auto tma_desc_wait_all_fn = [] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (cute::elect_one_sync()) {
+        cute::tma_desc_commit_group();
+        cute::tma_desc_wait_group();
+      }
+    };
+    // Entire warp must do this (ie its aligned)
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        tma_desc_wait_all_fn();
+        tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_C);
+      }
+    }
+    else if constexpr (is_destination_supported) {
+      tma_desc_wait_all_fn();
+      tma_descriptor_cp_fence_release(tensormap, shared_tensormaps.smem_tensormap_D[warp_group_idx]);
+    }
+  }
+
+  template <bool IsLoad>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::TmaDescriptor const* tensormap) {
+    if constexpr (IsLoad) {
+      if constexpr (is_source_supported) {
+        cute::tma_descriptor_fence_acquire(tensormap);
+      }
+    } 
+    else {
+      cute::tma_descriptor_fence_acquire(tensormap);
+    }
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..062b9a8b582a1a3c05407f163a0ca4b05646028a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -0,0 +1,958 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  class CtaTileMNK_,   //     (CTA_M,CTA_N,CTA_K)
+  class EpilogueTile_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class CollectiveEpilogue<
+    Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>,
+    CtaTileMNK_,
+    EpilogueTile_,
+    ElementC_,
+    StrideC_,
+    ElementD_,
+    StrideD_,
+    FusionCallbacks_,
+    CopyOpG2S_,
+    SmemLayoutAtomC_,
+    CopyOpS2R_,
+    CopyOpS2G_,
+    SmemLayoutAtomD_,
+    CopyOpR2S_,
+    CopyAtomC_,
+    CopyOpR2R_
+> {
+public:
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,FragmentSize_,ReuseSmemC_,DelayTmaStore_>;
+  using CtaTileMNK = CtaTileMNK_;
+  using EpilogueTile = EpilogueTile_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ElementC = ElementC_;
+  using StrideC = StrideC_;
+  using ElementD = ElementD_;
+  using StrideD = StrideD_;
+  using CopyOpG2S = CopyOpG2S_;
+  using SmemLayoutAtomC = SmemLayoutAtomC_;
+  using CopyOpS2R = CopyOpS2R_;
+  using CopyOpS2G = CopyOpS2G_;
+  using SmemLayoutAtomD = SmemLayoutAtomD_;
+  using CopyOpR2S = CopyOpR2S_;
+  using CopyAtomC = CopyAtomC_;
+  using CopyOpR2R = CopyOpR2R_;
+
+  using ThreadEpilogueOp = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::Operation;
+  using GmemTiledCopyC = CopyOpG2S;
+  using GmemTiledCopyD = CopyOpS2G;
+
+  static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
+  static_assert(cute::rank(CtaTileMNK{}) == 3, "CtaTileMNK must be rank-3: [CTA_M, CTA_N, CTA_K]");
+  static_assert(cute::rank(EpilogueTile{}) == 2, "EpilogueTile must be rank-2: [EPI_TILE_M, EPI_TILE_N]");
+  static_assert(size<0>(CtaTileMNK{}) % size<0>(shape(EpilogueTile{})) == 0, "EPI_TILE_M must divide CTA_M");
+  static_assert(size<1>(CtaTileMNK{}) % size<1>(shape(EpilogueTile{})) == 0, "EPI_TILE_N must divide CTA_N");
+  static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]");
+  static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]");
+
+private:
+  constexpr static bool is_source_supported = not cute::is_void_v<ElementC>;
+  constexpr static bool is_destination_supported = not cute::is_void_v<ElementD>;
+  using NonVoidElementD = cute::conditional_t<not is_destination_supported,fusion::get_element_aux_t<FusionCallbacks>, ElementD>;
+  static_assert(not cute::is_void_v<NonVoidElementD>, "SmemElementD is void");
+  using NonVoidElementC = cute::conditional_t<not is_source_supported,NonVoidElementD,ElementC>; // prevents void ref breakages
+
+  using TmaElementD = cute::conditional_t<cute::is_same_v<NonVoidElementD, cutlass::complex<float>>, uint64_t, NonVoidElementD>;
+  using TmaElementC = cute::conditional_t<cute::is_same_v<NonVoidElementC, cutlass::complex<float>>, uint64_t, NonVoidElementC>;
+
+  using SmemElementC = typename cutlass::detail::get_unpacked_element_type<NonVoidElementC>::type;
+  using SmemElementD = typename cutlass::detail::get_unpacked_element_type<NonVoidElementD>::type;
+
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_ and is_destination_supported;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  constexpr static bool is_m_major_C = detail::is_m_major<StrideC>();
+  constexpr static bool is_m_major_D = detail::is_m_major<StrideD>();
+
+  constexpr static bool is_im2col_C = cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>;
+  constexpr static bool is_im2col_D = cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>;
+
+  // Check if register transformation is needed before copying register to shared memory.
+  constexpr static bool IsUseR2R = !cute::is_void_v<CopyOpR2R>;
+
+  using SmemLayoutC = decltype(tile_to_shape(
+      SmemLayoutAtomC{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<StagesC>{}),
+      cute::conditional_t<is_m_major_C, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using SmemLayoutD = decltype(tile_to_shape(
+      SmemLayoutAtomD{},
+      make_shape(size<0>(EpilogueTile{}), size<1>(EpilogueTile{}), Int<ReuseSmemC ? StagesC : StagesD>{}),
+      cute::conditional_t<is_m_major_D, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  constexpr static bool support_smem_reuse = is_source_supported && is_destination_supported && StagesD <= StagesC
+                                            && cosize(take<0,2>(SmemLayoutC{})) == cosize(take<0,2>(SmemLayoutD{}));
+  static_assert(not (ReuseSmemC && not support_smem_reuse), "Smem reuse requirements not met");
+
+  constexpr static size_t SmemAlignmentD = cutlass::detail::alignment_for_swizzle(SmemLayoutD{});
+  constexpr static size_t SmemAlignmentC = cutlass::detail::alignment_for_swizzle(SmemLayoutC{});
+  constexpr static size_t MaxSmemAlignment = cute::max(SmemAlignmentC, SmemAlignmentD);
+
+  using SmemArrayTypeC = cute::ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>>;
+  using SmemArrayTypeD = cute::ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>>;
+
+  using EmptyType = cute::tuple<>;
+  using SmemCStorage = cute::conditional_t<is_source_supported and (not ReuseSmemC),
+                         SmemArrayTypeC,
+                         EmptyType>;
+  using SmemDStorage = cute::conditional_t<is_destination_supported,
+                         SmemArrayTypeD,
+                         EmptyType>;
+
+  struct CollectiveStorageWithC {
+    alignas(SmemAlignmentC) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageWithoutC {
+    cute::array<SmemElementC, 0> smem_C;
+    alignas(SmemAlignmentD) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+  union CollectiveStorageReuseC {
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementC, cosize_v<SmemLayoutC>> smem_C;
+    alignas(MaxSmemAlignment) ArrayEngine<SmemElementD, cosize_v<SmemLayoutD>> smem_D;
+  };
+
+public:
+  // TMA pipeline for loading C
+  using LoadPipeline = cutlass::PipelineTransactionAsync<StagesC>;
+  using LoadPipelineState = cutlass::PipelineState<StagesC>;
+  constexpr static uint32_t TmaTransactionBytes =
+    (size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof_bits<SmemElementC>::value)) / 8;
+  constexpr static bool RequiresTransactionBytes = true;
+
+  // TMA pipeline for storing D
+  using StorePipeline = cute::conditional_t<ReuseSmemC,
+                          cutlass::PipelineTmaStore<StagesC, StagesD-1>,
+                          cutlass::PipelineTmaStore<StagesD>>;
+  using StorePipelineState = cutlass::PipelineState<ReuseSmemC ? StagesC : StagesD>;
+
+  struct SharedStorage {
+    struct TensorStorage {
+      using CollectiveStorage = cute::conditional_t<not is_source_supported, CollectiveStorageWithoutC,
+                                  cute::conditional_t<ReuseSmemC, CollectiveStorageReuseC, CollectiveStorageWithC>>;
+      CollectiveStorage collective;
+
+      using FusionStorage = typename FusionCallbacks::SharedStorage;
+      FusionStorage thread;
+    } tensors;
+
+    using PipelineStorage = typename LoadPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side epilogue arguments
+  struct Arguments {
+    typename FusionCallbacks::Arguments thread{};
+    ElementC const* ptr_C;
+    StrideC dC;
+    ElementD const* ptr_D;
+    StrideD dD;
+  };
+
+  // Device side epilogue params
+  struct Params {
+    using TMA_C = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr<TmaElementC const>(nullptr),
+            repeat_like(StrideC{}, int32_t(0)), StrideC{}),
+        take<0,2>(SmemLayoutC{}),
+        EpilogueTile{},
+        _1{}));
+    using TMA_D = decltype(make_tma_copy(
+        CopyOpS2G{},
+        make_tensor(make_gmem_ptr<TmaElementD>(nullptr),
+            repeat_like(StrideD{}, int32_t(0)), StrideD{}),
+        take<0,2>(SmemLayoutD{}),
+        EpilogueTile{},
+        _1{}));
+
+    typename FusionCallbacks::Params thread{};
+    TMA_C tma_load_c;
+    TMA_D tma_store_d;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    uint32_t transaction_bytes = TmaTransactionBytes;
+    typename Params::TMA_C tma_load_c{};
+    if constexpr (is_source_supported) {
+      Tensor tensor_c = make_tensor(make_gmem_ptr<TmaElementC const>(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
+      tma_load_c = make_tma_copy_C_sm90(
+          CopyOpG2S{},
+          tensor_c,
+          take<0,2>(SmemLayoutC{}),
+          EpilogueTile{});
+    }
+
+    typename Params::TMA_D tma_store_d{};
+    if constexpr (is_destination_supported) {
+      Tensor tensor_d = make_tensor(make_gmem_ptr<TmaElementD>(args.ptr_D), make_layout(make_shape(M,N,L), args.dD));
+      tma_store_d = make_tma_copy_C_sm90(
+          CopyOpS2G{},
+          tensor_d,
+          take<0,2>(SmemLayoutD{}),
+          EpilogueTile{});
+    }
+
+    return {
+      FusionCallbacks::to_underlying_arguments(problem_shape, args.thread, workspace),
+      tma_load_c,
+      tma_store_d,
+      transaction_bytes
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return FusionCallbacks::get_workspace_size(problem_shape, args.thread);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return FusionCallbacks::initialize_workspace(problem_shape, args.thread, workspace, stream, cuda_adapter);
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto shape = cute::make_shape(M,N,L);
+
+    bool implementable = true;
+    if constexpr (is_destination_supported) {
+      constexpr int tma_alignment_bits_D = cutlass::detail::get_output_alignment_bits<ElementD>();
+      constexpr int min_tma_aligned_elements_D = tma_alignment_bits_D / cutlass::sizeof_bits<ElementD>::value;
+      if constexpr (cute::is_same_v<CopyOpS2G, SM90_TMA_STORE_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(take<0,2>(shape), take<0,2>(StrideD{}));
+      }
+      else {
+        implementable = cutlass::detail::check_alignment<min_tma_aligned_elements_D>(shape, StrideD{});
+      }
+    }
+
+    if constexpr (not cute::is_void_v<ElementC>) {
+      constexpr int tma_alignment_bits_C = cutlass::detail::get_input_alignment_bits<ElementC>();
+      constexpr int min_tma_aligned_elements_C = tma_alignment_bits_C / cutlass::sizeof_bits<ElementC>::value;
+      if constexpr (cute::is_same_v<CopyOpG2S, SM90_TMA_LOAD_IM2COL>) { // ignore L stride for implicit gemm
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(take<0,2>(shape), take<0,2>(StrideC{}));
+      }
+      else {
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_C>(shape, StrideC{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    bool fusion_implementable = FusionCallbacks::can_implement(problem_shape, args.thread);
+
+    if (!fusion_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum requirements for FusionCallbacks.\n");
+    }
+
+    bool beta_implementable = true;
+
+    if constexpr (cute::is_void_v<ElementC>) {
+      if constexpr (detail::has_beta<Arguments>::value) {
+        beta_implementable = args.thread.beta == 0.0;
+      }
+      if constexpr (detail::has_beta_ptr<Arguments>::value) {
+        beta_implementable = beta_implementable && args.thread.beta_ptr == nullptr;
+      }
+    }
+
+    if (!beta_implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Beta/beta pointer was set, but epilogue is sourceless (void-C).\n");
+    }
+
+    return implementable && fusion_implementable && beta_implementable;
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_load_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    // Compute number of epilogue subtiles
+    return size<1>(zipped_divide(make_layout(take<0,2>(tile_shape_MNK)), EpilogueTile{}));
+  }
+
+  template<class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static constexpr int
+  get_store_pipe_increment(TileShapeMNK tile_shape_MNK) {
+    return get_load_pipe_increment(tile_shape_MNK);
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void
+  prefetch_tma_descriptors(Params const& epilogue_params) {
+    if constexpr (is_source_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_load_c.get_tma_descriptor());
+    }
+    if constexpr (is_destination_supported) {
+      cute::prefetch_tma_descriptor(epilogue_params.tma_store_d.get_tma_descriptor());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  CollectiveEpilogue(Params const& params_, TensorStorage& shared_tensors)
+      : params(params_), fusion_callbacks(params_.thread, shared_tensors.thread) {}
+
+  CUTLASS_DEVICE
+  bool
+  is_producer_load_needed() const {
+    return fusion_callbacks.is_producer_load_needed();
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  load(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor C under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_C>(
+      make_coord(m_coord, n_coord),
+      make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full source tensor, slice to get the tile this CTA is currently responsible for
+    Tensor mC_mn = params.tma_load_c.get_tma_tensor(make_shape(M,N,L));                                //       (M,N,L)
+    Tensor mC = coalesce(mC_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gC = local_tile(mC, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtile, get matching smem tensor
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    Tensor gC_epi = flat_divide(gC, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sC_epi = make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{});           //      (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+
+    // Prepare the thread(b)lock's (G)mem to (S)mem TMA tiled copy (bGS_)
+    ThrCopy thrblk_g2s = params.tma_load_c.get_slice(Int<0>{});
+    Tensor bGS_gC = thrblk_g2s.partition_S(gC_epi);                                    // (G2S,G2S_M,G2S_N,EPI_M,EPI_N)
+    Tensor bGS_sC = thrblk_g2s.partition_D(sC_epi);                                    // (G2S,G2S_M,G2S_N,PIPE_C)
+
+    // Get the fusion callbacks for the producer load warp
+    auto pld_args = cutlass::epilogue::fusion::detail::ProducerLoadArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      thread_idx
+                    );
+    auto pld_callbacks = fusion_callbacks.get_producer_load_callbacks(pld_args);
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    // Predication for TMA load (one thread issues TMA load)
+    bool issue_tma_load = cute::elect_one_sync();
+
+    // Pre-loop fusion callback entry point
+    pld_callbacks.begin();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gC_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gC_epi); ++epi_m) {
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gC_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+        // Acquire the lock for this stage
+        constexpr uint16_t mcast_mask = 0;
+        uint64_t* tma_barrier = load_pipeline.producer_get_barrier(load_pipe_producer_state);
+        load_pipeline.producer_acquire(load_pipe_producer_state);
+
+        // Loop fusion callback entry point
+        pld_callbacks.step(tma_barrier, epi_m, epi_n, load_pipe_producer_state.count(), issue_tma_load);
+
+        // Execute the TMA load for C if needed
+        if (issue_tma_load && is_C_load_needed) {
+          copy(params.tma_load_c.with(*tma_barrier, mcast_mask),
+              bGS_gC(_,_,_,epi_m,epi_n), bGS_sC(_,_,_,load_pipe_producer_state.index()));
+          load_pipeline.producer_expect_transaction(load_pipe_producer_state);
+        }
+
+        // Commit TMA loads for this stage and release the lock
+        load_pipeline.producer_commit(load_pipe_producer_state);
+        ++load_pipe_producer_state;
+      }
+    }
+
+    // Post-loop fusion callback entry point
+    pld_callbacks.end();
+
+    return load_pipe_producer_state;
+  }
+
+  CUTLASS_DEVICE auto
+  load_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_producer_state) {
+    bool issue_tma_load = cute::elect_one_sync();
+    if (issue_tma_load) {
+      load_pipeline.producer_tail(load_pipe_producer_state);
+    }
+
+    return load_pipe_producer_state;
+  }
+
+  template<
+    class ProblemShapeMNKL,
+    class TileShapeMNK,
+    class TileCoordMNKL,
+    class AccEngine, class AccLayout,
+    class TiledMma
+  >
+  CUTLASS_DEVICE auto
+  store(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_MNK,
+      TileCoordMNKL tile_coord_mnkl,
+      cute::Tensor<AccEngine,AccLayout> accumulators,
+      TiledMma tiled_mma,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      int subtile_idx=-1) {
+    using namespace cute;
+    using ElementAccumulator = typename AccEngine::value_type;
+    using ElementCompute_ = typename epilogue::fusion::FusionCallbacksTraits<FusionCallbacks>::ElementCompute;
+    using ElementCompute = cute::conditional_t<cute::is_void_v<ElementCompute_>,ElementAccumulator,ElementCompute_>;
+
+    static_assert(is_rmem<AccEngine>::value, "Accumulator must be RF resident.");
+    static_assert(rank(AccLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA,MMA_M,MMA_N)");
+    static_assert(rank(ProblemShapeMNKL{}) == 4, "ProblemShapeMNKL must be rank 4");
+    static_assert(is_static<TileShapeMNK>::value, "TileShapeMNK must be static");
+    static_assert(rank(TileShapeMNK{}) == 3, "TileShapeMNK must be rank 3");
+    static_assert(rank(TileCoordMNKL{}) == 4, "TileCoordMNKL must be rank 4");
+
+    // Indexing variables
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [m_coord, n_coord, k_coord, l_coord] = tile_coord_mnkl;
+
+    // The tma tensor D under im2col mode only has two modes (M, N) which
+    // should be local tiled with only (m_coord, n_coord).
+    auto coord_shape = conditional_return<is_im2col_D>( 
+        make_coord(m_coord, n_coord),
+        make_coord(m_coord, n_coord, l_coord));
+
+    // Represent the full output tensor, slice to get the tile this CTA is responsible for
+    Tensor mD_mn = params.tma_store_d.get_tma_tensor(make_shape(M,N,L));                               //       (M,N,L)
+    Tensor mD = coalesce(mD_mn, take<0,2>(CtaTileMNK{}));
+    Tensor gD = local_tile(mD, take<0,2>(CtaTileMNK{}), coord_shape);                                  // (CTA_M,CTA_N)
+
+    // Apply epilogue subtiling
+    Tensor gD_epi = flat_divide(gD, EpilogueTile{});                             // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Construct the corresponding pipelined smem tensors
+    auto ptr_sC = shared_tensors.collective.smem_C.begin();
+    auto ptr_sD = shared_tensors.collective.smem_D.begin();
+    Tensor sC_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sC), SmemLayoutC{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_C)
+    Tensor sD_epi = cute::as_position_independent_swizzle_tensor(
+                      make_tensor(make_smem_ptr(ptr_sD), SmemLayoutD{}));             // (EPI_TILE_M,EPI_TILE_N,PIPE_D)
+
+    TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
+
+    // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
+    TiledCopy tiled_r2r = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>,
+          ElementCompute>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
+
+    // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
+    TiledCopy tiled_r2s = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (IsUseR2R) {
+        return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
+      }
+      else {
+        return make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+      }
+    }();
+    ThrCopy thread_r2s = tiled_r2s.get_slice(thread_idx);
+    Tensor tRS_rAcc = thread_r2s.retile_S(accumulators);                                   // ((R2S,R2S_V),MMA_M,MMA_N)
+    Tensor tRS_sD   = thread_r2s.partition_D(sD_epi);                                       // (R2S,R2S_M,R2S_N,PIPE_D)
+
+    auto mma_tile_m = size<0>(TileShapeMNK{}) / size<1>(tRS_rAcc);
+    auto mma_tile_n = size<1>(TileShapeMNK{}) / size<2>(tRS_rAcc);
+    auto epi_tile_m = size<0>(EpilogueTile{});
+    auto epi_tile_n = size<1>(EpilogueTile{});
+
+    // Allocate D registers
+    Layout tRS_rD_layout = make_layout(take<0,3>(shape(thread_r2s.partition_S(sD_epi))));
+    Tensor tRS_rD = make_tensor<SmemElementD>(tRS_rD_layout);                                      // (R2S,R2S_M,R2S_N)
+
+    // Vectorized fragment view
+    constexpr int FragmentSize = DispatchPolicy::FragmentSize;
+    Tensor tRS_rAcc_frg = recast<Array<ElementAccumulator, FragmentSize>>(tRS_rAcc);
+    Tensor tRS_rD_frg   = recast<Array<SmemElementD      , FragmentSize>>(tRS_rD);
+    CUTE_STATIC_ASSERT(size<0>(tRS_rAcc) % FragmentSize == 0, "Fragment size does not vectorize properly");
+
+    // (t)hread-partition for (s)mem to (r)egister copy (tSR_)
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R, SmemElementC>{}, tiled_copy_C_atom);
+    ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
+    Tensor tSR_sC        = thread_s2r.partition_S(sC_epi);                                  // (S2R,S2R_M,S2R_N,PIPE_C)
+    Layout tSR_rC_layout = thread_s2r.retile_D(tRS_rD).layout();                            // (S2R,S2R_M,S2R_N)
+
+    // Allocate C registers
+    // If C smem load is a non-vectorized dst(i) = src(i) then we can allocate C registers directly in the compute type
+    // to eliminate some redundant pack+unpack instruction sequences for sub-word types
+    constexpr bool IsDirectS2R = cute::is_same_v<CopyOpS2R, AutoVectorizingCopyWithAssumedAlignment<128>>
+                                && decltype(max_common_vector(tSR_rC_layout, tSR_sC.layout()))::value <= 1;
+    using RegisterElementC = cute::conditional_t<IsDirectS2R, ElementCompute, SmemElementC>;
+    Tensor tRS_rC = make_tensor<RegisterElementC>(tRS_rD_layout);                                  // (R2S,R2S_M,R2S_N)
+    Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                                   // (S2R,S2R_M,S2R_N)
+
+    // thread(b)lock-partition for (s)mem to (g)mem copy (bSG_)
+    ThrCopy thrblk_s2g = params.tma_store_d.get_slice(Int<0>{});
+    Tensor bSG_sD = thrblk_s2g.partition_S(sD_epi);                                    // (S2G,S2G_M,S2G_N,PIPE_D)
+    Tensor bSG_gD = thrblk_s2g.partition_D(gD_epi);                                    // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+
+    // OOB predication for tile quantization "residue"
+    // Absolute coordinate tensors (dynamic)
+    Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
+    Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
+    Tensor tRS_cD_mn = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (IsUseR2R) {
+        // (t)hread-partition for ConsumerStoreCallbacks. 
+        TiledCopy tiled_cst = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementC>{}, tiled_copy_C_atom);
+        ThrCopy thread_cst = tiled_cst.get_slice(thread_idx);
+
+        return thread_cst.partition_S(flat_divide(cD_mn, EpilogueTile{}));             // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+      }
+      else {
+        return thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{}));             // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+      }
+    }();
+    // Relative coordinate tensors (static)
+    Tensor cD = make_coord_tensor(cD_mn.layout());                                                  // (CTA_M,CTA_N)
+    Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout());                          // (R2S,R2S_M,R2S_N,EPI_M,EPI_N)
+    // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate
+    auto residue_cD = make_coord(M,N) - cD_mn(_0{});                                                           // (m,n)
+    auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{});                                                   // (m,n)
+
+    CUTE_STATIC_ASSERT(epi_tile_m % mma_tile_m == 0, "MMA_TILE_M must divide EPI_TILE_M");
+
+    if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+      // When the epilogue subtile is larger than the MMA tiles, loop over multiple MMA tiles
+      CUTE_STATIC_ASSERT(epi_tile_n % mma_tile_n == 0, "MMA_TILE_N must divide EPI_TILE_N");
+    }
+    else {
+      CUTE_STATIC_ASSERT(mma_tile_n % epi_tile_n == 0, "EPI_TILE_N must divide MMA_TILE_N");
+    }
+
+    // Get TiledCopy for partition reference when consumer store.
+    TiledCopy tiled_copy_partition_ref = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_copy_C_atom);
+    // Get the fusion callbacks for the consumer store warps
+    constexpr bool RefSrc = true; // Register tensors reference tiled copy src layout
+    auto cst_args = cutlass::epilogue::fusion::detail::ConsumerStoreArgs(
+                      problem_shape_mnkl,
+                      CtaTileMNK{},
+                      tile_coord_mnkl,
+                      tiled_mma,
+                      EpilogueTile{},
+                      tiled_copy_partition_ref,
+                      cD,
+                      residue_cD,
+                      tRS_cD,
+                      residue_tRS_cD,
+                      tRS_rC,
+                      thread_idx
+                    );
+    auto cst_callbacks = fusion_callbacks.template get_consumer_store_callbacks<RefSrc>(cst_args);
+    bool is_producer_load_needed = fusion_callbacks.is_producer_load_needed();
+    bool is_C_load_needed = is_source_supported && fusion_callbacks.is_C_load_needed();
+
+    using FragmentVisit = decltype(cst_callbacks.visit(tRS_rAcc_frg(0), 0, 0, 0));
+    constexpr bool IsDirectR2S = cute::is_same_v<FragmentVisit, Array<SmemElementD, FragmentSize>>;
+    using RegisterElementD = cute::conditional_t<!IsDirectR2S, ElementCompute, SmemElementD>;
+    Tensor tRS_rCompute = make_tensor<RegisterElementD>(tRS_rD_layout);                            // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rCompute_frg = recast<Array<RegisterElementD, FragmentSize>>(tRS_rCompute);
+
+    // Thread synchronizer for previously issued waits or fences
+    // to ensure visibility of smem reads/writes to threads or TMA unit
+    auto synchronize = [&] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+
+    // Predication for TMA store (one warp issues TMA store)
+    bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
+
+    // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight.
+    // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can
+    // only guarantee store completion after StagesD iterations, then we can begin issuing releases on the smem buffer locks.
+    // store_pipe_producer_state tracks the acquire and load_pipe_consumer_state tracks the release, in circular buffer fashion.
+    LoadPipelineState load_wait_state = load_pipe_consumer_state;
+    if constexpr (ReuseSmemC) {
+      load_wait_state = store_pipe_producer_state;
+      load_wait_state.phase_ ^= 1;
+    }
+
+    // We can delay issue of TMA store by one iteration to achieve better interleaving of non-TMA instructions
+    // Sync requirements of smem reuse may preclude this optimization
+    // Delayed stores cause delayed stage releases which causes deadlock when StagesC == StagesD
+    [[maybe_unused]] int epi_m_prev = 0;
+    [[maybe_unused]] int epi_n_prev = 0;
+    static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
+
+    // The TMA store sequence for one subtile iteration
+    auto tma_store_fn = [&] (int epi_m, int epi_n) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Write the tile from smem to gmem with TMA
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+      synchronize(); // ensure all threads have issued their async fence
+      if constexpr (is_destination_supported) {
+        if (issue_tma_store) {
+          copy(params.tma_store_d, bSG_sD(_,_,_,store_pipe_producer_state.index()), bSG_gD(_,_,_,epi_m,epi_n));
+        }
+      }
+
+      // Post async fence, pre TMA commit callback entry point
+      cst_callbacks.tma_store(epi_m, epi_n, store_pipe_producer_state.count(), issue_tma_store);
+
+      // Commit the TMA stores for this stage
+      if (issue_tma_store) {
+        store_pipeline.producer_commit(store_pipe_producer_state);
+      }
+      ++store_pipe_producer_state;
+      ++issued_stores;
+
+      // Wait for the next smem buffer to be available
+      if (issue_tma_store) {
+        store_pipeline.producer_acquire(store_pipe_producer_state);
+      }
+      synchronize();
+
+      if constexpr (ReuseSmemC) {
+        // producer_acquire returns when at most StagesD-1 committed stores are pending
+        bool store_finished = issued_stores > StorePipeline::UnacquiredStages;
+        // Let dma warp know earliest smem buffer is consumed and empty after StagesD producer commits
+        if (store_finished) {
+          if (is_producer_load_needed) {
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+          }
+          ++load_pipe_consumer_state;
+        }
+      }
+    };
+
+    //
+    // BEGIN EPILOGUE
+    //
+
+    // Pre-loop fusion callback entry point
+    cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
+
+    // For each output tile
+    CUTLASS_PRAGMA_UNROLL
+    for (int epi_n = 0; epi_n < size<3>(gD_epi); ++epi_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_m = 0; epi_m < size<2>(gD_epi); ++epi_m) {
+        [[maybe_unused]] bool is_first_iteration = epi_m == 0 && epi_n == 0;
+        bool is_last_iteration = epi_m == size<2>(gD_epi)-1 && epi_n == size<3>(gD_epi)-1;
+
+        if (subtile_idx != -1 && (epi_n * static_cast<int>(size<2>(gD_epi)) + epi_m) != subtile_idx) {
+          continue;
+        }
+
+        cst_callbacks.begin_loop(epi_m, epi_n);
+
+        if (is_producer_load_needed) {
+          // Wait for the producer load to fill smem
+          load_pipeline.consumer_wait(load_wait_state);
+
+          if (is_C_load_needed) {
+            // Copy source tile from smem to register
+            copy(tiled_s2r, tSR_sC(_,_,_,load_wait_state.index()), tSR_rC);
+            // Ensure smem loads are complete before reusing smem for mixed types/layouts
+            if constexpr (ReuseSmemC && not (SmemLayoutC{} == SmemLayoutD{})) {
+              synchronize();
+            }
+          }
+        }
+
+        // First loop fusion callback entry point
+        cst_callbacks.previsit(epi_m, epi_n, load_wait_state.count(), is_producer_load_needed);
+
+        if (is_producer_load_needed) {
+          if constexpr (not ReuseSmemC) {
+            // Let producer load warp know smem buffers are consumed and empty
+            cutlass::arch::fence_view_async_shared();
+            load_pipeline.consumer_release(load_pipe_consumer_state);
+            ++load_pipe_consumer_state;
+          }
+          ++load_wait_state;
+        }
+
+        if constexpr (epi_tile_m * epi_tile_n > mma_tile_m * mma_tile_n) {
+          // When the epilogue subtile is larger than the MMA tiles, loop over multiple
+          // MMA tiles
+          static constexpr int MmaMPerEpiM = epi_tile_m / mma_tile_m;
+          static constexpr int MmaNPerEpiN = epi_tile_n / mma_tile_n;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_n_in_epi = 0; mma_n_in_epi < MmaNPerEpiN; ++mma_n_in_epi) {
+            int mma_n = (epi_n * MmaNPerEpiN) + mma_n_in_epi;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_m_in_epi = 0; mma_m_in_epi < MmaMPerEpiM; ++mma_m_in_epi) {
+              int mma_m = (epi_m * MmaMPerEpiM) + mma_m_in_epi;
+              Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+              int idx_in_epi_subtile = (mma_n_in_epi * MmaMPerEpiM + mma_m_in_epi);
+
+              tRS_rCompute_frg(idx_in_epi_subtile) = cst_callbacks.visit(
+                tRS_rAcc_frg_mn(0), idx_in_epi_subtile, epi_m, epi_n);
+            }
+          }
+        }
+        else {
+          int mma_m = epi_m;
+          int mma_n = (epi_n * size<1>(EpilogueTile{})) / mma_tile_n;
+          Tensor tRS_rAcc_frg_mn = tRS_rAcc_frg(_,mma_m,mma_n);
+
+          // Vectorized fragment loop with visitor callback entry point
+          int epi_n_in_mma = epi_n % (mma_tile_n / epi_tile_n);
+          int r2s_v = epi_n_in_mma * size(tRS_rCompute_frg);
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_v = 0; epi_v < size(tRS_rCompute_frg); ++epi_v) {
+            tRS_rCompute_frg(epi_v) = cst_callbacks.visit(tRS_rAcc_frg_mn(r2s_v + epi_v), epi_v, epi_m, epi_n);
+          }
+        }
+
+        // The latest we can delay the TMA store is right before the smem store of the next iteration
+        // since the current TMA store needs to be committed before we can acquire the next smem buffer
+        if constexpr (DelayTmaStore) {
+          // Issue TMA stores for the previous subtile
+          if (not is_first_iteration and subtile_idx == -1) {
+            tma_store_fn(epi_m_prev, epi_n_prev);
+          }
+          epi_m_prev = epi_m;
+          epi_n_prev = epi_n;
+        }
+
+        // Smem reduction callback entry point using current store buffer for workspace
+        cst_callbacks.reduce(sD_epi(_,_,store_pipe_producer_state.index()),
+                              synchronize, epi_m, epi_n, is_last_iteration, tRS_rCompute_frg);
+
+        // Copy tile from register to regiser if needed
+        if constexpr (IsUseR2R) {
+          // retile source and destination for tiled_r2r
+          Tensor tRR_rD_src = thread_r2r.retile_S(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+          Tensor tRR_rD_dst = thread_r2r.retile_D(tRS_rCompute);                             // (R2R,R2R_M,R2R_N,EPI_M,EPI_N)
+
+          // Output register transformation before copying to shared memory.
+          copy(tiled_r2r, tRR_rD_src, tRR_rD_dst);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tRS_rD_frg); ++i) {
+          tRS_rD_frg(i) = cutlass::NumericArrayConverter<SmemElementD, RegisterElementD, FragmentSize>{}(tRS_rCompute_frg(i));
+        }
+
+        // Copy tile from register to smem
+        if constexpr (is_destination_supported) {
+          copy(tiled_r2s, tRS_rD, tRS_sD(_,_,_,store_pipe_producer_state.index()));
+        }
+
+        // Post reduction, pre TMA store callback entry point
+        constexpr bool issue_smem_store = true; // No smem store predication
+        cst_callbacks.postreduce(epi_m, epi_n, store_pipe_producer_state.count(), issue_smem_store);
+
+        if constexpr (not DelayTmaStore) {
+          // Issue TMA stores for this subtile
+          tma_store_fn(epi_m, epi_n);
+        }
+
+        cst_callbacks.end_loop(epi_m, epi_n);
+
+      } // for epi_m
+    } // for epi_n
+
+    if constexpr (DelayTmaStore) {
+      // Issue TMA stores for the last subtile
+      tma_store_fn(epi_m_prev, epi_n_prev);
+    }
+
+    // Post-loop fusion callback entry point
+    cst_callbacks.end();
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+  CUTLASS_DEVICE auto
+  store_tail(
+      LoadPipeline load_pipeline,
+      LoadPipelineState load_pipe_consumer_state,
+      StorePipeline store_pipeline,
+      StorePipelineState store_pipe_producer_state) {
+    // wait for all TMA stores to complete
+    store_pipeline.producer_tail(store_pipe_producer_state);
+    // reset store counter
+    issued_stores = 0;
+
+    if constexpr (ReuseSmemC) {
+      if (fusion_callbacks.is_producer_load_needed()) {
+        // Issue releases on up to StagesD-1 previously issued TMA stores
+        constexpr int release_stages = cute::min(StorePipeline::UnacquiredStages, get_load_pipe_increment(CtaTileMNK{}));
+        CUTLASS_PRAGMA_UNROLL
+        for (int stage = 0; stage < release_stages; ++stage) {
+          load_pipeline.consumer_release(load_pipe_consumer_state);
+          ++load_pipe_consumer_state;
+        }
+      }
+    }
+
+    return cute::make_tuple(load_pipe_consumer_state, store_pipe_producer_state);
+  }
+
+private:
+  Params const& params;
+  FusionCallbacks fusion_callbacks;
+  int issued_stores = 0;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d5fd85827b2751085a78dcb241aa3cf081470d5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing pipelined epilogues with bias add and elementwise activation functions.
+         This collective is now DEPRECATED, will be removed in the next release. Use EVT instead.
+*/
+
+#pragma once
+
+#include "sm90_epilogue_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  class BlockTileShape_,    //     (BLK_M,BLK_N,BLK_K)
+  class EpilogueTileShape_, // (EPI_TILE_M,EPI_TILE_N)
+  class ElementC_,
+  class StrideC_,
+  class ElementD_,
+  class StrideD_,
+  class FusionCallbacks_,
+  class CopyOpG2S_,
+  class SmemLayoutAtomC_,
+  class CopyOpS2R_,
+  class CopyOpS2G_,
+  class SmemLayoutAtomD_,
+  class CopyOpR2S_,
+  class CopyAtomC_,
+  class CopyOpR2R_
+>
+class Sm90EpilogueTmaWarpSpecializedBiasElementwise
+  : public CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+> {
+private:
+  using Impl =
+    CollectiveEpilogue<
+      Sm90TmaWarpSpecialized<StagesC_, StagesD_, FragmentSize_, false, false>,
+      BlockTileShape_,
+      EpilogueTileShape_,
+      ElementC_,
+      StrideC_,
+      ElementD_,
+      StrideD_,
+      FusionCallbacks_,
+      CopyOpG2S_,
+      SmemLayoutAtomC_,
+      CopyOpS2R_,
+      CopyOpS2G_,
+      SmemLayoutAtomD_,
+      CopyOpR2S_,
+      CopyAtomC_,
+      CopyOpR2R_
+    >;
+public:
+  using DispatchPolicy = Sm90TmaWarpSpecializedBiasElementwise<StagesC_, StagesD_, FragmentSize_>;
+  using ElementCompute = typename Impl::ThreadEpilogueOp::ElementCompute;
+  using ElementBias = typename Impl::ThreadEpilogueOp::ElementBias;
+  using ElementT = typename Impl::ThreadEpilogueOp::ElementAux;
+
+  // Constructor inheritance
+  using Impl::Impl;
+
+  // Host side epilogue arguments
+  struct [[deprecated("use Sm90TmaWarpSpecialized Arguments instead")]]
+  Arguments {
+    struct ThreadArgs {
+      ElementCompute alpha{1};
+      ElementCompute beta{0};
+      ElementCompute const *alpha_ptr{nullptr};
+      ElementCompute const *beta_ptr{nullptr};
+    } thread;
+    ElementC_ const* ptr_C{nullptr};
+    StrideC_ dC{};
+    ElementD_* ptr_D{nullptr};
+    StrideD_ dD{};
+    ElementBias const* ptr_Bias{nullptr};
+    ElementT* ptr_T{nullptr};
+
+    CUTLASS_HOST_DEVICE
+    operator typename Impl::Arguments() const {
+      typename Impl::Arguments arguments;
+      arguments.thread.alpha = thread.alpha;
+      arguments.thread.beta = thread.beta;
+      arguments.thread.alpha_ptr = thread.alpha_ptr;
+      arguments.thread.beta_ptr = thread.beta_ptr;
+      if constexpr (not cute::is_void_v<ElementBias>) {
+        arguments.thread.bias_ptr = ptr_Bias;
+      }
+      if constexpr (not cute::is_void_v<ElementT>) {
+        arguments.thread.aux_ptr = ptr_T;
+        arguments.thread.dAux = dD;
+      }
+      arguments.ptr_C = ptr_C;
+      arguments.dC = dC;
+      arguments.ptr_D = ptr_D;
+      arguments.dD = dD;
+
+      return arguments;
+    }
+  };
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/dispatch_policy.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca91ac19b0aadfeddcfb030ee16f03905855cd63
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/dispatch_policy.hpp
@@ -0,0 +1,302 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue {
+
+//////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Builder Epilogue Schedules
+//
+//////////////////////////////////////////////////////////////////////////////
+// Pre-Hopper schedules
+struct PtrArrayDefault {};
+struct EpilogueSimtVectorized {};
+struct EpiloguePtrArraySimtVectorized {};
+// Hopper direct store schedules
+struct NoSmemWarpSpecialized {};
+struct PtrArrayNoSmemWarpSpecialized {};
+struct PtrArrayNoSmemWarpSpecializedTransposed {};
+// Hopper TMA schedules
+struct TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperative {};
+struct PtrArrayTmaWarpSpecialized { static constexpr int NumEpilogueWarpGroups = 1; };
+struct PtrArrayTmaWarpSpecializedPingpong { static constexpr int NumEpilogueWarpGroups = 2; };
+struct PtrArrayTmaWarpSpecializedCooperative { static constexpr int NumEpilogueWarpGroups = 2; };
+// Blackwell direct store schedules
+struct NoSmemWarpSpecialized1Sm {};
+struct NoSmemWarpSpecialized2Sm {};
+struct FastF32NoSmemWarpSpecialized1Sm : NoSmemWarpSpecialized1Sm {};
+struct FastF32NoSmemWarpSpecialized2Sm : NoSmemWarpSpecialized2Sm {};
+struct BlockwiseNoSmemWarpSpecialized1Sm : NoSmemWarpSpecialized1Sm {};
+struct BlockwiseNoSmemWarpSpecialized2Sm : NoSmemWarpSpecialized2Sm {};
+struct PtrArrayNoSmemWarpSpecialized1Sm : NoSmemWarpSpecialized1Sm {};
+struct PtrArrayNoSmemWarpSpecialized2Sm : NoSmemWarpSpecialized2Sm {};
+struct PtrArrayFastF32NoSmemWarpSpecialized1Sm : PtrArrayNoSmemWarpSpecialized1Sm {};
+struct PtrArrayFastF32NoSmemWarpSpecialized2Sm : PtrArrayNoSmemWarpSpecialized2Sm {};
+struct PtrArrayBlockwiseNoSmemWarpSpecialized1Sm : PtrArrayNoSmemWarpSpecialized1Sm {};
+struct PtrArrayBlockwiseNoSmemWarpSpecialized2Sm : PtrArrayNoSmemWarpSpecialized2Sm {};
+// Blackwell TMA schedules 
+struct TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2Sm {};
+struct PtrArrayTmaWarpSpecialized1Sm : TmaWarpSpecialized1Sm {};
+struct PtrArrayTmaWarpSpecialized2Sm : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmNvf4     final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmNvf4     final : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmMxf4     final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmMxf4     final : TmaWarpSpecialized2Sm {};
+struct TmaWarpSpecialized1SmMxf8f6f4 final : TmaWarpSpecialized1Sm {};
+struct TmaWarpSpecialized2SmMxf8f6f4 final : TmaWarpSpecialized2Sm {};
+// Cooperative epilogue schedule for sm120 sparse kernels
+struct SparseTmaWarpSpecializedCooperativeSm120 : public TmaWarpSpecializedCooperative {};
+
+// DEPRECATED schedules, will be removed in next release
+struct TmaWarpSpecializedElementwiseBase : public TmaWarpSpecialized {};
+struct TmaWarpSpecializedCooperativeElementwiseBase : public TmaWarpSpecializedCooperative {};
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedElementwise : public TmaWarpSpecializedElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  thread::ScaleType::Kind Scale_ = thread::ScaleType::Default,
+  FloatRoundStyle Round_ = FloatRoundStyle::round_to_nearest
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombEltAct instead")]]
+TmaWarpSpecializedCooperativeElementwise : public TmaWarpSpecializedCooperativeElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  static constexpr thread::ScaleType::Kind Scale = Scale_;
+  static constexpr FloatRoundStyle Round = Round_;
+};
+
+struct TmaWarpSpecializedBiasElementwiseBase : public TmaWarpSpecialized{};
+struct TmaWarpSpecializedCooperativeBiasElementwiseBase : public TmaWarpSpecializedCooperative {};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecialized with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedBiasElementwise : public TmaWarpSpecializedBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+template <
+  template <class T> class ActivationFunctor_,
+  class ElementT_,
+  template <class T> class BiasOp_,
+  bool StoreT_,
+  class ElementBias_
+>
+struct [[deprecated("Use TmaWarpSpecializedCooperative with fusion::LinCombPerRowBiasEltActAux instead")]]
+TmaWarpSpecializedCooperativeBiasElementwise : public TmaWarpSpecializedCooperativeBiasElementwiseBase {
+  template <class T>
+  using ActivationFunctor = ActivationFunctor_<T>;
+
+  using ElementT = ElementT_;
+
+  template <class T>
+  using BiasOp = BiasOp_<T>;
+
+  static constexpr bool StoreT = StoreT_;
+  using ElementBias = ElementBias_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Collective Dispatch Policies
+//
+//////////////////////////////////////////////////////////////////////////////
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm90TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_
+>
+struct Sm90PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+};
+
+// DEPRECATED policies, will be removed in next release
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_ = 2
+>
+struct Sm90TmaWarpSpecializedBiasElementwise {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+};
+
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm100TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm100PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+
+  static_assert(StagesC >= 1, "StagesC must be >= 1");
+  static_assert(StagesD >= 1, "StagesD must be >= 1");
+};
+
+struct Sm100NoSmem {
+  constexpr static int StagesC = 1;
+  constexpr static int StagesD = 1;
+  constexpr static int FragmentSize = 1;
+};
+
+struct Sm100NoSmemWarpSpecialized {
+  constexpr static int StagesC = 1;
+  constexpr static int StagesD = 1;
+  constexpr static int FragmentSize = 1;
+};
+
+struct Sm100PtrArrayNoSmem {
+  constexpr static int StagesC = 1;
+  constexpr static int StagesD = 1;
+  constexpr static int FragmentSize = 1;
+};
+
+struct Sm100PtrArrayNoSmemWarpSpecialized {
+  constexpr static int StagesC = 1;
+  constexpr static int StagesD = 1;
+  constexpr static int FragmentSize = 1;
+};
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_
+>
+struct Sm120TmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+};
+
+template<
+  int StagesC_,
+  int StagesD_,
+  int FragmentSize_,
+  bool ReuseSmemC_,
+  bool DelayTmaStore_,
+  int NumEpilogueWarpGroups_
+>
+struct Sm120PtrArrayTmaWarpSpecialized {
+  constexpr static int StagesC = StagesC_;
+  constexpr static int StagesD = StagesD_;
+  constexpr static int FragmentSize = FragmentSize_;
+  constexpr static bool ReuseSmemC = ReuseSmemC_;
+  constexpr static bool DelayTmaStore = DelayTmaStore_;
+  constexpr static int NumEpilogueWarpGroups = NumEpilogueWarpGroups_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9febeec4d92d54ec02e221d028f7329c2edeea5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/callbacks.hpp
@@ -0,0 +1,91 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Dispatch interface for epilogue fusion callbacks
+// For visitor fusions, this is just a convenience wrapper to provide metadata and non-nested args.
+// It is also valid to just pass visitor callbacks directly to the collective, e.g. fusion::Sm90LinearCombination,
+// provided the collective supports a visitor callbacks interface. This is useful for implementing custom fusions.
+template <
+  class DispatchPolicy,  // specialize on collective's dispatch policy since callbacks API will depend on collective's algorithm
+  class Operation,       // the fusion operation being performed, e.g. fusion::LinearCombination
+  class CtaTile_MNK,     // computed tile per CTA
+  class EpilogueTile_MN, // epilogue subtile size
+  class... Args          // callbacks implementation dependent args (e.g. copy atoms, smem layouts)
+>
+struct FusionCallbacks {
+  static_assert(cutlass::detail::dependent_false<DispatchPolicy, Operation>, "Could not find a callbacks specialization.");
+};
+
+// Metadata helper to handle custom EVTs or other non-FusionCallbacks types
+template <class T>
+struct FusionCallbacksTraits {
+  using DispatchPolicy = void;
+  using Callbacks = T;
+  using Operation = FusionOperation;
+  using CtaTile_MNK = void;
+  using EpilogueTile_MN = void;
+  using ElementCompute = void;
+};
+
+template <
+  class DispatchPolicy_,
+  class Operation_,
+  class CtaTile_MNK_,
+  class EpilogueTile_MN_,
+  class... Args
+>
+struct FusionCallbacksTraits<
+  FusionCallbacks<DispatchPolicy_, Operation_, CtaTile_MNK_, EpilogueTile_MN_, Args...>
+> {
+  using DispatchPolicy = DispatchPolicy_;
+  using Callbacks = FusionCallbacks<DispatchPolicy_, Operation_, CtaTile_MNK_, EpilogueTile_MN_, Args...>;
+  using Operation = Operation_;
+  using CtaTile_MNK = CtaTile_MNK_;
+  using EpilogueTile_MN = EpilogueTile_MN_;
+  using ElementCompute = typename Operation::ElementCompute;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/operations.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/operations.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..114737a9d910a458f4895212d0904e002a9aeec8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/operations.hpp
@@ -0,0 +1,645 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/layout/matrix.h>
+#include <cute/numeric/numeric_types.hpp>
+#include <cute/numeric/integral_constant.hpp> // cute::false_type
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Fusion Operations
+// Template args must not be implementation dependent
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FusionOperation {
+  // metadata types/queries that can be overrided
+  using ElementOutput = void;
+  using ElementCompute = void;
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_indeterminate;
+
+  using ElementSource = void;
+  static constexpr bool IsSourceSupported = false;
+  static constexpr bool IsResidualSupported = false; // Source is added after activation
+
+  using ElementScalar = void;
+  static constexpr int AlignmentScalar = 0;
+  static constexpr bool IsScaleFactorSupported = false;
+  static constexpr bool IsPerRowScaleSupported = false;
+  static constexpr bool IsPerColScaleSupported = false;
+
+  using ElementBias = void;
+  static constexpr int AlignmentBias = 0;
+  static constexpr bool IsPerRowBiasSupported = false;
+  static constexpr bool IsPerColBiasSupported = false;
+  static constexpr bool IsDePerRowBiasSupported = false;
+
+  using ActivationFn = void;
+  static constexpr bool IsEltActSupported = false;
+  static constexpr bool IsDeEltActSupported = false;
+
+  using ElementAux = void;
+  using GmemLayoutTagAux = void;
+  static constexpr int AlignmentAux = 0;
+  static constexpr bool IsAuxOutSupported = false;
+  static constexpr bool IsAuxInSupported = false;
+
+  using ElementAmax = void;
+  static constexpr bool IsAbsMaxSupported = false;
+
+  using ElementBlockScaleFactor = void;
+  static constexpr int SFVecSize = 0;
+  static constexpr bool IsBlockScaleSupported = false;               // Umbrella variable to check BlockScaling support in the epilogues
+  using GmemLayoutTagScalefactor = void;
+};
+
+// D = alpha * acc
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledAcc : FusionOperation {
+  using ElementOutput = ElementOutput_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementScalar_;
+  static constexpr int AlignmentScalar = 1;
+  static constexpr auto RoundStyle = RoundStyle_;
+};
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinearCombination
+    : ScaledAcc<ElementOutput_, ElementCompute_, ElementScalar_, RoundStyle_> {
+  using ElementSource = ElementSource_;
+  static constexpr bool IsSourceSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombTopKSoftmaxCol
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+};
+
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerRowBiasSupported = true;
+};
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBias
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsPerColBiasSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltAct
+    : LinCombPerRowBias<ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// Grouped Wgrad's D = alpha * acc + beta * C with special AccFetch.
+template<
+  class GroupsPerTile_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinearCombinationGroupedWgrad
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using GroupsPerTile = GroupsPerTile_;
+};
+
+// D = activation(alpha * acc + beta * C + per-column bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasEltAct
+    : LinCombPerColBias<ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsEltActSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// aux = alpha * acc + beta * C + per-row bias
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltActAux
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = activation(alpha * acc + beta * C + per-col bias)
+// aux = alpha * acc + beta * C + per-col bias
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasEltActAux
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerRowLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr bool IsPerRowScaleSupported = true;
+};
+
+// D = activation(per-col alpha * acc + per-col beta * C + per-column bias)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerColLinCombPerColBiasEltAct
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr int AlignmentScalar = AlignmentScalar_;
+  static constexpr bool IsPerColScaleSupported = true;
+};
+
+// D = activation(per-col alpha * acc + per-column bias) + per-col beta * C
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_, // per-row alpha/beta
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct PerColResAddPerColBiasEltAct
+    : PerColLinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, AlignmentScalar_, RoundStyle_> {
+  static constexpr bool IsResidualSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltAct
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr bool IsScaleFactorSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-col bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerColBiasEltAct
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  static constexpr bool IsScaleFactorSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementAmax_ = ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerRowBiasEltActAmaxAux
+    : ScaledLinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAmax = ElementAmax_;
+  static constexpr bool IsAbsMaxSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementAmax_ = ElementCompute_,
+  class ElementBias_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct ScaledLinCombPerColBiasEltActAmaxAux
+    : ScaledLinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementAmax = ElementAmax_;
+  static constexpr bool IsAbsMaxSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxOutSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltAct
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ActivationFn = ActivationFn_<ElementCompute_>;
+  static constexpr bool IsDeEltActSupported = true;
+
+  using ElementAux = ElementAux_;
+  using GmemLayoutTagAux = GmemLayoutTagAux_;
+  static constexpr int AlignmentAux = AlignmentAux_;
+  static constexpr bool IsAuxInSupported = true;
+};
+
+// Z = Aux
+// dY = alpha * acc + beta * C
+// D = d_activation(dY, Z)
+// dBias = sum of columns of D
+template<
+  class GmemLayoutTagAux_,
+  template <class> class ActivationFn_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementAux_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentAux_ = 128 / cute::sizeof_bits_v<ElementAux_>,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombDeEltActDePerRowBias
+    : LinCombDeEltAct<GmemLayoutTagAux_, ActivationFn_, ElementOutput_, ElementCompute_,
+        ElementAux_, ElementSource_, ElementScalar_, AlignmentAux_, RoundStyle_> {
+  using ElementBias = ElementBias_;
+  static constexpr int AlignmentBias = AlignmentBias_;
+  static constexpr bool IsDePerRowBiasSupported = true;
+};
+
+template<
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombBlockScaleFactor
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+// D = activation(alpha * acc + beta * C)
+// With BlockScaleFactor generation (same recipe as LinCombBlockScaleFactor).
+template<
+  template <class> class ActivationFn_,
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombEltActBlockScaleFactor
+    : LinCombEltAct<ActivationFn_, ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+// D = alpha * acc + beta * C + per-row bias
+// With BlockScaleFactor generation
+template<
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementBias_   = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasBlockScaleFactor
+    : LinCombPerRowBias<ElementOutput_, ElementCompute_, ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+
+// D = alpha * acc + beta * C + per-col bias
+// With BlockScaleFactor generation.
+template<
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementBias_   = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasBlockScaleFactor
+    : LinCombPerColBias<ElementOutput_, ElementCompute_, ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// With BlockScaleFactor generation.
+template<
+  template <class> class ActivationFn_,
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementBias_   = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerRowBiasEltActBlockScaleFactor
+    : LinCombPerRowBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_, ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+
+// D = activation(alpha * acc + beta * C + per-col bias)
+// With BlockScaleFactor generation.
+template<
+  template <class> class ActivationFn_,
+  int SFVecSize_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementBlockScaleFactor_,
+  class GmemLayoutTagScalefactor_ = cutlass::layout::RowMajor,
+  class ElementBias_   = ElementOutput_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  int AlignmentBias_ = 128 / cute::sizeof_bits_v<ElementBias_>,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinCombPerColBiasEltActBlockScaleFactor
+    : LinCombPerColBiasEltAct<ActivationFn_, ElementOutput_, ElementCompute_, ElementBias_, ElementSource_, ElementScalar_, AlignmentBias_, RoundStyle_> {
+  using ElementBlockScaleFactor = ElementBlockScaleFactor_;
+  static constexpr int SFVecSize = SFVecSize_;
+  static constexpr bool IsBlockScaleSupported = true;
+  using GmemLayoutTagScalefactor = GmemLayoutTagScalefactor_;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfbb75bf00bd2160af770566c4f3970a2c7b5b10
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Fusion callbacks specializations for the sm100 TMA warp-specialized (ws) epilogue
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+
+#include "cutlass/epilogue/fusion/sm100_visitor_compute_tma_warpspecialized.hpp"  
+#include "cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sm100 Tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// Sm100 direct store callbacks alias to sm100 tma callbacks with 0 stages
+// Additional copy atom args will be ignored in the 0-stage specializations of aux load/store nodes
+template <
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm100NoSmemWarpSpecialized,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm100TmaWarpSpecialized<0, 0, 0, false, false>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm100TmaWarpSpecialized<0, 0, 0, false, false>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// Sm100 Ptr array tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, 1>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, 1>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+// With Row BlockScaleFactor Generation.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinearCombRowBlockScaleFactor =
+  Sm90EVT<Sm100BlockScaleFactorRowStore<SFVecsize, EpilogueTile, ElementOutput, ElementCompute, ElementBlockScaleFactor, RoundStyle>, // gen scalefactor
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl =  Sm100LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C
+// With Col BlockScaleFactor Generation.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinearCombColBlockScaleFactor =
+  Sm90EVT<Sm100BlockScaleFactorColStore<SFVecsize, EpilogueTile, ElementOutput, ElementCompute, ElementBlockScaleFactor, RoundStyle>, // gen scalefactor
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::ColumnMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl =  Sm100LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::ColumnMajor,  ElementSource, ElementScalar, RoundStyle>;  
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For Ptr-Array and Grouped GEMM
+// D = alpha * acc + beta * C, where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinearCombRowBlockScaleFactorPtrArray =
+  Sm90EVT<Sm100BlockScaleFactorRowStore<SFVecsize, EpilogueTile, ElementOutput, ElementCompute, ElementBlockScaleFactor *, RoundStyle>, // gen scalefactor
+    Sm90LinearCombinationPtrArray<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinearCombRowBlockScaleFactorPtrArray<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl =  Sm100LinearCombRowBlockScaleFactorPtrArray<SFVecSize, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    // NormConst is a single device-side constant value, its not per-batch or per-group
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For Ptr-Array and Grouped GEMM
+// D = activation(alpha * acc + beta * C), where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombEltActRowBlockScaleFactorPtrArray =
+  Sm90EVT<Sm100BlockScaleFactorRowStore<SFVecsize, EpilogueTile, ElementOutput, ElementCompute, ElementBlockScaleFactor *, RoundStyle>, // gen scalefactor
+    Sm90LinCombEltActPtrArray<ActivationFn, ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // activation(beta * C + (alpha * acc))
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombEltActBlockScaleFactor<ActivationFn, SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombEltActRowBlockScaleFactorPtrArray<SFVecSize, EpilogueTile, ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl =  Sm100LinCombEltActRowBlockScaleFactorPtrArray<SFVecSize, EpilogueTile, ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltActBlockScaleFactor<ActivationFn, SFVecSize, ElementOutput, ElementCompute, ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op: activation(beta * C + (alpha * acc))
+            {    // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // binary op : alpha * acc
+                {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {}                  // binary args : multiplies
+              },                    // end binary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-row bias
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, ElementOutput, 
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, 
+      ElementScalar, 
+      AlignmentBias,
+       RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm100LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// D = alpha * acc + beta * C + per-row bias
+//   with col blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, ElementOutput, 
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm100LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per_col bias
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerColBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, ElementOutput, 
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm100LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, CtaTileShapeMNK, EpilogueTile, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias) 
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, 
+      ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm100LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias) 
+//   with col blockScaled generation
+template<
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerRowBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, 
+      ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerRowBiasEltActColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm100LinCombPerRowBiasEltActColBlockScaleFactor<
+      SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor, 
+      ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per_col bias) 
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm100LinCombPerColBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm100BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, 
+      ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm100LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm100LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, 
+      ElementCompute, ElementBlockScaleFactor, ElementBias, ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute, 
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+
+// --------------------------------------------------------------------
+//  Sm100PtrArrayNoSmemWarpSpecialized  (direct-store, grouped GEMM)
+// --------------------------------------------------------------------
+template <
+    class Operation,
+    class CtaTile_MNK,
+    class EpilogueTile_MN,
+    class... Args
+>
+struct FusionCallbacks<
+        epilogue::Sm100PtrArrayNoSmemWarpSpecialized,
+        Operation,
+        CtaTile_MNK,
+        EpilogueTile_MN,
+        Args...>
+  : FusionCallbacks<
+        // reuse the ptr-array *TMA* callbacks with 0 stages
+        epilogue::Sm100PtrArrayTmaWarpSpecialized<0,0,0,false,false>,
+        Operation,
+        CtaTile_MNK,
+        EpilogueTile_MN,
+        Args...> {
+
+  using Base = FusionCallbacks<
+      epilogue::Sm100PtrArrayTmaWarpSpecialized<0,0,0,false,false>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>;
+
+  // bring ctors into scope
+  using Base::Base;
+};
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_compute_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_compute_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a20591288ad386543c3c7f0fd399c7fe45b7f60a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_compute_tma_warpspecialized.hpp
@@ -0,0 +1,500 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the sm100 TMA warp-specialized (ws) epilogue
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp" 
+#include "cutlass/epilogue/thread/activation.h"
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//                                   BatchNormApply
+//
+// This node aims to do the batch norm apply. The procedure is described as follows:
+//
+//                    output = (input - mean) * inv_stddev * alpha + bias
+//
+// while: (1) input & output are 2 matrices with shape (M, N),
+//            which are frg_input & return value of the visit function
+//
+//        (2) mean, inv_stddev, alpha & bias are 4 vectors with shape (N).
+//            which are loaded by ProducerLoadCallbacks
+//
+// To avoid redundant calculations in EVT, this node simplify the procedure as follows:
+//
+//                              output = input * alpha' + bias'
+//
+// while alpha' & bias' are 2 vectors with shape (N) calculated by mean, inv_stddev, alpha & bias
+//
+// The calculation among vectors is described as follows:
+//
+//                               alpha' = alpha * inv_stddev
+//                               bias' = bias - mean * alpha'
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  // reuses the mbarriers from the epilogue subtile load pipeline, so this must be at least
+  // this should just match CLC stage count
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementScalar,
+  class ElementCompute,
+  class ElementOutput,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm100BatchNormApply {
+  static_assert(Alignment * sizeof_bits_v<ElementScalar> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(cute::is_same_v<StrideMNL, Stride<_0,_1,_0>>); // row vector broadcast for alpha, bias, mean & inv_stddev
+
+  using SmemLayout = decltype(make_layout(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                              make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{}))));
+
+  using ElementCol = cute::conditional_t<(sizeof(ElementCompute) > sizeof(ElementScalar)), ElementCompute, ElementScalar>;
+
+  struct SharedStorage {
+    alignas(16) array_aligned<ElementCol, size<1>(CtaTileShapeMNK{}) * Stages> smem_alpha;
+    alignas(16) array_aligned<ElementCol, size<1>(CtaTileShapeMNK{}) * Stages> smem_bias;
+    alignas(16) array_aligned<ElementScalar, size<1>(CtaTileShapeMNK{}) * Stages> smem_mean;
+    alignas(16) array_aligned<ElementScalar, size<1>(CtaTileShapeMNK{}) * Stages> smem_inv_stddev;
+  };
+
+  struct Arguments {
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* bias_ptr = nullptr;
+    ElementScalar const* mean_ptr = nullptr;
+    ElementScalar const* inv_stddev_ptr = nullptr;
+    StrideMNL dVec = {};
+  };
+
+  struct Params {
+    using TMA_Vec = decltype(make_tma_atom(
+        SM90_TMA_LOAD{},
+        make_tensor(make_gmem_ptr<ElementScalar const>(nullptr), repeat_like(StrideMNL{}, int32_t(0)), append<3>(StrideMNL{}, _0{})),
+        take<0,2>(SmemLayout{}),
+        take<0,2>(CtaTileShapeMNK{})));
+
+    TMA_Vec tma_load_alpha;
+    TMA_Vec tma_load_bias;
+    TMA_Vec tma_load_mean;
+    TMA_Vec tma_load_inv_stddev;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+
+    Tensor tensor_alpha = make_tensor(make_gmem_ptr(args.alpha_ptr), make_layout(make_shape(size(M),N,size(L)), append<3>(args.dVec, _0{})));
+    Tensor tensor_bias = make_tensor(make_gmem_ptr(args.bias_ptr), make_layout(make_shape(size(M),N,size(L)), append<3>(args.dVec, _0{})));
+    Tensor tensor_mean = make_tensor(make_gmem_ptr(args.mean_ptr), make_layout(make_shape(size(M),N,size(L)), append<3>(args.dVec, _0{})));
+    Tensor tensor_inv_stddev = make_tensor(make_gmem_ptr(args.inv_stddev_ptr), make_layout(make_shape(size(M),N,size(L)), append<3>(args.dVec, _0{})));
+
+    typename Params::TMA_Vec tma_load_alpha = make_tma_atom(SM90_TMA_LOAD{}, tensor_alpha, take<0,2>(SmemLayout{}), take<0,2>(CtaTileShapeMNK{}));
+    typename Params::TMA_Vec tma_load_bias = make_tma_atom(SM90_TMA_LOAD{}, tensor_bias, take<0,2>(SmemLayout{}), take<0,2>(CtaTileShapeMNK{}));
+    typename Params::TMA_Vec tma_load_mean = make_tma_atom(SM90_TMA_LOAD{}, tensor_mean, take<0,2>(SmemLayout{}), take<0,2>(CtaTileShapeMNK{}));
+    typename Params::TMA_Vec tma_load_inv_stddev = make_tma_atom(SM90_TMA_LOAD{}, tensor_inv_stddev, take<0,2>(SmemLayout{}), take<0,2>(CtaTileShapeMNK{}));
+
+    return Params{tma_load_alpha, tma_load_bias, tma_load_mean, tma_load_inv_stddev};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BatchNormApply() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BatchNormApply(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_alpha(const_cast<ElementScalar*>(shared_storage.smem_alpha.data())),
+        smem_bias(const_cast<ElementScalar*>(shared_storage.smem_bias.data())),
+        smem_mean(const_cast<ElementScalar*>(shared_storage.smem_mean.data())),
+        smem_inv_stddev(const_cast<ElementScalar*>(shared_storage.smem_inv_stddev.data())),
+        smem_col_alpha(const_cast<ElementCompute*>(shared_storage.smem_alpha.data())),
+        smem_col_bias(const_cast<ElementCompute*>(shared_storage.smem_bias.data())) { }
+
+  Params const* params_ptr;
+  ElementScalar* smem_alpha;
+  ElementScalar* smem_bias;
+  ElementScalar* smem_mean;
+  ElementScalar* smem_inv_stddev;
+  ElementCompute* smem_col_alpha;
+  ElementCompute* smem_col_bias;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return true;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <int EpiTiles, class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
+    CUTLASS_DEVICE
+    ProducerLoadCallbacks(GTensor&& gAlpha, GTensor&& gBias, GTensor&& gMean, GTensor&& gInvStddev,
+      STensor&& sAlpha, STensor&& sBias, STensor&& sMean, STensor&& sInvStddev, Params const* params_ptr)
+      : gAlpha(cute::forward<GTensor>(gAlpha)),
+        gBias(cute::forward<GTensor>(gBias)),
+        gMean(cute::forward<GTensor>(gMean)),
+        gInvStddev(cute::forward<GTensor>(gInvStddev)),
+        sAlpha(cute::forward<STensor>(sAlpha)),
+        sBias(cute::forward<STensor>(sBias)),
+        sMean(cute::forward<STensor>(sMean)),
+        sInvStddev(cute::forward<STensor>(sInvStddev)),
+        params_ptr(params_ptr) {}
+
+    GTensor gAlpha;
+    GTensor gBias;
+    GTensor gMean;
+    GTensor gInvStddev;
+
+    STensor sAlpha;
+    STensor sBias;
+    STensor sMean;
+    STensor sInvStddev;
+
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      if (epi_m == 0 && epi_n == 0 && issue_tma_load) {
+        // Increment the expect-tx count of the first subtile's mbarrier by the row vector's byte-size
+        constexpr uint32_t copy_bytes = size<1>(CtaTileShapeMNK{}) * bits_to_bytes(sizeof_bits_v<ElementScalar>) * 4;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA bulk copy
+        int pipe_index = (load_iteration / EpiTiles) % Stages;
+        copy(params_ptr->tma_load_alpha.with(*full_mbarrier_ptr), gAlpha, sAlpha(_,pipe_index));
+        copy(params_ptr->tma_load_bias.with(*full_mbarrier_ptr), gBias, sBias(_,pipe_index));
+        copy(params_ptr->tma_load_mean.with(*full_mbarrier_ptr), gMean, sMean(_,pipe_index));
+        copy(params_ptr->tma_load_inv_stddev.with(*full_mbarrier_ptr), gInvStddev, sInvStddev(_,pipe_index));
+      }
+    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mAlpha = params_ptr->tma_load_alpha.get_tma_tensor(make_shape(size(M),N,size(L)));
+    Tensor mBias  = params_ptr->tma_load_bias.get_tma_tensor(make_shape(size(M),N,size(L)));
+    Tensor mMean  = params_ptr->tma_load_mean.get_tma_tensor(make_shape(size(M),N,size(L)));
+    Tensor mInvStddev = params_ptr->tma_load_inv_stddev.get_tma_tensor(make_shape(size(M),N,size(L)));
+
+    Tensor gAlpha = local_tile(mAlpha, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));             // (CTA_M,CTA_N)
+    Tensor gBias  = local_tile(mBias,  take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));             // (CTA_M,CTA_N)
+    Tensor gMean  = local_tile(mMean,  take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));             // (CTA_M,CTA_N)
+    Tensor gInvStddev = local_tile(mInvStddev, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));     // (CTA_M,CTA_N)
+
+    Tensor sAlpha = make_tensor(make_smem_ptr(smem_alpha), SmemLayout{});                         // (CTA_M,CTA_N,PIPE)
+    Tensor sBias  = make_tensor(make_smem_ptr(smem_bias), SmemLayout{});                          // (CTA_M,CTA_N,PIPE)
+    Tensor sMean  = make_tensor(make_smem_ptr(smem_mean), SmemLayout{});                          // (CTA_M,CTA_N,PIPE)
+    Tensor sInvStddev = make_tensor(make_smem_ptr(smem_inv_stddev), SmemLayout{});                // (CTA_M,CTA_N,PIPE)
+
+    auto [tCgAlpha,     tCsAlpha]     = tma_partition(params_ptr->tma_load_alpha, group_modes<0,2>(sAlpha), group_modes<0,2>(gAlpha));
+    auto [tCgBias,      tCsBias]      = tma_partition(params_ptr->tma_load_bias,  group_modes<0,2>(sBias),  group_modes<0,2>(gBias));
+    auto [tCgMean,      tCsMean]      = tma_partition(params_ptr->tma_load_mean,  group_modes<0,2>(sMean),  group_modes<0,2>(gMean));
+    auto [tCgInvStddev, tCsInvStddev] = tma_partition(params_ptr->tma_load_inv_stddev, group_modes<0,2>(sInvStddev), group_modes<0,2>(gInvStddev));
+
+    constexpr int EpiTiles = decltype(size(ceil_div(shape(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ProducerLoadCallbacks<EpiTiles, decltype(tCgAlpha), decltype(tCsAlpha)>(
+      cute::move(tCgAlpha), cute::move(tCgBias), cute::move(tCgMean), cute::move(tCgInvStddev),
+      cute::move(tCsAlpha), cute::move(tCsBias), cute::move(tCsMean), cute::move(tCsInvStddev), params_ptr);
+  }
+
+  template <int EpiTiles, class SR_RTensor, class SR_STensor, class SR_CTensor, class SR_SCTensor, class RTensor, class STensor, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      SR_RTensor&& tSR_rAlpha, SR_RTensor&& tSR_rBias,
+      SR_RTensor&& tSR_rMean, SR_RTensor&& tSR_rInvStddev,
+      SR_STensor&& tSR_sAlpha, SR_STensor&& tSR_sBias,
+      SR_STensor&& tSR_sMean, SR_STensor&& tSR_sInvStddev,
+      SR_CTensor&& tSR_cAlpha,
+      SR_SCTensor&& tSR_sColAlpha, SR_SCTensor&& tSR_sColBias,
+      RTensor&& tCrAlpha, RTensor&& tCrBias,
+      STensor&& tCsAlpha, STensor&& tCsBias,
+      ThrNum thr_num,
+      Params const* params_ptr)
+      :
+        tSR_rAlpha(cute::forward<SR_RTensor>(tSR_rAlpha)), tSR_rBias(cute::forward<SR_RTensor>(tSR_rBias)),
+        tSR_rMean(cute::forward<SR_RTensor>(tSR_rMean)), tSR_rInvStddev(cute::forward<SR_RTensor>(tSR_rInvStddev)),
+        tSR_sAlpha(cute::forward<SR_STensor>(tSR_sAlpha)), tSR_sBias(cute::forward<SR_STensor>(tSR_sBias)),
+        tSR_sMean(cute::forward<SR_STensor>(tSR_sMean)), tSR_sInvStddev(cute::forward<SR_STensor>(tSR_sInvStddev)),
+        tSR_cAlpha(cute::forward<SR_CTensor>(tSR_cAlpha)),
+        tSR_sColAlpha(cute::forward<SR_SCTensor>(tSR_sColAlpha)), tSR_sColBias(cute::forward<SR_SCTensor>(tSR_sColBias)),
+        tCrAlpha(cute::forward<RTensor>(tCrAlpha)), tCrBias(cute::forward<RTensor>(tCrBias)),
+        tCsAlpha(cute::forward<STensor>(tCsAlpha)), tCsBias(cute::forward<STensor>(tCsBias)),
+        thr_num(thr_num),
+        params_ptr(params_ptr) {}
+
+    SR_RTensor tSR_rAlpha;
+    SR_RTensor tSR_rBias;
+    SR_RTensor tSR_rMean;
+    SR_RTensor tSR_rInvStddev;
+    SR_STensor tSR_sAlpha;
+    SR_STensor tSR_sBias;
+    SR_STensor tSR_sMean;
+    SR_STensor tSR_sInvStddev;
+    SR_CTensor tSR_cAlpha;
+    SR_SCTensor tSR_sColAlpha;
+    SR_SCTensor tSR_sColBias;
+
+    ThrNum thr_num;
+
+    RTensor tCrAlpha;                                                                              // (CPY,CPY_M,CPY_N)
+    RTensor tCrBias;                                                                               // (CPY,CPY_M,CPY_N)
+
+    STensor tCsAlpha;                                                             // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+    STensor tCsBias;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if (epi_m == 0 && epi_n == 0) { // Assumes M-major subtile loop
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+        int pipe_index = (load_iteration / EpiTiles) % Stages;
+
+        Tensor tSR_rAlpha_flt = filter_zeros(tSR_rAlpha);
+        Tensor tSR_rBias_flt = filter_zeros(tSR_rBias);
+        Tensor tSR_rMean_flt = filter_zeros(tSR_rMean);
+        Tensor tSR_rInvStddev_flt = filter_zeros(tSR_rInvStddev);
+        Tensor tSR_sAlpha_flt = filter_zeros(tSR_sAlpha(_,_,_,pipe_index));
+        Tensor tSR_sBias_flt = filter_zeros(tSR_sBias(_,_,_,pipe_index));
+        Tensor tSR_sMean_flt = filter_zeros(tSR_sMean(_,_,_,pipe_index));
+        Tensor tSR_sInvStddev_flt = filter_zeros(tSR_sInvStddev(_,_,_,pipe_index));
+        Tensor tSR_cAlpha_flt = filter_zeros(tSR_cAlpha, tSR_rAlpha.stride());
+
+        for (int i = 0; i < size(tSR_rAlpha_flt); ++i) {
+          if (get<1>(tSR_cAlpha_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+            // OOB of SMEM
+            continue;
+          }
+          tSR_rAlpha_flt(i) = tSR_sAlpha_flt(i);
+          tSR_rBias_flt(i) = tSR_sBias_flt(i);
+          tSR_rMean_flt(i) = tSR_sMean_flt(i);
+          tSR_rInvStddev_flt(i) = tSR_sInvStddev_flt(i);
+        }
+
+        constexpr int RegFragSize = cute::min(size(tSR_rAlpha_flt), cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute))));
+        Tensor tSR_rAlpha_frg = recast<Array<ElementCompute, RegFragSize>>(tSR_rAlpha_flt);            // (FRG_V)
+        Tensor tSR_rBias_frg = recast<Array<ElementCompute, RegFragSize>>(tSR_rBias_flt);              // (FRG_V)
+        Tensor tSR_rMean_frg = recast<Array<ElementCompute, RegFragSize>>(tSR_rMean_flt);              // (FRG_V)
+        Tensor tSR_rInvStddev_frg = recast<Array<ElementCompute, RegFragSize>>(tSR_rInvStddev_flt);    // (FRG_V)
+
+        cutlass::multiplies<Array<ElementCompute, RegFragSize>> mul;
+        cutlass::negate<Array<ElementCompute, RegFragSize>> negate;
+        cutlass::multiply_add<Array<ElementCompute, RegFragSize>> mul_add;
+
+        // We do computation among vectors before computation among matrices
+        //                alpha' = alpha * inv_stddev
+        //                bias' = bias - alpha' * mean
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tSR_rAlpha_frg); ++i) {
+          tSR_rAlpha_frg(i) = mul(tSR_rAlpha_frg(i), tSR_rInvStddev_frg(i));
+          tSR_rBias_frg(i) = mul_add(tSR_rAlpha_frg(i), negate(tSR_rMean_frg(i)), tSR_rBias_frg(i));
+        }
+
+        Tensor tSR_sColAlpha_flt = filter_zeros(tSR_sColAlpha(_,_,_,pipe_index));
+        Tensor tSR_sColBias_flt = filter_zeros(tSR_sColBias(_,_,_,pipe_index));
+        // After computation, 4 vectors -> 2 vectors
+        for (int i = 0; i < size(tSR_rAlpha_flt); ++i) {
+          if (get<1>(tSR_cAlpha_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+            // OOB of SMEM
+            continue;
+          }
+          tSR_sColAlpha_flt(i) = tSR_rAlpha_flt(i);
+          tSR_sColBias_flt(i) = tSR_rBias_flt(i);
+        }
+
+        synchronize();
+
+        // To do bn_apply with Acc, reload these 2 vectors with the consistent shape
+        copy_aligned(tCsAlpha(_,_,_,_,_,pipe_index), tCrAlpha);
+        copy_aligned(tCsBias(_,_,_,_,_,pipe_index), tCrBias);
+      }
+    }
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_inputs) {
+        constexpr int RegFragSize = cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute)));
+      cutlass::multiply_add<Array<ElementCompute, RegFragSize>> mul_add;
+
+      Array<ElementCompute, FragmentSize> frg_apply;
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+
+      ConvertInput convert_input{};
+      ConvertOutput convert_output{};
+
+      Array frg_I = convert_input(frg_inputs);
+
+      Tensor tCrAlpha_frg = recast<Array<ElementCompute, RegFragSize>>(tCrAlpha(_,_,_,epi_m,epi_n));
+      Tensor tCrBias_frg = recast<Array<ElementCompute, RegFragSize>>(tCrBias(_,_,_,epi_m,epi_n));
+
+      constexpr int RegFragArraySize = FragmentSize / RegFragSize;
+      using RegFragArr = Array<Array<ElementCompute, RegFragSize>, RegFragArraySize>;
+      RegFragArr& frg_I_ = reinterpret_cast<RegFragArr&>(frg_I);
+      RegFragArr& frg_apply_ = reinterpret_cast<RegFragArr&>(frg_apply);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < RegFragArraySize; ++i) {
+        frg_apply_[i] = mul_add(tCrAlpha_frg(epi_v * RegFragArraySize + i), frg_I_[i], tCrBias_frg(epi_v * RegFragArraySize + i));
+      }
+
+      return convert_output(frg_apply);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor sAlpha = make_tensor(make_smem_ptr(smem_alpha),                                        // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor sBias = make_tensor(make_smem_ptr(smem_bias),                                          // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor sColAlpha = make_tensor(make_smem_ptr(smem_col_alpha),                                 // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor sColBias = make_tensor(make_smem_ptr(smem_col_bias),                                   // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor sMean = make_tensor(make_smem_ptr(smem_mean),                                          // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor sInvStddev = make_tensor(make_smem_ptr(smem_inv_stddev),                               // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+
+    // S2R: Smem to Reg
+    auto tiled_s2r = make_tiled_copy(Copy_Atom<DefaultCopy, ElementScalar>{},
+                                     Layout< Shape<_1, ThreadCount>,
+                                            Stride<_0,          _1>>{},
+                                     Layout<_1>{});
+    auto thr_s2r = tiled_s2r.get_slice(args.thread_idx);
+    Tensor tSR_sAlpha = thr_s2r.partition_S(sAlpha);
+    Tensor tSR_sBias = thr_s2r.partition_S(sBias);
+    Tensor tSR_sMean = thr_s2r.partition_S(sMean);
+    Tensor tSR_sInvStddev = thr_s2r.partition_S(sInvStddev);
+    Tensor tSR_sColAlpha = thr_s2r.partition_S(sColAlpha);
+    Tensor tSR_sColBias = thr_s2r.partition_S(sColBias);
+    Tensor tSR_cAlpha = thr_s2r.partition_S(args.cD);
+
+    Tensor tSR_rAlpha = make_tensor_like<ElementCompute>(take<0,3>(tSR_sAlpha)); // need to check
+    Tensor tSR_rBias = make_tensor_like<ElementCompute>(take<0,3>(tSR_sBias));
+    Tensor tSR_rMean = make_tensor_like<ElementCompute>(take<0,3>(tSR_sMean));
+    Tensor tSR_rInvStddev = make_tensor_like<ElementCompute>(take<0,3>(tSR_sInvStddev));
+
+    Tensor tCsAlpha = sm90_partition_for_epilogue<ReferenceSrc>(                  // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+                      sColAlpha, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCsBias = sm90_partition_for_epilogue<ReferenceSrc>(                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+                      sColBias, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    Tensor tCrAlpha = make_tensor_like<ElementCompute>(take<0,5>(tCsAlpha));                       // (CPY,CPY_M,CPY_N)
+    Tensor tCrBias = make_tensor_like<ElementCompute>(take<0,5>(tCsBias));                         // (CPY,CPY_M,CPY_N)
+
+    constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ConsumerStoreCallbacks<EpiTiles
+    , decltype(tSR_rAlpha), decltype(tSR_sAlpha), decltype(tSR_cAlpha), decltype(tSR_sColAlpha), decltype(tCrAlpha), decltype(tCsAlpha), ThreadCount
+    >(
+      cute::move(tSR_rAlpha), cute::move(tSR_rBias),
+      cute::move(tSR_rMean), cute::move(tSR_rInvStddev),
+      cute::move(tSR_sAlpha), cute::move(tSR_sBias),
+      cute::move(tSR_sMean), cute::move(tSR_sInvStddev),
+      cute::move(tSR_cAlpha),
+      cute::move(tSR_sColAlpha), cute::move(tSR_sColBias),
+      cute::move(tCrAlpha), cute::move(tCrBias),
+      cute::move(tCsAlpha), cute::move(tCsBias),
+      ThreadCount{},
+      params_ptr);
+  }
+};
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d026b15ccacef0bb199b7a98172c722f9402d075
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm100_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,666 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm100 TMA warp-specialized (ws) epilogue
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp" 
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+namespace detail {
+  template <int SFVecSize, class ElementOutput, class ElementCompute, class ElementBlockScaleFactor, int FragmentSize, int NumVecs>
+  CUTLASS_DEVICE auto
+  compute_quantized_with_row_scalefactor(
+      Array<ElementCompute, FragmentSize>& frg_compute,
+      Array<ElementBlockScaleFactor, NumVecs>& frg_sf,
+      ElementCompute norm_constant)
+  {
+    cutlass::multiplies<ElementCompute> mul;
+    cutlass::multiplies<Array<ElementCompute, SFVecSize>> mul_array;
+
+    Array<ElementOutput, FragmentSize> frg_output;
+    auto output_frgs = reinterpret_cast<Array<ElementOutput, SFVecSize> *>(frg_output.data());
+    auto compute_frgs = reinterpret_cast<Array< ElementCompute, SFVecSize> *>(frg_compute.data());
+
+      Array<ElementCompute, NumVecs> qpvscale_rcps = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+        if constexpr (cute::is_same_v<ElementBlockScaleFactor, float_ue8m0_t>) {
+          // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+          auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<Array<ElementBlockScaleFactor, NumVecs>>{}(frg_sf);
+          return cutlass::NumericArrayConverter<ElementCompute, ElementBlockScaleFactor, NumVecs>{}(e8m0_qpvscale_rcp);
+        }
+        else {
+          // UE4M3: Do the rcp in fp32 data type.
+          auto qpvscale_ups = cutlass::NumericArrayConverter<ElementCompute, ElementBlockScaleFactor, NumVecs>{}(frg_sf);
+          return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_ups)>{}(qpvscale_ups);
+        }
+      }();
+
+      // norm_constant and qpvscale_rcps are all positive numbers.
+      auto acc_scales = cutlass::multiplies<Array<ElementCompute, NumVecs>>{}(norm_constant, qpvscale_rcps);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int sf_v = 0; sf_v < NumVecs; ++sf_v) {
+        // Map INF to fp32::max
+        auto acc_scale = minimum_with_nan_propagation<ElementCompute>{}(acc_scales[sf_v], cutlass::platform::numeric_limits<ElementCompute>::max());
+        // Convert to output type
+        output_frgs[sf_v] = cutlass::NumericArrayConverter<ElementOutput, ElementCompute, SFVecSize>{}(mul_array(compute_frgs[sf_v], acc_scale));
+      }
+    return frg_output;
+  }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// BlockScaleFactor Generation Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm100BlockScaleFactorRowStore {
+  static_assert(size<1>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<1>(EpilogueTile{}) / SFVecSize == 1 or
+                size<1>(EpilogueTile{}) / SFVecSize == 2 or
+                size<1>(EpilogueTile{}) / SFVecSize == 4 or
+                size<1>(EpilogueTile{}) / SFVecSize == 8,
+                "Possible store in interleaved 4B aligned format");
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+
+  using Params = Arguments;
+
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (N % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm100BlockScaleFactorRowStore] N-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorRowStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorRowStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) { }
+
+  Params const* params_ptr = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class EpiTileCoordMN,
+    class ElementType
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,                   // (CPY,CPY_M,CPY_N)
+          GTensor&& tC_gSFD_,                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+          CoordGTensor tC_cSFD_,                // (m,n)
+          ThrResidue residue_tC_cSFD_,          // (m,n)
+          Params const* params_ptr_,
+          EpiTileCoordMN epi_tile_coord_mn_,    // (epi_tile_coord_m, epi_tile_coord_n)
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , epi_tile_coord_mn(epi_tile_coord_mn_){}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    EpiTileCoordMN epi_tile_coord_mn;
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input)
+    {
+      static_assert(FragmentSize % SFVecSize == 0, "Scale factor vector size should divide FragmentSize");
+      constexpr int NumVecs = FragmentSize / SFVecSize;
+      Array<ElementCompute, FragmentSize> frg_compute;
+
+      auto input_frgs = reinterpret_cast<Array< ElementInput, SFVecSize> const*>(frg_input.data());
+      auto compute_frgs = reinterpret_cast<Array< ElementCompute, SFVecSize> *>(frg_compute.data());
+
+      Tensor tC_rSFD_frg = recast<cutlass::Array<UnderlyingElementBlockScaleFactor, NumVecs>>(coalesce(filter(tC_rSFD)));               // (EPI_V)
+
+      cutlass::multiplies<ElementCompute> mul;
+      cutlass::maximum_absolute_value_reduction<Array<ElementCompute, SFVecSize>, true> amax_reduction;
+
+      cutlass::Array<ElementCompute, NumVecs> vec_maxs;
+      cutlass::Array<ElementCompute, NumVecs> pvscales;
+      // SF generation
+      CUTLASS_PRAGMA_UNROLL
+      for (int sf_v = 0; sf_v < NumVecs; ++sf_v) {
+        compute_frgs[sf_v] = NumericArrayConverter<ElementCompute, ElementInput, SFVecSize>{}(input_frgs[sf_v]);
+        /// Step1: get max across a vector
+        vec_maxs[sf_v] = amax_reduction(ElementCompute(0), compute_frgs[sf_v]);
+      }
+
+      /// Step2: Compute Scale
+      pvscales = cutlass::multiplies<Array<ElementCompute, NumVecs>>{}(vec_maxs, norm_constant_scaled_down);
+
+      tC_rSFD_frg(_0{}) = cutlass::NumericArrayConverter<UnderlyingElementBlockScaleFactor, ElementCompute, NumVecs>{}(pvscales);
+
+      Tensor tCgSFD_flt = filter_zeros(tC_gSFD(_,_,_,_0{},_0{},get<0>(epi_tile_coord_mn) + epi_m, get<1>(epi_tile_coord_mn) + epi_n));
+      Tensor tCrSFD_flt = filter_zeros(tC_rSFD);
+      constexpr auto MCL = decltype(max_common_layout(tCgSFD_flt, tCrSFD_flt)){};
+      constexpr int V = cute::min(4, size(MCL));
+      using VecType = uint_bit_t<V * sizeof_bits_v<UnderlyingElementBlockScaleFactor>>;
+      Tensor tCgSFD_vec = recast<VecType>(coalesce(tCgSFD_flt));
+      Tensor tCrSFD_vec = recast<VecType>(coalesce(tCrSFD_flt));
+      Tensor tCcSFD_pred = tC_cSFD(_,_,_, epi_m, epi_n);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tCrSFD_vec); i++){
+        if (elem_less(tCcSFD_pred(i * SFVecSize * V), residue_tC_cSFD)) {
+          tCgSFD_vec(i) = tCrSFD_vec(i);
+        }
+      }
+      /// Step3: Compute quantized output values
+      return detail::compute_quantized_with_row_scalefactor<SFVecSize, ElementOutput>(frg_compute, tC_rSFD_frg(_0{}), norm_constant);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [tile_coord_m, tile_coord_n, tile_coord_k, tile_coord_l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[tile_coord_l];
+      tile_coord_l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+    static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_,_,tile_coord_l));                   // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                        gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto epi_tile_coord_mn = make_coord(tile_coord_m * size<0>(epi_tile_mn), tile_coord_n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},tile_coord_l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+#if 0
+    if(threadIdx.x == 128 && blockIdx.x == 0 && blockIdx.y == 0){
+      print("epi_tile     ");print(args.epi_tile);    print("\n");
+      print("mSFD         ");print(mSFD);       print("\n");
+      print("gSFD         ");print(gSFD);       print("\n");
+      print("tCgSFD       ");print(tCgSFD);     print("\n");
+      print("tCrSFD       ");print(tCrSFD);     print("\n");
+      print("filter(tCrSFD) ");print(filter(tCrSFD));     print("\n");
+      print("filter(tCgSFD) ");print(filter(tCgSFD));     print("\n");
+    }
+#endif
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      epi_tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down);
+
+  }
+};
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm100BlockScaleFactorColStore {
+
+  static_assert(size<0>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<0>(EpilogueTile{}) / SFVecSize == 1 or
+                size<0>(EpilogueTile{}) / SFVecSize == 2 or
+                size<0>(EpilogueTile{}) / SFVecSize == 4 or
+                size<0>(EpilogueTile{}) / SFVecSize == 8,
+                "Possible store in interleaved 4B aligned format");
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+  static constexpr int NumSyncWarps = SFVecSize == 64 ? 4 : 0;
+  static constexpr int NumSyncThreads = NumSyncWarps * NumThreadsPerWarp;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncWarps> smem_aux;
+  };
+
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = nullptr;
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+
+  using Params = Arguments;
+
+  // BlockScaleFactor generation is per batch or group
+  // For Ptr-Array GEMM and Grouped GEMM, ElementBlockScaleFactor is ElementType*
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (M % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm100BlockScaleFactorColStore] M-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorColStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm100BlockScaleFactorColStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class EpiTileCoordMN,
+    class ElementType
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    // Normally, we should use tile_shape_mnk to tile the gtensor.
+    // However, the SF gtensor could not be divisible by non-pow2 cta tile, so we use epi tile (pow2) to do tiling.
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,                       // (CPY,CPY_M,CPY_N)
+          GTensor&& tC_gSFD_,                       // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+          STensor&& sAmaxs_,                        // (NumSyncWarps)
+          CoordGTensor tC_cSFD_,                    // (m,n)
+          ThrResidue residue_tC_cSFD_,              // (m,n)
+          Params const* params_ptr_,
+          EpiTileCoordMN epi_tile_coord_mn_,        // (epi_tile_coord_m, epi_tile_coord_n)
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , epi_tile_coord_mn(epi_tile_coord_mn_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    EpiTileCoordMN epi_tile_coord_mn;
+
+    CUTLASS_DEVICE
+    ElementCompute find_amax(ElementCompute max) {
+      // Overall idea: after TMEM_LOAD.32DP32bit pattern, each thread in the warp can load adjacent elements of a column into its private RF.
+      //               Here we are using shuffle instructons to the amax value of the adjacent column elements.
+      // For VS16, t0~t15 would generate an amax, and t16~t31 would generate another one.
+      // For VS32, t0~t31 should generate an amax.
+      // For VS64, t0~t63 should generate an amax. We would first do the reduciton within a warp,
+      //           and then use smem to do inter-warp reduction.
+      if constexpr (SFVecSize == 32) {
+        return cutlass::redux_abs_max_nan_propagation_sync_warp<ElementCompute>{}(max);
+      }
+      else if constexpr (SFVecSize == 16) {
+        return cutlass::redux_abs_max_nan_propagation_sync_warp_t0t15_t16t31<ElementCompute>{}(max);
+      }
+      else if constexpr (SFVecSize == 64) {
+        // Get abs_max per warp
+        auto abs_max = cutlass::redux_abs_max_nan_propagation_sync_warp<ElementCompute>{}(max);
+
+        // Switch the amax of adjacent warps
+        const bool leading_thread = (threadIdx.x % NumThreadsPerWarp) == 0;
+        const int warp_idx = threadIdx.x / NumThreadsPerWarp % 4;
+        auto synchronize = [] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(NumSyncThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+        // Inter-warp reduction for VS=64
+        // Only 4 * FP32  = 16 bytes smem is needed as we have 4 warps.
+        if (leading_thread) {
+          sAmaxs(warp_idx) = abs_max;
+        }
+        synchronize();
+        // Switch data between two adjacent warps to do reduction
+        float tmp = sAmaxs(warp_idx^1);
+        synchronize();
+        abs_max  = cutlass::maximum_with_nan_propagation<ElementCompute>{}(abs_max,tmp);
+        return abs_max;
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<ElementCompute>, "Unsupported VecSize");
+      }
+    }
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE auto
+    compute_quantized_value(Array<ElementCompute, FragmentSize> compute, Array<UnderlyingElementBlockScaleFactor, FragmentSize> sf) {
+      cutlass::multiplies<Array<ElementCompute, FragmentSize>> mul_array;
+      auto qpvscale_rcp = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+        if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
+          // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+          auto e8m0_qpvscale_rcps = cutlass::reciprocal_approximate<Array<UnderlyingElementBlockScaleFactor, FragmentSize>>{}(sf);
+          return cutlass::NumericArrayConverter<ElementCompute, UnderlyingElementBlockScaleFactor, FragmentSize>{}(e8m0_qpvscale_rcps);
+        }
+        else {
+          // UE4M3: Do the rcp in fp32 data type.
+          auto qpvscale_up = cutlass::NumericArrayConverter<ElementCompute, UnderlyingElementBlockScaleFactor, FragmentSize>{}(sf);
+          return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+        }
+      }();
+      // norm_constant and qpvscale_rcps[sf_v] are all positive numbers.
+      auto acc_scale = mul_array(norm_constant, qpvscale_rcp);
+      // Map INF to fp32::max
+      acc_scale = minimum_with_nan_propagation<decltype(acc_scale)>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+      return mul_array(compute, acc_scale);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input)
+    {
+      constexpr int NumVecs = 1; // each thread only compute 1 col scalefactors
+      Array<ElementCompute, FragmentSize> frg_compute;
+      Array<ElementOutput, FragmentSize> frg_output;
+      Array<ElementCompute, FragmentSize> frg_scale_float;
+      Array<ElementCompute, FragmentSize> frg_amax;
+      Array<UnderlyingElementBlockScaleFactor, FragmentSize> frg_scale;
+
+      Tensor tC_rSFD_frg = recast<cutlass::Array<UnderlyingElementBlockScaleFactor, NumVecs>>(coalesce(filter(tC_rSFD)));               // (EPI_V)
+
+      cutlass::multiplies<ElementCompute> mul;
+      cutlass::multiplies<Array<ElementCompute, FragmentSize>> mul_array;
+      /// convert acc to Element Compute
+      auto compute_frgs = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize>{}(frg_input);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        /// Step1: get max across a vector
+        frg_amax[i] = find_amax(compute_frgs[i]);
+      }
+      
+      frg_scale_float = mul_array(frg_amax, norm_constant_scaled_down);
+      frg_scale = cutlass::NumericArrayConverter<UnderlyingElementBlockScaleFactor, ElementCompute, FragmentSize>{}(frg_scale_float);
+      auto tC_cSFD_pred = tC_cSFD(_,_,_,epi_m,epi_n);
+      auto tC_gSFD_store = tC_gSFD(_,_,_,_,_,get<0>(epi_tile_coord_mn) + epi_m, get<1>(epi_tile_coord_mn) + epi_n);
+      for (int i=0; i < cute::ceil_div(FragmentSize, SFVecSize); i++) {
+        int idx = i * SFVecSize + threadIdx.x % SFVecSize;
+        if (idx < FragmentSize && elem_less(tC_cSFD_pred(idx), residue_tC_cSFD)) {
+          UnderlyingElementBlockScaleFactor tmp = frg_scale[idx];
+          // Store the (EpilogueTile / SFVecSize) elements.
+          tC_gSFD_store(idx) = tmp;
+        }
+      }
+
+      /// Step3: Compute quantized output values
+      if constexpr (cute::sizeof_bits_v<ElementOutput> == 4) {
+        return compute_quantized_value(compute_frgs, frg_scale); // ElementCompute
+      }
+      else {
+        // 6bits or 8bits output.
+        compute_frgs = compute_quantized_value(compute_frgs, frg_scale);
+        frg_output = cutlass::NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize>{}(compute_frgs);
+        return frg_output;   // ElementOutput
+      }
+
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [tile_coord_m, tile_coord_n, tile_coord_k, tile_coord_l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig = cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize, UMMA::Major::MN>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[tile_coord_l];
+      tile_coord_l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+    //Tensor gSFD = local_tile(mSFD, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));
+    // Normally, we should use tile_shape_mnk to tile the mSFD tensor. However, we could not do it for non-pow2 cta tile with vectorsize = 32.
+    // For scale factor, 128x4 elements are stored in a basic block, and the layout of mSFD is ((_32,_4,int),(_32,_4,int),int):((_16,_4,int),(_0,_1, int),int)
+    // If we tiled it using tile_shape_mnk(128, 192), the N mode would encounter shape_div failure because (32, 4) could not be divisible by 192.
+    // Therefore, switching to using pow2 epilogue tile.
+    static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_,_,tile_coord_l));                              // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                        gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto epi_tile_coord_mn = make_coord(tile_coord_m * size<0>(epi_tile_mn), tile_coord_n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},tile_coord_l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(make_smem_ptr(smem_aux), make_layout(_4{}));
+#if 0
+    if(threadIdx.x == 128 && blockIdx.x == 0 && blockIdx.y == 0){
+      print("mSFD         ");print(mSFD);       print("\n");
+      print("gSFD         ");print(gSFD);       print("\n");
+      print("tCgSFD       ");print(tCgSFD);     print("\n");
+      print("tCrSFD       ");print(tCrSFD);     print("\n");
+      print("args.tCcD       ");print(args.tCcD);     print("\n");
+      print("args.residue_tCcD       ");print(args.residue_tCcD);     print("\n");
+      print("filter(tCrSFD) ");print(filter(tCrSFD));     print("\n");
+      print("filter(tCgSFD) ");print(filter(tCgSFD));     print("\n");
+    }
+#endif
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      epi_tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down);
+  }
+};
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b769b1f0fbe2aa78f0ee97da442fb61c1aa49cc8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,1593 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+  \brief Fusion callbacks specializations for the SM120 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm100_callbacks_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sm120 Tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// D = alpha * acc + beta * C
+// With BlockScaleFactor Generation.
+// 1. Find max of 32 F32 elements
+// 2. Convert the max to UE8 (or UE4M3) and store the result.
+// 3. Convert the UE8 (or UE4M3) back to F32 scale.
+// 4. Reciprocal of F32 scale with MUFU.
+// 5. Multiply each F32 element with the above reciprocal, then convert to ElementD
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombRowBlockScaleFactor =
+  Sm90EVT<Sm120BlockScaleFactorRowStore<SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,ElementCompute, ElementBlockScaleFactor, RoundStyle>, // gen scalefactor
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm120LinearCombRowBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+
+  using Sm100Fusion = FusionCallbacks<
+        epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+        fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::RowMajor, ElementSource, ElementScalar, RoundStyle>,
+        CtaTileShapeMNK,
+        EpilogueTile
+  >;
+  using Operation = typename Sm100Fusion::Operation;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per-row bias
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute,
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerRowBiasRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias) 
+//   with row blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl = 
+    Sm120LinCombPerRowBiasEltActRowBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor, 
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per_col bias
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerColBiasRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per_col bias) 
+//   with row blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasEltActRowBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, 
+      ElementCompute, ElementCompute, ElementBias, 
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombPerColBiasEltActRowBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar, 
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementBias, ElementSource, 
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+    
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+//   with per column blockScaled generation
+// 1. Find max of 32 F32 elements
+// 2. Convert the max to UE8 (or UE4M3) and store the result.
+// 3. Convert the UE8 (or UE4M3) back to F32 scale.
+// 4. Reciprocal of F32 scale with MUFU.
+// 5. Multiply each F32 element with the above reciprocal, then convert to ElementD
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombColBlockScaleFactor = Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle>,
+    Sm90LinearCombination<
+      ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+  epilogue::Sm120TmaWarpSpecialized<
+    StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+  fusion::LinCombBlockScaleFactor<
+    SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, 
+    cutlass::layout::ColumnMajor, ElementSource, ElementScalar, RoundStyle>,
+  CtaTileShapeMNK,
+  EpilogueTile
+> : Sm120LinearCombColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl = Sm120LinearCombColBlockScaleFactor<SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle>;
+
+  using Sm100Fusion = FusionCallbacks<
+        epilogue::Sm100TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+        fusion::LinCombBlockScaleFactor<SFVecSize, ElementOutput, ElementCompute,ElementBlockScaleFactor, cutlass::layout::ColumnMajor,ElementSource, ElementScalar, RoundStyle>,
+        CtaTileShapeMNK,
+        EpilogueTile
+  >;
+  using Operation = typename Sm100Fusion::Operation;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {
+            // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}                    // ternary args : multiply_add
+          },
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+  
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = alpha * acc + beta * C + per-Col bias
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, 
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+{
+
+  using Impl =
+    Sm120LinCombPerColBiasColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per_col bias)
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerColBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn,
+      ElementCompute, ElementCompute, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource,
+      ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerColBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombPerColBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerColBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource,
+      ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {      // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },     // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };       // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+//   with per column blockScaled generation
+template<
+  int StagesC,
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasEltActColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >,
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn,
+      ElementCompute, ElementCompute, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    > {
+
+
+  using Impl =
+    Sm120LinCombPerRowBiasEltActColBlockScaleFactor<
+      StagesC, SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombPerRowBiasEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}   // ternary args : multiply_add
+            },     // end ternary op
+            activation // unary args : activation
+          },    // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };    // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+
+// D = alpha * acc + beta * C + per-row bias
+//   with per column blockScaled generation
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombPerRowBiasColBlockScaleFactor =
+  Sm90EVT<
+    Sm120BlockScaleFactorColStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementCompute, ElementCompute,
+      ElementBias, ElementSource, ElementScalar,
+      AlignmentBias, RoundStyle
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > 
+{
+
+  using Impl = 
+    Sm120LinCombPerRowBiasColBlockScaleFactor<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementBias,
+      ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  using Operation = 
+    fusion::LinCombPerRowBiasBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::ColumnMajor,
+      ElementBias, ElementSource, ElementScalar,AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementBlockScaleFactor * block_scale_factor_ptr = nullptr;
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {  // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                 // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },  // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+// Sm120 Ptr array tma warp specialized callbacks just alias to their sm90 counterpart
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class Operation,
+  class CtaTile_MNK,
+  class EpilogueTile_MN,
+  class... Args
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    Operation,
+    CtaTile_MNK,
+    EpilogueTile_MN,
+    Args...
+> : FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...
+    > {
+  using FusionCallbacks<
+      epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+      Operation,
+      CtaTile_MNK,
+      EpilogueTile_MN,
+      Args...>::FusionCallbacks;
+};
+
+// For Ptr-Array and Grouped GEMM
+// D = alpha * acc + beta * C, where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinearCombRowBlockScaleFactorPtrArray =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor *, RoundStyle
+    >, // gen scalefactor
+    Sm90LinearCombinationPtrArray< ElementCompute, ElementCompute, 
+      ElementSource, ElementScalar, RoundStyle
+    > // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    fusion::LinCombBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinearCombRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinearCombRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombBlockScaleFactor<
+      SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+
+// For Ptr-Array and Grouped GEMM
+// D = activation(alpha * acc + beta * C), where alpha and beta can be vectors for each batch/group
+// With Row BlockScaleFactor Generation, separate tensors per batch/group.
+template<
+  int SFVecsize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor, 
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm120LinCombEltActRowBlockScaleFactorPtrArray =
+  Sm90EVT<
+    Sm120BlockScaleFactorRowStore<
+      SFVecsize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ElementOutput,
+      ElementCompute, ElementBlockScaleFactor *, RoundStyle
+    >, // gen scalefactor
+    Sm90LinCombEltActPtrArray<ActivationFn, ElementCompute, ElementCompute, 
+      ElementSource, ElementScalar, RoundStyle
+    > // activation(beta * C + (alpha * acc))
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  int SFVecSize,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm120PtrArrayTmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore, NumEpilogueWarpGroups>,
+    fusion::LinCombEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm120LinCombEltActRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn,
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm120LinCombEltActRowBlockScaleFactorPtrArray<
+      SFVecSize, EpilogueTile, CtaTileShapeMNK, FragmentSize, ActivationFn, 
+      typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type,
+      ElementCompute, ElementBlockScaleFactor, ElementSource, ElementScalar, RoundStyle
+    >;
+
+  using Operation =
+    fusion::LinCombEltActBlockScaleFactor<
+      ActivationFn, SFVecSize, ElementOutput, ElementCompute,
+      ElementBlockScaleFactor, cutlass::layout::RowMajor,
+      ElementSource, ElementScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+    ElementBlockScaleFactor ** block_scale_factor_ptr = nullptr;
+
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    using StrideNormConst = Stride<_0,_0,int64_t>;
+    ElementCompute const* norm_constant_ptr = nullptr;
+    StrideNormConst dNormConst = {_0{}, _0{}, 0};
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {
+          {    // unary op : activation(beta * C + (alpha * acc + bias))
+            {    // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+                {},                 // leaf args : acc
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {block_scale_factor_ptr, norm_constant_ptr, dNormConst} // BlockScaleFactor args
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e72e971bd8d99f87a2528af3c1dbd27366298ef5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,899 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+  \brief Visitor tree store operations for the SM120 TMA warp-specialized (ws) epilogue
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// BlockScaleFactor Generation Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm120BlockScaleFactorRowStore {
+
+  static_assert(size<1>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<1>(EpilogueTile{}) / SFVecSize == 1 or
+                size<1>(EpilogueTile{}) / SFVecSize == 2 or
+                size<1>(EpilogueTile{}) / SFVecSize == 4 or 
+                size<1>(EpilogueTile{}) / SFVecSize == 8,
+                "Possible store in interleaved 4B aligned format");
+
+  static constexpr int NumWarpgroups = 2;
+  static constexpr int NumSyncWarps = NumWarpsPerWarpGroup * NumWarpgroups;
+  static constexpr int NumQuadsPerWarp = 8;
+  static constexpr int NumSyncQuads = NumSyncWarps * NumQuadsPerWarp;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncQuads> smem_aux;
+  };
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = {};
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = {};
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+
+  using Params = Arguments;
+
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (N % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm120BlockScaleFactorRowStore] N-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorRowStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorRowStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class TileCoordMN,
+    class ElementType,
+    class TiledCopy_
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,
+          GTensor&& tC_gSFD_,
+          STensor&& sAmaxs_,
+          CoordGTensor tC_cSFD_,
+          ThrResidue residue_tC_cSFD_,
+          Params const* params_ptr_,
+          TileCoordMN tile_coord_mn_,
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_,
+          int thread_idx_,
+          TiledCopy_ const&)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , tile_coord_mn(tile_coord_mn_)
+      , thread_idx(thread_idx_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    TileCoordMN tile_coord_mn;
+    int thread_idx;
+    static constexpr int NumCollaboratingThreads = decltype(size(TiledCopy_{}))::value;
+    static_assert(NumCollaboratingThreads % NumThreadsPerWarpGroup == 0);
+    static constexpr int NumCollaboratingWarpGroups = NumCollaboratingThreads / NumThreadsPerWarpGroup;
+    static_assert(NumCollaboratingWarpGroups == 1 || NumCollaboratingWarpGroups == 2,
+                  "SM120 epilogue currently only supports one or two warp groups collaborating.");
+
+    template <class ElementAccumulator, class ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      return frg_input;
+    }
+
+    template <class SmemTensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(SmemTensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      /*
+      Accumulator fragments are distributed across quads in different warps.
+      For SFVector = 16, we have:
+
+         8 elements          8 elements       8 elements          8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7      Warp 4 Quad 7
+        Warp 0 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7      Warp 4 Quad 7
+
+        <same pattern for warps 1 and 5 for the next set of 16 rows>
+        <same pattern for warps 2 and 6 for the next set of 16 rows>
+        <same pattern for warps 3 and 7 for the next set of 16 rows>
+
+      In this case, row-wise scale factors are cooperatively reduced across 4
+      threads from 1 quad in 1 warp. Each quad computes its own, local absolute
+      maximum without communicating with other warps through shared memory.
+
+      For SFVector = 32, we have:
+         8 elements        8 elements         8 elements         8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 4 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 4 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 4 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7
+        Warp 0 Quad 0      Warp 4 Quad 0      Warp 0 Quad 0      Warp 4 Quad 0
+        Warp 0 Quad 1      Warp 4 Quad 1      Warp 0 Quad 1      Warp 4 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 4 Quad 7      Warp 0 Quad 7      Warp 4 Quad 7
+
+        <same pattern for warps 1 and 5 for the next set of 16 rows>
+        <same pattern for warps 2 and 6 for the next set of 16 rows>
+        <same pattern for warps 3 and 7 for the next set of 16 rows>
+
+      For SFVector = 64, we have:
+          8 elements        8 elements         8 elements         8 elements
+      <----------------><-----------------><-----------------><----------------->
+        Warp 0 Quad 0      Warp 2 Quad 0      Warp 4 Quad 0      Warp 6 Quad 0
+        Warp 0 Quad 1      Warp 2 Quad 1      Warp 4 Quad 1      Warp 6 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 2 Quad 7      Warp 4 Quad 7      Warp 6 Quad 7
+        Warp 0 Quad 0      Warp 2 Quad 0      Warp 4 Quad 0      Warp 6 Quad 0
+        Warp 0 Quad 1      Warp 2 Quad 1      Warp 4 Quad 1      Warp 6 Quad 1
+        ...                ...                ...                ...
+        Warp 0 Quad 7      Warp 2 Quad 7      Warp 4 Quad 7      Warp 6 Quad 7
+
+        <same pattern for warps 1, 3, 5 and 7 for the next set of 16 rows>
+
+      Thus, rowwise scale factors are cooperatively reduced across 8 threads
+      from two quads in two warps. Each quad first computes its own, local
+      absolute maximum and then shares this with the corresponding quad in the
+      other warp. In this case, a reduction through shared memory is needed.
+
+      For a non-cooperative epilogue (in which each warpgroup computes a
+      separate tile), the pattern is the same as that above, except that warps 0
+      and 2 are in the same row, and 1 and 3 are in the same row, and warps 4-7
+      are not included.
+      */
+
+      // Accumulator fragments consist of two elements from two different rows of a 16x8 MMA output
+      static constexpr int ColsPerThreadAccFrag = 2;
+      static constexpr int RowsPerThreadAccFrag = 2;
+      static_assert(FragmentSize ==
+                    (ColsPerThreadAccFrag * RowsPerThreadAccFrag));
+
+      static constexpr int NumThreadsPerQuad = 4;
+      static_assert(SFVecSize == 16 || SFVecSize == 32 || SFVecSize == 64, "SF vector size must be either 16, 32 or 64.");
+      // A quad from two or four warps participate in computing each scale factor.
+      constexpr int WarpsPerSF = SFVecSize / 16;
+      static_assert(WarpsPerSF == 1 || WarpsPerSF == 2 || WarpsPerSF == 4, "Only one, two or four warps are allowed in reduction.");
+
+      constexpr bool IsInterWarpReductionNeeded = (WarpsPerSF != 1);
+
+      // Number of fragments for each thread that are needed for computing a scale factor
+      static constexpr int AccFragsPerSF = SFVecSize / (ColsPerThreadAccFrag * NumThreadsPerQuad * WarpsPerSF);
+      static_assert(size<2>(visit_results) % AccFragsPerSF == 0,
+        "Fragments along N mode must be a multiple of the number of accumulator fragments needed per SF");
+
+      auto warp_idx = thread_idx / NumThreadsPerWarp;
+      auto warpgroup_idx = thread_idx / NumThreadsPerWarpGroup;
+      auto quad_idx_in_warp = (thread_idx % NumThreadsPerWarp) / NumThreadsPerQuad;
+      auto thread_idx_in_quad = thread_idx % NumThreadsPerQuad;
+
+      cutlass::maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+      cutlass::multiplies<ElementCompute> mul;
+      
+      Tensor tC_rSFD_flt = filter_zeros(tC_rSFD);
+
+      auto synchronize = [&] () {
+        cutlass::arch::NamedBarrier::sync(NumCollaboratingThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      };
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int sf_id = 0; sf_id < size(tC_rSFD_flt); ++sf_id) {
+
+        auto coord = idx2crd(sf_id, tC_rSFD_flt.shape());
+        auto row_in_acc = get<0,1,1>(coord);
+        auto row = crd2idx(get<1>(coord), get<1>(tC_rSFD_flt.shape()));
+        auto sf = crd2idx(get<2>(coord), get<2>(tC_rSFD_flt.shape()));
+
+        //
+        // Compute amax for this scale factor
+        //
+        ElementCompute amax{0};
+
+        // Compute amax among vals owned by this thread for this vector
+        auto acc_frag_row = row_in_acc * RowsPerThreadAccFrag;
+        auto acc_frag_start_for_sf = sf * AccFragsPerSF;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < AccFragsPerSF; ++i) {
+          auto acc_frg = visit_results(0, row, acc_frag_start_for_sf + i);
+          amax = amax_op(amax, acc_frg[acc_frag_row]);
+          amax = amax_op(amax, acc_frg[acc_frag_row + 1]);
+        }
+
+        // At this point, each thread has computed the amax of the values that it owns for this SF vector.
+        // We now need to compute the amax across threads. Because the TiledMMA uses an MmaThrLayout of <4,1,1>,
+        // we know that all fragments in this row will belong to threads in this warp. Furthermore, because
+        // SM120 narrow-precision MMAs have 16x8 output size with a quad owning two rows, we know that a quad
+        // will own all of the elements to be reduced via amax. Therefore, we can use warp shuffle intrinsics
+        // among threads in one quad to compute the amax.
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < 3; ++i) {
+          auto amax_other = __shfl_xor_sync(0xffffffff, amax, i);
+          amax = amax_op(amax, amax_other);
+        }
+
+        if constexpr (IsInterWarpReductionNeeded) {
+          // At this point, all threads in the quad have the amax for the elements of the accumulator owned by its quad
+          // that should be used in computing the amax for this SF. Threads 0 in each quad of warps 0 and 2
+          // (similarly, 1 and 3) now exchange amaxes to compute the final amax.
+          if (thread_idx_in_quad == 0) {
+            sAmaxs(quad_idx_in_warp, warp_idx) = amax;
+          }
+          synchronize();
+
+          // Get the amax broadcasted by the warp with which we share.
+          // Work on 4 warps per SFD generation
+          if constexpr (WarpsPerSF == 4) {
+            if constexpr (NumCollaboratingWarpGroups == 2) {
+              // This implementation assumes warp layout 2 x 4.
+              // For cooperative kernels (NumCollaboratingWarpGroups=2),
+              // warp 0 shares with 2 / 4 / 6, warp 1 shares with 3 / 5/ 7.
+              auto amax_other2 = sAmaxs(quad_idx_in_warp, warp_idx ^ 2);
+              auto amax_other4 = sAmaxs(quad_idx_in_warp, warp_idx ^ 4);
+              auto amax_other6 = sAmaxs(quad_idx_in_warp, warp_idx ^ 6);
+              synchronize();
+              amax = amax_op(amax, amax_other2);
+              amax = amax_op(amax, amax_other4);
+              amax = amax_op(amax, amax_other6);
+            } 
+            else {
+              static_assert(cutlass::detail::dependent_false<TiledCopy_>, "Unsupported warp layout.");
+            }
+          }
+          // Work on 2 warps per SFD generation
+          else if constexpr(WarpsPerSF == 2) {
+            // For cooperative kernels (NumCollaboratingWarpGroups=2), 0 shares
+            // with 4, 1 shares with 5, etc. For non-cooperative kernels
+            // (NumCollaboratingWarpGroups=1), 0 shares with 2, 1 shares with 3.
+            auto amax_other = sAmaxs(
+                quad_idx_in_warp, warp_idx ^ (1 << NumCollaboratingWarpGroups));
+            synchronize();
+            amax = amax_op(amax, amax_other);
+          }
+        }
+
+        ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
+        UnderlyingElementBlockScaleFactor qpvscale = NumericConverter<UnderlyingElementBlockScaleFactor, ElementCompute>{}(pvscale);
+        tC_rSFD_flt(coord) = qpvscale;
+
+        //
+        // Apply the scale factor to the output
+        //
+        ElementCompute qpvscale_rcp = [&]() {
+          if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
+            // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+            auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<UnderlyingElementBlockScaleFactor>{}(qpvscale);
+            return cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+          }
+          else {
+            // UE4M3: Do the rcp in fp32 data type.
+            auto qpvscale_up = cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(qpvscale);
+            return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+          }
+        }();
+
+        ElementCompute acc_scale = mul(norm_constant, qpvscale_rcp);
+        acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+
+        // Compute quantized output values
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < AccFragsPerSF; ++i) {
+          auto acc_frag = visit_results(0, row, acc_frag_start_for_sf + i);
+          visit_results(0, row, acc_frag_start_for_sf + i)[acc_frag_row    ] = mul(acc_frag[acc_frag_row], acc_scale);
+          visit_results(0, row, acc_frag_start_for_sf + i)[acc_frag_row + 1] = mul(acc_frag[acc_frag_row + 1], acc_scale);
+        }
+      } // sf
+
+      // Since scale factors are computed cooperatively across two quads from two warps, we only need one thread from the
+      // set of 8 cooperating threads to write out the data. We do this with thread 0 in each quad of the first warp that collaborates.
+      bool write_sf = (thread_idx_in_quad == 0);
+      if constexpr (NumCollaboratingWarpGroups == 2) {
+        // For cooperative kernels (NumCollaboratingWarpGroups=2), 0 shares with 4, 1 shares with 5, etc.
+        // Thus, only the warps in the first warpgroup need to write out scale factors.
+        if constexpr (IsInterWarpReductionNeeded) {
+          write_sf &= warp_idx < NumWarpsPerWarpGroup;
+        }
+      }
+      else {
+        if constexpr (IsInterWarpReductionNeeded) {
+          // When non-cooperative kernels apply inter warp reduce, they are with
+          // SF output rule as below :
+          // 1. warp 0 shares with 2 and 1 shares with 3 within each warpgroup.
+          // 2. warps 0 and 1 of the first warpgroup and 4 and 5 of the second
+          //   warpgroup need to write output sf.
+          write_sf &= ((warp_idx < 2) || (warpgroup_idx == 1 && warp_idx < 6));
+        }
+      }
+
+      if (write_sf && elem_less(tC_cSFD(_0{}, _0{}, _0{}, epi_m, epi_n), residue_tC_cSFD)) {
+        copy_aligned(tC_rSFD, tC_gSFD(_, _, _, _0{}, _0{}, get<0>(tile_coord_mn) + epi_m, get<1>(tile_coord_mn) + epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig = cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[l];
+      l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+
+    static_assert(size<1>(EpilogueTile{}) && ((size<1>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0), "Epilogue Tile N should be pow of 2");
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));                             // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(                                     // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                        gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(
+      make_smem_ptr(smem_aux),
+      make_layout(make_shape(Int<NumQuadsPerWarp>{}, Int<NumSyncWarps>{}))
+    );
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down,
+      args.thread_idx,
+      args.tiled_copy);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int SFVecSize,
+  class EpilogueTile,
+  class CtaTileShapeMNK,
+  int FragmentSize,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBlockScaleFactor,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+struct Sm120BlockScaleFactorColStore {
+
+  static_assert(size<0>(EpilogueTile{}) % SFVecSize == 0, "EpilogueTileN should be divisible by SFVecSize");
+  static_assert(size<0>(EpilogueTile{}) / SFVecSize == 1 or
+                size<0>(EpilogueTile{}) / SFVecSize == 2 or
+                size<0>(EpilogueTile{}) / SFVecSize == 4,
+                "Possible store in interleaved 4B aligned format");
+
+  static constexpr int NumWarpgroups = 2;
+  static constexpr int NumSyncWarps = NumWarpsPerWarpGroup * NumWarpgroups;
+  static constexpr int NumThreadsPerQuad = 4;
+  static constexpr int NumSyncElementsCrossWarp = NumSyncWarps * NumThreadsPerQuad;
+  struct SharedStorage {
+    array_aligned<ElementCompute, NumSyncElementsCrossWarp> smem_aux;
+  };
+
+  using NormalConstStrideMNL = Stride<_0,_0,int64_t>;
+
+  struct Arguments {
+    ElementBlockScaleFactor* ptr_scale_factor = {};
+    // A matrix wide constant value to scale the output matrix
+    // Avoids generating small FP4 values.
+    ElementCompute const* norm_constant_ptr = {};
+    NormalConstStrideMNL norm_constant_stride = {};
+  };
+  using Params = Arguments;
+
+  using UnderlyingElementBlockScaleFactor = cute::remove_pointer_t<ElementBlockScaleFactor>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = (M % SFVecSize == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: [EVT Sm120BlockScaleFactorColStore] N-dim should be divisible by SFVecSize.\n");
+    }
+    return implementable;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorColStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm120BlockScaleFactorColStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params)
+      , smem_aux(const_cast<ElementCompute*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr = nullptr;
+  ElementCompute *smem_aux = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class GTensor,
+    class STensor,
+    class CoordGTensor,
+    class ThrResidue,
+    class TileCoordMN,
+    class ElementType,
+    class TiledCopy_
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rSFD_,
+          GTensor&& tC_gSFD_,
+          STensor&& sAmaxs_,
+          CoordGTensor tC_cSFD_,
+          ThrResidue residue_tC_cSFD_,
+          Params const* params_ptr_,
+          TileCoordMN tile_coord_mn_,
+          ElementType norm_constant_,
+          ElementType norm_constant_scaled_down_,
+          int thread_idx_,
+          TiledCopy_ const&)
+      : tC_rSFD(cute::forward<RTensor>(tC_rSFD_))
+      , tC_gSFD(cute::forward<GTensor>(tC_gSFD_))
+      , sAmaxs(cute::forward<STensor>(sAmaxs_))
+      , tC_cSFD(tC_cSFD_)
+      , residue_tC_cSFD(residue_tC_cSFD_)
+      , params_ptr(params_ptr_)
+      , norm_constant(norm_constant_)
+      , norm_constant_scaled_down(norm_constant_scaled_down_)
+      , tile_coord_mn(tile_coord_mn_)
+      , thread_idx(thread_idx_) {}
+
+    static_assert(is_same_v<ElementType, ElementCompute>);
+    RTensor tC_rSFD;
+    GTensor tC_gSFD;
+    STensor sAmaxs;
+    CoordGTensor tC_cSFD;
+    ThrResidue residue_tC_cSFD;
+    Params const* params_ptr;
+    ElementCompute norm_constant;
+    ElementCompute norm_constant_scaled_down;
+    TileCoordMN tile_coord_mn;
+    int thread_idx;
+    static constexpr int NumCollaboratingThreads = decltype(size(TiledCopy_{}))::value;
+    static_assert(NumCollaboratingThreads % NumThreadsPerWarpGroup == 0);
+    static constexpr int NumCollaboratingWarpGroups = NumCollaboratingThreads / NumThreadsPerWarpGroup;
+    static_assert(NumCollaboratingWarpGroups == 2,
+                  "SM120 epilogue currently only supports two warp groups collaborating.");
+    static_assert(SFVecSize == 16 || SFVecSize == 32 || SFVecSize == 64, "SF vector size must be either 16, 32 or 64.");
+
+    template <class ElementAccumulator, class ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          int epi_v,
+          int epi_m,
+          int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      return frg_input;
+    }
+
+    template <class SmemTensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(SmemTensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      /*
+      Accumulator fragments are distributed across threads/quads in different warps. For column major, the
+      reduction happens along M dimension. For SFVector = 32, we have:
+
+              8 elements               8 elements             8 elements               8 elements
+      +  <----------------------><----------------------><----------------------><---------------------->
+      |     Warp 0 Quad 0           Warp 4 Quad 0           Warp 0 Quad 0           Warp 4 Quad 0
+      |     Warp 0 Quad 1           Warp 4 Quad 1           Warp 0 Quad 1           Warp 4 Quad 1
+      |     ...                     ...                     ...                     ...
+    1 |     Warp 0 Quad 7           Warp 4 Quad 7           Warp 0 Quad 7           Warp 4 Quad 7
+    6 |     Warp 0 Quad 0           Warp 4 Quad 0           Warp 0 Quad 0           Warp 4 Quad 0
+      |     Warp 0 Quad 1           Warp 4 Quad 1           Warp 0 Quad 1           Warp 4 Quad 1
+      |     ...                     ...                     ...                     ...
+      +     Warp 0 Quad 7           Warp 4 Quad 7           Warp 0 Quad 7           Warp 4 Quad 7
+      |     Warp 1 Quad 0           Warp 5 Quad 0           Warp 1 Quad 0           Warp 5 Quad 0
+      |     Warp 1 Quad 1           Warp 5 Quad 1           Warp 1 Quad 1           Warp 5 Quad 1
+    1 |     ...                     ...                     ...                     ...
+    6 |     Warp 1 Quad 7           Warp 5 Quad 7           Warp 1 Quad 7           Warp 5 Quad 7
+      |     Warp 1 Quad 0           Warp 5 Quad 0           Warp 1 Quad 0           Warp 5 Quad 0
+      |     Warp 1 Quad 1           Warp 5 Quad 1           Warp 1 Quad 1           Warp 5 Quad 1
+      |     ...                     ...                     ...                     ...
+      |     Warp 1 Quad 7           Warp 5 Quad 7           Warp 1 Quad 7           Warp 5 Quad 7
+
+                    <same pattern for warps 2/3 and 6/7 for the next set of 32 rows>
+
+      In this case, colum-wise scale factors are cooperatively reduced across 8 threads from 2 warps.
+      Each column first computes its own, local absolute maximum and then shares this with the
+      corresponding threads in the other warp. In this case, a reduction through shared memory is needed.
+
+      For SFVector = 64, the reduction happens inside 4 warps: warp 0/1/2/3 and warp 4/5/6/7.
+      */
+
+      // Accumulator fragments consist of two elements from two different columns of a 16x8 MMA output
+      static constexpr int RowsPerThreadAccFrag = 2;
+      static constexpr int ColsPerThreadAccFrag = 2;
+      static_assert(FragmentSize == (ColsPerThreadAccFrag * RowsPerThreadAccFrag));
+
+      static constexpr int NumThreadsPerCol = NumThreadsPerWarp / NumThreadsPerQuad;
+      constexpr int WarpsPerSF = SFVecSize / NumThreadsPerCol / ColsPerThreadAccFrag;
+      static_assert(WarpsPerSF == 1 || WarpsPerSF == 2 || WarpsPerSF == 4, "Only one, two or four warps are allowed in reduction.");
+
+      auto warp_idx = thread_idx / NumThreadsPerWarp;
+      auto thread_idx_in_warp = thread_idx % NumThreadsPerWarp;
+
+      cutlass::maximum_absolute_value_reduction<ElementCompute, true> amax_op;
+      cutlass::multiplies<ElementCompute> mul;
+
+      auto synchronize = [&] () {
+        // When WarpsPerSF equals 1, data processing is inside warp, there is no needs to have the sync.
+        static constexpr bool NoSyncNeeded = (WarpsPerSF == 1);
+        if(NoSyncNeeded)
+          return;
+        cutlass::arch::NamedBarrier::sync(NumCollaboratingThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      };
+
+      CUTLASS_PRAGMA_UNROLL
+      for(int mma_in_epi = 0; mma_in_epi < size<1>(tC_rSFD)*size<2>(tC_rSFD); ++mma_in_epi) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int sf_id = 0; sf_id < ColsPerThreadAccFrag; ++sf_id) {
+
+          //
+          // Compute amax for this scale factor
+          //
+          ElementCompute amax{0};
+
+          // Compute amax among vals owned by this thread for this vector
+          auto acc_frg = visit_results(mma_in_epi);
+          amax = amax_op(amax, acc_frg[sf_id]);
+          amax = amax_op(amax, acc_frg[sf_id + ColsPerThreadAccFrag]);
+
+          // At this point, each thread has computed the amax of the values that it owns for this SF vector.
+          // We now need to compute the amax across threads. Because SM120 narrow-precision MMAs have 16x8 output
+          // size with a quad owning two rows, we know that 8 threads in one column will own all of the 16 elements
+          // to be reduced via amax. Therefore, we can use warp shuffle intrinsics among threads to compute the amax.
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < NumThreadsPerCol; ++i) {
+            auto amax_other = __shfl_xor_sync(0xffffffff, amax, (i * NumThreadsPerQuad));
+            amax = amax_op(amax, amax_other);
+          }
+
+          // At this point, all threads in the quad have the amax for the elements of the accumulator owned by its
+          // threads that should be used in computing the amax for this SF.
+          if (thread_idx_in_warp < NumThreadsPerQuad && WarpsPerSF != 1) {
+            sAmaxs(thread_idx_in_warp, warp_idx) = amax;
+          }
+
+          synchronize();
+
+          // Get the amax broadcasted by the warp with which we share.
+          // For cooperative kernels, when scale factor vector size is 32 (WarpsPerSF equals 2),
+          // warp 0 shares with 1, warp2 shares with 2, etc.
+          // When vector size is 64 (WarpsPerSF equals 4), warp 0 shares with 1/2/3, and 4 shares with 5/6/7.
+          // When vector size is 16, no needs to swap between warps.
+          if constexpr (2 == WarpsPerSF) {
+            auto amax_other = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 1);
+            amax = amax_op(amax, amax_other);
+          }
+          else if constexpr (4 == WarpsPerSF) {
+            auto amax_other1 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 1);
+            auto amax_other2 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 2);
+            auto amax_other3 = sAmaxs(thread_idx % NumThreadsPerQuad, warp_idx ^ 3);
+            amax = amax_op(amax, amax_other1);
+            amax_other2 = amax_op(amax_other2, amax_other3);
+            amax = amax_op(amax, amax_other2);
+          }
+          synchronize();
+
+          ElementCompute pvscale = mul(amax, norm_constant_scaled_down);
+          UnderlyingElementBlockScaleFactor qpvscale = NumericConverter<UnderlyingElementBlockScaleFactor, ElementCompute>{}(pvscale);
+          filter(tC_rSFD)(sf_id + mma_in_epi*ColsPerThreadAccFrag) = qpvscale;
+
+          //
+          // Apply the scale factor to the output
+          //
+          ElementCompute qpvscale_rcp = [&]() {
+            if constexpr (cute::is_same_v<UnderlyingElementBlockScaleFactor, float_ue8m0_t>) {
+              // UE8M0: Use integer subtraction to do the fast rcp in ue8m0 and then convert to float.
+              auto e8m0_qpvscale_rcp = cutlass::reciprocal_approximate<UnderlyingElementBlockScaleFactor>{}(qpvscale);
+              return cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(e8m0_qpvscale_rcp);
+            }
+            else {
+              // UE4M3: Do the rcp in fp32 data type.
+              auto qpvscale_up = cutlass::NumericConverter<ElementCompute, UnderlyingElementBlockScaleFactor>{}(qpvscale);
+              return cutlass::reciprocal_approximate_ftz<decltype(qpvscale_up)>{}(qpvscale_up);
+            }
+          }();
+
+          ElementCompute acc_scale = mul(norm_constant, qpvscale_rcp);
+          acc_scale = cutlass::minimum_with_nan_propagation<ElementCompute>{}(acc_scale, cutlass::platform::numeric_limits<ElementCompute>::max());
+
+          // Compute quantized output values
+          visit_results(mma_in_epi)[sf_id                       ] = mul(acc_frg[sf_id                       ], acc_scale);
+          visit_results(mma_in_epi)[sf_id + ColsPerThreadAccFrag] = mul(acc_frg[sf_id + ColsPerThreadAccFrag], acc_scale);
+        } // end for sf_id
+      } // end for mma_in_epi
+
+      // Since scale factors are computed cooperatively across two or four warps, we only need one thread from the
+      // cooperating column threads group to write out the data.
+      bool write_sf = (thread_idx_in_warp < NumThreadsPerQuad);
+      if constexpr (2 == WarpsPerSF) {
+        // Output warp {0, 2, 4, 6}.
+        write_sf &= ((warp_idx & 0x1) == 0);
+      }
+      else if constexpr (4 == WarpsPerSF) {
+        // Output warp {0, 4}.
+        write_sf &= ((warp_idx & 0x3) == 0);
+      }
+      else if constexpr (1 == WarpsPerSF) {
+        // Output warp {0, 1, ..., 7}. Keep write_sf as is.
+      }
+
+      if (write_sf && elem_less(tC_cSFD(_0{}, _0{}, _0{}, epi_m, epi_n), residue_tC_cSFD)) {
+        copy_aligned(tC_rSFD, tC_gSFD(_, _, _, _0{}, _0{}, get<0>(tile_coord_mn) + epi_m, get<1>(tile_coord_mn) + epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFVecSize, UMMA::Major::MN>;
+    UnderlyingElementBlockScaleFactor* ptr_scale_factor = nullptr;
+    // If Ptr-Array/Grouped GEMM with BlockScaleFactor per batch/group
+    if constexpr (!cute::is_same_v<UnderlyingElementBlockScaleFactor, ElementBlockScaleFactor>) {
+      ptr_scale_factor = params_ptr->ptr_scale_factor[l];
+      l = 0;
+    }
+    else {
+      ptr_scale_factor = params_ptr->ptr_scale_factor;
+    }
+
+    static_assert(size<0>(EpilogueTile{}) && ((size<0>(EpilogueTile{}) & (size<1>(EpilogueTile{}) - 1)) == 0),
+      "Epilogue Tile N should be pow of 2");
+
+    auto epi_tile_mn = shape<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile));
+    Tensor mSFD = make_tensor(make_gmem_ptr(ptr_scale_factor),
+                    Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(args.problem_shape_mnkl));
+
+    Tensor gSFD = local_tile(mSFD, args.epi_tile, make_coord(_, _,l));               // (EPI_M,EPI_N, #EPI_Ms, #EPI_Ns)
+    Tensor tCgSFD = sm90_partition_for_epilogue<ReferenceSrc>(        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,#EPI_Ms, #EPI_Ns)
+                      gSFD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrSFD = make_tensor_like<UnderlyingElementBlockScaleFactor>(take<0,3>(cute::layout(tCgSFD)));    // (CPY,CPY_M,CPY_N)
+
+    auto tile_coord_mn = make_coord(m * size<0>(epi_tile_mn), n * size<1>(epi_tile_mn));
+
+    // Fetch and compute these during initialization
+    Tensor mNormConst= make_tensor(make_gmem_ptr(params_ptr->norm_constant_ptr), make_layout(make_shape(M, N, L), params_ptr->norm_constant_stride));
+    ElementCompute norm_constant = mNormConst(_0{},_0{},l);
+    ElementCompute fp_max = ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+    ElementCompute scale_down_factor = cutlass::reciprocal_approximate_ftz<ElementCompute>{}(fp_max);
+    ElementCompute norm_constant_scaled_down = cutlass::multiplies<ElementCompute>{}(norm_constant, scale_down_factor);
+
+    Tensor sAmaxs = make_tensor(
+      make_smem_ptr(smem_aux),
+      make_layout(make_shape(Int<NumThreadsPerQuad>{}, Int<NumSyncWarps>{}))
+    );
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCrSFD),
+      cute::move(tCgSFD),
+      cute::move(sAmaxs),
+      args.tCcD,
+      args.residue_tCcD,
+      params_ptr,
+      tile_coord_mn,
+      norm_constant,
+      norm_constant_scaled_down,
+      args.thread_idx,
+      args.tiled_copy);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..95e8208686ead6606040ee280023a7f5b879b07b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
@@ -0,0 +1,2792 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Fusion callbacks specializations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/fusion/callbacks.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+using Sm90EVT = Sm90TreeVisitor<NodeOp, ChildOps...>;
+
+// D = alpha * acc
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, 
+      Sm90AccFetch
+    > {
+  using Impl = 
+    Sm90EVT<Sm90Compute<multiplies, ElementOutput, ElementCompute, RoundStyle>,
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>,
+      Sm90AccFetch
+    >;
+  using Operation = fusion::ScaledAcc<ElementOutput, ElementCompute, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    // Give a name and flat ordering to the fusion callback args
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+
+    // Conversion to the args expected by the visitor implementation
+    // to_underlying_arguments will implicitly call this
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : alpha * acc
+          {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+          {},                     // leaf args : acc
+          {} // binary args : multiplies
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombination =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombination<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C, where beta and alpha can be vectors for each batch
+template<
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombinationPtrArray =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcastPtrArray<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombinationPtrArray<typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombination<ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C)
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltAct<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C), where beta and alpha can be vectors for each batch
+template<
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEltActPtrArray =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc))
+    Sm90LinearCombinationPtrArray<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  int NumEpilogueWarpGroups,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90PtrArrayTmaWarpSpecialized<StagesC, 
+                                             StagesD, 
+                                             FragmentSize, 
+                                             ReuseSmemC, 
+                                             DelayTmaStore, 
+                                             NumEpilogueWarpGroups
+                                            >,
+    fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombEltActPtrArray<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombEltActPtrArray<ActivationFn, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombEltAct<ActivationFn, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    ElementScalar const* const* alpha_ptr_array = nullptr;
+    ElementScalar const* const* beta_ptr_array = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {beta_ptr_array}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {alpha_ptr_array}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBias<
+      CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerRowBias<
+    CtaTileShapeMNK, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerRowBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = alpha * acc + beta * C + per-column bias
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBias<ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerColBias<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle> {
+  using Impl = Sm90LinCombPerColBias<
+    StagesC, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+  using Operation = fusion::LinCombPerColBias<
+    ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {     // ternary op : beta * C + (alpha * acc + bias)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // ternary op : alpha * acc + bias
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+            {}                  // ternary args : multiply_add
+          },                    // end ternary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-column bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90LinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // ternary op : alpha * acc + bias
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                  // ternary args : multiply_add
+            },                    // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(alpha * acc + beta * C + per-row bias)
+// Aux = alpha * acc + beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowBiasEltActAux =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
+      Sm90LinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerRowBiasEltActAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerRowBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
+          {                 // unary op : store(beta * C + (alpha * acc + bias))
+            {                  // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, dAux} // unary args : store
+          },                // end unary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// D = activation(alpha * acc + beta * C + per_col bias)
+// Aux = alpha * acc + beta * C + per_col bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColBiasEltActAux =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90EVT<Sm90AuxStore<Stages, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>,
+      Sm90LinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombPerColBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90LinCombPerColBiasEltActAux<
+      StagesC, CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombPerColBiasEltActAux<
+      StagesC, CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombPerColBiasEltActAux<
+      GmemLayoutTagAux, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(store(beta * C + (alpha * acc + bias)))
+          {                 // unary op : store(beta * C + (alpha * acc + bias))
+            {                  // ternary op : beta * C + (alpha * acc + bias)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // ternary op : alpha * acc + bias
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {}                  // ternary args : multiply_add
+              },                    // end ternary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, dAux} // unary args : store
+          },                // end unary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = per-row alpha * acc + per-row beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<bool,_0,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// D = activation(per-row alpha * acc + per-row beta * C + per-row bias)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerRowLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90PerRowLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute,
+                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerRowLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerRowLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    using StrideAlpha = Stride<bool,_0,int64_t>;
+    using StrideBeta  = Stride<bool,_0,int64_t>;
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+    StrideAlpha dAlpha = {bool(1), _0{}, 0};
+    StrideBeta  dBeta  = {bool(1), _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {beta_ptr, beta, dBeta}, // leaf args : beta
+            {},                      // leaf args : C
+            {                        // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha}, // leaf args : alpha
+              {},                         // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                     // ternary args : multiply_add
+            },                       // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = per-col alpha * acc + per-col beta * C + per-column bias
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerColLinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// D = activation(per-col alpha * acc + per-col beta * C + per-column bias)
+template<
+  int StagesC,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerColLinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>,
+    Sm90PerColLinCombPerColBias<StagesC, CtaTileShapeMNK, EpilogueTile, ElementCompute, ElementCompute,
+                                ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle>
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerColLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerColLinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerColLinCombPerColBiasEltAct<
+      StagesC, CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerColLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,bool,int64_t>;
+    using StrideBeta  = Stride<_0,bool,int64_t>;
+    StrideAlpha dAlpha = {_0{}, bool(1), 0};
+    StrideBeta  dBeta  = {_0{}, bool(1), 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op : activation(beta * C + (alpha * acc + bias))
+          {    // ternary op : beta * C + (alpha * acc + bias)
+            {beta_ptr, beta, dBeta}, // leaf args : beta
+            {},               // leaf args : C
+            {                 // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha},   // leaf args : alpha
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}              // ternary args : multiply_add
+            },                // end ternary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          activation // unary args : activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = activation(per-col alpha * acc + per-column bias) + per-col beta * C
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90PerColResAddPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + activation(alpha * acc + bias)
+    Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // beta, dynamic scalar/vector broadcast
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(alpha * acc + bias)
+      Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+        Sm90RowBroadcast<0, CtaTileShapeMNK, ElementScalar, ElementCompute, Stride<_0,bool,int64_t>, AlignmentScalar>, // alpha, dynamic scalar/vector broadcast
+        Sm90AccFetch, // acc
+        Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+      >
+    >
+  >;
+
+  template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  int AlignmentScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::PerColResAddPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90PerColResAddPerColBiasEltAct<
+      CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    > {
+
+  using Impl =
+    Sm90PerColResAddPerColBiasEltAct<
+      CtaTileShapeMNK, EpilogueTile, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+  using Operation =
+    fusion::PerColResAddPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, AlignmentScalar, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,bool,int64_t>;
+    using StrideBeta  = Stride<_0,bool,int64_t>;
+    StrideAlpha dAlpha = {_0{}, bool(1), 0};
+    StrideBeta  dBeta  = {_0{}, bool(1), 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + activation(alpha * acc + bias)
+          {beta_ptr, beta, dBeta}, // leaf args : beta
+          {},                      // leaf args : C
+          {                        // unary op : activation(alpha * acc + bias)
+            {                          // ternary op : alpha * acc + bias
+              {alpha_ptr, alpha, dAlpha},        // leaf args : alpha
+              {},                                // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {}                       // ternary args : multiply_add
+            },                         // end ternary op
+            activation             // unary args : activation
+          },                       // end unary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename T>
+constexpr bool is_fp8_v = cute::is_same_v<T,float_e4m3_t> || cute::is_same_v<T,float_e5m2_t>;
+
+// We only apply the scaling factor if output is fp8
+template <typename ElementOutput>
+struct ScaleOutOp { template <typename T> using Op = cutlass::first<T>; };
+template <>
+struct ScaleOutOp<float_e4m3_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+template <>
+struct ScaleOutOp<float_e5m2_t> { template <typename T> using Op = cutlass::multiplies<T>; };
+
+template <typename T>
+using amax = cutlass::maximum_absolute_value_reduction<T, true>; // propogate nans
+
+}; // end namespace detail
+
+// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
+      Sm90AccFetch, // acc
+      Sm90ColBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_1,_0,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltAct =
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+      Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
+          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
+            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+              {{beta, scale_c},
+               {beta_ptr, scale_c_ptr},
+               {dBeta, {_0{}, _0{}, 0}}
+               },  // leaf args : (scale_c * beta)
+              {},  // leaf args : C
+              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                {{alpha, scale_a, scale_b}, 
+                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {} // ternary args : multiply_add
+              },   // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {{scale_d},
+           {scale_d_ptr}
+           },   // leaf args : scale_d
+          {} // binary args : multiplies or first
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+template<
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 2>, // scale_c * beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>, 3>, // scale_a * scale_b * alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, CtaTileShapeMNK, ElementBias, ElementCompute, Stride<_0,_1,int64_t>, AlignmentBias> // bias
+    >
+  >;
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-col bias
+// if D is fp8 
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template<
+  class CtaTileShapeMNK,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltAct =
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+      // Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+      Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90ScaledLinCombPerColBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerColBiasEltAct<
+      CtaTileShapeMNK, ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerColBiasEltAct<
+      ActivationFn, ElementOutput, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)) * scale_d
+          {    // unary op : activation((scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias))
+            {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+              {{beta, scale_c},
+               {beta_ptr, scale_c_ptr},
+               {dBeta, {_0{}, _0{}, 0}}
+               },  // leaf args : (scale_c * beta)
+              {},  // leaf args : C
+              {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                {{alpha, scale_a, scale_b}, 
+                 {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                 {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                 },                   // leaf args : (scale_a * scale_b * alpha)
+                {},                   // leaf args : acc
+                {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+                {} // ternary args : multiply_add
+              },   // end ternary op
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            activation // unary args : activation
+          },   // end unary op
+          {{scale_d},
+           {scale_d_ptr}
+           },   // leaf args : scale_d
+          {} // binary args : multiplies or first
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+
+// fp8 aux specialization
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8 =
+  Sm90SplitTreeVisitor<
+    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+    Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+          Sm90SplitTreeFetch // Z
+        >
+      >,
+      Sm90ScalarBroadcast<ElementScalar> // scale_d
+    >,
+    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
+      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
+        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
+          Sm90SplitTreeFetch // Z
+        >,
+        Sm90ScalarBroadcast<ElementScalar> // scale_aux
+      >
+    >
+  >;
+
+// non-fp8 aux specialization
+// lets us use some EVT specializations such as relu + uint1b_t aux
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8 =
+  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
+          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+          Sm90ScaledLinCombPerRowBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+        >
+      >
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+// dispatcher
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerRowBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
+  >,
+  Sm90ScaledLinCombPerRowBiasEltActAmaxAuxNotFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+  >
+>;
+
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementAmax,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerRowBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerRowBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    ElementScalar scale_aux = ElementScalar(1);
+    ElementScalar const* scale_aux_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    ElementAmax* amax_D_ptr = nullptr;
+    ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      // Only compute amax_d if D is fp8
+      ElementAmax* amax_D_ptr_ = nullptr;
+      if constexpr (detail::is_fp8_v<ElementOutput>) {
+        amax_D_ptr_ = amax_D_ptr;
+      }
+
+      // Aux is fp8 -> DAG arguments
+      if constexpr (detail::is_fp8_v<ElementAux>) {
+        typename Impl::Arguments args;
+        // always use structured binding to unpack DAG args since it may or may not be a tuple
+        auto& [Z_args, aux_args, D_args] = args;
+
+        Z_args =
+          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+            {{beta, scale_c},
+             {beta_ptr, scale_c_ptr},
+             {dBeta, {_0{}, _0{}, 0}}
+             },  // leaf args : (scale_c * beta)
+            {},  // leaf args : C
+            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+              {{alpha, scale_a, scale_b}, 
+               {alpha_ptr, scale_a_ptr, scale_b_ptr},
+               {dAlpha ,{_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+               },                   // leaf args : (scale_a * scale_b * alpha)
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            {} // ternary args : multiply_add
+          };   // end ternary op
+
+        D_args =
+          {    // binary op : activation(Z) * scale_d or activation(Z)
+            {    // unary op : reduce(activation(Z))
+              {             // unary op : activation(Z)
+                {},             // leaf args : Z
+                activation      // unary args : activation
+              },                // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},
+             {scale_d_ptr}
+             },  // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+
+        aux_args =
+          {    // unary op : store(Aux)
+            {    // binary op : Z * scale_d or Z
+              {    // unary op : reduce(Z)
+                {},            // leaf args : Z
+                {amax_aux_ptr} // unary args : reduce
+              },   // end unary op
+              {{scale_aux},
+               {scale_aux_ptr}
+               },  // leaf args : scale_d
+              {} // binary args : multiplies
+            },   // end binary op
+            {aux_ptr, dAux} // unary args : store
+          };   // end unary op
+
+        return args;
+      }
+
+      // Aux is not fp8 -> Tree arguments
+      else {
+        return
+          {  // binary op : activation(Z) * scale_d or activation(Z)
+            {  // unary op : reduce(activation(Z))
+              {  // unary op : activation(Z)
+                {  // unary op : store(Z)
+                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+                    {{beta, scale_c},
+                     {beta_ptr, scale_c_ptr},
+                     {dBeta, {_0{}, _0{}, 0}}
+                    },                // leaf args : (scale_c * beta)
+                    {},               // leaf args : C
+                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                      {{alpha, scale_a, scale_b}, 
+                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                       {dAlpha, {_0{}, _0{}, 0}}
+                      },                // leaf args : (scale_a * scale_b * alpha)
+                      {},               // leaf args : acc
+                      {bias_ptr, ElementBias(0), dBias
+                      },                // leaf args : bias
+                      {}              // ternary args : multiply_add
+                    },                // end ternary op
+                    {}              // ternary args : multiply_add
+                  },                // end ternary op
+                  {aux_ptr, dAux} // unary args : store
+                },                // end unary op
+                activation     // unary args : activation
+              },               // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+      }
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+// if D is fp8 
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8 
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+
+// fp8 aux specialization
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAuxFp8 =
+  Sm90SplitTreeVisitor<
+    // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-col bias
+    Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>,
+    // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+    Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+      Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+        Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+          Sm90SplitTreeFetch // Z
+        >
+      >,
+      Sm90ScalarBroadcast<ElementScalar> // scale_d
+    >,
+    // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+    Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // store(Aux)
+      Sm90EVT<Sm90Compute<cutlass::multiplies, ElementCompute, ElementCompute, RoundStyle>, // Z * scale_aux
+        Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_aux
+          Sm90SplitTreeFetch // Z
+        >,
+        Sm90ScalarBroadcast<ElementScalar> // scale_aux
+      >
+    >
+  >;
+
+// non-fp8 aux specialization
+// lets us use some EVT specializations such as relu + uint1b_t aux
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAuxNotFp8 =
+  // D = activation(Z) * scale_d, amax_d = max(abs(elements in D))
+  Sm90EVT<Sm90Compute<detail::ScaleOutOp<ElementOutput>::template Op, ElementOutput, ElementCompute, RoundStyle>, // activation(Z) * scale_d
+    Sm90EVT<Sm90ScalarReduction<detail::amax, atomic_maximum, ElementAmax, ElementCompute, RoundStyle>, // amax_d
+      Sm90EVT<Sm90Compute<ActivationFn, ElementCompute, ElementCompute, RoundStyle>, // activation(Z)
+        Sm90EVT<Sm90AuxStore<StagesD, EpilogueTile, ElementAux, RoundStyle, StrideAux, SmemLayoutAtom, CopyOpR2S, AlignmentAux>, // Aux = Z
+          // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+          Sm90ScaledLinCombPerColBias<CtaTileShapeMNK, ElementCompute, ElementCompute, ElementBias, ElementSource, ElementScalar, AlignmentBias, RoundStyle>
+        >
+      >
+    >,
+    Sm90ScalarBroadcast<ElementScalar> // scale_d
+  >;
+
+// dispatcher
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int StagesD,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementAmax = ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90ScaledLinCombPerColBiasEltActAmaxAux = conditional_t<detail::is_fp8_v<ElementAux>,
+  Sm90ScaledLinCombPerColBiasEltActAmaxAuxFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar,AlignmentAux, AlignmentBias, RoundStyle
+  >,
+  Sm90ScaledLinCombPerColBiasEltActAmaxAuxNotFp8<
+    CtaTileShapeMNK, EpilogueTile, StagesD, StrideAux, SmemLayoutAtom, CopyOpR2S, ActivationFn,
+    ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+  >
+>;
+
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementAmax,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpR2S
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::ScaledLinCombPerColBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpR2S
+> : Sm90ScaledLinCombPerColBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90ScaledLinCombPerColBiasEltActAmaxAux<
+      CtaTileShapeMNK, EpilogueTile, StagesD, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>,
+      SmemLayoutAtom, CopyOpR2S, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::ScaledLinCombPerColBiasEltActAmaxAux<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementAmax, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    ElementScalar scale_a = ElementScalar(1);
+    ElementScalar scale_b = ElementScalar(1);
+    ElementScalar scale_c = ElementScalar(1);
+    ElementScalar scale_d = ElementScalar(1);
+    ElementScalar const* scale_a_ptr = nullptr;
+    ElementScalar const* scale_b_ptr = nullptr;
+    ElementScalar const* scale_c_ptr = nullptr;
+    ElementScalar const* scale_d_ptr = nullptr;
+
+    ElementScalar scale_aux = ElementScalar(1);
+    ElementScalar const* scale_aux_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using StrideBias = Stride<_0,_1,int64_t>;
+    ElementBias const* bias_ptr = nullptr;
+    StrideBias dBias = {};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    ElementAmax* amax_D_ptr = nullptr;
+    ElementAmax* amax_aux_ptr = nullptr;
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      // Only compute amax_d if D is fp8
+      ElementAmax* amax_D_ptr_ = nullptr;
+      if constexpr (detail::is_fp8_v<ElementOutput>) {
+        amax_D_ptr_ = amax_D_ptr;
+      }
+
+      // Aux is fp8 -> DAG arguments
+      if constexpr (detail::is_fp8_v<ElementAux>) {
+        typename Impl::Arguments args;
+        // always use structured binding to unpack DAG args since it may or may not be a tuple
+        auto& [Z_args, aux_args, D_args] = args;
+
+        Z_args =
+          {    // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+            {{beta, scale_c},
+             {beta_ptr, scale_c_ptr},
+             {dBeta, {_0{}, _0{}, 0}}
+             },  // leaf args : (scale_c * beta)
+            {},  // leaf args : C
+            {    // ternary op : (scale_a * scale_b * alpha) * acc + bias
+              {{alpha, scale_a, scale_b}, 
+               {alpha_ptr, scale_a_ptr, scale_b_ptr},
+               {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+               },                   // leaf args : (scale_a * scale_b * alpha)
+              {},                   // leaf args : acc
+              {bias_ptr, ElementBias(0), dBias}, // leaf args : bias
+              {} // ternary args : multiply_add
+            },   // end ternary op
+            {} // ternary args : multiply_add
+          };   // end ternary op
+
+        D_args =
+          {    // binary op : activation(Z) * scale_d or activation(Z)
+            {    // unary op : reduce(activation(Z))
+              {             // unary op : activation(Z)
+                {},             // leaf args : Z
+                activation      // unary args : activation
+              },                // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},
+             {scale_d_ptr}
+             },  // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+
+        aux_args =
+          {    // unary op : store(Aux)
+            {    // binary op : Z * scale_d or Z
+              {    // unary op : reduce(Z)
+                {},            // leaf args : Z
+                {amax_aux_ptr} // unary args : reduce
+              },   // end unary op
+              {{scale_aux},
+               {scale_aux_ptr}
+               },  // leaf args : scale_d
+              {} // binary args : multiplies
+            },   // end binary op
+            {aux_ptr, dAux} // unary args : store
+          };   // end unary op
+
+        return args;
+      }
+
+      // Aux is not fp8 -> Tree arguments
+      else {
+        return
+          {  // binary op : activation(Z) * scale_d or activation(Z)
+            {  // unary op : reduce(activation(Z))
+              {  // unary op : activation(Z)
+                {  // unary op : store(Z)
+                  {  // ternary op : (scale_c * beta) * C + ((scale_a * scale_b * alpha) * acc + bias)
+                    {{beta, scale_c},
+                    {beta_ptr, scale_c_ptr},
+                    {dBeta, {_0{}, _0{}, 0}}
+                    },  // leaf args : (scale_c * beta)
+                    {},               // leaf args : C
+                    {                 // ternary op : (scale_a * scale_b * alpha) * acc + bias
+                      {{alpha, scale_a, scale_b}, 
+                       {alpha_ptr, scale_a_ptr, scale_b_ptr},
+                       {dAlpha, {_0{}, _0{}, 0}, {_0{}, _0{}, 0}}
+                      },                // leaf args : (scale_a * scale_b * alpha)
+                      {},               // leaf args : acc
+                      {bias_ptr, ElementBias(0), dBias
+                      },                // leaf args : bias
+                      {}              // ternary args : multiply_add
+                    },                // end ternary op
+                    {}              // ternary args : multiply_add
+                  },                // end ternary op
+                  {aux_ptr, dAux} // unary args : store
+                },                // end unary op
+                activation     // unary args : activation
+              },               // end unary op
+              {amax_D_ptr_} // unary args : reduce
+            },              // end unary op
+            {{scale_d},{scale_d_ptr}}, // leaf args : scale_d
+            {} // binary args : multiplies or first
+          };   // end binary op
+      }
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltAct =
+  Sm90EVT<Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>, // activation(beta * C + (alpha * acc), aux)
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90AuxLoad<Stages, EpilogueTile, ElementAux, StrideAux, SmemLayoutAtom, CopyOpS2R, AlignmentAux> // aux
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltAct<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltAct<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // binary op : activation(beta * C + (alpha * acc), aux)
+          {                  // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {}               // ternary args : multiply_add
+          },                 // end ternary op
+          {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+          activation // binary args : activation
+        };   // end binary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  int Stages,
+  class StrideAux,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux = ElementOutput,
+  class ElementBias = ElementOutput,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  int AlignmentAux = 128 / sizeof_bits_v<ElementAux>,
+  int AlignmentBias = 128 / sizeof_bits_v<ElementBias>,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDeEltActDePerRowBias =
+  Sm90EVT<Sm90Compute<cutlass::epilogue::thread::Identity, ElementOutput, ElementCompute, RoundStyle>, // Identity for final conversion
+    Sm90EVT<Sm90ColReduction<plus, plus, plus, 0, CtaTileShapeMNK,
+                             ElementBias, ElementCompute, RoundStyle, Stride<_1,_0,int64_t>, AlignmentBias>,
+      Sm90LinCombDeEltAct<CtaTileShapeMNK, EpilogueTile, Stages, StrideAux, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+                          ElementCompute, ElementCompute, ElementAux, ElementSource, ElementScalar, AlignmentAux, RoundStyle>
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class GmemLayoutTagAux,
+  template <class> class ActivationFn,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementAux,
+  class ElementBias,
+  class ElementSource,
+  class ElementScalar,
+  int AlignmentAux,
+  int AlignmentBias,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class SmemLayoutAtom,
+  class CopyOpS2R
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >,
+    CtaTileShapeMNK,
+    EpilogueTile,
+    SmemLayoutAtom,
+    CopyOpS2R
+> : Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    > {
+
+  using Impl =
+    Sm90LinCombDeEltActDePerRowBias<
+      CtaTileShapeMNK, EpilogueTile, StagesC, cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>, SmemLayoutAtom, CopyOpS2R, ActivationFn,
+      ElementOutput, ElementCompute, ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+  using Operation =
+    fusion::LinCombDeEltActDePerRowBias<
+      GmemLayoutTagAux, ActivationFn, ElementOutput, ElementCompute,
+      ElementAux, ElementBias, ElementSource, ElementScalar, AlignmentAux, AlignmentBias, RoundStyle
+    >;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    using ActivationArguments = typename Sm90Compute<ActivationFn, ElementOutput, ElementCompute, RoundStyle>::Arguments;
+    ActivationArguments activation = ActivationArguments();
+
+    using StrideAux = cutlass::gemm::TagToStrideC_t<GmemLayoutTagAux>;
+    ElementAux const* aux_ptr = nullptr;
+    StrideAux dAux = {};
+
+    using StrideBias = Stride<_1,_0,int64_t>;
+    ElementBias* dbias_ptr = nullptr;
+    StrideBias dDbias = {};
+
+    operator typename Impl::Arguments() const {
+      return
+      {   // unary op : identity/convert
+        {    // unary op : reduce(activation(beta * C + (alpha * acc), aux))
+          {    // binary op : activation(beta * C + (alpha * acc), aux)
+            {                  // ternary op : beta * C + (alpha * acc)
+              {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+              {},                   // leaf args : C
+              {                     // binary op : alpha * acc
+                {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+                {},                     // leaf args : acc
+                {}                  // binary args : multiplies
+              },                    // end binary op
+              {}               // ternary args : multiply_add
+            },                 // end ternary op
+            {aux_ptr, ElementAux(0), dAux}, // leaf args : aux
+            activation // binary args : activation
+          },   // end binary op
+          {dbias_ptr, ElementCompute(0), dDbias} // unary args : reduce
+        },   // end unary op
+        {} // unary args : identity/convert
+      };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// D = softmax(top_k(alpha * acc + beta * C))
+template<
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombTopKSoftmaxCol =
+  Sm90EVT<Sm90TopKSoftmaxColReduction<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, RoundStyle>, // softmax(top_k(beta * C + (alpha * acc)))
+    Sm90LinearCombination<ElementCompute, ElementCompute, ElementSource, ElementScalar, RoundStyle> // beta * C + (alpha * acc)
+  >;
+
+template <
+  int TopK,
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinCombTopKSoftmaxCol<TopK, FragmentSize, CtaTileShapeMNK, EpilogueTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinCombTopKSoftmaxCol<TopK, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // unary op: activation(beta * C + (alpha * acc))
+          {    // ternary op : beta * C + (alpha * acc)
+            {{beta}, {beta_ptr}}, // leaf args : beta
+            {},                   // leaf args : C
+            {                     // binary op : alpha * acc
+              {{alpha}, {alpha_ptr}}, // leaf args : alpha
+              {},                     // leaf args : acc
+              {}                  // binary args : multiplies
+            },                    // end binary op
+            {} // ternary args : multiply_add
+          },   // end ternary op
+          {} // unary args: activation
+        };   // end unary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Grouped Wgrad Conv
+template<
+  class GroupsPerTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombinationGroupedWgrad =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetchGroupedWgrad<GroupsPerTile> // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class GroupsPerTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinearCombinationGroupedWgrad<GroupsPerTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombinationGroupedWgrad<GroupsPerTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombinationGroupedWgrad<GroupsPerTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombinationGroupedWgrad<GroupsPerTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    //ElementScalar groups = ElementScalar(1);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+template <class FusionOpOrCallbacks, class = cute::void_t<>>
+struct get_element_aux {
+  using type = void;
+};
+
+template <class FusionOpOrCallbacks>
+struct get_element_aux<FusionOpOrCallbacks, cute::void_t<typename FusionOpOrCallbacks::ElementAux>> {
+  using type = typename FusionOpOrCallbacks::ElementAux;
+};
+
+template <class NodeOp, class... ChildOps>
+struct get_element_aux<Sm90TreeVisitor<NodeOp, ChildOps...>, cute::void_t<>> {
+  using type = typename get_element_aux<NodeOp>::type;
+};
+
+template <class... Ts>
+struct get_element_aux<FusionCallbacks<Ts...>, cute::void_t<typename FusionCallbacks<Ts...>::Operation>> {
+ private:
+  using Operation = typename FusionCallbacks<Ts...>::Operation;
+ public:
+  using type = typename get_element_aux<Operation>::type;
+};
+} // namespace cutlass:epilogue::fusion::detail
+
+template <class Callbacks>
+using get_element_aux_t = typename detail::get_element_aux<Callbacks>::type;
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae63a7675c12dc4329374815da4d081a6bd885ee
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
@@ -0,0 +1,842 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/detail/helper_macros.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The template argument provided for ComputeFn must be able to accept
+// exactly one template parameter.  In Standard C++, it's OK for
+// ComputeFn to have other template parameters, as long as those have
+// defaults.  For example, the following struct Foo would work.
+//
+// template<class A, class B = A>
+// struct Foo {
+//   CUTLASS_HOST_DEVICE auto operator() (A a, B b);
+// };
+//
+// However, some compilers, such as Clang, require that the argument
+// take _exactly_ one template parameter.  This is nonstandard C++
+// behavior.  One work-around for this case is to create a subclass
+// with exactly one template parameter, and then use that subclass as
+// the template argument.
+//
+// template<class A>
+// struct FooHomogeneous : public Foo<A, A> {};
+//
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct Sm90Compute {
+private:
+  using EmptyArguments = typename Sm90VisitorImpl<>::Arguments;
+
+  template <class Fn, class = void>
+  struct ComputeArguments {
+    using type = EmptyArguments;
+  };
+
+  // partial specialization for compute fns that define an Arguments member, e.g. activation hyperparameters
+  template <class Fn>
+  struct ComputeArguments<Fn, platform::void_t<typename Fn::Arguments>> {
+    using type = typename Fn::Arguments;
+  };
+
+public:
+  struct SharedStorage { };
+
+  using Arguments = typename ComputeArguments<ComputeFn<ElementCompute>>::type;
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const&, Arguments const& args, void*) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const&, Arguments const&) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute()
+      : params() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90Compute(Params const& params, SharedStorage const& shared_storage)
+      : params(params) {}
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Params const& params)
+      : params(params) {}
+
+    Params const& params;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) CUTLASS_LAMBDA_FUNC_INLINE {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          ComputeOutput compute_output{};
+
+          if constexpr (cute::is_same_v<Arguments, EmptyArguments>) {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs...))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs...));
+          }
+          else {
+            using ElementComputeOutput =
+                typename cute::remove_cvref_t<decltype(compute_output(cvt_frg_inputs..., params))>::Element;
+            using ConvertOutput = NumericArrayConverter<ElementOutput, ElementComputeOutput, FragmentSize, RoundStyle>;
+            ConvertOutput convert_output{};
+            return convert_output(compute_output(cvt_frg_inputs..., params));
+          }
+        }
+      );
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks(params);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Performance Optimized Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// beta * C + Z
+template <
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class InputScaleOp,  // beta
+  class ElementSource, // C
+  class InputAddOp     // Z
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle,
+              cute::void_t<decltype(declval<InputScaleOp>().is_zero())>>,
+  InputScaleOp,
+  Sm90SrcFetch<ElementSource>,
+  InputAddOp
+> : Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      InputScaleOp,
+      Sm90SrcFetch<ElementSource>,
+      InputAddOp,
+      Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    if constexpr (detail::IsScalarBroadcast<InputScaleOp>::value && not is_void_v<ElementSource>) {
+      return (get<2>(scale_op.params_ptr->dScalar[0]) != 0 && scale_op.params_ptr->scalar_ptrs[0] != nullptr) ||
+              is_C_load_needed() ||
+              added_op.is_producer_load_needed();
+    }
+    else {
+      return is_C_load_needed() || added_op.is_producer_load_needed();
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    auto const& scale_op = get<0>(Impl::ops);
+    auto const& src_op = get<1>(Impl::ops);
+    auto const& added_op = get<2>(Impl::ops);
+    return (not scale_op.is_zero() && src_op.is_C_load_needed()) || added_op.is_C_load_needed();
+  }
+
+  template <class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(bool is_C_load_needed, CallbacksImpl&& impl)
+      : is_C_load_needed(is_C_load_needed), CallbacksImpl(cute::forward<CallbacksImpl>(impl)) { }
+
+    bool is_C_load_needed;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_added = get<2>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      using ElementZ = typename decltype(frg_added)::Element;
+      using ConvertZ = NumericArrayConverter<ElementCompute, ElementZ, FragmentSize, RoundStyle>;
+      using ConvertI = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertZ convert_Z{};
+      ConvertI convert_I{};
+
+      Array frg_I = convert_Z(frg_added);
+
+      if constexpr (!is_void_v<ElementSource>) {
+        Array frg_scalar = get<0>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+        Array frg_source = get<1>(CallbacksImpl::callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+        using ElementX = typename decltype(frg_scalar)::Element;
+        using ElementY = typename decltype(frg_source)::Element;
+        using ConvertX = NumericArrayConverter<ElementCompute, ElementX, FragmentSize, RoundStyle>;
+        using ConvertY = NumericArrayConverter<ElementCompute, ElementY, FragmentSize, RoundStyle>;
+        using ComputeI = multiply_add<Array<ElementCompute, FragmentSize>>;
+        ConvertX convert_X{};
+        ConvertY convert_Y{};
+        ComputeI compute_I{};
+
+        frg_I = compute_I(convert_X(frg_scalar), convert_Y(frg_source), frg_I);
+      }
+
+      return convert_I(frg_I);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_tuple = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    bool is_C_load_needed = this->is_C_load_needed();
+    if (not is_C_load_needed) {
+      cute::clear(args.tCrC);
+    }
+    return ConsumerStoreCallbacks<decltype(callbacks_tuple)>(
+        is_C_load_needed, std::move(callbacks_tuple));
+  }
+};
+
+// ReLU with aux bit tensor dReLU/dZ
+// Aux(i) = Z(i) >= 0 ? 1 : 0
+namespace detail {
+// Placeholder node so we can retain standard EVT structure
+template <class StrideMNL>
+struct Sm90ReLUAuxStore : Sm90VisitorImpl<> {
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ReLUAuxStore(Params const& params, SharedStorage const& shared_storage) { }
+};
+} // namespace detail
+
+// Specialization on the generic compute+aux EVT
+template <
+  // Compute node
+  template <class> class Activation,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  // Aux node
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment,
+  bool EnableNullptr,
+  // Input node
+  class InputOp
+>
+struct Sm90TreeVisitor<
+  Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle,
+              cute::enable_if_t<cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ReLu<ElementCompute>>  ||
+                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>> ||
+                                cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ThresholdReLU<ElementCompute>> >>,
+  Sm90TreeVisitor<
+    Sm90AuxStore<
+      Stages,
+      EpilogueTile,
+      cutlass::uint1b_t,
+      RoundStyle,
+      StrideMNL,
+      SmemLayoutAtom,
+      CopyOpR2S,
+      Alignment,
+      EnableNullptr
+    >,
+    InputOp
+  >
+> : Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >
+{
+  using Impl =
+    Sm90VisitorImpl<
+      Sm90VisitorImpl<
+        InputOp,
+        detail::Sm90ReLUAuxStore<StrideMNL>
+      >,
+      Sm90Compute<Activation, ElementOutput, ElementCompute, RoundStyle>
+    >;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(Params const& params_, SharedStorage const& shared_storage)
+    : params(params_), Impl(params_, shared_storage) {}
+
+  Params const& params;
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue, class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        RTensor&& tC_rAux,
+        GTensor&& tC_gAux,
+        CTensor tC_cAux,
+        ThrResidue residue_tC_cAux,
+        Params const& params,
+        CallbacksImpl&& impl)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux)),
+        tC_cAux(tC_cAux),
+        residue_tC_cAux(residue_tC_cAux),
+        params(params),
+        CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      Array frg_input = callbacks_input.visit(frg_acc, epi_v, epi_m, epi_n);
+
+      // Compute activation + aux
+      using ElementInput = typename decltype(frg_input)::Element;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ConvertAux = PackPredicates<FragmentSize>;
+      using ComputeOutput = Activation<ElementCompute>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput relu{};
+      ConvertAux convert_aux{};
+      ConvertOutput convert_output{};
+
+      Array frg_compute = convert_input(frg_input);
+      bool frg_aux[FragmentSize];
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        ElementCompute pre_relu = frg_compute[i];
+        if constexpr (cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::Clamp<ElementCompute>> ||
+                      cute::is_same_v<Activation<ElementCompute>, cutlass::epilogue::thread::ThresholdReLU<ElementCompute>>) {
+          frg_compute[i] = relu(frg_compute[i], params_compute);
+        }
+        else {
+          frg_compute[i] = relu(frg_compute[i]);
+        }
+        if constexpr (cute::is_same_v<ElementCompute, float>) {
+          uint32_t aux;
+          asm volatile("set.equ.u32.f32 %0, %1, %2;\n" : "=r"(aux) : "f"(frg_compute[i]), "f"(pre_relu)); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else if constexpr (cute::is_same_v<ElementCompute, cutlass::half_t>) {
+          uint32_t aux;
+          cutlass::half_t compute = frg_compute[i];
+          asm volatile("set.equ.u32.f16 %0, %1, %2;\n" : "=r"(aux) : "h"(compute.raw()), "h"(pre_relu.raw())); // NaN outputs 1 in Aux
+          frg_aux[i] = static_cast<bool>(aux);
+        } else {
+          frg_aux[i] = frg_compute[i] == pre_relu;
+        }
+      }
+
+      static_assert(FragmentSize % 8 == 0, "Predicate vector must be byte-aligned");
+      Tensor tC_rAux_frg = recast<typename ConvertAux::result_type>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)));   // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_aux(frg_aux);
+
+      return convert_output(frg_compute);
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      // Unpack callbacks + params
+      auto& [callbacks_input_aux, callbacks_compute] = CallbacksImpl::callbacks_tuple;
+      auto& [callbacks_input, callbacks_aux] = callbacks_input_aux.callbacks_tuple;
+      auto const& [params_input_aux, params_compute] = params;
+      auto const& [params_input, params_aux] = params_input_aux;
+
+      // Visit the input node
+      callbacks_input.end();
+
+      // Nullptr is no-op
+      if constexpr (EnableNullptr) {
+        if (params_aux.ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      // Compute vectorization
+      constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      // Copy vectorizes into byte-aligned stores
+      if constexpr (V > 1 && V % 8 == 0) {
+        using VecType = uint_bit_t<V>;
+        Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+        Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+        Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+        Tensor tC_pAux_vec = cute::lazy::transform(tC_cAux_vec, [&](auto const& c){ return elem_less(c, residue_tC_cAux); });
+        copy_if(tC_pAux_vec, tC_rAux_vec, tC_gAux_vec);
+      }
+      // sub-byte vectorization, must serialize threads
+      else {
+        // Assumes no inter-warp sharing of bytes (most copy layouts should satisfy this)
+        int lane_idx = canonical_lane_idx();
+        Tensor tC_pAux = cute::lazy::transform(tC_cAux, [&](auto const& c){ return elem_less(c, residue_tC_cAux); });
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int i = 0; i < NumThreadsPerWarp; ++i) {
+          if (lane_idx == i) {
+            copy_if(tC_pAux, tC_rAux, tC_gAux);
+          }
+          __syncwarp();
+        }
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // Unpack params
+    auto const& [params_input_aux, params_compute] = params;
+    auto const& [params_input, params_aux] = params_input_aux;
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr<cutlass::uint1b_t>(params_aux.ptr_aux);
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params_aux.dAux));                     // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    auto callbacks_impl = Impl::template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD), decltype(callbacks_impl)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params, cute::move(callbacks_impl));
+  }
+};
+
+// Aux load for uint1b_t
+template <
+  int Stages,
+  class EpilogueTile,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  Stages,
+  EpilogueTile,
+  cutlass::uint1b_t,
+  StrideMNL,
+  SmemLayoutAtom,
+  CopyOpS2R,
+  Alignment,
+  EnableNullptr
+> {
+  static_assert(Alignment % 128 == 0, "sub-16B alignment not supported yet");
+
+  struct SharedStorage {};
+
+  struct Arguments {
+    cutlass::uint1b_t const* ptr_aux = nullptr;
+    cutlass::uint1b_t null_default = cutlass::uint1b_t(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const&)
+      : params(params) { }
+
+  Params const params;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class RTensor, class GTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux_, GTensor&& tC_gAux_, CTensor tC_cAux_, ThrResidue residue_tC_cAux_, Params const& params_)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux_)),
+        tC_gAux(cute::forward<GTensor>(tC_gAux_)),
+        tC_cAux(tC_cAux_),
+        residue_tC_cAux(residue_tC_cAux_),
+        params(params_) {}
+
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N,{EPI_M,EPI_N})
+    GTensor tC_gAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tC_cAux;                                                                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tC_cAux;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 5) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        constexpr auto MCL = decltype(max_common_layout(tC_rAux, tC_gAux)){};
+        constexpr int V = cute::min(Alignment, size(MCL));
+        if constexpr (V > 1) {
+          using VecType = uint_bit_t<V>;
+          Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
+          Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
+          Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
+          Tensor tC_pAux_vec = cute::lazy::transform(tC_cAux_vec, [&](auto const& c){ return elem_less(c, residue_tC_cAux); });
+          copy_if(tC_pAux_vec, tC_gAux_vec, tC_rAux_vec);
+        }
+        else {
+          Tensor tC_pAux = cute::lazy::transform(tC_cAux, [&](auto const& c){ return elem_less(c, residue_tC_cAux); });
+          copy_if(tC_pAux, tC_gAux, tC_rAux);
+        }
+      }
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        if constexpr (EnableNullptr) {
+          if (params.ptr_aux == nullptr) {
+            return;
+          }
+        }
+
+        Tensor tC_pAux = cute::lazy::transform(tC_cAux(_,_,_,epi_m,epi_n), [&](auto const& c){ return elem_less(c, residue_tC_cAux); });
+        copy_if(tC_pAux, tC_gAux(_,_,_,epi_m,epi_n), tC_rAux);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      using ElementRegister = typename remove_cvref_t<RTensor>::value_type;
+      if constexpr (decltype(cute::rank(tC_rAux))::value == 3) {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux))(epi_v);
+      }
+      else {
+        return recast<Array<ElementRegister, FragmentSize>>(coalesce(tC_rAux(_,_,_,epi_m,epi_n)))(epi_v);
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    gmem_ptr ptr_aux = make_gmem_ptr<cutlass::uint1b_t const>(params.ptr_aux);
+    Tensor mAux = make_tensor(ptr_aux, make_layout(make_shape(M,N,L), params.dAux));                         // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // If byte-unaligned vectorization, store in registers as uint32_t to reduce redundant pack+unpack instruction sequences
+    constexpr int V = decltype(max_common_vector(tC_gAux.layout(), make_layout(tC_gAux.shape())))::value;
+    Tensor tC_rAux = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (V % 8 != 0) {
+        return make_tensor<uint32_t>(take<0,3>(shape(tC_gAux)));                       // (CPY,CPY_M,CPY_N)
+      } else {
+        return make_tensor<cutlass::uint1b_t>(shape(tC_gAux));                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      }
+    }();
+
+    if constexpr (EnableNullptr) {
+      if (params.ptr_aux == nullptr) {
+        fill(tC_rAux, params.null_default);
+      }
+    }
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tC_gAux), decltype(args.tCcD), decltype(args.residue_tCcD)>(
+        cute::move(tC_rAux), cute::move(tC_gAux), args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+// dReLU specialization
+template<
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle
+>
+struct Sm90Compute<
+  cutlass::epilogue::thread::dReLU,
+  ElementOutput,
+  ElementCompute,
+  RoundStyle
+> : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, typename ElementInput, typename ElementAux, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput      , FragmentSize> const& frg_input,
+          Array<ElementAux        , FragmentSize> const& frg_aux) {
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ComputeOutput = cutlass::epilogue::thread::dReLU<Array<ElementCompute, FragmentSize>>;
+      using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      ComputeOutput compute_output{};
+      ConvertOutput convert_output{};
+
+      return convert_output(compute_output(convert_input(frg_input), frg_aux)); // don't convert frg_aux for dReLU
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..535d8b082d44ff796fe2efc4e1531b4a3dc2674c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -0,0 +1,1492 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/helper_macros.hpp"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct Sm90AccFetch : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return frg_acc;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks{};
+  }
+};
+
+// Split tree visitor fetches intermediate results from temporary accumulators
+using Sm90SplitTreeFetch = Sm90AccFetch;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns C
+template <class Element>
+struct Sm90SrcFetch : Sm90VisitorImpl<> {
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return is_C_load_needed();
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return not is_void_v<Element>;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_void_v<Element>;
+  }
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+
+  template<class SrcTensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(SrcTensor const& tCrC)
+      : tCrC(tCrC) {}
+
+    SrcTensor const& tCrC;                                                                         // (CPY,CPY_M,CPY_N)
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<typename SrcTensor::value_type, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<typename SrcTensor::value_type, FragmentSize>>(tCrC)(epi_v);
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    // register type may differ from logical type so we can't assert matching types here
+    return ConsumerStoreCallbacks(args.tCrC);
+  }
+};
+
+// returns accumulator in Grouped Conv Wgrad
+template <class GroupsPerTile_>
+struct Sm90AccFetchGroupedWgrad : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+  using GroupsPerTile = GroupsPerTile_;
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(int32_t thread_idx)
+      : thread_idx(thread_idx) { }
+
+    int32_t thread_idx;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+
+      Array<ElementAccumulator, FragmentSize> frg_acc_rst;
+      int warp_id = thread_idx / 32;
+
+      // In Grouped Wgrad, only diagonal block data is valid and the others is wrong and useless.
+      // One block size is C/G x C/G. Note that C/G = Tile_N / GroupsPerTile.
+      // Copy diagonal block ACC into the first block Col which is the output tensor size Tile_M * C/G.
+      // Then we can store the valid output tensor tile directly.
+      if constexpr ( cute::is_same_v<GroupsPerTile, _1> ) {
+        frg_acc_rst = frg_acc;
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _2> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 16; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id / 2 * 16];
+        }
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _4> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 8; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id * 8];
+        }
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _8> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 4; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id * 8 + i / 2 * 4];
+        }
+      }
+
+      return frg_acc_rst;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks(args.thread_idx);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90AuxLoad {
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+  using CopyOpG2S =
+      SM90_TMA_LOAD
+    ;
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        CopyOpG2S{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), repeat_like(StrideMNL{}, int32_t(0)), append<3>(StrideMNL{}, _0{})),
+        take<0,2>(SmemLayoutTma{})));
+    TMA_Aux tma_load_aux;
+    Element null_default = Element(0);
+    bool use_default = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto M_AUX =
+        size(M)
+      ;
+    Tensor tensor_aux = make_tensor(make_gmem_ptr(args.ptr_aux), make_layout(make_shape(M_AUX,N,L), append<3>(args.dAux, _0{})));
+    typename Params::TMA_Aux tma_load_aux = make_tma_copy(CopyOpG2S{}, tensor_aux, take<0,2>(SmemLayoutTma{}));
+
+    bool use_default = false;
+    if constexpr (EnableNullptr) {
+      use_default = args.ptr_aux == nullptr;
+    }
+
+    return Params{tma_load_aux, args.null_default, use_default};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return true;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (params_ptr->use_default && params_ptr->null_default == Element(0));
+  }
+
+  template <class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
+    CUTLASS_DEVICE
+    ProducerLoadCallbacks(GTensor&& bGS_gAux, STensor&& bGS_sAux, Params const* params_ptr)
+      : bGS_gAux(cute::forward<GTensor>(bGS_gAux)),
+        bGS_sAux(cute::forward<STensor>(bGS_sAux)),
+        params_ptr(params_ptr) {}
+
+    GTensor bGS_gAux;                                                                  // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    STensor bGS_sAux;                                                                  // (TMA,TMA_M,TMA_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          return;
+        }
+      }
+
+      if (issue_tma_load) {
+        // Increment the expected transaction bytes of the current stage's mbarrier by the subtile's byte-size
+        constexpr uint32_t copy_bytes = size(take<0,2>(SmemLayout{})) * sizeof_bits_v<Element> / 8;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA load
+        constexpr uint16_t mcast_mask = 0;
+        int load_pipe_index = load_iteration % Stages;
+        copy(params_ptr->tma_load_aux.with(*full_mbarrier_ptr, mcast_mask),
+          bGS_gAux(_,_,_,epi_m,epi_n), bGS_sAux(_,_,_,load_pipe_index));
+      }
+    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    auto coord_shape =
+        make_coord(m, n, l)
+      ;
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), coord_shape);                       // (CTA_M,CTA_N)
+
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    Tensor sAux_epi = make_tensor(make_smem_ptr(smem_aux), SmemLayout{});        // (EPI_TILE_M,EPI_TILE_N,PIPE)
+
+    ThrCopy thrblk_g2s = params_ptr->tma_load_aux.get_slice(_0{});
+    Tensor bGS_gAux = thrblk_g2s.partition_S(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+    Tensor bGS_sAux = thrblk_g2s.partition_D(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+
+    return ProducerLoadCallbacks<decltype(bGS_gAux), decltype(bGS_sAux)>(
+      cute::move(bGS_gAux), cute::move(bGS_sAux), params_ptr);
+  }
+
+  template <class RTensor, class TiledS2R, class STensorS2R>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tC_rAux, TiledS2R tiled_s2r, STensorS2R&& tSR_sAux, Params const* params_ptr)
+      : tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tiled_s2r(tiled_s2r),
+        tSR_sAux(cute::forward<STensorS2R>(tSR_sAux)),
+        params_ptr(params_ptr) { }
+
+    TiledS2R tiled_s2r;
+    RTensor tC_rAux;                                                                          // (CPY,CPY_M,CPY_N)
+    STensorS2R tSR_sAux;                                                                      // (S2R,S2R_M,S2R_N,PIPE)
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->use_default) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+
+      using RLayoutS2R = decltype(cute::layout(TiledS2R{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tSR_rAux = make_tensor(tC_rAux.data(), RLayoutS2R{});                                 // (S2R,S2R_M,S2R_N)
+
+      int load_pipe_index = load_iteration % Stages;
+      copy(tiled_s2r, tSR_sAux(_,_,_,load_pipe_index), tSR_rAux);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+
+      return tC_rAux_frg(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+
+    Tensor mAux_mn = params_ptr->tma_load_aux.get_tma_tensor(make_shape(M,N,L));                             // (M,N,L)
+    Tensor mAux = coalesce(mAux_mn, take<0,2>(args.tile_shape_mnk));
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      >(mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    auto tiled_s2r = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpS2R,Element>{}, args.tiled_copy)
+    );
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));            // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    auto tSR_sAux = tiled_s2r.get_slice(args.thread_idx).partition_S(sAux_epi);               // (S2R,S2R_M,S2R_N,PIPE)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_s2r), decltype(tSR_sAux)>(
+        cute::move(tC_rAux), tiled_s2r, cute::move(tSR_sAux), params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpS2R,      // Unused
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxLoad<
+  0, EpilogueTile, Element, LayoutOrStrideMNL,
+  SmemLayoutAtom, CopyOpS2R, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element const* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorG2R,
+    class RTensor,
+    class CTensorG2R,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensorG2R&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorG2R&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorG2R>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorG2R>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+
+    GTensorG2R tC_gAux;
+    RTensor tC_rAux;
+    CTensorG2R tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          fill(tC_rAux, params_ptr->null_default);
+          return;
+        }
+      }
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux(_,_,_,epi_m,epi_n)), MCL.compose(Int<V>{})));
+      Tensor tC_pAux_vec = cute::lazy::transform(tC_cAux_vec, [&](auto const& c){ return elem_less(c, problem_shape_mnl); });
+
+      copy_if(tC_pAux_vec, tC_gAux_vec, tC_rAux_vec);
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      return recast<Array<Element, FragmentSize>>(tC_rAux)(epi_v);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL_ = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcast {
+  using StrideMNL = StrideMNL_;
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    if (get<2>(params_ptr->dScalar[0]) == 0) {
+      // Only 1 batch
+      return scalar == Element(0);
+    }
+    else {
+      // multiple batch
+      if (valid_scalar == false) {
+        // for stridedBatch kernel, if ptr has a valid address, we need to enable the epi_load warps.
+        return params_ptr->scalar_ptrs[0] == nullptr;
+      }
+      else {
+        // Check whether each batch is ZERO or not.
+        return scalar == Element(0);
+      }
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  bool valid_scalar = false;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Get the scalar for batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    // Get the scalar for batched broadcast
+    if (get<2>(params_ptr->dScalar[0]) != 0) {
+      auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+      update_scalar(l_coord);
+    }
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    valid_scalar = true;
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    }
+    else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      }
+      else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+  template<class... Xs>
+  CUTLASS_DEVICE void
+  update_scalar(cute::tuple<Xs...>) {
+    // Only support multiple L-modes with fully-broadcast scalar
+    scalar = params_ptr->scalars[0];
+    valid_scalar = true;
+  }
+};
+
+// Scalar broadcast
+// Supports reduction over multiple broadcasts to support fusions such as fp8 scaling factors
+template<
+  class Element,
+  class StrideMNL_ = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct Sm90ScalarBroadcastPtrArray {
+  using StrideMNL = StrideMNL_;
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    Element const* const* scalar_ptr_arrays[BroadcastCount] = {};
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    // producer load is needed if Element is not void
+    return !cute::is_void_v<Element>;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  // This must be called after update_scalar is called
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return scalar == Element(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarBroadcastPtrArray(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if (size<2>(params_ptr->dScalar[0]) == 0) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    // Always refresh scalar with the current group index so per-group
+    // alpha/beta values (provided through pointer arrays) are loaded
+    // correctly even when the L-stride is zero.
+    auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+    update_scalar(l_coord);
+
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [m_coord, n_coord, k_coord, l_coord] = args.tile_coord_mnkl;
+    update_scalar(l_coord);
+
+    return ConsumerStoreCallbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar[0]);
+
+    if (params_ptr->scalar_ptr_arrays[0] != nullptr) {
+      // Pointer-array variant: each entry already points to the scalar of a group.
+      scalar = *(params_ptr->scalar_ptr_arrays[0][l_coord]);
+    }
+    else if (params_ptr->scalar_ptrs[0] != nullptr) {
+      // Strided pointer variant.
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    }
+    else {
+      // Literal fallback.
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+
+      if (params_ptr->scalar_ptr_arrays[i] != nullptr) {
+        scalar = reduction_fn(scalar, *(params_ptr->scalar_ptr_arrays[i][l_coord]));
+      }
+      else if (params_ptr->scalar_ptrs[i] != nullptr) {
+        int rest_l_offset = l_coord * size<2>(params_ptr->dScalar[i]);
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][rest_l_offset]);
+      }
+      else {
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <int StagesC, class CtaTileShapeMNK, class EpilogueTile>
+[[deprecated("row broadcast only uses 0 stages")]] constexpr int
+compute_row_broadcast_stages() {
+  return ceil_div(StagesC, size<1>(zipped_divide(make_layout(take<0,2>(CtaTileShapeMNK{})), EpilogueTile{}))) + 1;
+}
+
+}
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput_,
+  class ElementCompute = cute::remove_pointer_t<ElementInput_>,
+  class StrideMNL_ = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<cute::remove_pointer_t<ElementInput_>>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90RowBroadcast {
+  using StrideMNL = StrideMNL_;
+  // Get base element input type.
+  using ElementInput = cute::remove_pointer_t<ElementInput_>;
+  // Check if input is an array of pointers.
+  static constexpr bool IsArrayOfPointers = is_same_v<ElementInput*, ElementInput_>;
+  using PtrRowType = cute::conditional_t<IsArrayOfPointers, ElementInput const* const*, ElementInput const*>;
+
+  static_assert(Stages == 0, "Row broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<1>(StrideMNL{}))>, bool>; // row vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{} || IsDynamicBroadcast);
+
+  struct SharedStorage {
+    array_aligned<ElementInput, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  struct Arguments {
+    PtrRowType ptr_row = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false),
+        smem(const_cast<ElementInput*>(shared_storage.smem.data())) {
+    auto const& [stride_M, stride_N, stride_L] = params.dRow;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_row == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_N == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+       if constexpr (!IsArrayOfPointers) {
+         is_zero_ = params.ptr_row[0] == ElementInput(0);
+       }
+    }
+  }
+
+  Params params;
+  bool is_zero_ = false;
+  ElementInput *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_,
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_,
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        Residue residue_cRow_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , residue_cRow(residue_cRow_)
+      , params(params_) {
+    }
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    Residue residue_cRow;                                                        // (m, n)
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      bool is_nullptr = EnableNullptr && params.ptr_row == nullptr;
+
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = filter_zeros(tGS_cRow, tGS_gRow.stride());
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM,
+        }
+        if (not is_nullptr && elem_less(tGS_cRow_flt(i), residue_cRow)) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i); // issue async gmem to smem load
+        }
+        else {
+          tGS_sRow_flt(i) = params.null_default; // fill OOB values so smem to RF load can issue without predication
+        }
+      }
+    }
+
+    CUTLASS_DEVICE bool
+    begin_sync_needed() const {
+      return true; // Ensure visibility of async gmem to smem loads
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = make_tensor_like<ElementInput>(tSR_sRow_flt);
+        copy_aligned(tSR_sRow_flt, tSR_rRow_flt);
+
+        constexpr int FrgSize = size(tSR_rRow_flt);
+        using FrgInput = Array<ElementInput, FrgSize>;
+        using FrgCompute = Array<ElementCompute, FrgSize>;
+        using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+        Tensor tSR_rRow_input_frg = recast<FrgInput>(coalesce(tSR_rRow_flt));
+        Tensor tSR_rRow_compute_frg = recast<FrgCompute>(filter(tSR_rRow));
+        ConvertInput convert_input{};
+
+        tSR_rRow_compute_frg(_0{}) = convert_input(tSR_rRow_input_frg(_0{}));
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    auto layout_N = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
+      auto shape_N = get<1>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_N = repeat_like(shape_N, int(0));
+        if (get<1>(params.dRow) == bool(1)) {
+          stride_N = transform_leaf(compact_major<LayoutLeft>(shape_N),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_N, stride_N);
+      }
+      else {
+        return make_layout(shape_N);
+      }
+    }();
+
+    auto layout_M = make_layout(M, repeat_like(M, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dRow));
+    ElementInput const* ptr_row = nullptr;
+    if constexpr(IsArrayOfPointers) {
+      if (!(EnableNullptr && params.ptr_row == nullptr)) {
+        ptr_row = params.ptr_row[l];
+      }
+    } else {
+      ptr_row = params.ptr_row;
+    }
+    Tensor mRow = make_tensor(make_gmem_ptr(ptr_row), make_layout(layout_M,layout_N,layout_L));
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem),
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, ElementInput>{},
+                                     Layout< Shape<_1, ThreadCount>,
+                                            Stride<_0,          _1>>{},
+                                     Layout<_1>{});
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord
+    Tensor tGS_cRow = thr_g2s.partition_S(args.cD);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like<ElementCompute>(take<0,3>(tSR_sRow));                        // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks(
+      tGS_gRow,
+      tGS_sRow,
+      tGS_cRow, tiled_g2s,
+      tSR_sRow,
+      tSR_rRow,
+      args.residue_cD,
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementInput_,
+  class ElementCompute = cute::remove_pointer_t<ElementInput_>,
+  class StrideMNL_ = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<cute::remove_pointer_t<ElementInput_>>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct Sm90ColBroadcast {
+  using StrideMNL = StrideMNL_;
+  // Get base element input type.
+  using ElementInput = cute::remove_pointer_t<ElementInput_>;
+  // Check if input is an array of pointers.
+  static constexpr bool IsArrayOfPointers = is_same_v<ElementInput*, ElementInput_>;
+  using PtrColType = cute::conditional_t<IsArrayOfPointers, ElementInput const* const*, ElementInput const*>;
+
+  static_assert(Stages == 0, "Column broadcast doesn't support smem pipelining");
+
+  static constexpr bool IsDynamicBroadcast = is_same_v<remove_cvref_t<decltype(get<0>(StrideMNL{}))>, bool>; // Column vector or scalar broadcast
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))> || IsDynamicBroadcast); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{} || IsDynamicBroadcast);
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  struct Arguments {
+    PtrColType ptr_col = nullptr;
+    ElementInput null_default = ElementInput(0);
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    PtrColType ptr_col = nullptr;
+    ElementCompute null_default = ElementCompute(0);
+    StrideMNL dCol = {};
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {args.ptr_col, ElementCompute(args.null_default), args.dCol};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return is_zero_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params), is_zero_(false) {
+    auto const& [stride_M, stride_N, stride_L] = params.dCol;
+    // Nullptr default
+    if (EnableNullptr && params.ptr_col == nullptr) {
+      is_zero_ = params.null_default == ElementCompute(0);
+    }
+    // Dynamic non-batched scalar broadcast
+    else if (IsDynamicBroadcast && stride_M == bool(0) && stride_L == repeat_like(stride_L, 0)) {
+       if constexpr (!IsArrayOfPointers) {
+         is_zero_ = params.ptr_col[0] == ElementInput(0);
+       }
+    }
+  }
+
+  Params params;
+  bool is_zero_;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensor tCgCol_, RTensor tCrCol_, CTensor tCcCol_, ThrResidue residue_tCcCol_, Params const& params_)
+      : tCgCol(tCgCol_),
+        tCrCol(tCrCol_),
+        tCcCol(tCcCol_),
+        residue_tCcCol(residue_tCcCol_),
+        params(params_) {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        fill(tCrCol, params.null_default);
+      }
+    }
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcCol;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (EnableNullptr && params.ptr_col == nullptr) {
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      Tensor tCgCol_flt = filter_zeros(tCgCol);
+      Tensor tCrCol_flt = make_tensor_like<ElementInput>(filter_zeros(tCrCol));
+      Tensor tCcCol_flt = filter_zeros(tCcCol, tCgCol.stride());
+
+      constexpr auto MCL = decltype(max_common_layout(tCgCol_flt, tCrCol_flt)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+      if constexpr (V > 1) {
+        using VecType = uint_bit_t<V * sizeof_bits_v<ElementInput>>;
+        Tensor tCgCol_vec = recast<VecType>(coalesce(tCgCol_flt));
+        Tensor tCrCol_vec = recast<VecType>(coalesce(tCrCol_flt));
+        Tensor tCcCol_vec = tensor<1>(zipped_divide(tCcCol_flt, MCL.compose(Int<V>{})));
+        Tensor tCpCol_vec = cute::lazy::transform(tCcCol_vec, [&](auto const& c){ return elem_less(c, residue_tCcCol); });
+        copy_if(tCpCol_vec, tCgCol_vec, tCrCol_vec);
+      }
+      else {
+        Tensor tCpCol_flt = cute::lazy::transform(tCcCol_flt, [&](auto const& c){ return elem_less(c, residue_tCcCol); });
+        copy_if(tCpCol_flt, tCgCol_flt, tCrCol_flt);
+      }
+
+      constexpr int FrgSize = size(tCrCol_flt);
+      using FrgInput = Array<ElementInput, FrgSize>;
+      using FrgCompute = Array<ElementCompute, FrgSize>;
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FrgSize>;
+
+      Tensor tCrCol_input_frg = recast<FrgInput>(coalesce(tCrCol_flt));
+      Tensor tCrCol_compute_frg = recast<FrgCompute>(filter(tCrCol));
+      ConvertInput convert_input{};
+
+      tCrCol_compute_frg(_0{}) = convert_input(tCrCol_input_frg(_0{}));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementCompute, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<ElementCompute, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    auto layout_M = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
+      auto shape_M = get<0>(args.problem_shape_mnkl);
+      if constexpr (IsDynamicBroadcast) {
+        auto stride_M = repeat_like(shape_M, int(0));
+        if (get<0>(params.dCol) == bool(1)) {
+          stride_M = transform_leaf(compact_major<LayoutLeft>(shape_M),
+            [] (auto const& stride) { return static_cast<int>(stride); }
+          );
+        }
+        return make_layout(shape_M, stride_M);
+      }
+      else {
+        return make_layout(shape_M);
+      }
+    }();
+
+    auto layout_N = make_layout(N, repeat_like(N, _0{}));
+    auto layout_L = make_layout(L, get<2>(params.dCol));
+    ElementInput const* ptr_col = nullptr;
+    if constexpr(IsArrayOfPointers) {
+      if (!(EnableNullptr && params.ptr_col == nullptr)) {
+        ptr_col = params.ptr_col[l];
+      }
+    } else {
+      ptr_col = params.ptr_col;
+    }
+    Tensor mCol = make_tensor(make_gmem_ptr(ptr_col), make_layout(layout_M,layout_N,layout_L));
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    Tensor mCol_static = make_tensor(make_gmem_ptr(ptr_col), make_layout(make_layout(M),layout_N,layout_L));
+    Tensor tCgCol_static = sm90_partition_for_epilogue<ReferenceSrc>(                  // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol_static, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol_static);                   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks(tCgCol, tCrCol, args.tCcD, args.residue_tCcD, params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix broadcast
+// Only need to redefine this if we can multicast across cluster L
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpS2R,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+using Sm90MatrixBroadcast
+  = Sm90AuxLoad<Stages, EpilogueTile, Element, StrideMNL, SmemLayoutAtom, CopyOpS2R, EnableNullptr>;
+
+namespace detail {
+
+template <typename Operation, typename = void>
+struct IsScalarBroadcast {
+  static constexpr bool value = false;
+};
+
+template <typename Operation>
+struct IsScalarBroadcast<Operation, cute::enable_if_t<is_same_v<decltype(take<0,2>(typename Operation::StrideMNL{})), Stride<_0,_0>>>> {
+  static constexpr bool value = true;
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..06ad8082e57cedf4d16aecdad8a995e838e1c93e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -0,0 +1,1722 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL,
+  class SmemLayoutAtom,
+  class CopyOpR2S,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90AuxStore {
+  using ElementAux = Element;
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+
+  constexpr static bool is_m_major = epilogue::collective::detail::is_m_major<StrideMNL>();
+  // Find the max contiguous layout usable by TMA (if EpilogueTile is a non-compact tiler)
+  using SmemShapeTma = decltype(make_shape(
+      max_common_vector(make_layout(get<0>(EpilogueTile{})),make_layout(get<0>(EpilogueTile{}))),
+      max_common_vector(make_layout(get<1>(EpilogueTile{})),make_layout(get<1>(EpilogueTile{})))));
+  using SmemLayoutTma = decltype(tile_to_shape(
+      SmemLayoutAtom{}, SmemShapeTma{},
+      cute::conditional_t<is_m_major, Step<_2,_1>, Step<_1,_2>>{} ));
+  using SmemLayout = decltype(tile_to_shape(
+      SmemLayoutTma{},
+      make_shape(size<0>(shape(EpilogueTile{})), size<1>(shape(EpilogueTile{})), Int<Stages>{}),
+      cute::conditional_t<is_m_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{} ));
+
+  struct SharedStorage {
+    alignas(cutlass::detail::alignment_for_swizzle(SmemLayout{}))
+    array_aligned<Element, size(SmemLayout{})> smem_aux;
+  };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  struct Params {
+    using TMA_Aux = decltype(make_tma_copy(
+        SM90_TMA_STORE{},
+        make_tensor(static_cast<Element*>(nullptr), repeat_like(StrideMNL{}, int32_t(0)), StrideMNL{}),
+        SmemLayoutTma{}));
+    TMA_Aux tma_store_aux;
+    bool is_nullptr = false;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+
+    bool is_nullptr = false;
+    if constexpr (EnableNullptr) {
+      is_nullptr = args.ptr_aux == nullptr;
+    }
+
+    typename Params::TMA_Aux tma_store_aux;
+    if (not is_nullptr) {
+      Tensor tensor_aux = make_tensor(args.ptr_aux, make_layout(make_shape(M,N,L), args.dAux));
+      tma_store_aux = make_tma_copy(SM90_TMA_STORE{}, tensor_aux, SmemLayoutTma{});
+    }
+
+    return {tma_store_aux, is_nullptr};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params),
+        smem_aux(const_cast<Element*>(shared_storage.smem_aux.data())) { }
+
+  Params const* params_ptr;
+  Element* smem_aux;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <
+    class RTensor,
+    class TiledR2S,
+    class STensorR2S,
+    class STensorS2G,
+    class GTensorS2G
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+          RTensor&& tC_rAux,
+          TiledR2S tiled_r2s,
+          STensorR2S&& tRS_sAux,
+          STensorS2G&& bSG_sAux,
+          GTensorS2G&& bSG_gAux,
+          Params const* params_ptr)
+      : tiled_r2s(tiled_r2s),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tRS_sAux(cute::forward<STensorR2S>(tRS_sAux)),
+        bSG_sAux(cute::forward<STensorS2G>(bSG_sAux)),
+        bSG_gAux(cute::forward<GTensorS2G>(bSG_gAux)),
+        params_ptr(params_ptr) {}
+
+    TiledR2S tiled_r2s;
+    RTensor tC_rAux;                                                                   // (CPY,CPY_M,CPY_N)
+    STensorR2S tRS_sAux;                                                               // (R2S,R2S_M,R2S_N,PIPE)
+    STensorS2G bSG_sAux;                                                               // (S2G,S2G_M,S2G_N,PIPE)
+    GTensorS2G bSG_gAux;                                                               // (S2G,S2G_M,S2G_N,EPI_M,EPI_N)
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));                          // (EPI_V)
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      using RLayoutR2S = decltype(cute::layout(TiledR2S{}.get_slice(0).retile_S(RTensor{})));
+      Tensor tRS_rAux = make_tensor(tC_rAux.data(), RLayoutR2S{});                                 // (R2S,R2S_M,R2S_N)
+
+      if (issue_smem_store) {
+        int store_pipe_index = store_iteration % Stages;
+        copy(tiled_r2s, tRS_rAux, tRS_sAux(_,_,_,store_pipe_index));
+      }
+    }
+
+    CUTLASS_DEVICE void
+    tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->is_nullptr) {
+          return;
+        }
+      }
+
+      if (issue_tma_store) {
+        // Issue the TMA store
+        int store_pipe_index = store_iteration % Stages;
+        copy(params_ptr->tma_store_aux, bSG_sAux(_,_,_,store_pipe_index), bSG_gAux(_,_,_,epi_m,epi_n));
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    Tensor mAux = params_ptr->tma_store_aux.get_tma_tensor(make_shape(M,N,L));                               // (M,N,L)
+    Tensor gAux = local_tile(mAux, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));                 // (CTA_M,CTA_N)
+
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gAux, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));                  // (CPY,CPY_M,CPY_N)
+
+    Tensor sAux_epi = cute::as_position_independent_swizzle_tensor(
+                        make_tensor(make_smem_ptr(smem_aux), SmemLayout{}));     // (EPI_TILE_M,EPI_TILE_N,PIPE)
+    Tensor gAux_epi = flat_divide(gAux, args.epi_tile);                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    auto tiled_r2s = conditional_return<ReferenceSrc>(
+      make_tiled_copy_S(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy),
+      make_tiled_copy_D(Copy_Atom<CopyOpR2S,Element>{}, args.tiled_copy)
+    );
+    auto tRS_sAux = tiled_r2s.get_slice(args.thread_idx).partition_D(sAux_epi);               // (R2S,R2S_M,R2S_N,PIPE)
+
+    ThrCopy thrblk_s2g = params_ptr->tma_store_aux.get_slice(_0{});
+    Tensor bSG_sAux = thrblk_s2g.partition_S(sAux_epi);                                // (TMA,TMA_M,TMA_N,PIPE)
+    Tensor bSG_gAux = thrblk_s2g.partition_D(gAux_epi);                                // (TMA,TMA_M,TMA_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks<decltype(tC_rAux), decltype(tiled_r2s), decltype(tRS_sAux), decltype(bSG_sAux), decltype(bSG_gAux)>(
+            cute::move(tC_rAux),
+            tiled_r2s,
+            cute::move(tRS_sAux),
+            cute::move(bSG_sAux),
+            cute::move(bSG_gAux),
+            params_ptr);
+  }
+};
+
+template <
+  class Element,
+  class EpilogueTile,   // Unused
+  FloatRoundStyle RoundStyle,
+  class LayoutOrStrideMNL,
+  class SmemLayoutAtom, // Unused
+  class CopyOpR2S,      // Unused
+  int Alignment,
+  bool EnableNullptr
+>
+struct Sm90AuxStore<
+  0, EpilogueTile, Element, RoundStyle, LayoutOrStrideMNL,
+  SmemLayoutAtom, CopyOpR2S, Alignment, EnableNullptr
+> {
+  using ElementAux = Element;
+  using StrideMNL = cutlass::gemm::TagToStrideC_t<LayoutOrStrideMNL>;
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90AuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<
+    class GTensorR2G,
+    class RTensor,
+    class CTensorR2G,
+    class ProblemShapeMNL
+  >
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GTensorR2G&& tC_gAux,
+        RTensor&& tC_rAux,
+        CTensorR2G&& tC_cAux,
+        ProblemShapeMNL problem_shape_mnl,
+        Params const* params_ptr)
+      : tC_gAux(cute::forward<GTensorR2G>(tC_gAux)),
+        tC_rAux(cute::forward<RTensor>(tC_rAux)),
+        tC_cAux(cute::forward<CTensorR2G>(tC_cAux)),
+        problem_shape_mnl(problem_shape_mnl),
+        params_ptr(params_ptr) {}
+
+    GTensorR2G tC_gAux;
+    RTensor tC_rAux;
+    CTensorR2G tC_cAux;
+    ProblemShapeMNL problem_shape_mnl;
+    Params const* params_ptr;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(epi_v) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_aux == nullptr) {
+          return;
+        }
+      }
+
+      constexpr auto MCL = decltype(max_common_layout(tC_gAux(_,_,_,_0{},_0{}), tC_rAux)){};
+      constexpr int V = cute::min(Alignment, size(MCL));
+
+      Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
+      Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
+
+      Tensor tC_cAux_vec = tensor<1>(zipped_divide(coalesce(tC_cAux(_,_,_,epi_m,epi_n)), MCL.compose(Int<V>{})));
+      Tensor tC_pAux_vec = cute::lazy::transform(tC_cAux_vec, [&](auto const& c){ return elem_less(c, problem_shape_mnl); });
+
+      copy_if(tC_pAux_vec, tC_rAux_vec, tC_gAux_vec);
+    }
+  };
+
+  template <
+    bool ReferenceSrc,
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    auto problem_shape_mnl = make_shape(M,N,L);
+
+    // Gmem Tensor
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux), make_shape(M,N,L), params_ptr->dAux
+    );
+    Tensor tC_gAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      mAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    // Register Tensor
+    Tensor tC_rAux = make_tensor<Element>(take<0,3>(shape(tC_gAux)));
+
+    // Predication support
+    Tensor coordAux = make_identity_tensor(shape(mAux));
+    Tensor tC_cAux = sm90_partition_for_epilogue<ReferenceSrc>(
+                      coordAux, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks<decltype(tC_gAux), decltype(tC_rAux), decltype(tC_cAux), decltype(problem_shape_mnl)>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape_mnl,
+      params_ptr
+    );
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90ScalarReduction {
+private:
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(IsAtomic, "non-atomic scalar reduction not supported yet");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+  #if !defined(CUTLASS_SKIP_REDUCTION_INIT)
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mScalar_layout = make_layout(make_shape(M,N,L), args.dScalar);
+      if (args.ptr_scalar != nullptr) {
+        return fill_workspace(args.ptr_scalar, ElementOutput(args.reduction_identity), cosize(mScalar_layout), stream, cuda_adapter);
+      }
+    }
+  #endif
+
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ScalarReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params const params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class CTensor, class ThrResidue>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        int l_coord,
+        CTensor tCcScalar,
+        ThrResidue residue_tCcScalar,
+        Params const& params)
+      : scalar(params.reduction_identity),
+        l_coord(l_coord),
+        tCcScalar(tCcScalar),
+        residue_tCcScalar(residue_tCcScalar),
+        params(params) {}
+
+    ElementCompute scalar;
+    int l_coord;
+    CTensor tCcScalar;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcScalar;
+    Params params;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return frg_input;
+        }
+      }
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      Tensor tCcScalar_mn = tCcScalar(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (elem_less(tCcScalar_mn(epi_v * FragmentSize + i), residue_tCcScalar)) {
+          scalar = reduce_input(scalar, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_scalar == nullptr) {
+          return;
+        }
+      }
+
+      using ConvertI = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+      using ReduceInput = GmemReduceFn<ElementOutput>;
+
+      ConvertI convert_I{};
+      ReduceInput reduce_input{};
+
+      ElementOutput* ptr_scalar = params.ptr_scalar + l_coord * get<2>(params.dScalar);
+      reduce_input(ptr_scalar, convert_I(scalar));
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks<decltype(args.tCcD), decltype(args.residue_tCcD)>(
+      get<3>(args.tile_coord_mnkl), args.tCcD, args.residue_tCcD, params);
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_row is assumed to point to a compact n-major (ceil_div(M,CTA_M), round_nearest(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (N, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true,
+  // Indicate the parameter order when calling RegReduceFn
+  // Seq length equals the number of RegReduceFn parameters
+  // No.0 represents tCrRow; No.1 and subsequent numbers sequentially represent frg_inputs in `visit`
+  class RegReduceSeq = cute::seq<0, 1>
+>
+struct Sm90RowReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_row = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dRow = {};
+  };
+
+  struct Params {
+    void* ptr_row = nullptr;
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dRow = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M), size<>(N), L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_row);
+    }
+
+    return {
+      args.ptr_row,
+      args.reduction_identity,
+      args.dRow,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mRow_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dRow);
+      if (args.ptr_row != nullptr) {
+        return fill_workspace(args.ptr_row, ElementOutput(args.reduction_identity), cosize(mRow_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(size<>(M),size<>(N),L), make_shape(tile_M, tile_N))) * tile_N * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(size<>(N), tile_N) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return cute::get<0>(cute::make_tuple(frg_inputs...));
+        }
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
+      Tensor tCcRow_mn = tCcRow(_,_,_,epi_m,epi_n);
+
+      if constexpr (VisitCheckOOB) {
+        using ReduceInput = RegReduceFn<ElementCompute>;
+        ReduceInput reduce_input{};
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          if (elem_less(tCcRow_mn(epi_v * FragmentSize + i), residue_tCcRow)) {
+            ElementCompute& tCrRow_vmn = tCrRow_mn(epi_v * FragmentSize + i);
+            tCrRow_vmn = transform_apply(cute::make_tuple(frg_inputs...),
+                [&] (auto&& frg_input) {
+                  return ElementCompute(frg_input[i]);
+                },
+                [&] (auto&&... cvt_frg_inputs) {
+                  auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn, cvt_frg_inputs...);
+                  return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+                });
+          }
+        }
+      }
+      else {
+        constexpr int RegFragSize = cute::max(1, static_cast<int>(sizeof(uint32_t) / sizeof(ElementCompute)));
+        using ReduceInput = RegReduceFn<Array<ElementCompute, RegFragSize>>;
+        ReduceInput reduce_input{};
+        Tensor tCrRow_mn_frg = recast<Array<ElementCompute, RegFragSize>>(tCrRow_mn);
+
+        constexpr int RegFragArraySize = FragmentSize / RegFragSize;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < RegFragArraySize; ++i) {
+          Array<ElementCompute, RegFragSize>& tCrRow_vmn_frg = tCrRow_mn_frg(epi_v * RegFragArraySize + i);
+          tCrRow_vmn_frg = transform_apply(cute::make_tuple(frg_inputs...),
+              [&] (auto&& frg_input) {
+                using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+                using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, RegFragSize, RoundStyle>;
+                using RegFragArr = Array<Array<ElementCompute, RegFragSize>, RegFragArraySize>;
+                ConvertInput convert_input{};
+                return convert_input(reinterpret_cast<RegFragArr&>(frg_input)[i]);
+              },
+              [&] (auto&&... cvt_frg_inputs) {
+                auto frg_compute_tuple = cute::make_tuple(tCrRow_vmn_frg, cvt_frg_inputs...);
+                return cute::detail::apply(frg_compute_tuple, reduce_input, RegReduceSeq{});
+              });
+        }
+      }
+      return cute::get<0>(cute::make_tuple(frg_inputs...));
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+      if constexpr (EnableNullptr) {
+        if (params.ptr_row == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cRow(_0{},_0{}), residue_cRow)) {
+        return;
+      }
+
+      int lane_m = get<0>(lane_mn);
+      [[maybe_unused]] bool is_reduced_lane = lane_m == 0;
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      Tensor tCrRow_frg = recast<FragmentShuffle>(filter(tCrRow));
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+
+      auto FrgSizePerLaneM = size(tCrRow_frg) / size<0>(lane_layout_MN);
+      constexpr bool SwapShuffle = FrgSizePerLaneM > 0;
+
+      //
+      // Swap Shuffle
+      //
+      // The normal way to reduction among threads:
+      // use shuffle to let *** the first half of threads *** have *** whole data *** from the second half of threads.
+      // After each step of reduction, a half of threads won't work in the following steps.
+      // That is, as the reduction progresses, the efficiency of shuffle & reduction instructions gradually change from 1/2, 1/4 to 1/32 (the worst case).
+      //
+      // To overcome this shortcoming, for a NxN matrix to be reduced among N threads as a 1XN vectors,
+      // we use swap & shuffle aiming to let *** each half of threads *** have *** a half of data *** from the other half of threads.
+      // After reduction, each half of threads should deal with a (N/2)x(N/2) sub-matrix independently in the following step.
+      // We can recursively do this until the problem size is 1.
+      //
+      if constexpr (SwapShuffle) { // for a NxN matrix to be reduced among N threads as a 1XN vectors
+        Tensor tCrRow_frg_ = logical_divide(tCrRow_frg, FrgSizePerLaneM);                       // (FrgSizePerLaneM, M)
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = size<1>(tCrRow_frg_) / 2; m > 0; m /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int r = 0; r < m; ++r) {
+            auto frg_A = tCrRow_frg_(_,r);
+            auto frg_B = tCrRow_frg_(_,r + m);
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < size(frg_A); ++v) {
+              // Step1: swap
+              if (not (lane_m & m)) { // the first half of threads swap fragments from the first half of data to the second
+                cutlass::swap(frg_A(v), frg_B(v));
+              }
+
+              // Step2: shuffle
+              uint64_t frg_shfl = reinterpret_cast<uint64_t&>(frg_A(v));
+              // each half of threads get a half of data from the other half of threads
+              frg_shfl = __shfl_xor_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(m, _0{}));
+
+              // Step3: reduction
+              frg_A(v) = reduce_shuffle(frg_B(v), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+            }
+          }
+        }
+      }
+      else {
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_rows = size<0>(lane_layout_MN) / 2; reduction_rows > 0; reduction_rows /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int frg_idx = 0; frg_idx < size(tCrRow_frg); ++frg_idx) {
+            uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrRow_frg(frg_idx));
+            frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(reduction_rows, _0{}));
+            tCrRow_frg(frg_idx) = reduce_shuffle(tCrRow_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+          }
+        }
+      }
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrRow_flt = filter_zeros(tCrRow);
+        Tensor tCcRow_flt = make_tensor(tCcRow.data(), make_layout(tCrRow_flt.shape(), tCcRow.stride()));
+        auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+
+        Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(gRow_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgRow_flt = filter_zeros(tCgRow);
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if constexpr (SwapShuffle) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < FltFrgSizePerLaneM; ++i) {
+            int idx = lane_m * FltFrgSizePerLaneM + i;
+            // Only care about OOB for N mode
+            if (get<1>(tCcRow_flt(idx)) < get<1>(residue_tCcRow)) {
+              reduce_output(&tCgRow_flt(idx), convert_output(tCrRow_flt(i)));
+            }
+          }
+        }
+        else {
+          if (is_reduced_lane) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(tCrRow_flt); ++i) {
+              if (elem_less(tCcRow_flt(i), residue_tCcRow)) {
+                reduce_output(&tCgRow_flt(i), convert_output(tCrRow_flt(i)));
+              }
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in M, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<0>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_ml(_,_,m,l), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCgBuf_flt = recast<ElementGmem>(filter(tCgBuf));
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCgBuf_flt_ = logical_divide(tCgBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCgBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            copy_aligned(tCrRow, recast<ElementGmem>(tCgBuf));
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in M, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<0>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+
+        if constexpr (SwapShuffle) {
+          Tensor tCrRow_flt = filter(tCrRow);
+          Tensor tCsBuf_flt = filter(tCsBuf);
+          auto FltFrgSizePerLaneM = size(tCrRow_flt) / size<0>(lane_layout_MN);
+          Tensor tCsBuf_flt_ = logical_divide(tCsBuf_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          Tensor tCrRow_flt_ = logical_divide(tCrRow_flt, FltFrgSizePerLaneM);               // (FltFrgSizePerLaneM, M)
+          copy_aligned(tCrRow_flt_(_,_0{}), tCsBuf_flt_(_,lane_m));
+        }
+        else {
+          if (is_reduced_lane) {
+            copy_aligned(tCrRow, tCsBuf);
+          }
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerRow = decltype(size<1>(sBuf_frg))::value;
+
+        constexpr int RowNum = decltype(size<0>(warp_layout_MN))::value;
+        using FragmentSmemArray = Array<FragmentSmem, RowNum>;
+
+        // Do the threadblock smem reduction
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_ml(_,_,m,l)));
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerRow; frg_idx += size(tiled_copy)) {
+          FragmentSmemArray frg_smem;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = 0; reduction_rows < RowNum; ++reduction_rows) {
+            int FragsCurrRows = reduction_rows * FragsPerRow;
+            frg_smem[reduction_rows] = sBuf_frg(FragsCurrRows + frg_idx);
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int reduction_rows = RowNum / 2; reduction_rows > 0; reduction_rows /= 2) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int row_idx = 0; row_idx < reduction_rows; ++row_idx) {
+              frg_smem[row_idx] = reduce_smem(frg_smem[row_idx], frg_smem[row_idx + reduction_rows]);
+            }
+          }
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem[0]);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[n], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_ml) * size<3>(gBuf_ml) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrRow, tCcRow, gRow_l, cRow, gBuf_ml, sBuf_layout,
+          lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+          tile_coord_mnkl, residue_cRow, residue_tCcRow, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gRow_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            Tensor tRgBuf_ml = gBuf_ml(_0{},n,_,_);
+            ElementCompute output = tRgBuf_ml(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int ml = 1; ml < size(tRgBuf_ml); ++ml) {
+              output = reduce_output(output, tRgBuf_ml(ml));
+            }
+            if (elem_less(cRow(_0{},n), residue_cRow)) {
+              gRow_l(_0{},n,_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int n = thread_idx; n < size<1>(gBuf_ml); n += size(tiled_copy)) {
+            bool do_store = elem_less(cRow(_0{},n), residue_cRow);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_ml); ++l) {
+              Tensor tRgBuf_m = gBuf_ml(_0{},n,_,l);
+              ElementCompute output = tRgBuf_m(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int m = 1; m < size(tRgBuf_m); ++m) {
+                output = reduce_output(output, tRgBuf_m(m));
+              }
+              if (do_store) {
+                gRow_l(_0{},n,l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      auto mn_shape = shape(typename decltype(args.tiled_copy)::Tiler_MN{});
+      if constexpr (ReferenceSrc) { return right_inverse(args.tiled_copy.get_layoutS_TV()).with_shape(mn_shape); }
+      else                        { return right_inverse(args.tiled_copy.get_layoutD_TV()).with_shape(mn_shape); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mRow = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_row), make_shape(M,N,L), params.dRow); // (M,N,L)
+    Tensor gRow_l = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      gRow_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrRow = make_tensor_like<ElementCompute>(tCgRow);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    fill(tCrRow, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_0{}, _1{}));
+    auto block_shape = ceil_div(make_shape(M,N,L), shape(gBuf_layout)); // (M_CNT, N_CNT, L_CNT)
+
+    // Let the M_CNT (the num of partial reduction results) become the outer mode
+    Layout block_layout = make_layout(block_shape, make_stride(get<1>(block_shape), _1{}, get<0>(block_shape) * get<1>(block_shape)));
+    Layout mBuf_layout = blocked_product(gBuf_layout, block_layout);
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_ml = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(_,n,_));     // (CTA_M,CTA_N,REST_M,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,                                          // (CTA_M,CTA_N,WARPS_M)
+      make_layout(make_shape(_1{},_1{},size<0>(warp_layout_MN))));
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrRow), args.tCcD, gRow_l, args.cD, gBuf_ml, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(cute::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class ShuffleReduceFn,
+  template <class> class GmemReduceFn,
+  int Stages,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool EnableNullptr = true, // Noop on nullptr params
+  // If this is false, ptr_col is assumed to point to a compact m-major (round_nearest(M,CTA_M), ceil_div(N,CTA_N), L)
+  // tensor of ElementCompute. It is the user's responsibility to reduce this to a (M, L) tensor of ElementOutput
+  bool FinalReduction = true,
+  // False means skip OOB predication if OOB inputs are known to be the reduction identity
+  bool VisitCheckOOB = true
+>
+struct Sm90ColReduction {
+private:
+  static_assert(Stages == 0, "Smem usage not supported yet");
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_1,_0>{});
+  static constexpr bool IsAtomic = is_atomic<GmemReduceFn<ElementCompute>>::value;
+  static_assert(not (IsAtomic && not FinalReduction), "atomic reduction must be final");
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments {
+    void* ptr_col = nullptr; // ElementOutput* if FinalReduction, else ElementCompute*
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dCol = {};
+  };
+
+  struct Params {
+    void* ptr_col = nullptr;
+    ElementCompute reduction_identity = ElementCompute(0);
+    StrideMNL dCol = {};
+    ElementCompute* reduction_buffer = nullptr;
+    int* tile_counters = nullptr;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    ElementCompute* reduction_buffer;
+    int* tile_counters = nullptr;
+    if constexpr (IsAtomic) {
+      reduction_buffer = nullptr;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      reduction_buffer = reinterpret_cast<ElementCompute*>(workspace);
+      tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+    }
+    else {
+      reduction_buffer = reinterpret_cast<ElementCompute*>(args.ptr_col);
+    }
+
+    return {
+      args.ptr_col,
+      args.reduction_identity,
+      args.dCol,
+      reduction_buffer,
+      tile_counters
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    if constexpr (IsAtomic || not FinalReduction) {
+      return 0;
+    }
+
+    size_t workspace_size = 0;
+    auto problem_shape_mnkl = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_mnkl;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+
+    // Increment by size of reduction buffer
+    workspace_size += product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+    // Align and increment by size of tile counters
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+    workspace_size += cute::ceil_div(M, tile_M) * sizeof(int);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    if constexpr (IsAtomic) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      Layout mCol_layout = make_layout(make_shape(size<>(M),size<>(N),size<>(L)), args.dCol);
+      if (args.ptr_col != nullptr) {
+        return fill_workspace(args.ptr_col, ElementOutput(args.reduction_identity), cosize(mCol_layout), stream, cuda_adapter);
+      }
+      return Status::kSuccess;
+    }
+    else if constexpr (FinalReduction) {
+      auto problem_shape_mnkl = append<4>(problem_shape, 1);
+      auto [M, N, K, L] = problem_shape_mnkl;
+      auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+      size_t tile_counters_offset = product(ceil_div(make_shape(M,N,L), make_shape(tile_M, tile_N))) * tile_M * sizeof(ElementCompute);
+      tile_counters_offset = round_nearest(tile_counters_offset, MinWorkspaceAlignment);
+
+      int* tile_counters = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(workspace) + tile_counters_offset);
+      size_t tile_counters_size = cute::ceil_div(M, tile_M) * sizeof(int);
+      return zero_workspace(tile_counters, tile_counters_size, stream, cuda_adapter);
+    }
+    else {
+      return Status::kSuccess;
+    }
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+    bool do_final_reduction = false;
+
+    template <typename ElementAccumulator, typename ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return frg_input;
+        }
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ConvertInput convert_input{};
+      ReduceInput reduce_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        if (!VisitCheckOOB || elem_less(tCcCol_mn(epi_v * FragmentSize + i), residue_tCcCol)) {
+          ElementCompute& tCrCol_vmn = tCrCol_mn(epi_v * FragmentSize + i);
+          tCrCol_vmn = reduce_input(tCrCol_vmn, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+      if (not is_last_iteration) {
+        return;
+      }
+
+      auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+              lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+              tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+      auto [m, n, k, l] = tile_coord_mnkl;
+      constexpr bool ReferenceSrc = decltype(ref_src)::value;
+
+      // Runtime nullptr is noop
+      if constexpr (EnableNullptr) {
+        if (params.ptr_col == nullptr) {
+          return;
+        }
+      }
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+
+      //
+      // 1. Warp shuffle reduction
+      //
+      using FragmentShuffle = Array<ElementCompute, sizeof(uint64_t) / sizeof(ElementCompute)>;
+      using ReduceShuffle = ShuffleReduceFn<FragmentShuffle>;
+      ReduceShuffle reduce_shuffle{};
+      Tensor tCrCol_frg = recast<FragmentShuffle>(filter(tCrCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int frg_idx = 0; frg_idx < size(tCrCol_frg); ++frg_idx) {
+          uint64_t frg_shfl = reinterpret_cast<uint64_t&>(tCrCol_frg(frg_idx));
+          frg_shfl = __shfl_down_sync(0xFFFFFFFF, frg_shfl, lane_layout_MN(_0{},reduction_cols));
+          tCrCol_frg(frg_idx) = reduce_shuffle(tCrCol_frg(frg_idx), reinterpret_cast<FragmentShuffle&>(frg_shfl));
+        }
+      }
+      bool is_reduced_lane = get<1>(lane_mn) == 0;
+
+      //
+      // 2. Atomic reduction
+      //
+      if constexpr (IsAtomic) {
+        // Filter so we don't issue redunant copies over stride-0 modes
+        Tensor tCrCol_flt = filter_zeros(tCrCol);
+        Tensor tCcCol_flt = make_tensor(tCcCol.data(), make_layout(tCrCol_flt.shape(), tCcCol.stride()));
+
+        Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(gCol_l(_,_,l), epi_tile, tiled_copy, thread_idx);
+        Tensor tCgCol_flt = filter_zeros(tCgCol);
+
+        // NOTE: atomic reduction is performed in the output type
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        using ReduceOutput = GmemReduceFn<ElementOutput>;
+        ConvertOutput convert_output{};
+        ReduceOutput reduce_output{};
+
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrCol_flt); ++i) {
+            if (elem_less(tCcCol_flt(i), residue_tCcCol)) {
+              reduce_output(&tCgCol_flt(i), convert_output(tCrCol_flt(i)));
+            }
+          }
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. One warp in N, skip threadblock smem reduction
+      //
+      else if constexpr (decltype(size<1>(warp_layout_MN))::value <= 1) {
+        // Dump warp reduction to gmem workspace
+        using ElementGmem = cute::conditional_t<FinalReduction, ElementCompute volatile, ElementCompute>;
+        Tensor tCgBuf = sm90_partition_for_epilogue<ReferenceSrc>(gBuf_nl(_,_,n,l), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          copy_aligned(tCrCol, recast<ElementGmem>(tCgBuf));
+        }
+        sync_fn();
+      }
+
+      //
+      // 2. Multiple warps in N, do threadblock smem reduction
+      //
+      else {
+        Tensor sBuf = make_tensor(make_smem_ptr<ElementCompute>(raw_pointer_cast(smem_buffer.data())), sBuf_layout);
+        static_assert(decltype(cosize(sBuf.layout()))::value * sizeof(ElementCompute) <=
+                      decltype(cosize(smem_buffer.layout()))::value * sizeof(typename remove_cvref_t<STensor>::value_type),
+                      "smem reduction buffer not large enough, use a larger epilogue tile");
+        sync_fn();
+
+        // Dump warp reduction to smem workspace
+        Tensor tCsBuf = sm90_partition_for_epilogue<ReferenceSrc>(sBuf(_,_,get<1>(warp_mn)), epi_tile, tiled_copy, thread_idx);
+        if (is_reduced_lane) {
+          copy_aligned(tCrCol, tCsBuf);
+        }
+        sync_fn();
+
+        constexpr int SmemFragSize = cute::max(size_t{1}, sizeof(uint32_t) / sizeof(ElementCompute));
+        using FragmentSmem = Array<ElementCompute, SmemFragSize>;
+        using VectorSmem = uint_bit_t<sizeof_bits_v<FragmentSmem>>;
+        using ReduceSmem = GmemReduceFn<FragmentSmem>;
+        ReduceSmem reduce_smem{};
+
+        Tensor sBuf_frg = recast<FragmentSmem>(filter_zeros(sBuf));
+        Tensor sBuf_vec = recast<VectorSmem>(filter_zeros(sBuf));
+        constexpr int FragsPerCol = decltype(size<0>(sBuf_frg))::value;
+
+        // Do the threadblock smem reduction
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(warp_layout_MN) / 2; reduction_cols > 1; reduction_cols /= 2) {
+          int FragsPerReduction = reduction_cols * FragsPerCol;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int frg_idx = thread_idx; frg_idx < FragsPerReduction; frg_idx += size(tiled_copy)) {
+            FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerReduction));
+            sBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+          }
+          sync_fn();
+        }
+
+        // Do final smem reduction and dump to gmem workspace
+        using VectorGmem = cute::conditional_t<FinalReduction, VectorSmem volatile, VectorSmem>;
+        Tensor gBuf_vec = recast<VectorGmem>(filter(gBuf_nl(_,_,n,l)));
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (int frg_idx = thread_idx; frg_idx < FragsPerCol; frg_idx += size(tiled_copy)) {
+          FragmentSmem frg_smem = reduce_smem(sBuf_frg(frg_idx), sBuf_frg(frg_idx + FragsPerCol));
+          gBuf_vec(frg_idx) = reinterpret_cast<VectorSmem&>(frg_smem);
+        }
+        sync_fn();
+      }
+
+      //
+      // 3. Increment atomic counters to signal final gmem reduction
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        // Ensure gmem writes are visible to other threads before incrementing counter
+        __threadfence();
+        sync_fn();
+        // Collective thread 0 increments atomic tile counter and copies value to smem
+        int* prev_tile_count = reinterpret_cast<int*>(raw_pointer_cast(smem_buffer.data()));
+        if (thread_idx == 0) {
+          *prev_tile_count = atomicAdd(&params.tile_counters[m], 1);
+        }
+        sync_fn();
+        // Broadcast tile count to other threads in CTA and determine final reduction status
+        do_final_reduction = *prev_tile_count == size<2>(gBuf_nl) * size<3>(gBuf_nl) - 1;
+        sync_fn();
+      }
+    }
+
+    CUTLASS_DEVICE void
+    end() {
+      //
+      // 4. Do final gmem reduction if necessary
+      //
+      if constexpr (not IsAtomic && FinalReduction) {
+        if (not do_final_reduction) {
+          return;
+        }
+
+        auto& [ref_src, tCrCol, tCcCol, gCol_l, cCol, gBuf_nl, sBuf_layout,
+                lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+                tile_coord_mnkl, residue_cCol, residue_tCcCol, epi_tile, tiled_copy, thread_idx] = args_tuple;
+
+        using ReduceOutput = GmemReduceFn<ElementCompute>;
+        using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+        ReduceOutput reduce_output{};
+        ConvertOutput convert_output{};
+
+        // Reduction over batches
+        if (size<2>(stride(gCol_l)) == 0) {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            Tensor tRgBuf_nl = gBuf_nl(m,_0{},_,_);
+            ElementCompute output = tRgBuf_nl(_0{});
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int nl = 1; nl < size(tRgBuf_nl); ++nl) {
+              output = reduce_output(output, tRgBuf_nl(nl));
+            }
+            if (elem_less(cCol(m,_0{}), residue_cCol)) {
+              gCol_l(m,_0{},_0{}) = convert_output(output);
+            }
+          }
+        }
+        // No reduction over batches
+        else {
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int m = thread_idx; m < size<0>(gBuf_nl); m += size(tiled_copy)) {
+            bool do_store = elem_less(cCol(m,_0{}), residue_cCol);
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (int l = 0; l < size<3>(gBuf_nl); ++l) {
+              Tensor tRgBuf_n = gBuf_nl(m,_0{},_,l);
+              ElementCompute output = tRgBuf_n(_0{});
+              CUTLASS_PRAGMA_NO_UNROLL
+              for (int n = 1; n < size(tRgBuf_n); ++n) {
+                output = reduce_output(output, tRgBuf_n(n));
+              }
+              if (do_store) {
+                gCol_l(m,_0{},l) = convert_output(output);
+              }
+            }
+          }
+        }
+
+      }
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      auto mn_shape = shape(typename decltype(args.tiled_copy)::Tiler_MN{});
+      if constexpr (ReferenceSrc) { return right_inverse(args.tiled_copy.get_layoutS_TV()).with_shape(mn_shape); }
+      else                        { return right_inverse(args.tiled_copy.get_layoutD_TV()).with_shape(mn_shape); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+    Layout inv_warp_layout_MN = right_inverse(warp_layout_MN);                                  // warp_idx -> warp_mn
+    int warp_idx = args.thread_idx / NumThreadsPerWarp;
+    auto warp_mn = idx2crd(inv_warp_layout_MN(warp_idx), shape(warp_layout_MN));
+
+    // Partition output gmem and register tensors
+    auto [tile_M, tile_N, tile_K] = args.tile_shape_mnk;
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr<ElementOutput>(params.ptr_col), make_shape(M,N,L), params.dCol); // (M,N,L)
+    Tensor gCol_l = local_tile(mCol, take<0,2>(args.tile_shape_mnk), make_coord(m,n,_));             // (CTA_M,CTA_N,L)
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+                      gCol_l(_,_,l), args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like<ElementCompute>(tCgCol);                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    fill(tCrCol, params.reduction_identity);
+
+    // Partition gmem+smem reduction buffer tensors
+    Layout gBuf_layout = make_layout(take<0,2>(args.tile_shape_mnk), make_stride(_1{}, _0{}));
+    Layout mBuf_layout = blocked_product(gBuf_layout, make_layout(ceil_div(make_shape(M,N,L), shape(gBuf_layout))));
+    Tensor mBuf = make_tensor(make_gmem_ptr(params.reduction_buffer), mBuf_layout);                // (ceil_M,ceil_N,L)
+    Tensor gBuf_nl = local_tile(mBuf, take<0,2>(args.tile_shape_mnk), make_coord(m,_,_));     // (CTA_M,CTA_N,REST_N,L)
+    Layout sBuf_layout = blocked_product(gBuf_layout,make_layout(make_shape(_1{},_1{},size<1>(warp_layout_MN)))); // (CTA_M,CTA_N,WARPS_N)
+
+    auto args_tuple = make_tuple(
+        bool_constant<ReferenceSrc>{}, cute::move(tCrCol), args.tCcD, gCol_l, args.cD, gBuf_nl, sBuf_layout,
+        lane_layout_MN, lane_mn, warp_layout_MN, warp_mn,
+        args.tile_coord_mnkl, args.residue_cD, args.residue_tCcD, args.epi_tile, args.tiled_copy, args.thread_idx);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Batch matrix reduction
+template <
+  int Stages,
+  class EpilogueTile,
+  class Element,
+  class StrideMNL,
+  class CopyOpR2S,
+  class SmemLayoutAtom,
+  int Alignment = 128 / sizeof_bits_v<Element>,
+  bool EnableNullptr = true // Noop on nullptr params
+>
+struct Sm90MatrixReduction;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..93720f8d3d71f3f4759463b5d40e604313b7e3a4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
@@ -0,0 +1,1149 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the sm90 TMA warp-specialized (ws) epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/detail/helper_macros.hpp"
+
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partitioning Helpers
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class CtaTileMN,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    CtaTileMN cT,          // (CTA_M,CTA_N,...)
+    EpilogueTile epi_tile, // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  ThrCopy thread_copy = tiled_copy.get_thread_slice(thread_idx);
+  Tensor cT_epi = flat_divide(cT, epi_tile);                                 // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,...)
+  if constexpr (ReferenceSrc) {
+    return thread_copy.partition_S(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+  else {
+    return thread_copy.partition_D(cT_epi);                                        // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,...)
+  }
+}
+
+template <
+  bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+  class Engine, class LayoutMNL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class EpilogueTile,
+  class TiledCopy
+>
+CUTLASS_HOST_DEVICE
+constexpr auto
+sm90_partition_for_epilogue(
+    Tensor<Engine, LayoutMNL> mT,  // (M,N,L)
+    TileShapeMNK tile_shape_mnk,   // (CTA_M,CTA_N,CTA_K)
+    TileCoordMNKL tile_coord_mnkl, // (m,n,k,l)
+    EpilogueTile epi_tile,         // (EPI_TILE_M,EPI_TILE_N)
+    TiledCopy tiled_copy,
+    int thread_idx) {
+  auto [m, n, k, l] = tile_coord_mnkl;
+  auto coord_shape =
+      make_coord(m, n, l)
+    ;
+  Tensor cT = local_tile(mT, take<0,2>(tile_shape_mnk), coord_shape);                                  // (CTA_M,CTA_N)
+  Tensor tCcT =
+    sm90_partition_for_epilogue<ReferenceSrc>(cT, epi_tile, tiled_copy, thread_idx);   // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+  return tCcT;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Visitor Implementation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Producer load callbacks, called by the epilogue load warp.
+// Operations usually only define this if TMA load is needed. Most operations will reuse this empy implementation
+// Load callbacks are responsible for issuing corresponding mbarrier expect-tx ops for any TMA loads issued, but
+// are not responsible for issuing the producer_commit barrier arrival, which is issued by the collective instead
+// If this is non-empty, is_producer_load_needed must be true.
+//
+template <class CallbacksTuple>
+struct ProducerLoadCallbacksImpl {
+  // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+  CallbacksTuple callbacks_tuple;
+
+  // Before entry of the subtile load loop
+  CUTLASS_DEVICE void
+  begin() {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.begin();
+      }
+    );
+  }
+
+  // Entry of the subtile load loop. Aux loads usually performed here
+  // Upon entry the producer acquire of the current subtile lock has completed.
+  // Upon exit all TMA loads for this subtile must have been issued, with corresponding expect-tx operations
+  CUTLASS_DEVICE void
+  step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.step(full_mbarrier_ptr, epi_m, epi_n, load_iteration, issue_tma_load);
+      }
+    );
+  }
+
+  // Exit of the subtile load loop.
+  CUTLASS_DEVICE void
+  end() {
+    for_each(callbacks_tuple,
+      [] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.end();
+      }
+    );
+  }
+};
+
+
+//
+// Consumer store callbacks, called by the epilogue store warps.
+// All operations must redefine this, with optional inheritance from this empty implementation.
+//
+template <class CallbacksTuple>
+struct ConsumerStoreCallbacksImpl {
+  // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+  CallbacksTuple callbacks_tuple;
+
+  // Before entry of subtile store loop. Gmem broadcasts usually performed here.
+  CUTLASS_DEVICE void
+  begin() {
+    for_each(callbacks_tuple,
+      [] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.begin();
+      }
+    );
+  }
+
+  // Is a thread sync needed after begin(). Allows chaining async copies across multiple nodes
+  CUTLASS_DEVICE bool
+  begin_sync_needed() const {
+    return cute::apply(callbacks_tuple,
+      [] (auto const&... callbacks) {
+        return (false || ... || callbacks.begin_sync_needed());
+      }
+    );
+  }
+
+  // Start of subtile store iteration
+  CUTLASS_DEVICE void
+  begin_loop(int epi_m, int epi_n) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.begin_loop(epi_m, epi_n);
+      }
+    );
+  }
+
+  // Before visit callback. Smem broadcasts usually performed here.
+  // Upon entry, all producer loads for this subtile are completed and visible.
+  CUTLASS_DEVICE void
+  previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.previsit(epi_m, epi_n, load_iteration, is_producer_load_needed);
+      }
+    );
+  }
+
+  // Perform the fused elementwise computation
+  template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+  CUTLASS_DEVICE auto // returns an Array
+  visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+        Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+    = delete; // Must be implemented for each operation
+
+  // After visit call. Smem reductions usually performed here
+  // reduction_buffer is an arbitrary smem tensor that can be used for workspace
+  // It is each nodes reponsibility to assert that this buffer is sufficiently sized
+  // and to ensure that this buffer is no longer needed upon callback exit
+  // i.e. results are synchronized and no longer in the reduction buffer
+  //
+  // visit_results is a rmem tensor that contains the results of visit() for an entire
+  // on the current epilogue subtile
+  template <class STensor, class SyncFn, class VTensor>
+  CUTLASS_DEVICE void
+  reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.reduce(reduction_buffer, sync_fn, epi_m, epi_n, is_last_iteration, visit_results);
+      }
+    );
+  }
+
+  // After reduce call, before smem async fence. Smem stores usually performed here.
+  // Upon exit, all smem stores for TMA must have been issued
+  CUTLASS_DEVICE void
+  postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.postreduce(epi_m, epi_n, store_iteration, issue_smem_store);
+      }
+    );
+  }
+
+  // After smem async fence, before TMA store commit. Aux stores usually performed here
+  // Upon exit, all TMA stores for this subtile must have been issued
+  // Because of the TMA store delay optimization, this entry point must ONLY be used for TMA stores
+  // other gmem stores can be placed in the reduce or postreduce entry points
+  CUTLASS_DEVICE void
+  tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.tma_store(epi_m, epi_n, store_iteration, issue_tma_store);
+      }
+    );
+  }
+
+  // End of subtile store iteration
+  CUTLASS_DEVICE void
+  end_loop(int epi_m, int epi_n) {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.end_loop(epi_m, epi_n);
+      }
+    );
+  }
+
+  // Exit of subtile store loop. Gmem reductions usually performed here.
+  CUTLASS_DEVICE void
+  end() {
+    for_each(callbacks_tuple,
+      [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        callbacks.end();
+      }
+    );
+  }
+};
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile
+>
+struct ProducerLoadArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ProducerLoadArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    thread_idx(thread_idx) {}
+};
+
+template<
+  class ProblemShapeMNKL,
+  class TileShapeMNK,
+  class TileCoordMNKL,
+  class TiledMma,
+  class EpilogueTile,
+  class TiledCopy,
+  class CoordTensor,
+  class Residue,
+  class ThrCoordTensor,
+  class ThrResidue,
+  class ThrSrcTensor
+>
+struct ConsumerStoreArgs {
+  ProblemShapeMNKL problem_shape_mnkl;
+  TileShapeMNK tile_shape_mnk;
+  TileCoordMNKL tile_coord_mnkl;
+  TiledMma tiled_mma;
+  EpilogueTile epi_tile;
+  TiledCopy tiled_copy;
+  CoordTensor cD;
+  Residue residue_cD;
+  ThrCoordTensor tCcD;
+  ThrResidue residue_tCcD;
+  ThrSrcTensor & tCrC;
+  int thread_idx;
+
+  CUTLASS_DEVICE
+  ConsumerStoreArgs(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      TileCoordMNKL tile_coord_mnkl,
+      TiledMma tiled_mma,
+      EpilogueTile epi_tile,
+      TiledCopy tiled_copy,
+      CoordTensor cD,
+      Residue residue_cD,
+      ThrCoordTensor tCcD,
+      ThrResidue residue_tCcD,
+      ThrSrcTensor & tCrC,
+      int thread_idx)
+  : problem_shape_mnkl(problem_shape_mnkl),
+    tile_shape_mnk(tile_shape_mnk),
+    tile_coord_mnkl(tile_coord_mnkl),
+    tiled_mma(tiled_mma),
+    epi_tile(epi_tile),
+    tiled_copy(tiled_copy),
+    cD(cD),
+    residue_cD(residue_cD),
+    tCcD(tCcD),
+    residue_tCcD(residue_tCcD),
+    tCrC(tCrC),
+    thread_idx(thread_idx) {}
+};
+
+template <class... Ops>
+struct Sm90VisitorImplBase {
+  // Shared memory allocation
+  using SharedStorage = tuple<typename Ops::SharedStorage...>;
+  // Host side fusion arguments
+  using Arguments = tuple<typename Ops::Arguments...>;
+  // Device side fusion params (Kernel-entry API)
+  using Params = tuple<typename Ops::Params...>;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        auto ret = Op::to_underlying_arguments(problem_shape, op_args, op_workspace);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return ret;
+      },
+      [] (auto&&... op_params) CUTLASS_LAMBDA_FUNC_INLINE { return cute::make_tuple(op_params...); }
+    );
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        return Op::can_implement(problem_shape, op_args);
+      },
+      [&] (auto&&... implementable) CUTLASS_LAMBDA_FUNC_INLINE {
+        return (true && ... && implementable);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return transform_apply(tuple<Ops...>{}, args,
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+        return round_nearest(op_workspace_size, MinWorkspaceAlignment);
+      },
+      [&] (auto&&... op_workspace_size) CUTLASS_LAMBDA_FUNC_INLINE {
+        return (0 + ... + op_workspace_size);
+      }
+    );
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
+    return transform_apply(tuple<Ops...>{}, args,
+      // Initialize each operation's workspace, stopping at the first error
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
+        if (status != Status::kSuccess) {
+          return status;
+        }
+
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        status = Op::initialize_workspace(problem_shape, op_args, op_workspace, stream, cuda_adapter);
+        if (op_workspace != nullptr) {
+          size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
+          op_workspace += round_nearest(op_workspace_size, MinWorkspaceAlignment);
+        }
+        return status;
+      },
+      // Return the final status
+      [&] (auto const&...ops) CUTLASS_LAMBDA_FUNC_INLINE { return status; }
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
+        [] (auto&& op, auto const& op_params, auto&& op_storage) CUTLASS_LAMBDA_FUNC_INLINE {
+          using Op = cute::remove_cvref_t<decltype(op)>;
+          return Op(op_params, op_storage);
+        },
+        [] (auto&&... ops) CUTLASS_LAMBDA_FUNC_INLINE { return cute::make_tuple(ops...); }
+      )) {}
+
+  // Ops can store kernel persistent variables (e.g. descriptors, scalars, wave counters)
+  tuple<Ops...> ops;
+};
+
+template <class... Ops>
+struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
+
+  using Impl = Sm90VisitorImplBase<Ops...>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImpl(Params const& params, SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  using Impl::ops;
+
+  //
+  // Queries for kernel runtime
+  //
+
+  // Is a specialized warp for producer TMA loads needed
+  // e.g. Aux tensor loads, broadcasts using TMA bulk copy
+  // This condition cannot change between work tiles because it is used
+  // to determine whether the load warp should exit early or not
+  // e.g. for batched beta this must always be true regardless of current batch idx
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) CUTLASS_LAMBDA_FUNC_INLINE {
+        return (false || ... || op.is_producer_load_needed());
+      }
+    );
+  }
+
+  // Is a producer TMA load specifically for C needed
+  // If this is true then is_producer_load_needed must also be true
+  // This condition can change between work tiles because it is only used
+  // to determine whether the TMA and smem loads for C of a given tile should happen
+  // e.g. for batched beta this can be false depending on current batch idx
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return cute::apply(ops,
+      [] (auto const&... op) CUTLASS_LAMBDA_FUNC_INLINE {
+        return (false || ... || op.is_C_load_needed());
+      }
+    );
+  }
+
+  // Producer load callbacks factory
+  // All operations must redefine this, but most can just dispatch to the base impl
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) CUTLASS_LAMBDA_FUNC_INLINE {
+        return op.get_producer_load_callbacks(args);
+      },
+      [] (auto&&... callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ProducerLoadCallbacksImpl<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+
+  // Consumer store callbacks factory
+  // All operations must redefine this
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return transform_apply(ops,
+      [&] (auto& op) CUTLASS_LAMBDA_FUNC_INLINE {
+        return op.template get_consumer_store_callbacks<ReferenceSrc>(args);
+      },
+      [] (auto&&... callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return ConsumerStoreCallbacksImpl<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyProducerLoadCallbacks = ProducerLoadCallbacksImpl<cute::tuple<>>;
+using EmptyConsumerStoreCallbacks = ConsumerStoreCallbacksImpl<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
+
+  using Impl = Sm90VisitorImpl<ChildOps..., NodeOp>;
+  using Params = typename Impl::Params;
+  using SharedStorage = typename Impl::SharedStorage;
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90TreeVisitor(
+      Params const& params,
+      SharedStorage const& shared_storage)
+    : Impl(params, shared_storage) {}
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
+          return child_callbacks.visit(frg_acc, epi_v, epi_m, epi_n); // child ops must be nullary (e.g. loads, trees)
+        },
+        [&] (auto&&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
+          return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+        },
+        make_seq<Rm1>{} // restrict the transform to R-1 child ops, apply is for node op
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_impl = Sm90VisitorImpl<ChildOps..., NodeOp>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_impl)>(cute::move(callbacks_impl));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// DAG visitors
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Most DAG fusions can be represented as a set of output trees with a common input tree
+// The common input is first evaluated, then the result is passed as the acc fragment to the output trees
+template <class InputTree, class OutputTree, class... AuxOutTrees>
+struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree> {
+
+  using Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array frg_input = get<0>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n);
+
+      constexpr int Rm2 = sizeof...(AuxOutTrees);
+      cute::for_each(make_seq<Rm2>{}, // restrict the sequence to aux out trees
+        [&] (auto I) CUTLASS_LAMBDA_FUNC_INLINE {
+          get<I+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+        }
+      );
+
+      return get<Rm2+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_impl = Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputTree>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_impl)>(cute::move(callbacks_impl));
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  // deducing the output type for all the nodes is tricky so we just convert them all to a common type
+  // if multiple compute types are needed then split into multiple subgraphs grouped by type
+  class ElementCompute,
+  class EdgeTuple, // tuple of int_sequence, each sequence is the children indices (indexed by topological order) for each node
+  class... Ops     // in topological order, last op is the output. EdgeTuple must match this order
+>
+struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using Sm90VisitorImpl<Ops...>::Sm90VisitorImpl;
+
+  template<class CallbacksImpl>
+  struct ConsumerStoreCallbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) CUTLASS_LAMBDA_FUNC_INLINE {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+            // Compute the current op with children inputs
+            [&] (auto const&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
+              auto frg_output = callbacks.visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+              using ElementOutput = typename decltype(frg_output)::Element;
+              using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+              ConvertOutput convert_output{};
+
+              return convert_output(frg_output);
+            },
+            // Get inputs in the sequence given by the children indices of the current op
+            edge_seq
+          );
+          return frg_compute; // unused
+        },
+        // Visit the last op
+        [&] (auto const&...ops) CUTLASS_LAMBDA_FUNC_INLINE {
+          return cute::detail::apply(frg_compute_tuple,
+            // Compute the last op with children inputs
+            [&] (auto const&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
+              return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
+            },
+            // Get inputs in the sequence given by the children indices of the last op
+            get<Rm1>(EdgeTuple{})
+          );
+        },
+        // Transform to visit R-1 ops, apply to visit last op
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto callbacks_impl = Sm90VisitorImpl<Ops...>::
+      template get_consumer_store_callbacks<ReferenceSrc>(args);
+    return ConsumerStoreCallbacks<decltype(callbacks_impl)>(cute::move(callbacks_impl));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Base specializations so we can have standard layout params and simple aggregate initializers
+namespace detail {
+
+template <class Op0>
+struct Sm90VisitorImplBase<Op0> {
+
+  // Retain tuple for SharedStorage because empty structs have 1B alignment
+  // tuples use multiple inheritance, avoids this problem
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage))
+      }) {}
+
+  tuple<Op0> ops;
+};
+
+template <class Op0, class Op1>
+struct Sm90VisitorImplBase<Op0, Op1> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1);
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1> ops;
+};
+
+template <class Op0, class Op1, class Op2>
+struct Sm90VisitorImplBase<Op0, Op1, Op2> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace)
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2);          
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2> ops;
+};
+
+template <class Op0, class Op1, class Op2, class Op3>
+struct Sm90VisitorImplBase<Op0, Op1, Op2, Op3> {
+
+  using SharedStorage = tuple<
+    typename Op0::SharedStorage,
+    typename Op1::SharedStorage,
+    typename Op2::SharedStorage,
+    typename Op3::SharedStorage
+  >;
+
+  struct Arguments {
+    typename Op0::Arguments op_0;
+    typename Op1::Arguments op_1;
+    typename Op2::Arguments op_2;
+    typename Op3::Arguments op_3;
+  };
+
+  struct Params {
+    typename Op0::Params op_0;
+    typename Op1::Params op_1;
+    typename Op2::Params op_2;
+    typename Op3::Params op_3;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    size_t op_0_workspace_size = Op0::get_workspace_size(problem_shape, args.op_0);
+    size_t op_1_workspace_size = Op1::get_workspace_size(problem_shape, args.op_1);
+    size_t op_2_workspace_size = Op2::get_workspace_size(problem_shape, args.op_2);
+    uint8_t* op_0_workspace = reinterpret_cast<uint8_t*>(workspace);
+    uint8_t* op_1_workspace = op_0_workspace + op_0_workspace_size;
+    uint8_t* op_2_workspace = op_1_workspace + op_1_workspace_size;
+    uint8_t* op_3_workspace = op_2_workspace + op_2_workspace_size;
+    return Params{
+      Op0::to_underlying_arguments(problem_shape, args.op_0, op_0_workspace),
+      Op1::to_underlying_arguments(problem_shape, args.op_1, op_1_workspace),
+      Op2::to_underlying_arguments(problem_shape, args.op_2, op_2_workspace),
+      Op3::to_underlying_arguments(problem_shape, args.op_3, op_3_workspace)
+    };
+  }
+  
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return Op0::can_implement(problem_shape, args.op_0) && 
+           Op1::can_implement(problem_shape, args.op_1) &&
+           Op2::can_implement(problem_shape, args.op_2) &&
+           Op3::can_implement(problem_shape, args.op_3); 
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    size_t workspace_size = 0;
+    workspace_size += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    workspace_size += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_size = round_nearest(workspace_size, MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    status = Op0::initialize_workspace(problem_shape, args.op_0, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op0::get_workspace_size(problem_shape, args.op_0);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op1::initialize_workspace(problem_shape, args.op_1, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op1::get_workspace_size(problem_shape, args.op_1);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op2::initialize_workspace(problem_shape, args.op_2, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op2::get_workspace_size(problem_shape, args.op_2);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Op3::initialize_workspace(problem_shape, args.op_3, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += Op3::get_workspace_size(problem_shape, args.op_3);
+    workspace_offset = round_nearest(workspace_offset, MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase() {}
+
+  CUTLASS_HOST_DEVICE
+  Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
+    : ops({
+        Op0(params.op_0, get<0>(shared_storage)),
+        Op1(params.op_1, get<1>(shared_storage)),
+        Op2(params.op_2, get<2>(shared_storage)),
+        Op3(params.op_3, get<3>(shared_storage))
+      }) {}
+
+  tuple<Op0, Op1, Op2, Op3> ops;
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd378419567b1680c400ec38746211a577a3c409
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
@@ -0,0 +1,763 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree Top-K + Softmax fusion operation for sm90 TMA warp-specialized epilogue
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+
+#include "cute/tensor.hpp"
+#include "sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::fusion {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Top-K + Softmax reduction across columns
+// Performs a reduction of top-K values across N, and finally performs a softmax on them,
+// and sets values not in the top-K to 0.
+//
+//   Assumptions:
+//     1. CTA_N >= N (single tile across N, the mode which is reduced)
+//     2. EPI_N >= N (single epilogue tile across N, because we can reduce and revisit one
+//        epilogue tile at a time.)
+//     3. Top-K value is either 2 or 4.
+//
+
+namespace detail {
+
+// Implementations for add to sorted list and merging sorted lists,
+// with fast paths for lists of size 2 and 4 (Top-2 and Top-4).
+// Generic implementations may result in greater register use and branching,
+// and should be avoided.
+// Fast paths for Top-2 and Top-4 are written in inline PTX directly.
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce_scalar(Array<float, 2> a, float scalar) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx, %3, %4;\n"
+      "  setp.gtu.f32 p, %2, %4;\n"
+      "  selp.f32 %1, mx, %2, p;\n"
+      "  selp.f32 %0, %2, %4, p;\n"
+      "}\n" : "=f"(out[0]), "=f"(out[1]) : "f"(a[0]), "f"(a[1]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 2> top_2_reduce(Array<float, 2> a, Array<float, 2> b) {
+  Array<float, 2> out;
+  asm volatile(
+      "{\n"
+      "  .reg .v2 .f32 mx;\n"
+      "  .reg .pred p;\n"
+      "  max.f32 mx.x, %3, %4;\n"           // max(a1, b0)
+      "  max.f32 mx.y, %2, %5;\n"           // max(a0, b1)
+      "  setp.gtu.f32 p, %2, %4;\n"         // a0 > b0
+      "  selp.f32 %1, mx.x, mx.y, p;\n"     // a0 > b0 ? max(a1, b0) : max(a0, b1)
+      "  selp.f32 %0, %2, %4, p;\n"         // a0 > b0 ? a0 : b0
+      "}\n" : "=f"(out[0]), "=f"(out[1]) :
+      "f"(a[0]), "f"(a[1]), "f"(b[0]), "f"(b[1]));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce_scalar(Array<float, 4> a, float scalar) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mx;\n"                   // max(a3, b)
+      "  .reg .pred p0;\n"                  // a0 > b
+      "  .reg .pred p1;\n"                  // a1 > b
+      "  .reg .pred p2;\n"                  // a2 > b
+      "  max.f32 mx, %7, %8;\n"             // max(a3, b)
+      "  setp.gtu.f32 p0, %4, %8;\n"        // a0 > b
+      "  setp.gtu.f32 p1, %5, %8;\n"        // a1 > b
+      "  setp.gtu.f32 p2, %6, %8;\n"        // a2 > b
+      "  selp.f32 %3, mx, %6, p2;\n"        // a2 > b ? max(a3, b) : a2
+      "  selp.f32 %2, %6, %8, p2;\n"        // a1 = a2 > b ? a2 : b
+      "  selp.f32 %2, %2, %5, p1;\n"        // a1 > b ? max(a2, b) : a1 == a1 > b ? a1 : old_a1
+      "  selp.f32 %1, %5, %8, p1;\n"        // a0 = a1 > b ? a1 : b
+      "  selp.f32 %1, %1, %4, p0;\n"        // a0 > b ? max(a1, b) : a0 == a0 > b ? a0 : old_a0
+      "  selp.f32 %0, %4, %8, p0;\n"        // a0 = a0 > b ? a0 : b
+      "}\n" :
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) :
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]), "f"(scalar));
+  return out;
+}
+
+CUTLASS_DEVICE
+Array<float, 4> top_4_reduce(Array<float, 4> a, Array<float, 4> b) {
+  Array<float, 4> out;
+  asm volatile(
+      "{\n"
+      "  .reg .f32 mxa0b1;\n"                          // max(a0, b1)
+      "  .reg .f32 mxa1b0;\n"                          // max(a1, b0)
+
+      "  .reg .f32 mxa2b0;\n"                          // max(a2, b0)
+      "  .reg .f32 mxa1b1;\n"                          // max(a1, b1)
+      "  .reg .f32 mxa0b2;\n"                          // max(a1, b1)
+
+      "  .reg .f32 mxa1b2;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa2b1;\n"                          // max(a2, b1)
+      "  max.f32 mxa1b2, %5, %10;\n"
+      "  max.f32 mxa2b1, %6, %9;\n"
+
+      "  .reg .f32 mxa3b0;\n"                          // max(a1, b2)
+      "  .reg .f32 mxa0b3;\n"                          // max(a2, b1)
+      "  max.f32 mxa3b0, %7, %8;\n"
+      "  max.f32 mxa0b3, %4, %11;\n"
+
+      "  .reg .pred pa0b0;\n"                          // a0 > b0
+      "  .reg .pred pa1b0;\n"                          // a1 > b0
+      "  .reg .pred pa2b0;\n"                          // a2 > b0
+      "  .reg .pred pa0b1;\n"                          // a0 > b1
+      "  .reg .pred pa1b1;\n"                          // a1 > b1
+      "  .reg .pred pa0b2;\n"                          // a0 > b2
+      "  .reg .pred pb2a0;\n"                          // b1 > a0
+      "  .reg .pred pb1a0;\n"                          // b1 > a0
+
+      "  setp.gtu.f32 pa0b0, %4, %8;\n"                // a0 > b0
+      "  setp.gtu.f32 pa1b0, %5, %8;\n"                // a1 > b0
+      "  setp.gtu.f32 pa2b0, %6, %8;\n"                // a2 > b0
+      "  setp.gtu.f32 pa0b1, %4, %9;\n"                // a0 > b1
+      "  setp.gtu.f32 pa1b1, %5, %9;\n"                // a1 > b1
+      "  setp.gtu.f32 pa0b2, %4, %10;\n"               // a0 > b2
+
+      "  not.pred pb2a0, pa0b2;\n"
+      "  not.pred pb1a0, pa0b1;\n"
+
+      "  selp.f32 mxa1b0, %5, %8, pa1b0;\n"            // max(a1, b0)
+      "  selp.f32 mxa0b1, %4, %9, pa0b1;\n"            // max(a0, b1)
+
+      "  selp.f32 mxa1b1, %5, %9, pa1b1;\n"            // max(a1, b1)
+      "  selp.f32 mxa2b0, %6, %8, pa2b0;\n"            // max(a2, b0)
+      "  selp.f32 mxa0b2, %4, %10, pa0b2;\n"           // max(a0, b2)
+
+      // a0
+      "  selp.f32 %0, %4, %8, pa0b0;\n"                // a0 = a0 > b0 ? a0 : b0
+
+      // a1
+      "  selp.f32 %1, mxa1b0, mxa0b1, pa0b0;\n"        // a1 = a0 > b0 ? max(a1, b0) : max(a0, b1)
+
+      // a2
+      "  mov.f32 %2, mxa1b1;\n"                        // a2 = max(a1, b1) ** most likely case
+      "  selp.f32 %2, mxa2b0, %2, pa1b0;\n"            // a0 > a1 > b0
+      "  selp.f32 %2, mxa0b2, %2, pb1a0;\n"            // b0 > b1 > a0
+
+      // a3
+      "  mov.f32 %3, mxa1b2;\n"                        // a3 = max(a1, b2) ** one of the most likely cases
+      "  selp.f32 %3, mxa2b1, %3, pa1b1;\n"            // a3 = a1 > b1 ? max(a2, b1) ** second most likely case
+      "  selp.f32 %3, mxa3b0, %3, pa2b0;\n"            // a0 > a1 > a2 > b0
+      "  selp.f32 %3, mxa0b3, %3, pb2a0;\n"            // b0 > b1 > b2 > a0
+      "}\n" :
+      "=f"(out[0]), "=f"(out[1]), "=f"(out[2]), "=f"(out[3]) :
+      "f"(a[0]), "f"(a[1]), "f"(a[2]), "f"(a[3]),
+      "f"(b[0]), "f"(b[1]), "f"(b[2]), "f"(b[3]));
+  return out;
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void add_element_to_desc_sorted_array(cutlass::Array<Element, N>& a, Element b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce_scalar(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce_scalar(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] < b) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b;
+        break;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] and b[0] are the largest elements in a[] and b[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+void merge_desc_sorted_arrays(cutlass::Array<Element, N>& a, const cutlass::Array<Element, N>& b) {
+  if constexpr (N == 2 && is_same_v<Element, float>) {
+    a = top_2_reduce(a, b);
+  }
+  else if constexpr (N == 4 && is_same_v<Element, float>) {
+    a = top_4_reduce(a, b);
+  }
+  else {
+    // slower generic path with branching, slower, and can cause register spill
+    int j = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < N; ++k) {
+      if (a[k] < b[j]) {
+        // Shift down
+        CUTLASS_PRAGMA_UNROLL
+        for (int l = N - 1; l > k; --l) {
+          a[l] = a[l-1];
+        }
+        a[k] = b[j];
+        ++j;
+      }
+    }
+  }
+}
+
+// Assumption: array elements are sorted in descending order
+// (a[0] is the largest element in a[].)
+template <typename Element, int N>
+CUTLASS_DEVICE
+Element topk_logsumexp(cutlass::Array<Element, N> a) {
+  // Do one less `exp`, because we know what its result will be.
+  // Assume x is a set of `x_i`s, and `x_m` is the maximum of that set.
+  // logsumexp(x) = log(sum(x_i)) = m + log(sum(x_i - m)) = m + log(1 + sum_{i != m}(x_i - x_m))
+  // Compute m + log(1 + sum_{i != m}(x_i - x_m))
+  Element sum = Element(1.0);
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 1; i < N; ++i) {
+    sum += fast_exp(a[i] - a[0]);
+  }
+  return a[0] + fast_log(sum);
+}
+
+CUTLASS_DEVICE
+float fast_masked_softmax(float value, float minimum, float logsumexp) {
+  float new_value;
+  asm volatile(
+      "{\n"
+      "  .reg .pred p0;\n"
+      // value >= minimum
+      "  setp.geu.f32 p0, %1, %2;\n"
+
+      "  .reg .f32 x_lse;\n"
+      "  .reg .f32 %%f<11>;\n"
+      "  .reg .b32 %%r<3>;\n"
+
+      // x_lse = value - minimum
+      "  sub.rn.f32  x_lse, %1, %3;\n"
+
+      // exp(x_lse)
+      // The following is derived from a ptx dump of expf.
+      // exp requires a base conversion from exp2.
+      "  fma.rn.f32 %%f1, x_lse, 0f3BBB989D, 0f3F000000;\n"
+      "  cvt.sat.f32.f32 %%f2, %%f1;\n"
+      "  fma.rm.f32 %%f3, %%f2, 0f437C0000, 0f4B400001;\n"
+      "  add.f32 %%f4, %%f3, 0fCB40007F;\n"
+      "  neg.f32 %%f5, %%f4;\n"
+      "  fma.rn.f32 %%f6, x_lse, 0f3FB8AA3B, %%f5;\n"
+      "  fma.rn.f32 %%f7, x_lse, 0f32A57060, %%f6;\n"
+      "  mov.b32 %%r1, %%f3;\n"
+      "  shl.b32 %%r2, %%r1, 23;\n"
+      "  mov.b32 %%f8, %%r2;\n"
+      "  ex2.approx.ftz.f32 %%f9, %%f7;\n"
+      "  mul.f32 %%f10, %%f9, %%f8;\n"
+
+      // Mask or softmax
+      "  selp.f32 %0, %%f10, 0f00000000, p0;\n"
+      "}\n" : "=f"(new_value) : "f"(value), "f"(minimum), "f"(logsumexp));
+  return new_value;
+}
+
+template <typename Element>
+CUTLASS_DEVICE
+Element masked_softmax(Element value, Element minimum, Element logsumexp) {
+  if constexpr (is_same_v<Element, float>) {
+    // Inline PTX implementation
+    // Significantly reduces register requirements
+    return fast_masked_softmax(value, minimum, logsumexp);
+  }
+  else {
+    return value < minimum ? Element(0.0) : fast_exp(value - logsumexp);
+  }
+}
+
+} // namespace detail
+
+template <
+  int TopK,
+  int FragmentSize,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  int Alignment = 128 / sizeof_bits_v<ElementOutput>,
+  bool UseButterflyReduce = true
+>
+struct Sm90TopKSoftmaxColReduction {
+private:
+  static_assert(is_same_v<ElementCompute, float>, "Fused Top-K + Softmax reduction requires FP32 accumulation.");
+  static_assert(TopK == 2 || TopK == 4,
+  "Fused Top-K + Softmax reduction only allows K=2 and K=4, because those cases have been performance-optimized. Other values of K can be enabled by removing this assertion, but they may come with serious performance implications."
+  );
+  static_assert(Alignment * sizeof_bits_v<ElementOutput> % 128 == 0, "sub-16B alignment not supported yet");
+
+  // Reduction tensors
+  //   We have two tensors for this EVT node: a reduction tensor and a tensor holding
+  //   final reduction values (tCrSoftmax). The reason for this is that Top-K and Softmax
+  //   require different reductions, but those luckily overlap. Top-K obviously needs at least
+  //   two values (K >= 2), and softmax needs one value: logsumexp. Logsumexp is simply the log
+  //   of sum of exponents over the set, and is equivalent to m + sum(exp(x_i - m)), where m is the
+  //   maximum of all x_i elements. Since safe softmax for any element x_i is computed as
+  //   softmax(x_i) = exp(x_i - m) / sum_j(exp(x_j - max))
+  //   we can track logsumexp instead of tracking two variables (sum of exps and the max).
+  //   In addition, subtracting logsumexp from any element and taking its exp is equivalent to
+  //   computing its softmax.
+  //
+  //   The overlap between softmax and top-K is that we don't need to reduce logsumexp along the
+  //   way at all, because any element not in the top-K is going to be masked out and set to 0.
+  //   Therefore, we only reduce the top-K elements, and when done, compute their logsumexp and
+  //   keep it, and the smallest element in the top-K for masking out non-top-K elements.
+  //
+  //   This means that our final reduction result will always be 2 elements, regardless of the value
+  //   of K: minimum of top-K, and logsumexp.
+  //
+  //   For each reduction tensor, we define a new struct for readability.
+
+  struct ReductionResult {
+    ElementCompute min_;
+    ElementCompute logsumexp_;
+
+    CUTLASS_DEVICE
+    ReductionResult() { }
+
+    CUTLASS_DEVICE
+    ReductionResult(ElementCompute min, ElementCompute logsumexp):
+      logsumexp_(logsumexp), min_(min) { }
+
+    // Warp shuffle broadcast
+    CUTLASS_DEVICE
+    void shuffle_up_sync(uint32_t delta, int lane_id) {
+      static_assert(sizeof(ReductionResult) == sizeof(uint64_t));
+      uint64_t r = reinterpret_cast<uint64_t&>(*this);
+      r = __shfl_up_sync(0xFFFFFFFF, r, delta);
+      *this = (lane_id - static_cast<int>(delta) >= 0) ? reinterpret_cast<ReductionResult&>(r) : *this;
+    }
+  };
+
+  struct TopKResult {
+    Array<ElementCompute, TopK> top_k_;
+
+    CUTLASS_DEVICE
+    TopKResult() {
+      top_k_.fill(-cutlass::platform::numeric_limits<ElementCompute>::infinity());
+    }
+
+    // This is where we do the "final" reduction, where we compute
+    // the logsumexp for softmax, keep the smallest value in top-K,
+    // and discard the rest.
+    CUTLASS_DEVICE
+    ReductionResult reduce_final() const {
+      return ReductionResult(top_k_[TopK - 1], topk_logsumexp(top_k_));
+    }
+
+    // Butterfly reduction
+    CUTLASS_DEVICE
+    void shuffle_xor_sync(int laneMask) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_xor_sync(0xFFFFFFFF, top_k, laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[0], laneMask);
+        top_k_arr[1] = __shfl_xor_sync(0xFFFFFFFF, top_k_arr[1], laneMask);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_xor_sync(0xFFFFFFFF, top_k_[i], laneMask);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+
+    // Warp shuffle reduction
+    CUTLASS_DEVICE
+    void shuffle_down_sync(uint32_t delta) {
+      if constexpr (TopK == 2) {
+        static_assert(sizeof(TopKResult) == sizeof(uint64_t));
+        uint64_t top_k = reinterpret_cast<uint64_t&>(*this);
+        top_k = __shfl_down_sync(0xFFFFFFFF, top_k, delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else if constexpr (TopK == 4) {
+        static_assert(sizeof(TopKResult) == 2 * sizeof(uint64_t));
+        uint64_t* top_k_ptr = reinterpret_cast<uint64_t*>(this);
+        uint64_t top_k_arr[2];
+        top_k_arr[0] = top_k_ptr[0];
+        top_k_arr[1] = top_k_ptr[1];
+        top_k_arr[0] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[0], delta);
+        top_k_arr[1] = __shfl_down_sync(0xFFFFFFFF, top_k_arr[1], delta);
+        auto synced_v = reinterpret_cast<TopKResult&>(top_k_arr);
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+      else {
+        TopKResult synced_v;
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < TopK; ++i) {
+          synced_v.top_k_[i] = __shfl_down_sync(0xFFFFFFFF, top_k_[i], delta);
+        }
+        detail::merge_desc_sorted_arrays(top_k_, synced_v.top_k_);
+      }
+    }
+  };
+
+public:
+  struct SharedStorage { };
+
+  struct Arguments { };
+
+  struct Params { };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return {};
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    auto [M, N, K, L] = problem_shape;
+    auto [tile_M, tile_N, tile_K] = CtaTileShapeMNK{};
+    // Cross CTA reduction is not possible because there is no guarantee that all CTAs run
+    // concurrently.
+    // Cross epilogue tile reduction is possible, but re-visiting and applying reduction
+    // to accumulators is only possible for the current epilogue tile.
+    auto [epi_M, epi_N] = EpilogueTile{};
+    return N <= tile_N && N <= epi_N && N >= TopK;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90TopKSoftmaxColReduction(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class ArgsTuple>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(ArgsTuple&& args_tuple, Params const& params)
+      : args_tuple(cute::forward<ArgsTuple>(args_tuple)),
+        params(params) {}
+
+    ArgsTuple args_tuple;
+    Params const& params;
+
+    template <typename ElementAccumulator, typename ElementInput>
+    CUTLASS_DEVICE auto
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol,
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Array frg_I = convert_input(frg_input);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        auto thread_crd = tCcCol_mn(epi_v * FragmentSize + i);
+        if (elem_less(thread_crd, residue_tCcCol)) {
+          TopKResult& tCrCol_vmn = tCrTopK(epi_v * FragmentSize + i);
+          detail::add_element_to_desc_sorted_array(tCrCol_vmn.top_k_, frg_I[i]);
+        }
+      }
+
+      return frg_input;
+    }
+
+    template <class STensor, class SyncFn, class VTensor>
+    CUTLASS_DEVICE void
+    reduce(STensor&& smem_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
+
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol,
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // fully OOB CTA in partially OOB cluster
+      if (not elem_less(cCol(_0{},_0{}), residue_cCol)) {
+        return;
+      }
+      Tensor tCcCol_mn = tCcCol(_,_,_,epi_m,epi_n);
+
+      // `tCrTopK` and `tCrSoftmax` have 0-strides along modes that correspond to N,
+      // in order to reduce along modes in the `R2S` sublayout that correspond to N.
+      // This means we should modify and warp-reduce them according to their co-domain instead of
+      // their domain. Therefore we keep a filtered view of both and use them as necessary.
+      auto tCrTopK_f = filter(tCrTopK);
+      auto tCrSoftmax_f = filter(tCrSoftmax);
+
+      // The pattern here is: reduce Top-K first, then compute logsumexp, keep it and the
+      // last element of Top-K, use the latter to mask the visited results, and the former
+      // to apply softmax.
+      //
+      // This gives us two options: reduce the Top-K with warp shuffles, have the reduced
+      // lanes compute logsumexp and pair it with the last Top-K element, and broadcast
+      // the result back using warp shuffles.
+      //
+      // Alternatively, we can do a butterfly reduction over Top-K, and have all lanes
+      // compute their own logsumexp and skip the broadcast.
+      if constexpr (UseButterflyReduce) {
+        //
+        // 1. Butterfly reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 1; j < size<1>(lane_layout_MN); j *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_xor_sync(j);
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+          tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+        }
+      }
+      else {
+        //
+        // 1. Warp shuffle reduction
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int reduction_cols = size<1>(lane_layout_MN) / 2; reduction_cols > 0; reduction_cols /= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrTopK_f); ++i) {
+            tCrTopK_f(i).shuffle_down_sync(lane_layout_MN(_0{},reduction_cols));
+          }
+        }
+
+        //
+        // 2. Strip down reduced value and compute sum of exps
+        //
+        bool is_reduced_lane = get<1>(lane_mn) == 0;
+        if (is_reduced_lane) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i) = tCrTopK_f(i).reduce_final();
+          }
+        }
+
+        //
+        // 3. Broadcast reduced values to all participants
+        //
+        CUTLASS_PRAGMA_UNROLL
+        for (int broadcast_cols = 1; broadcast_cols <= size<1>(lane_layout_MN) / 2; broadcast_cols *= 2) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tCrSoftmax_f); ++i) {
+            tCrSoftmax_f(i).shuffle_up_sync(lane_layout_MN(_0{},broadcast_cols), get<1>(lane_mn));
+          }
+        }
+      }
+
+      //
+      // 4. Re-visit and apply top-K and softmax
+      //
+      CUTLASS_PRAGMA_UNROLL
+      for (int epi_v = 0; epi_v < size(visit_results); ++epi_v) {
+        auto& visit_frag = visit_results(epi_v);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+          visit_frag[i] = detail::masked_softmax(
+            visit_frag[i],
+            tCrSoftmax(epi_v * FragmentSize + i).min_,
+            tCrSoftmax(epi_v * FragmentSize + i).logsumexp_
+          );
+        }
+      }
+
+    }
+
+    CUTLASS_DEVICE void
+    end_loop(int epi_m, int epi_n) {
+      auto& [tCrTopK, tCrSoftmax, tCcCol, cCol,
+              lane_layout_MN, lane_mn,
+              residue_cCol, residue_tCcCol] = args_tuple;
+
+      // Reset reduced top-K values for next tile
+      // This must be done because we only assume a single epilogue tile across N,
+      // but not M.
+      fill(tCrTopK, TopKResult());
+    }
+
+    CUTLASS_DEVICE void
+    end() { }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    Layout ref_layout_MN = [&] () {
+      auto mn_shape = shape(typename decltype(args.tiled_copy)::Tiler_MN{});
+      if constexpr (ReferenceSrc) { return right_inverse(args.tiled_copy.get_layoutS_TV()).with_shape(mn_shape); }
+      else                        { return right_inverse(args.tiled_copy.get_layoutD_TV()).with_shape(mn_shape); }
+    }();                                                                                         // tile_mn -> tv_idx
+
+    // Get the MN layout + coord of lanes to determine shuffle reduction iterations
+    using _W = Int<decltype(args.tiled_copy)::TiledNumThr::value / NumThreadsPerWarp>;
+    Layout tv2lane = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_1,_0,_0>>{};            //   tv_idx -> lane_idx
+    Layout ref2lane = composition(tv2lane, ref_layout_MN);                                      //  tile_mn -> lane_idx
+    Layout lane_layout_MN = make_layout(filter(get<0>(ref2lane)), filter(get<1>(ref2lane)));    //  lane_mn -> lane_idx
+    Layout inv_lane_layout_MN = right_inverse(lane_layout_MN);                                  // lane_idx -> lane_mn
+    int lane_idx = canonical_lane_idx();
+    auto lane_mn = idx2crd(inv_lane_layout_MN(lane_idx), shape(lane_layout_MN));
+
+    // Get the MN layout + coord of warps to determine smem reduction iterations
+    Layout tv2warp = Layout<Shape<Int<NumThreadsPerWarp>,_W,_1>,Stride<_0,_1,_0>>{};            //   tv_idx -> warp_idx
+    Layout ref2warp = composition(tv2warp, ref_layout_MN);                                      //  tile_mn -> warp_idx
+    Layout warp_layout_MN = make_layout(filter(get<0>(ref2warp)), filter(get<1>(ref2warp)));    //  warp_mn -> warp_idx
+
+    // Make sure there's only one warp across N so we can use warp shuffle intrinsics for reduction.
+    static_assert(decltype(size<1>(warp_layout_MN))::value <= 1);
+
+    // Reduction layout
+    //   We're assuming all elements in a row (over which we're performing the reduction) are
+    //   visited in the same corresponding epilogue tile, and this is what allows us to apply the
+    //   top-K + softmax operation within `reduce()`, by re-visiting the accumulated results.
+    //
+    //   This presents a challenge, because the layout of the accumulated results is typically in
+    //   in the register to shared memory shape, or: (R2S,R2S_M,R2S_N).
+    //   This means that we still need to reduce this tensor along N.
+    //
+    //   The solution is simple: we need to flatten the layout, identify modes that correspond to
+    //   N and set their strides to 0, in order to map fragment indices corresponding to the same
+    //   row back to the same element in the tensor.
+    //
+    //   This requires some extra layout manipulation, which is as follows.
+
+    // Create new accumulator layout with column broadcast
+    auto [M, N, K] = args.tile_shape_mnk;
+    auto thr_mma = args.tiled_mma.get_thread_slice(args.thread_idx);
+    auto gColReduce = make_tensor<ElementCompute>(
+        make_layout(make_shape(M, N), make_stride(_1{}, 0_c)));                                                // (M,N)
+    auto tCrColReduce = make_tensor_like<ElementCompute>(                                       // (FrgV, MMA_M, MMA_N)
+        thr_mma.partition_C(gColReduce).layout());
+
+    // Tile the new accumulator tensor according to R2S
+    ThrCopy thread_r2s = args.tiled_copy.get_slice(args.thread_idx);
+    Tensor tRS_rSoftmax = thread_r2s.retile_S(tCrColReduce);                               // ((R2S,R2S_V),MMA_M,MMA_N)
+    auto tCrC_layout = args.tCrC.layout();                                                         // (R2S,R2S_M,R2S_N)
+
+    // Compose the new accumulator R2S layout with the expected tCrC layout to get final
+    // reduction tensor layout.
+    auto tCrSoftmax_layout = take<0, 3>(tRS_rSoftmax.layout()).compose(tCrC_layout); // (R2S,R2S_V) o (R2S,R2S_M,R2S_N)
+
+    Tensor tCrTopK = make_tensor<TopKResult>(tCrSoftmax_layout);                                   // (R2S,R2S_M,R2S_N)
+    Tensor tCrSoftmax = make_tensor<ReductionResult>(tCrSoftmax_layout);                           // (R2S,R2S_M,R2S_N)
+    fill(tCrTopK, TopKResult());
+
+    auto args_tuple = make_tuple(
+        cute::move(tCrTopK), cute::move(tCrSoftmax), args.tCcD, args.cD,
+        lane_layout_MN, lane_mn,
+        args.residue_cD, args.residue_tCcD);
+    return ConsumerStoreCallbacks<decltype(args_tuple)>(std::move(args_tuple), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::fusion
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/activation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..8412b5037b3aacbca4d28b80b99839acb368d5df
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/activation.h
@@ -0,0 +1,914 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This extends the contents of cutlass/functional.h with frequently used activation functions.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// If kIsHeavy is a member, use it.  Otherwise, assume that it's false.
+template<class Op, class Enable = void>
+struct kIsHeavy_member_or_false {
+  static constexpr bool value = false;
+};
+template<class Op>
+struct kIsHeavy_member_or_false<Op, typename cutlass::platform::enable_if<Op::kIsHeavy>::type> {
+  static constexpr bool value = Op::kIsHeavy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Identity operator
+template <typename T>
+struct Identity {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    return value;
+  }
+};
+
+template <typename T, int N>
+struct Identity<Array<T, N> > {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> value) const {
+    return value;
+  }
+};
+
+/// Scale operator
+template <typename T>
+struct Scale {
+  struct Arguments {
+    using scale_type = T;
+    T scale = T(1);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, T scale) const {
+    multiplies<T> mul;
+    return mul(scale, value);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
+
+template <typename T, int N>
+struct Scale<Array<T, N>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, T scale) const {
+    multiplies<Array<T, N>> mul;
+    return mul(scale, values);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> values, Arguments args = Arguments()) const {
+    return this->operator()(values, args.scale);
+  }
+};
+
+/// Specialization to compose other activations with a defined unary operator
+/// e.g. Scale<Identity<T>>
+template <template <class> class Activation, typename T>
+struct Scale<Activation<T>> {
+  using Arguments = typename Scale<T>::Arguments;
+
+  static const bool kIsHeavy = Activation<T>::kIsHeavy;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, typename Arguments::scale_type scale) const {
+    multiplies<T> mul;
+    Activation<T> act;
+    return mul(scale, act(value));
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments args = Arguments()) const {
+    return this->operator()(value, args.scale);
+  }
+};
+
+/// ReLu operator - propagates NaNs
+template <typename T>
+struct ReLu {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T threshold, T value) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+
+    return mx(value, threshold);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+
+    return mx(value, T(0));
+  }
+};
+
+template <typename T>
+using ReLU = ReLu<T>;
+
+template <typename T, int N>
+struct ReLu<Array<T, N>> {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(T const & threshold, Array<T, N> const &frag) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T, N>, PropagateNaN> mx;
+
+    return mx(frag, threshold);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &frag) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T, N>, PropagateNaN> mx;
+    return mx(frag, T(0));
+  }
+};
+
+// Generic clamp
+template <typename T>
+struct Clamp {
+  struct Arguments {
+    T lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::lowest();
+    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+    minimum<T, PropagateNaN> mn;
+
+    return mn(mx(value, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.lower_bound, args.upper_bound);
+  }
+};
+
+template <typename T, int N>
+struct Clamp<Array<T,N>> {
+  using Arguments = typename Clamp<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound, T const& upper_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T,N>, PropagateNaN> mx;
+    minimum<Array<T,N>, PropagateNaN> mn;
+
+    return mn(mx(values, lower_bound), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.lower_bound, args.upper_bound);
+  }
+};
+
+// Lower Bound
+template <typename T>
+struct LowerBound {
+  struct Arguments {
+    T lower_bound;
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& lower_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<T, PropagateNaN> mx;
+
+    return mx(value, lower_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.lower_bound);
+  }
+};
+
+template <typename T, int N>
+struct LowerBound<Array<T,N>> {
+  using Arguments = typename LowerBound<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, T const& lower_bound) const {
+    constexpr bool PropagateNaN = true;
+    maximum<Array<T,N>, PropagateNaN> mx;
+
+    return mx(values, lower_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.lower_bound);
+  }
+};
+
+// Leaky Relu operator
+template <typename T>
+struct LeakyReLU {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T leaky_alpha = T(0);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& leaky_alpha) const {
+    T res = value > T(0) ? value : value * leaky_alpha;
+    return res;
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.leaky_alpha);
+  }
+};
+
+template <typename T, int N>
+struct LeakyReLU<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename LeakyReLU<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, T const& leaky_alpha) const {
+    Array<T, N> y;
+    LeakyReLU<T> leaky_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = leaky_op(values[i], leaky_alpha);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.leaky_alpha);
+  }
+};
+
+// Y = min((X <= threshold ? 0 : X), upper_bound)
+template <typename T>
+struct ThresholdReLU {
+  static constexpr bool kIsHeavy = false;
+
+  struct Arguments {
+    T threshold = T(0);
+    T upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<T>::max();
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, T threshold, T upper_bound) const {
+    minimum_with_nan_propagation<T> mn;
+    
+    return mn((value <= threshold ? T(0) : value), upper_bound);
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T value, Arguments const& args = Arguments()) const {
+    return operator()(value, args.threshold, args.upper_bound);
+  }
+};
+
+template <typename T, int N>
+struct ThresholdReLU<Array<T,N>> {
+  static constexpr bool kIsHeavy = false;
+
+  using Arguments = typename ThresholdReLU<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, T threshold, T upper_bound) const {
+    ThresholdReLU<T> relu;
+
+    Array<T,N> retvals;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      retvals[i] = relu(values[i], threshold, upper_bound);    
+    }
+
+    return retvals;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T,N> operator()(Array<T,N> const& values, Arguments const& args = Arguments()) const {
+    return operator()(values, args.threshold, args.upper_bound);
+  }
+};
+
+// Tanh operator
+template <typename T>
+struct Tanh {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return fast_tanh(value);
+  }
+};
+
+template <typename T, int N>
+struct Tanh<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    Tanh<T> tanh_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = tanh_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <int N>
+struct Tanh<Array<half_t, N>> {
+  using T = half_t;
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& z) const {
+    fast_tanh_op<Array<T, N>> tanh;
+    return tanh(z);
+  }
+};
+
+// Sigmoid operator
+template <typename T>
+struct Sigmoid {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+#if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
+    return fast_tanh(value * T(0.5)) * T(0.5) + T(0.5);
+#else
+    return T(1) / (T(1) + fast_exp(-value));
+#endif
+  }
+};
+
+template <typename T, int N>
+struct Sigmoid<Array<T, N>> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& z) const {
+#if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
+    multiplies<Array<T, N>> mul;
+    multiply_add<Array<T, N>> fma;
+    fast_tanh_op<Array<T, N>> tanh;
+    return fma(tanh(mul(z, cutlass::constants::half<T>())),
+               cutlass::constants::half<T>(),
+               cutlass::constants::half<T>());
+#else
+    plus<Array<T, N>> add;
+    divides<Array<T, N>> div;
+    negate<Array<T, N>> neg;
+    fast_exp_op<Array<T, N>> fast_exp;
+    return div(cutlass::constants::one<T>(),
+               add(cutlass::constants::one<T>(),
+                   fast_exp(neg(z))));
+#endif
+  }
+};
+
+// SiLu (swish) operator introduced by Elfwing et al. in the following paper
+// "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning" (2017)
+// https://arxiv.org/pdf/1702.03118.pdf
+// It is used in EfficientNet and YOLOv5, for example.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html
+template <typename T>
+struct SiLu {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    Sigmoid<T> sigmoid;
+    return value * sigmoid(value);
+  }
+};
+
+template <typename T, int N>
+struct SiLu<Array<T, N>> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Sigmoid<Array<T, N>> sigmoid_op;
+    multiplies<Array<T, N>>     mul;
+    return mul(value, sigmoid_op(value));
+  }
+};
+
+template <typename T>
+using ScaledSiLu = Scale<SiLu<T>>;
+
+// Hardswish operator introduced by Howard et al. in the following paper
+// "Searching for MobileNetV3" (2019)
+// https://arxiv.org/pdf/1905.02244.pdf
+// It is used in models based on MobilenetNetV3.
+// Reference: https://pytorch.org/docs/stable/generated/torch.nn.Hardswish.html
+template <typename T>
+struct HardSwish {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 / T(6);
+  }
+};
+
+template <>
+struct HardSwish<float> {
+  using T = float;
+  static const bool kIsHeavy = false;
+  static constexpr float kOneSixth = 0.16666667f;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 * kOneSixth;
+  }
+};
+
+template <>
+struct HardSwish<cutlass::half_t> {
+  using T = cutlass::half_t;
+  static const bool kIsHeavy = false;
+  static constexpr float kOneSixth = 0.16666667f;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &x) const {
+    minimum<T> mn;
+    maximum<T> mx;
+    T relu6 = mn(mx(x + T(3), T(0)), T(6));
+    return x * relu6 * T(kOneSixth);
+  }
+};
+
+template <typename T, int N>
+struct HardSwish<Array<T, N> > {
+  static const bool kIsHeavy = false;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    HardSwish<T> hardswish_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = hardswish_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <int N>
+struct HardSwish<Array<half_t, N> > {
+  using T = half_t;
+  static const bool kIsHeavy = false;
+  static constexpr float kOneSixth = 0.16666667f;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    minimum<Array<T, N> > mn;
+    maximum<Array<T, N> > mx;
+    multiplies<Array<T, N> > mul;
+    plus<Array<T, N> > add;
+
+    return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(kOneSixth));
+  }
+};
+
+template <typename T>
+using ScaledHardSwish = Scale<HardSwish<T>>;
+
+//
+// GELU function definitions implemented as described by
+//   Hendrycks, D., and Gimpel, K. in
+//   "Gaussian Error Linear Units (GELUs)." (2020)
+//   https://arxiv.org/pdf/1606.08415.pdf
+//
+// Floating-point constants are Taylor coefficients described in the paper.
+//
+
+// GELU operator
+template <typename T>
+struct GELU {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &value) const {
+    return T(cutlass::constants::half<T>() * value *
+      (cutlass::constants::one<T>() + (T)erff((float)(value * cutlass::constants::half_root_two<T>()))));
+  }
+};
+
+template <>
+struct GELU<float> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &value) const {
+    return cutlass::constants::half<float>() * value *
+      (cutlass::constants::one<float>() + erff(value * cutlass::constants::half_root_two<float>() ));
+  }
+};
+
+template <>
+struct GELU<double> {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  double operator()(double const &value) const {
+    return cutlass::constants::half<double>() * value *
+      (cutlass::constants::one<double>() + erf( value * cutlass::constants::half_root_two<double>() ));
+  }
+};
+
+template <typename T, int N>
+struct GELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    GELU<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+using ScaledGELU = Scale<GELU<T>>;
+
+// GELU operator implemented using the Taylor series approximation
+template <typename T>
+struct GELU_taylor {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+
+    return T(cutlass::constants::half<T>() * z *
+      (cutlass::constants::one<T>() + fast_tanh(k0 * z * (cutlass::constants::one<T>() + k1 * z * z))));
+  }
+};
+
+template <>
+struct GELU_taylor <float>{
+  static const bool kIsHeavy = true;
+  using T = float;
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &z) const {
+    // 0.5f * (x + x * tanh(x * (0.797885f + 0.0356774f * x * x)));
+    T k0 = T(0.7978845608028654);
+    T tmp = T(0.044715);
+    T k1 = T(k0*tmp);
+    multiply_add<T> fma;
+    multiplies<T> mul;
+    T v0 = mul(k1, z);
+    T v1 = fma(v0, z, k0);
+    T v2 = mul(z, v1);
+    T v3 = fast_tanh(v2);
+    T v4 = fma(z, v3, z);
+    T v5 = mul(cutlass::constants::half<T>(), v4);
+    return v5;
+  }
+};
+
+template <int N>
+struct GELU_taylor<Array<half_t, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &z) const {
+
+    using T = half_t;
+    Array<half_t, N> y;
+
+    half_t k0 = half_t(0.7978845608028654);
+    half_t k1 = half_t(0.044715);
+
+    multiply_add<Array<half_t, N>> fma;
+    multiplies<Array<half_t, N>>     mul;
+    plus<Array<half_t, N>>         add;
+
+    fast_tanh_op<Array<half_t, N>> tanh;
+
+    Array<half_t, N> u = mul(mul(k0, z), fma(mul(k1, z), z, cutlass::constants::one<T>()));
+
+    y = mul(mul(z, cutlass::constants::half<T>()), add(cutlass::constants::one<T>(), tanh(u)));
+
+    return y;
+  }
+};
+
+template <int N>
+struct GELU_taylor<Array<float, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<float, N> operator()(Array<float, N> const &value) const {
+    multiply_add<Array<float, N>> fma;
+    multiplies<Array<float, N>> mul;
+    fast_tanh_op<Array<float, N>> tanh;
+    // 0.5f * (x + x * tanh(x * (0.797885f + 0.0356774f * x * x)));
+    float k0 = float(0.7978845608028654);
+    float tmp = float(0.044715);
+    float k1 = float(k0*tmp);
+
+    Array<float, N> v0 = mul(k1, value);
+    Array<float, N> v1 = fma(v0, value, k0);
+    Array<float, N> v2 = mul(value, v1);
+    Array<float, N> v3 = tanh(v2);
+    Array<float, N> v4 = fma(value, v3, value);
+    Array<float, N> v5 = mul(cutlass::constants::half<float>(), v4);
+    return v5;
+  }
+};
+
+template <typename T, int N>
+struct GELU_taylor<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &value) const {
+    Array<T, N> y;
+    GELU_taylor<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(value[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+using ScaledGELU_taylor = Scale<GELU_taylor<T>>;
+
+/// Computes backwards pass for GELU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dGELU {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &d_t, T const &z) const {
+
+    T k0 = T(0.7978845608028654);
+    T k1 = T(0.044715);
+    T k2 = T(0.1070322243);
+
+    T tanh_out = fast_tanh(k0 * z * (1 + k1 * z * z));
+
+    T ff = constants::half<T>() * z * ((1 - tanh_out * tanh_out) * (k0 + k2 * z * z)) +
+      constants::half<T>() * (1 + tanh_out);
+
+    return ff * d_t;
+  }
+};
+
+template <typename T, int N>
+struct dGELU<Array<T, N> > {
+  static const bool kIsHeavy = true;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &d_t, Array<T, N> const &z) const {
+    Array<T, N> y;
+    dGELU<T> gelu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = gelu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
+template <typename T>
+struct dReLU {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, bool d_relu) const {
+    return d_relu ? d_t : T(0);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, U d_relu) const {
+    return operator()(d_t, static_cast<bool>(d_relu));
+  }
+};
+
+template <typename T, int N>
+struct dReLU<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, bool const (&d_relu)[N]) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<uint1b_t, N> const& d_relu) const {
+    UnpackPredicates<N> unpack_op;
+
+    bool preds[N];
+    unpack_op(preds, d_relu);
+
+    return operator()(d_t, preds);
+  }
+
+  template <typename U>
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<U, N> const& d_relu) const {
+    Array<T, N> y;
+    dReLU<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], d_relu[i]);
+    }
+
+    return y;
+  }
+};
+
+/// Computes backwards pass for ReLU operator assuming d_t is the layer gradient and
+/// z is computed from the forward pass.
+template <typename T>
+struct dReLU_Z {
+  CUTLASS_HOST_DEVICE
+  T operator()(T d_t, T z) const {
+    return z < 0 ? T(0) : d_t;
+  }
+};
+
+template <typename T, int N>
+struct dReLU_Z<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& d_t, Array<T, N> const& z) const {
+    Array<T, N> y;
+    dReLU_Z<T> relu_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = relu_op(d_t[i], z[i]);
+    }
+
+    return y;
+  }
+};
+
+// ElementwiseFilter operator
+// Filters by a specific value and maps it to 0.0
+// Used in GEMM + comm
+template <typename T>
+struct ElementwiseFilter {
+
+  static const bool kIsHeavy = false;
+
+  struct Arguments {
+    T value_to_filter = T(-0.0);
+    T filtered_value = T(0.0);
+  };
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, T const& value_to_filter, T const& filtered_value) const {
+    T res = value == value_to_filter ? filtered_value : value;
+    return res;
+  }
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& value, Arguments const& args = Arguments()) const {
+    return this->operator()(value, args.value_to_filter, args.filtered_value);
+  }
+};
+
+template <typename T, int N>
+struct ElementwiseFilter<Array<T, N> > {
+
+  static const bool kIsHeavy = false;
+
+  using Arguments = typename ElementwiseFilter<T>::Arguments;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, T const& value_to_filter, T const& filtered_value) const {
+    Array<T, N> y;
+    ElementwiseFilter<T> filter_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(values.size()); ++i) {
+      y[i] = filter_op(values[i], value_to_filter, filtered_value);
+    }
+
+    return y;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& values, Arguments const& args = Arguments()) const {
+    return this->operator()(values, args.value_to_filter, args.filtered_value);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/conversion_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/conversion_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..19bbc03a91a8495dd45f7a701b9ca9eded092bab
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/conversion_op.h
@@ -0,0 +1,148 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing conversion operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Converts the result without other operations
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class Convert {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementAccumulator_;
+  using ElementD = ElementOutput;                     // for use with cute::collective::DefaultEpilogue
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = FragmentAccumulator;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  Convert(Params const &params = Params()) {
+
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+
+  }
+
+  /// Returns true if source is needed based on state of runtime arguments
+  CUTLASS_HOST_DEVICE
+  constexpr bool is_source_needed() const {
+    return false;
+  }
+
+  /// Constexpr function to enable the compiler to optimize away the source loading if it is
+  /// never needed.
+  CUTLASS_HOST_DEVICE
+  constexpr bool is_source_ever_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source = FragmentOutput(),
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementAccumulator, kCount, Round> destination_converter;
+
+    return destination_converter(accumulator);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementAccumulator const source) const {
+    NumericConverter<ElementD, ElementAccumulator, Round> destination_converter;
+    return destination_converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator) const {
+    NumericConverter<ElementD, ElementAccumulator, Round> destination_converter;
+    return destination_converter(accumulator);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a132134ccd65524ebf5b8561fb90fddb52ef2281
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/detail.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Utilities for thread-level epilogues
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Class used to identify cases in which no operation is performed
+template <typename T_>
+struct NoOp {};
+
+} // namespace detail
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination.h
new file mode 100644
index 0000000000000000000000000000000000000000..05a1f79b55e64b6d1816c860400632fd8a3df064
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination.h
@@ -0,0 +1,527 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename ElementSource_ = ElementOutput_
+>
+class LinearCombination {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params 
+  {
+    ElementCompute alpha;                         ///< scales accumulators
+    ElementCompute beta;                          ///< scales source tensor
+    ElementCompute const *alpha_ptr;              ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;               ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute const* const* alpha_ptr_array; ///< array of pointers to accumulator scalar per group/batch
+    ElementCompute const* const* beta_ptr_array;  ///< array of pointers to source scalar per group/batch
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha(ElementCompute(1)),
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      alpha_ptr_array(nullptr),
+      beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ):
+      alpha(alpha), beta(beta),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ):
+      alpha(alpha), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr),
+      alpha_ptr_array(nullptr), beta_ptr_array(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array,
+      ElementCompute const* const* beta_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(beta_ptr_array) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const* const* alpha_ptr_array
+    ):
+      alpha(0), beta(0),
+      alpha_ptr(nullptr), beta_ptr(nullptr),
+      alpha_ptr_array(alpha_ptr_array), beta_ptr_array(nullptr) { }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  explicit LinearCombination(Params const &params, int group_idx) {
+    if (params.alpha_ptr_array != nullptr && params.alpha_ptr_array[group_idx] != nullptr) {
+      alpha_ = *(params.alpha_ptr_array[group_idx]);
+    }
+    else if (params.alpha_ptr != nullptr) {
+      alpha_ = *params.alpha_ptr;
+    }
+    else {
+      alpha_ = params.alpha;
+    }
+    if (params.beta_ptr_array != nullptr && params.beta_ptr_array[group_idx] != nullptr) {
+      beta_ = *(params.beta_ptr_array[group_idx]);
+    }
+    else if (params.beta_ptr != nullptr) {
+      beta_ = *params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit LinearCombination(const Params & params) 
+  : LinearCombination(params, /* group_idx */ 0) { }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling with source: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const &accumulator,
+      FragmentSource const &source) const {
+
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    if (Scale == ScaleType::Nothing)
+      return destination_converter(converted_accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if (Scale == ScaleType::NoBetaScaling)
+      intermediate = converted_source;
+    else
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    if (Scale == ScaleType::Nothing)
+      return destination_converter(converted_accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    return destination_converter(intermediate);
+  }
+
+  //
+  // Specializations for scalar (for use with cute::collective::DefaultEpilogue)
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    [[maybe_unused]] NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    // Convert to destination numeric type
+
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+    multiply_add<ElementCompute> madd;
+
+    if constexpr (Scale == ScaleType::NoBetaScaling) {
+      intermediate = source_converter(source);
+    }
+    else {
+      intermediate = multiply(beta_, source);                            // X =  beta * C + uniform
+    }
+
+    intermediate = madd(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    return destination_converter(intermediate);
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator) const {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+    ElementCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Convert to destination numeric type
+    if constexpr (Scale == ScaleType::Nothing) {
+      return destination_converter(converted_accumulator);
+    }
+
+    // Perform binary operations
+    ElementCompute intermediate;
+    multiplies<ElementCompute> multiply;
+
+    intermediate = multiply(alpha_, accumulator);    // D = alpha * Accum
+    return destination_converter(intermediate);
+  }
+};
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = vector_alpha * accumulator + (optional) vector_beta/scalar_beta * source
+///
+template <
+  typename ElementOutput_,            ///< Data type used to load and store tensors
+  int Count,                          ///< Number of elements computed per operation.
+  typename ElementAccumulator_,       ///< Accumulator data type
+  typename ElementCompute_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round,
+  typename ElementSource_
+>
+class LinearCombination<ElementOutput_,
+                        Count,
+                        ElementAccumulator_,
+                        ElementCompute_,
+                        ScaleType::PerChannelScaling,
+                        Round,
+                        ElementSource_> {
+public:
+        
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+  static constexpr bool IsPerChannelScalingSupported = true;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params
+  {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator vector
+    ElementCompute const *beta_ptr;        ///< pointer to source vector
+    ElementCompute beta;                   ///< scales source tensor
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      alpha_ptr(nullptr),
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute beta
+    ):
+      alpha_ptr(alpha_ptr), beta_ptr(nullptr), beta(beta) { }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+
+public:
+
+  /// Constructs the function object
+  CUTLASS_HOST_DEVICE
+  LinearCombination(Params const& params) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + vector_beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha,
+      FragmentCompute const& vbeta) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(vbeta, converted_source);                             // X = vector_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling with source: D = vector_alpha * accumulator + scalar_beta(from host) * source 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source,
+      FragmentCompute const& valpha) const {
+    // Convert source to internal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+
+    intermediate = mul_add_source(beta_, converted_source);                           // X =  scalar_beta * C + uniform
+
+    intermediate = mul_add_accumulator(valpha, converted_accumulator, intermediate);    // D = vector_alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = vector_alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& accumulator,
+      FragmentCompute const& valpha) const {
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(valpha, converted_accumulator);    // D = vector_alpha * Accum
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b6aa714b3e76e270405cc1cb9efe38fd6649c45
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -0,0 +1,985 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+struct EmptyArguments {};
+
+template<class T, class = void>
+struct ElementwiseOpDispatcher {
+  using Arguments = EmptyArguments;
+
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value);
+  }
+};
+
+template<class T>
+struct ElementwiseOpDispatcher<T, std::void_t<typename T::Arguments>> {
+  using Arguments = typename T::Arguments;
+
+  Arguments args;
+  T op;
+
+  CUTLASS_HOST_DEVICE
+  ElementwiseOpDispatcher(Arguments args_):args(args_) {}
+
+  template <typename ValueType>
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(ValueType value) {
+    return op(value, args);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasElementwise {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::Default;
+
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i] + beta_ * tmp_C[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum + beta_ * tmp_C, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const &V) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(alpha_ * tmp_Accum, V);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationPerChannelScalingBiasElementwise {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+  static bool const IsPerChannelScalingSupported = true;
+
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute beta;                   ///< scales source tensor
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPerChannelScalingBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbeta,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + vbeta[i] * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbeta,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + vbeta * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..76d80f294f56a4c3a8226c43303d744fc7ae828f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -0,0 +1,610 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayMaximum {
+
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Array<Element, ElementsPerAccess>  const &rhs) const {
+
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+    Array<Element, ElementsPerAccess>  const &lhs,
+    Element                                   rhs) const {
+
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = platform::max(lhs[i].get(), rhs);
+    }
+
+    return result;
+  }
+};
+
+
+/// Partial specialization: Element=float
+template <int ElementsPerAccess>
+struct ArrayMaximum<float, ElementsPerAccess> {
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    Array<float, ElementsPerAccess>  const &rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Array<float, ElementsPerAccess> operator()(
+    Array<float, ElementsPerAccess>  const &lhs,
+    float rhs) const {
+
+    Array<float, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = fmax(lhs[i], rhs);
+    }
+
+    return result;
+  }
+};
+
+/// Partial specialization: Element=half
+template <int ElementsPerAccess>
+struct ArrayMaximum<half_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    Array<half_t, ElementsPerAccess>  const &rhs) const {
+
+    Array<half_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2 const *rhs_ptr = reinterpret_cast<__half2 const *>(rhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const *rhs_ptr = reinterpret_cast<__half const *>(rhs.raw_data());
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+    Array<half_t, ElementsPerAccess>  const &lhs,
+    half_t const &rhs) const {
+
+    Array<half_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    __half rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half2 rhs_pair = __half2half2(rhs_raw);
+
+    __half2 const *lhs_ptr = reinterpret_cast<__half2 const *>(lhs.raw_data());
+    __half2       *res_ptr = reinterpret_cast<__half2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+
+    __half const *lhs_ptr = reinterpret_cast<__half const *>(lhs.raw_data());
+    __half const  rhs_raw = reinterpret_cast<__half const &>(rhs);
+    __half       *res_ptr = reinterpret_cast<__half       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+/// Partial specialization: Element=bfloat16_t
+template <int ElementsPerAccess>
+struct ArrayMaximum<bfloat16_t, ElementsPerAccess> {
+
+  using NvType   = __nv_bfloat16;
+  using NvTypeV2 = __nv_bfloat162;
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    Array<bfloat16_t, ElementsPerAccess>  const &rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2 const *rhs_ptr = reinterpret_cast<NvTypeV2 const *>(rhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_ptr[i]);
+    }
+
+    #else
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const *rhs_ptr = reinterpret_cast<NvType const *>(rhs.raw_data());
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_ptr[i]) ? rhs_ptr[i] : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  Array<bfloat16_t, ElementsPerAccess> operator()(
+    Array<bfloat16_t, ElementsPerAccess>  const &lhs,
+    bfloat16_t                                   rhs) const {
+
+    Array<bfloat16_t, ElementsPerAccess> result;
+
+    #if __CUDA_ARCH__ >= 800
+    int const kVectorCount = ElementsPerAccess / 2;
+
+
+    NvType rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvTypeV2 rhs_pair = __bfloat162bfloat162(rhs_raw);
+
+    NvTypeV2 const *lhs_ptr = reinterpret_cast<NvTypeV2 const *>(lhs.raw_data());
+    NvTypeV2       *res_ptr = reinterpret_cast<NvTypeV2 *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = __hmax2(lhs_ptr[i], rhs_pair);
+    }
+
+    static_assert(!(ElementsPerAccess % 2), "Output array must be divisible by vector length.");
+
+    #else
+
+    NvType const *lhs_ptr = reinterpret_cast<NvType const *>(lhs.raw_data());
+    NvType const  rhs_raw = reinterpret_cast<NvType const &>(rhs);
+    NvType       *res_ptr = reinterpret_cast<NvType       *>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      res_ptr[i] = ((lhs_ptr[i] < rhs_raw) ? rhs_raw : lhs_ptr[i]);
+    }
+
+    #endif
+
+    return result;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, int ElementsPerAccess>
+struct ReluConditional {
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<Element, ElementsPerAccess> const &fragment, 
+    Element threshold) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !(fragment[i] < threshold);
+    }
+  }
+};
+
+template <int ElementsPerAccess>
+struct ReluConditional<half_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<half_t, ElementsPerAccess> const &fragment, 
+    half_t threshold) const {
+
+    __half y = reinterpret_cast<__half const &>(threshold);
+    __half const *x = reinterpret_cast<__half const *>(fragment.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+
+template <int ElementsPerAccess>
+struct ReluConditional<bfloat16_t, ElementsPerAccess> {
+
+  CUTLASS_DEVICE
+  void operator()(
+    bool conditional[],
+    Array<bfloat16_t, ElementsPerAccess> const &fragment,
+    bfloat16_t threshold) const {
+
+    __nv_bfloat16 y = reinterpret_cast<__nv_bfloat16 const &>(threshold);
+    __nv_bfloat16 const *x = reinterpret_cast<__nv_bfloat16 const *>(fragment.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      conditional[i] = !__hlt(x[i], y);
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This is a partial specialization for fused Bias and ReLU. It supports the option of packing
+/// ReLU conditionals in a bit vector that may be used by backwards passes as an optimization.
+///
+/// This class can only be used with cutlass::epilogue::threadblock::EpilogueWithBroadcast<>.
+///
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  int ElementsPerAccess,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationBiasRelu {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementVector = ElementVector_;
+
+  using ElementT = uint1b_t;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using ElementwiseOp = ReLu<ElementCompute>;
+  using BinaryOp = plus<ElementCompute>;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementZ threshold;                    ///< ReLu threshold
+
+    //
+    // Methods
+    //
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute()), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr),
+      threshold(ElementCompute()) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold_ = ElementCompute()
+    ): 
+      alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+
+      threshold = convert_threshold(threshold_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr), threshold(ElementZ()) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold_ = ElementCompute()
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+      NumericConverter<ElementZ, ElementCompute> convert_threshold;
+
+      threshold = convert_threshold(threshold_);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr), threshold(ElementZ()) {
+    }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementZ threshold_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationBiasRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementZ const &>(allones);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+
+      ElementCompute z = alpha_ * tmp_Accum[i];
+      z += beta_ * tmp_C[i];
+
+      z = binary_op(z, V[i]);
+      result_Z[i] = z;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    //
+    // Compute condition
+    //
+
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions); 
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+
+    bool conditions[kElementsPerAccess];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(alpha_ * tmp_Accum[i], V[i]);
+      result_Z[i] = z;
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    //
+    // Compute condition
+    //
+
+    detail::ReluConditional<ElementZ, kElementsPerAccess> relu_conditional;
+    relu_conditional(conditions, frag_Z, threshold_);
+
+    detail::ArrayMaximum<ElementZ, kElementsPerAccess> maximum_op;
+    frag_Z = maximum_op(frag_Z, threshold_);
+
+    // 
+    // Compute conditions
+    //
+
+    //
+    // Store
+    //
+    if (kStoreT) {
+      PackPredicates<kElementsPerAccess> pack_predicates;
+      frag_T = pack_predicates(conditions);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
new file mode 100644
index 0000000000000000000000000000000000000000..7abed2632aaf5951edbab372fdbb211784b06985
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -0,0 +1,684 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear scaling operations used by epilogues. Values are clamped before
+         converting to the output element type.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationClampIsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationClamp {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator 
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClampMax =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::max());
+
+    ElementCompute const kClampMin =
+        ElementCompute(cutlass::platform::numeric_limits<ElementOutput>::lowest());
+
+    intermediate = max_accumulator(intermediate, kClampMin);
+    intermediate = min_accumulator(intermediate, kClampMax);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements then clamps the output before
+/// converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationClamp<ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationClampIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha
+    ): alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationClamp(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source,
+    ElementCompute uniform = ElementCompute(0)) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+    
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_accumulator;
+    
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    //
+    // Convert float => ElementOutput_ with clamping
+    //
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements then clamps
+/// the output before converting to the output element type.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+/// or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+/// above.
+template <
+    /// Data type used to load and store< tensors
+    typename ElementOutput_,
+    /// Number of elements computed per operation
+    int Count,
+    ///< Control Alpha and Beta scaling
+    ScaleType::Kind Scale = ScaleType::Default,
+    /// Rounding mode
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class FastLinearCombinationClamp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static_assert(
+      cutlass::platform::numeric_limits<ElementOutput>::is_integer,
+      "This elementwise op expects the output to be int.");
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    /// scales accumulators
+    ElementCompute alpha;
+    /// scales source tensor
+    ElementCompute beta;
+    /// pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *alpha_ptr;
+    /// pointer to source scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : alpha(ElementCompute(1)),
+          beta(ElementCompute(0)),
+          alpha_ptr(nullptr),
+          beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha)
+        : alpha(alpha), beta(0), alpha_ptr(nullptr), beta_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  FastLinearCombinationClamp(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator,
+                            FragmentOutput const &source,
+                            ElementCompute uniform = ElementCompute(0)) const {
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementOutput, kCount, Round>
+        source_converter;
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    // Float min-max
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate =
+          mul_add_source(beta_, converted_source);  // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator,
+                                         intermediate);  // D = alpha * Accum + X
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    FastNumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Compute linear scaling in floating point
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+
+    minimum<ComputeFragment> min_accumulator;
+    maximum<ComputeFragment> max_accumulator;
+
+    // Float min-max
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);
+    }
+
+    /// Clamping constant value
+    ElementCompute const kClamp =
+        ElementCompute(1 << (sizeof_bits<ElementOutput>::value - 1));
+
+    intermediate = max_accumulator(intermediate, -kClamp);
+    intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1));
+
+    // Convert to destination numeric type
+    FastNumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2aefe91eea8e7781abb2f802a530eabbacc91878
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_dgelu.h
@@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Functor performing linear combination followed by dGelu operation
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDGelu {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static bool const kIsHeavy = true;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDGelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    dGELU<ElementCompute>  gelu_op;
+
+    // dGelu
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    dGELU<ElementCompute>  gelu_op;
+
+    // dGelu with conversion
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      intermediate[i] = gelu_op(intermediate[i], ElementCompute(tensor[i]));
+    }
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ecb015508a431d15e31d4d7b6e83f5005e6c7cf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_drelu.h
@@ -0,0 +1,452 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file  
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDRelu {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute threshold;              ///< minimum value that is output 
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementTensor threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = ElementTensor(params.threshold);
+    participates_in_reduction_  = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementTensor const &>(allones);
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      ElementTensor cond = tensor[i];
+      if (cond <= threshold_) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  int Count,                                           ///< Number of elements computed per operation
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationDReluConditionalBits {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = uint1b_t;
+
+  static bool const kIsHeavy = false;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  FragmentTensor predicate_mask_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationDReluConditionalBits(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    participates_in_reduction_ = true;
+    predicate_mask_.clear();
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    predicate_mask_.clear();
+
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+      
+      bit_not<FragmentTensor> not_op;
+      predicate_mask_ = not_op(predicate_mask_);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    bit_or<FragmentTensor> or_op;
+
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+
+    unpack_predicates(conditions, predicates);
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    bit_or<FragmentTensor> or_op;
+
+    FragmentTensor predicates = or_op(tensor, predicate_mask_);
+
+    // Obtain from packed bits
+    bool conditions[kCount];
+    UnpackPredicates<kCount> unpack_predicates;
+
+    unpack_predicates(conditions, predicates);
+
+    // dReLU = (cond ? dy : 0)
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kCount; ++i) {
+      if (!conditions[i]) {
+        intermediate[i] = ElementCompute();
+      }
+    }
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e82d2ca26a64e14858867f668c09d8f2324b561
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with GELU operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the GELU activation to an array of elements.
+///
+/// D = gelu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationGELU = LinearCombinationGeneric<GELU, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2acd493782de1611f9233663de0d1f388c2c8ff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Activation, class = void>
+struct GenericActivationTraits {
+  static constexpr bool IsArgumentsNeeded = false;
+  struct Arguments {};
+};
+
+template <class Activation>
+struct GenericActivationTraits<Activation, decltype(typename Activation::Arguments(), void())> {
+  static constexpr bool IsArgumentsNeeded = true;
+  using Arguments = typename Activation::Arguments;
+};
+
+template <typename T>
+struct LinearCombinationGenericParams {
+  T alpha;                  ///< scales accumulators
+  T beta;                   ///< scales source tensor
+  T const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+  T const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams():
+    alpha(T(1)),
+    beta(T(0)),
+    alpha_ptr(nullptr),
+    beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T alpha,
+    T beta = T(0)
+  ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { }
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericParams(
+    T const *alpha_ptr,
+    T const *beta_ptr = nullptr
+  ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by an activation function to an array of elements.
+///
+/// D = activation(alpha * accumulator + beta * source + uniform)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGeneric {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params
+    : LinearCombinationGenericParams<ElementCompute>,
+      GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+    using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGeneric(Params const &params) {
+    params_ = params;
+    params_.alpha = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    params_.beta = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.beta = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(params_.beta, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_accumulator(params_.alpha, converted_accumulator);    // D = alpha * Accum
+    }
+
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate, params_);
+    } else {
+      intermediate = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8a8083e26dd5ff4b662976ef53905e276cc38db
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operations with a generic element-wise activation
+  function. Scaling factors are applied to operands A, B, and C. The pre-activation auxiliary
+  output is also returned.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+///   D = activation(Aux)
+///
+template <
+  template<typename T> class ActivationFunctor,
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  typename ElementAuxOutput_,                          ///< Data type used to store auxiliary output
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  bool IsHeavy = false
+>
+class LinearCombinationGenericWithScalingAndAbsMax {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAuxOutput = ElementAuxOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = float;
+
+  static bool const kIsScalingAndAmaxAuxOutputNeeded = (platform::is_same<ElementAuxOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementAuxOutput, cutlass::float_e5m2_t>::value);
+  static bool const kIsScalingAndAmaxOutputNeeded    = (platform::is_same<ElementOutput, cutlass::float_e4m3_t>::value ||
+                                                        platform::is_same<ElementOutput, cutlass::float_e5m2_t>::value);
+
+  static bool const kIsHeavy = IsHeavy;
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAuxOutput = Array<ElementAuxOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    struct ActivationParams
+      : LinearCombinationGenericParams<ElementCompute>,
+        GenericActivationTraits<ActivationFunctor<ElementCompute>>::Arguments {
+      using LinearCombinationGenericParams<ElementCompute>::LinearCombinationGenericParams;
+    };
+
+    ActivationParams activation;
+    ElementScalingFactor const* scale_a_ptr = nullptr;   ///< pointer to a scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_b_ptr = nullptr;   ///< pointer to b scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_c_ptr = nullptr;   ///< pointer to c scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_d_ptr = nullptr;   ///< pointer to d scalar - if not null, loads it from memory
+    ElementScalingFactor const* scale_aux_ptr = nullptr; ///< pointer to aux scalar - if not null, loads it from memory
+
+    ElementAbsmax * abs_max_aux_ptr = nullptr;      ///< pointer to location to store amax of Aux
+    ElementAbsmax * abs_max_D_ptr   = nullptr;      ///< pointer to location to store amax of D
+
+    CUTLASS_HOST_DEVICE
+    Params() :
+      scale_a_ptr(nullptr),
+      scale_b_ptr(nullptr),
+      scale_c_ptr(nullptr),
+      scale_d_ptr(nullptr),
+      scale_aux_ptr(nullptr),
+      abs_max_aux_ptr(nullptr),
+      abs_max_D_ptr(nullptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ActivationParams activation_params,
+           ElementScalingFactor const* scale_a_ptr,
+           ElementScalingFactor const* scale_b_ptr,
+           ElementScalingFactor const* scale_c_ptr,
+           ElementScalingFactor const* scale_d_ptr,
+           ElementScalingFactor const* scale_aux_ptr,
+           ElementAbsmax * abs_max_aux_ptr,
+           ElementAbsmax * abs_max_D_ptr) :
+           activation(activation_params),
+           scale_a_ptr(scale_a_ptr),
+           scale_b_ptr(scale_b_ptr),
+           scale_c_ptr(scale_c_ptr),
+           scale_d_ptr(scale_d_ptr),
+           scale_aux_ptr(scale_aux_ptr),
+           abs_max_aux_ptr(abs_max_aux_ptr),
+           abs_max_D_ptr(abs_max_D_ptr) {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  Params params_;
+  bool skip_elementwise_;
+
+  // Scaling factors for output and auxiliary output
+  ElementCompute scale_d_;
+  ElementCompute scale_aux_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationGenericWithScalingAndAbsMax(Params const &params) :
+    params_(params),
+    skip_elementwise_(false),
+    scale_d_(ElementCompute(params.scale_d_ptr ? *(params.scale_d_ptr) : ElementScalingFactor(1))),
+    scale_aux_(ElementCompute(params.scale_aux_ptr ? *(params.scale_aux_ptr) : ElementScalingFactor(1)))
+  {
+    params_.activation.alpha = (params.activation.alpha_ptr ? *params.activation.alpha_ptr : params.activation.alpha);
+    params_.activation.beta = (params.activation.beta_ptr ? *params.activation.beta_ptr : params.activation.beta);
+    auto scale_a =
+        ElementCompute(params.scale_a_ptr ? *(params.scale_a_ptr) : ElementScalingFactor(1));
+    auto scale_b =
+        ElementCompute(params.scale_b_ptr ? *(params.scale_b_ptr) : ElementScalingFactor(1));
+    auto scale_c =
+        ElementCompute(params.scale_c_ptr ? *(params.scale_c_ptr) : ElementScalingFactor(1));
+
+    multiplies<ElementCompute> multiply;
+    params_.activation.alpha = multiply(params.activation.alpha, multiply(scale_a, scale_b));
+    params_.activation.beta = multiply(params.activation.beta, scale_c);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return params_.activation.beta != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      params_.activation.beta = ElementCompute(1);
+    }
+
+    // Only the final partition should perform the activation function
+    // and scale the output and auxiliary output values.
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+      scale_d_ = ElementCompute(1.);
+      scale_aux_ = ElementCompute(1.);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + (beta * scale_c * source) + bias
+  ///      D = activation(Aux)
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias,
+    FragmentOutput const &source) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.beta, converted_source);
+      intermediate = mul_add_accumulator(params_.activation.alpha, converted_accumulator, intermediate);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<ElementCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  /// Computes linear scaling:
+  ///    Aux = (alpha * scale_a * scale_b * accumulator) + bias
+  ///      D = activation(Aux)
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentCompute& output,
+    FragmentCompute& aux_output,
+    FragmentAccumulator const &accumulator,
+    FragmentCompute const& bias) {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> multiply;
+    plus<FragmentCompute> add;
+    ActivationFunctor<FragmentCompute> activation;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = multiply(params_.activation.alpha, converted_accumulator);
+    }
+
+    intermediate = add(intermediate, bias);
+
+    aux_output = intermediate;
+    if constexpr (GenericActivationTraits<ActivationFunctor<FragmentCompute>>::IsArgumentsNeeded) {
+      output = skip_elementwise_ ? intermediate : activation(intermediate, params_.activation);
+    } else {
+      output = skip_elementwise_ ? intermediate : activation(intermediate);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_output_abs_max() const {
+    return params_.abs_max_D_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementAbsmax* get_ptr_aux_output_abs_max() const {
+    return params_.abs_max_aux_ptr;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_d() const {
+    return scale_d_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementCompute get_scale_aux() const {
+    return scale_aux_;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
new file mode 100644
index 0000000000000000000000000000000000000000..4315a9b2619562a5e9ef99e2d221237108482724
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -0,0 +1,69 @@
+/*************************************************************************************************** 
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with HardSwish operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the HardSwish activation to an array of elements.
+///
+/// D = hardswish(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationHardSwish = LinearCombinationGeneric<HardSwish, ElementOutput_, Count, ElementAccumulator_,
+                                                            ElementCompute_, Scale, Round>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..24b507eb1154b45caa23a78dbd494102a2fe0274
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
@@ -0,0 +1,231 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationLeakyRelu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta_bias;              ///< scales bias tensor
+    ElementCompute leaky_alpha;            ///< leaky_alpha
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta_bias(ElementCompute(0)),
+      leaky_alpha(ElementCompute(1)) 
+       { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta_bias,
+      ElementCompute leaky_alpha = ElementCompute(1)
+    ): alpha(alpha), beta_bias(beta_bias), leaky_alpha(leaky_alpha) {
+
+    }
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_bias_;
+  ElementCompute leaky_alpha_recip_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationLeakyRelu(Params const &params) {
+    alpha_ = (params.alpha);
+    beta_bias_ = (params.beta_bias);
+    leaky_alpha_recip_ = (ElementCompute(params.leaky_alpha));    
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_bias_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_bias_ = ElementCompute(1);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    LeakyReLU<ComputeFragment> leakyrelu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_bias_, converted_source);                        // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+    
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+    
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_accumulator;
+    LeakyReLU<ComputeFragment> leakyrelu;
+    //printf("in doing with bias");
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+    
+    // Compute threshold optionally
+    intermediate = leakyrelu(intermediate, leaky_alpha_recip_);
+    
+    
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a7136a6c628635f80c41b00cc23a167e5165ae6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_params.h
@@ -0,0 +1,75 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct LinearCombinationParams {
+  uint64_t alpha_data[2];
+  uint64_t beta_data[2];
+
+  CUTLASS_HOST_DEVICE
+  LinearCombinationParams()
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  { }
+
+  template <typename ElementCompute>
+  CUTLASS_HOST_DEVICE 
+  LinearCombinationParams(ElementCompute alpha, ElementCompute beta) 
+  : alpha_data {0lu, 0lu}, beta_data {0lu, 0lu} 
+  {
+#if defined(__CUDA_ARCH__)
+    reinterpret_cast<ElementCompute&>(alpha_data) = alpha;
+    reinterpret_cast<ElementCompute&>(beta_data) = beta;
+#else
+    memcpy( alpha_data, &alpha, sizeof(ElementCompute) ); 
+    memcpy( beta_data, &beta, sizeof(ElementCompute) ); 
+#endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..212084ae497f7e9a09e7f733c3e191ccd74eb762
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations on planar-complex arrays
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/complex.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to arrays of planar-complex elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  ScaleType::Kind Scale = ScaleType::Default           ///< Control Alpha and Beta scaling
+>
+class LinearCombinationPlanarComplex {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = complex<ElementCompute>;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = ArrayPlanarComplex<ElementOutput, kCount>;
+  using FragmentAccumulator = ArrayPlanarComplex<ElementAccumulator, kCount>;
+  using ComputeFragment = ArrayPlanarComplex<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementScalar alpha{ElementCompute(1)};         ///< scales accumulators
+    ElementScalar beta{ElementCompute(0)};          ///< scales source tensor
+    ElementScalar const* alpha_ptr{nullptr};        ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementScalar const* beta_ptr{nullptr};         ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar alpha,
+      ElementScalar beta
+    ): alpha(alpha), beta(beta)
+    {}
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementScalar const *alpha_ptr,
+      ElementScalar const *beta_ptr
+    ): alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) 
+    {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementScalar alpha_;
+  ElementScalar beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPlanarComplex(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    return beta_.real() != ElementCompute(0) || beta_.imag() != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_source{
+      source_converter(source.real), 
+      source_converter(source.imag)};
+
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real), 
+      accumulator_converter(accumulator.imag)};
+
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+
+    // Perform binary operations
+  
+    // complex multiply: I = beta * C
+    ComputeFragment intermediate {
+      mul_op(beta_.real(), converted_source.real),
+      mul_op(beta_.real(), converted_source.imag)
+    };
+
+    intermediate.real = mul_add_op(-beta_.imag(), converted_source.imag, intermediate.real);
+    intermediate.imag = mul_add_op( beta_.imag(), converted_source.real, intermediate.imag);
+
+    // complex multiply-add: I = alpha * AB + I
+    intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real, intermediate.real);
+    intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag, intermediate.imag);
+
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return FragmentOutput{
+      destination_converter(intermediate.real), 
+      destination_converter(intermediate.imag)};
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    ComputeFragment converted_accumulator{
+      accumulator_converter(accumulator.real), 
+      accumulator_converter(accumulator.imag)};
+
+    // Perform binary operations
+    multiplies<Array<ElementCompute, kCount> > mul_op;
+    multiply_add<Array<ElementCompute, kCount> > mul_add_op;
+
+    // complex multiply-add: I = alpha * AB + I
+    ComputeFragment intermediate {
+      mul_op(alpha_.real(), converted_accumulator.real),
+      mul_op(alpha_.real(), converted_accumulator.imag)
+    };
+
+    intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real);
+    intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return FragmentOutput{
+      destination_converter(intermediate.real), 
+      destination_converter(intermediate.imag)};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..134ddded13c9479ffe02a96439e04b18b0e1bc5e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -0,0 +1,572 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a maximum operation used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationReluIsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static bool const kIsHeavy = detail::LinearCombinationReluIsHeavy();
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0),
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::OnlyAlphaPerChannelScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(threshold_, intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbfa4a3de003d106dadbe8738449260517948fe0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_relu0.h
@@ -0,0 +1,543 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with a relu operation used by epilogues.
+  This one only supports relu0 and tries to folding relu into other instructions.  Thus,
+  serial splitk is not supported by this one.  For example, relu can be folded into 
+  hfma2/hmul2 for sm80+
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Single source of truth for whether to unroll for `LinearCombinationClamp()`
+constexpr bool LinearCombinationRelu0IsHeavy() {
+  return false;
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationRelu0 {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add_relu0<FragmentCompute> mul_add_relu0_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    } else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+
+      // Compute threshold optionally
+      intermediate = relu(intermediate);
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_relu0_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+/// Special handling for int types
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+  ScaleType::Kind Scale,                               ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round
+>
+class LinearCombinationRelu0 <ElementOutput_, Count, int, float, Scale, Round> {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = int;
+  using ElementCompute = float;
+
+  static bool const kIsHeavy = detail::LinearCombinationRelu0IsHeavy();
+
+  static int const kCount = Count;
+  static const ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentScaleBias = Array<ElementCompute, kCount>;
+  using FragmentSource = Array<ElementOutput, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta = ElementCompute(0)
+    ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr = nullptr
+    ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationRelu0(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    if (Scale == ScaleType::NoBetaScaling) return true;
+
+    if (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if (Scale == ScaleType::Nothing) return false;
+
+    return beta_ != ElementCompute(0);
+  }
+
+  /// This is used for serial reduction which is not supported by Relu0
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(k_partition == 0);
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentOutput const &source) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementOutput, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::NoBetaScaling) {
+      intermediate = converted_source;
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }  else if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+    ReLu<FragmentCompute> relu;
+
+    if (Scale == ScaleType::Nothing) {
+      intermediate = converted_accumulator;
+    } else {
+      intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+    }
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+
+  /// Computes per-channel linear scaling and bias : D = scale * accumulator + bias
+  /// Scale and Bias are from input Fragment
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentScaleBias const &scale,
+    FragmentScaleBias const &bias) const {
+    
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform per-channel scale and bias
+    FragmentCompute intermediate;
+
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    if(Scale == ScaleType::OnlyAlphaPerChannelScaling)
+      intermediate = mul_add_accumulator(scale, converted_accumulator, bias);    // D = scale * Accum + bias
+    else
+      intermediate = mul_add_accumulator(alpha_, converted_accumulator, bias);   // D = alpha * Accum + bias
+
+    ReLu<FragmentCompute> relu;
+
+    // Compute threshold optionally
+    intermediate = relu(intermediate);
+
+    if (cutlass::platform::numeric_limits<ElementOutput>::is_integer) {
+      // Convert floats back to INT
+      FragmentAccumulator scaled_accumulator;
+
+      NumericArrayConverter<int, ElementCompute, kCount, Round> compute_converter;
+
+      scaled_accumulator = compute_converter(intermediate);
+
+      // Convert to destination numeric type
+      NumericArrayConverter<ElementOutput, int, kCount, Round>
+          destination_converter;
+
+      return destination_converter(scaled_accumulator);
+    } else {
+      NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+          destination_converter;
+      return destination_converter(intermediate);
+    }
+  }
+};
+
+#endif // Conditional guards to enable partial specialization for packed integers
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
new file mode 100644
index 0000000000000000000000000000000000000000..219ab2591bae75563212adc69dca1401d33c0390
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_residual_block.h
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue functor specialized for residual blocks in deep neural networks.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/// Models a residual block of the form: UnaryOp(BinaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual1), residual2))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          template <typename T> class BinaryOp2_ = detail::NoOp,
+          bool StoreT_ = false,
+          typename ElementVector_ = ElementC_>
+class LinearCombinationResidualBlock {
+public:
+  static bool const kIsSingleSource = false;
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp1 = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using BinaryOp2 = BinaryOp2_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+
+private:
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation UnaryOp(BinaryOp(BinaryOp(ActivationOp(AB + bias), residual1), residual2))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual1, FragmentC const &residual2,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp1 binary_op1;
+    BinaryOp2 binary_op2;
+    ActivationOp activation;
+
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual1 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual1);
+    FragmentCompute tmp_residual2 =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual2);
+
+    FragmentCompute z =
+        binary_op2(binary_op1(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual1), beta_ * tmp_residual2);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+
+/// Models a residual block of the form: UnaryOp(BinaryOp(ActivationOp(TensorOp(X) + bias), residual))
+template <typename ElementOutput_, typename ElementAccumulator_,
+          typename ElementCompute_, typename ElementC_, int ElementsPerAccess,
+          template <typename T> class ActivationOp_,
+          template <typename T> class BinaryOp1_,
+          template <typename T> class UnaryOp_,
+          bool StoreT_,
+          typename ElementVector_>
+class LinearCombinationResidualBlock<ElementOutput_, ElementAccumulator_,
+          ElementCompute_, ElementC_, ElementsPerAccess,
+          ActivationOp_, BinaryOp1_, UnaryOp_,
+          detail::NoOp, StoreT_, ElementVector_> {
+public:
+  static bool const kIsSingleSource = true;
+
+  using ElementOutput = ElementC_;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  using UnaryOp = UnaryOp_<Array<ElementCompute, kCount>>;
+  using BinaryOp = BinaryOp1_<Array<ElementCompute, kCount>>;
+  using ActivationOp = ActivationOp_<Array<ElementCompute, kCount>>;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentOutput = Array<ElementOutput, kElementsPerAccess>;
+
+  using ElementZ = ElementOutput_;
+  using ElementT = ElementZ;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  static bool const kIsHeavy = true;
+  static bool const kStoreZ = true;
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales residual input
+    ElementCompute const *alpha_ptr{nullptr};       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr{nullptr};        ///< pointer to residual scalar - if not null, loads it from memory
+
+    CUTLASS_HOST_DEVICE
+    Params() : alpha(ElementCompute(1)), beta(ElementCompute(1)) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha, ElementCompute beta)
+        : alpha(alpha), beta(beta) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr)
+        : alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {}
+  };
+
+private:
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  bool skip_elementwise_;
+
+public:
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationResidualBlock(Params const &params) {
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    skip_elementwise_ = false;
+  }
+
+  /// The "source" tensor corresponds to the residual input
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const { return true; }
+
+  /// Functionally required for serial reduction in the epilogue
+  /// IMPORTANT: Split-k is supported only when ActivationOp is Identity.
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation UnaryOp(BinaryOp(ActivationOp(AB + bias), residual))
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &frag_Z, FragmentOutput &, FragmentAccumulator const &AB,
+                  FragmentC const &residual,
+                  FragmentCompute const &bias) const {
+    UnaryOp unary_op;
+    BinaryOp binary_op;
+    ActivationOp activation;
+
+    FragmentCompute tmp_Accum =
+        NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_residual =
+        NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(residual);
+
+    FragmentCompute z =
+        binary_op(activation(alpha_ * tmp_Accum + bias), beta_ * tmp_residual);
+    FragmentCompute result_Z = skip_elementwise_ ? z : unary_op(z);
+
+    NumericArrayConverter<ElementOutput, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+  }
+
+  /// Should never be called
+  CUTLASS_HOST_DEVICE
+  void operator()(FragmentOutput &, FragmentOutput &, FragmentAccumulator const &,
+                  FragmentCompute const &) const {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
new file mode 100644
index 0000000000000000000000000000000000000000..481eb00db5a169df579aaf32e88e2ac8ab59e9cb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -0,0 +1,70 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with Sigmoid operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator followed by the Sigmoid activation, to an array of elements.
+///
+/// D = sigmoid(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSigmoid = LinearCombinationGeneric<Sigmoid, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, true>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
new file mode 100644
index 0000000000000000000000000000000000000000..438bfa6b3453984246e5a51150c4e988ecdc4e43
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -0,0 +1,69 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination with SiLU operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator folllowed by the SiLU activation to an array of elements.
+///
+/// D = silu(alpha * accumulator + beta * source + uniform)
+///
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationSilu = LinearCombinationGeneric<SiLu, ElementOutput_, Count, ElementAccumulator_,
+                                                       ElementCompute_, Scale, Round, true>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b36501b99a1bfb81b609909b043c915b983708ea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Functor performing linear combination operation, bias addition, and tensor-tensor
+  elementwise operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/detail.hpp"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+namespace detail {
+
+/// Returns whether a source operand is needed for a combination of binary operation and scale
+/// type. Simple specialized checks are made for cases in which 0 is an identity element of
+/// the binary operation.
+template <class BinaryOp, class ElementCompute, ScaleType::Kind Scale>
+CUTLASS_HOST_DEVICE
+bool is_binary_op_source_needed(ElementCompute scale) {
+  if constexpr (cute::is_same_v<BinaryOp, NoOp<ElementCompute>>) {
+    return false;
+  }
+  else if constexpr (cute::is_same_v<BinaryOp, plus<ElementCompute>> || cute::is_same_v<BinaryOp, minus<ElementCompute>>) {
+    // Cases for binary operators for which 0 is an identity element
+    if constexpr (Scale == ScaleType::NoBetaScaling) return true;
+
+    if constexpr (Scale == ScaleType::OnlyAlphaScaling) return false;
+
+    if constexpr (Scale == ScaleType::Nothing) return false;
+
+    return scale != ElementCompute(0);
+  }
+
+  return true;
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Compute a tensor-tensor broadcast epilogue.
+ *
+ * @param ElementOutput_ Data type used to load and store tensors
+ * @param ElementAccumulator_ Accumulator data type
+ * @param ElementCompute_ Data type used to compute linear combination
+ * @param ElementBias_ Data type of Bias elements
+ * @param ActivationFunctor_ Fused Activation
+ * @param BinaryOp0_ Binary operation to perform on O0 and C0. detail::NoOp means no operation
+ * @param BinaryOp1_ Binary operation to perform on O1 and C1. detail::NoOp means no operation
+ * @param UnaryOp_ Unary operation to perform on final result
+ * @param Scale Controls the type of Alpha and Beta scaling to perform
+ * @param Round How values should be rounded in conversions
+ * @param ElementSource_ Data type used for source operands
+ *
+ *  Computes the following:
+ *      O0 = alpha * accumulator + bias
+ *      O1 = BinaryOp0(O0, beta * C0)
+ *      O2 = BinaryOp1(O1, beta * C1)
+ *      D  = UnaryOp(O2)
+ */
+template <
+  class ElementOutput_,
+  class ElementAccumulator_ = ElementOutput_,
+  class ElementCompute_ = ElementOutput_,
+  class ElementBias_ = ElementCompute_,
+  template <class T> class ActivationFunctor_ = Identity,
+  template <class T> class BinaryOp0_ = plus,
+  template <class T> class BinaryOp1_ = detail::NoOp,
+  template <class T> class UnaryOp_ = Identity,
+  ScaleType::Kind Scale = ScaleType::Default,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  class ElementSource_ = ElementOutput_
+>
+class LinearCombinationTensorBroadcast {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementBias = ElementBias_;
+  using ElementC = ElementSource_;
+  using ElementD = ElementOutput_;
+  using ElementScalingFactor = ElementAccumulator_;
+
+  using UnaryOp = UnaryOp_<ElementCompute>;
+  using BinaryOp0 = BinaryOp0_<ElementCompute>;
+  using BinaryOp1 = BinaryOp1_<ElementCompute>;
+  using ActivationFunctor = ActivationFunctor_<ElementCompute>;
+
+  static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentBias = Array<ElementBias, kCount>;
+
+  static constexpr FloatRoundStyle kRound = Round;
+  using NoOpType = detail::NoOp<ElementCompute>;
+  static constexpr bool IsBinaryOp0Enabled = !cute::is_same_v<BinaryOp0, NoOpType>;
+  static constexpr bool IsBinaryOp1Enabled = !cute::is_same_v<BinaryOp1, NoOpType>;
+  static constexpr bool IsUnaryOpEnabled = !cute::is_same_v<UnaryOp, NoOpType> && !cute::is_same_v<UnaryOp, Identity<ElementCompute>>;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha{};                          ///< scales accumulators
+    ElementCompute beta{};                           ///< scales source tensor
+    ElementCompute const* alpha_ptr = nullptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const* beta_ptr = nullptr;        ///< pointer to source scalar - if not null, loads it from memory
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr, ElementCompute const* beta_ptr)
+        : alpha_ptr(alpha_ptr),
+          beta_ptr(beta_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute const* alpha_ptr)
+        : alpha_ptr(alpha_ptr) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(ElementCompute alpha,
+           ElementCompute beta)
+        : alpha(alpha),
+          beta(beta) {}
+  };
+
+private:
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTensorBroadcast(Params const& params)
+      : alpha_(params.alpha_ptr ? *params.alpha_ptr : params.alpha),
+        beta_(params.beta_ptr ? *params.beta_ptr : params.beta) {}
+
+  /// Returns true if source 0 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source0_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp0, ElementCompute, Scale>(beta_);
+  }
+
+  /// Returns true if source 1 is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source1_needed() const {
+    return detail::is_binary_op_source_needed<BinaryOp1, ElementCompute, Scale>(beta_);
+  }
+
+  //
+  // Specialization for scalar
+  //
+  CUTLASS_HOST_DEVICE
+  ElementD operator()(ElementAccumulator const accumulator, ElementC const source0, ElementC source1, ElementBias const bias) {
+    // Convert everything to Compute type, do compute, and then store to output type
+    NumericConverter<ElementCompute, ElementAccumulator, Round> accumulator_converter;
+    NumericConverter<ElementCompute, ElementBias, Round> bias_converter;
+    NumericConverter<ElementCompute, ElementC, Round> source_converter;
+    NumericConverter<ElementD, ElementCompute, Round> destination_converter;
+
+    ActivationFunctor act;
+    multiplies<ElementCompute> mul;
+    multiply_add<ElementCompute> madd;
+
+    ElementCompute intermediate = accumulator_converter(accumulator);
+    intermediate = madd(alpha_, intermediate, bias_converter(bias));
+    intermediate = act(intermediate);
+
+    // Apply BinaryOp0, if needed
+    if constexpr (IsBinaryOp0Enabled) {
+      BinaryOp0 bin0;
+      ElementCompute converted_source = source_converter(source0);
+      intermediate = bin0(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply BinaryOp1, if needed
+    if constexpr (IsBinaryOp1Enabled) {
+      BinaryOp1 bin1;
+      ElementCompute converted_source = source_converter(source1);
+      intermediate = bin1(intermediate, mul(beta_, converted_source));
+    }
+
+    // Apply UnaryOp, if needed
+    if constexpr (IsUnaryOpEnabled) {
+      UnaryOp unary;
+      intermediate = unary(intermediate);
+    }
+
+    return destination_converter(intermediate);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..7dd3b3e56c3bc837d28be2ad4d8b50dd4ce4a011
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Functor performing linear combination with elementwise
+*/
+
+#pragma once
+
+#include "cutlass/half.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/constants.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+///
+/// D = alpha * accumulator + beta * source + uniform
+///
+template <
+  typename ElementCompute_,                            ///< Data type returned by this functor
+  typename ElementAccumulator_,                        ///< Data type of accumulators
+  typename ElementSource_,                             ///< Data type of source tensor
+  typename ElementTensor_,                             ///< Data type of additional tensor
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationWithElementwise {
+public:
+
+  using ElementOutput = ElementSource_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementSource = ElementSource_;
+  using ElementTensor = ElementTensor_;
+
+  static bool const kIsHeavy = true;
+
+  static int const kCount = Count;
+
+  using FragmentCompute = Array<ElementCompute, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentTensor = Array<ElementTensor, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  /// Host-constructable parameters structure
+  struct Params {
+
+    ElementCompute alpha;                  ///< scales accumulators
+    ElementCompute beta;                   ///< scales source tensor
+    ElementCompute threshold;              ///< minimum value that is output 
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha(ElementCompute(1)), 
+      beta(ElementCompute(0)),
+      threshold(ElementCompute(0)), 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute alpha,
+      ElementCompute beta,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementCompute threshold = ElementCompute(0)
+    ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute alpha_;
+  ElementCompute beta_;
+  ElementCompute threshold_;
+  bool participates_in_reduction_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationWithElementwise(Params const &params) {
+
+    alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha);
+    beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta);
+    threshold_ = params.threshold;
+    participates_in_reduction_ = true;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ != ElementCompute(0);
+  }
+
+  /// Returns true if the threadblock computes the reduction
+  CUTLASS_HOST_DEVICE
+  bool participates_in_reduction() const {
+    return participates_in_reduction_;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      // set to NaN to make ReLU no-op for all except last k partitions
+      int64_t allones = -1;
+      threshold_ = reinterpret_cast<ElementCompute const &>(allones);
+      // Avoid computing the reduction if this isn't the final Split-K slice
+      participates_in_reduction_ = false;
+    }
+  }
+  
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator, 
+    FragmentSource const &source,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round> source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_source = source_converter(source);
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_add_source;
+    multiply_add<FragmentCompute> mul_add_accumulator;
+
+    intermediate = mul_add_source(beta_, converted_source);                             // X =  beta * C + uniform
+    intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate);    // D = alpha * Accum + X
+
+    return intermediate;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentCompute operator()(
+    FragmentAccumulator const &accumulator,
+    FragmentTensor const &tensor) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate;
+
+    multiplies<FragmentCompute> mul_accumulator;
+
+    intermediate = mul_accumulator(alpha_, converted_accumulator);    // D = alpha * Accum
+
+    return intermediate;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/reduction_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/reduction_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2474c086d415db8162049b216073d68f4e3dc67
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/reduction_op.h
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing reduction operations used by epilogues.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a reduction sum to an array of elements.
+///
+///
+template <
+  typename Element_,                             ///< Data type used to load and store tensors
+  int Count                                      ///< Number of elements computed per operation
+>
+class ReductionOpPlus {
+public:
+
+  using Element = Element_;
+  static int const kCount = Count;
+
+  using Fragment = Array<Element, kCount>;
+  using Operator = plus<Fragment>;
+
+  /// Host-constructable parameters structure
+  struct Params { };
+
+private:
+
+  /// reduction operator
+  Operator operator_;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  ReductionOpPlus(Params const &params) {
+
+  }
+
+  /// Computes Compute => 
+  CUTLASS_HOST_DEVICE
+  Fragment operator()(
+    Fragment const &lhs,
+    Fragment const &rhs) const {
+
+    return operator_(lhs, rhs);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/scale_type.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/scale_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..beed8bf76a576504bf4df7095a4fc710f1f68212
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/thread/scale_type.h
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Enum defines the behaviors of the epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specifies internal data type for computation
+/// Note :
+///  1. Scalar means alpha/beta is a single value from host(constant param) or device memory.
+///  2. Vector means alpha/beta is a vector always from device memory.
+struct ScaleType {
+  enum Kind {
+    Default,                           // D = scalar_alpha x Acc + scalar_beta x C
+    NoBetaScaling,                     // D = scalar_alpha x Acc + C
+    OnlyAlphaScaling,                  // D = scalar_alpha x Acc
+    PerChannelScaling,                 // D = vector_alpha x Acc + vector_beta x C
+    OnlyAlphaPerChannelScaling,        // D = vector_alpha x Acc
+    Nothing                            // D = Acc
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dd226517d896fb22c489f1d675c8b5fbfdc0250
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -0,0 +1,255 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
+  typename Operator_ = arch::OpMultiplyAddComplex
+> 
+struct DefaultEpilogueComplexTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueComplexTensorOp <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex
+> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
new file mode 100644
index 0000000000000000000000000000000000000000..effb49a2823aa9484dc3c9b6711f87aad9ee58d2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped complex GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Specialization and defines sensible defaults for epilogues for complex*complex case
+//  4 real-valued mma operations (Complex)
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Epilogue Shape
+  typename Shape_,
+  /// Warp-level mma operator
+  typename WarpMmaTensorOp_,
+  /// Number of k partitions
+  int PartitionsK,
+  /// Epilogue output operator
+  typename OutputOp_,
+  /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load()
+  int ElementsPerAccess,
+  /// Multiply-add operator 
+  /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) 
+  typename Operator_ = arch::OpMultiplyAddComplex,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+> 
+struct DefaultEpilogueComplexTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = Operator_;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput
+    , kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization and defines sensible defaults for epilogues for complex*complex case
+//  3 real-valued mma operations (Gaussian Complex)
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess, 
+  BlasMode BlasMode_
+>
+struct DefaultEpilogueComplexTensorOpBlas3 <Shape_, WarpMmaTensorOp_, PartitionsK, 
+                                      OutputOp_, ElementsPerAccess, 
+                                      arch::OpMultiplyAddGaussianComplex
+                                      , BlasMode_
+> {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using Operator = arch::OpMultiplyAddGaussianComplex;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 0>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
new file mode 100644
index 0000000000000000000000000000000000000000..45e36028e89606baa851de2b7384a951f68ca5ff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Direct store epilogue
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_direct_store.h"
+#include "cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Given a properly constructed epilogue, returns a direct store epilogue
+template <typename EpilogueTensorOp>
+struct DefaultEpilogueDirectStore {
+
+  using OutputTileIterator = DirectStoreEpilogueIterator<typename EpilogueTensorOp::OutputTileIterator::Element>;
+
+  using Epilogue = EpilogueDirectStore<
+    typename EpilogueTensorOp::Shape,
+    typename EpilogueTensorOp::WarpMmaOperator,
+    EpilogueTensorOp::kPartitionsK,
+    OutputTileIterator,
+    typename EpilogueTensorOp::AccumulatorFragmentIterator,
+    typename EpilogueTensorOp::WarpTileIterator,
+    typename EpilogueTensorOp::SharedLoadIterator,
+    typename EpilogueTensorOp::OutputOp
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed87a9e3587c06bf33b4d5af05b66fa70ac6ce62
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -0,0 +1,241 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Constructs a default epilogue for planar complex outputs.
+
+  This template reuses components for real-valued epilogues and applies them to planar complex
+  output matrices.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMma_,
+  typename OpcodeClass_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm70,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm75,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassTensorOp, 
+  arch::Sm80,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueTensorOp<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues.
+template <
+  typename ThreadblockShape_,
+  typename WarpMmaOperator_,
+  typename ArchTag_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpiloguePlanarComplex<
+  ThreadblockShape_, 
+  WarpMmaOperator_, 
+  arch::OpClassSimt, 
+  ArchTag_,
+  PartitionsK, 
+  OutputOp_, 
+  ElementsPerAccess> {
+
+  using RealEpilogue = DefaultEpilogueSimt<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    OutputOp_,
+    ElementsPerAccess
+  >;
+
+  using Epilogue = EpiloguePlanarComplex<
+    ThreadblockShape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    typename RealEpilogue::OutputTileIterator,
+    typename RealEpilogue::AccumulatorFragmentIterator,
+    typename RealEpilogue::WarpTileIterator,
+    typename RealEpilogue::SharedLoadIterator,
+    OutputOp_,
+    typename RealEpilogue::Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..10719f183f0f5b994584b2d5c2fd73ea053862da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -0,0 +1,443 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using SIMT.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_simt.h"
+#include "cutlass/epilogue/warp/tile_iterator_simt.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_simt.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h" 
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_depthwise.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueSimt {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaSimt_,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueSimtAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static const int kPartitionsK = Shape::kK / WarpMmaSimt::Shape::kK;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapSimt<
+    Shape,
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::Policy,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaSimt,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <typename Shape_,        // ThreadBlock Shape
+          typename WarpMmaSimt_,  // mma_depthwise_simt
+          typename OutputOp_,
+          int ElementsPerAccess_,
+          typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+          typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1> >
+struct DefaultDirectConvEpilogueSimt {
+  using Shape = Shape_;
+  using WarpMmaSimt = WarpMmaSimt_;
+  using WarpShape = typename WarpMmaSimt::Shape;
+  using OutputOp = OutputOp_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  static int const kElementsPerAccess = ElementsPerAccess_;
+
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaSimt::LayoutC;
+  using ElementAccumulator = typename WarpMmaSimt::ElementC;
+
+  /// Number of threads total
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN
+  >;
+
+  static int const kWarpSize = cutlass::gemm::warp::WarpSize<arch::OpClassSimt>::value;
+
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Thread map
+  //
+  
+  using OutputTileThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<ThreadBlockOutputShape::kC, ThreadBlockOutputShape::kNHW>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorDirectConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ThreadOutputShape,
+    ThreadBlockOutputShape 
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorSimt<
+    typename WarpMmaSimt::Shape,
+    typename WarpMmaSimt::ThreadMma,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorSimtDirect2dConv<
+    typename WarpMmaSimt::Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    typename WarpMmaSimt::ThreadMma,
+    ElementAccumulator,
+    layout::RowMajor,
+    typename WarpMmaSimt::Policy
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorPitchLinear<
+    OutputTileThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueDepthwise<
+    Shape,
+    ThreadOutputShape,
+    ThreadBlockOutputShape,
+    WarpMmaSimt,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb01693772e5c077a0aa88ccbe43a9642c6ff684
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -0,0 +1,904 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  float, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  bfloat16_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t, 
+  int32_t, 
+  8, 
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+  
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput, 
+  int32_t, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+                "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float, 
+  ElementsPerAccess,
+  ThreadblockShape, 
+  WarpShape, 
+  InstructionShape, 
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8 || ElementsPerAccess == 4),
+              "ElementsPerAccess needs to be 16 or 8.");
+  
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8) || (ElementsPerAccess == 4),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
new file mode 100644
index 0000000000000000000000000000000000000000..68a98f3fc07f8e7cd1681caf157b475225f8961f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  /// Is for a symmetric kernel
+  BlasMode BlasMode_ = BlasMode::kGemm
+>
+struct DefaultEpilogueTensorOpBlas3 {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorBlas3<
+    OutputTileThreadMap,
+    ElementOutput,
+    kBlasMode
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added 
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2039fe1d1e08d76c6d339eae679f8e19719b2d7c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -0,0 +1,337 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops on Volta.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueVoltaTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueVoltaTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapVoltaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess,
+    ElementAccumulator
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    gemm::GemmShape<32, 32, 4>,
+    ElementAccumulator,
+    LayoutC
+  >;
+
+  static int const kSharedMemAlignment = sizeof_bits<ElementAccumulator>::value * WarpTileIterator::kElementsPerAccess / 8;
+
+  static_assert(kSharedMemAlignment == 8, "Shared memory alignment must be 8B");
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator,
+    kSharedMemAlignment
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f260a5b4f2faea9a1aa675656e1fa1232b43552d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
@@ -0,0 +1,126 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Default configuration for epilogue computing absolute maximum of output and auxiliary outputs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for absolute-maximum-computing  epilogues with TensorOps
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementAuxOutput,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithAbsMax {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the output
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Stores the auxiliary output
+  //
+  using AuxOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementAuxOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithAbsMax<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    AuxOutputTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef4fc03834404b8b569fe8cf29072668b76821f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute,
+  conv::StrideSupport StrideSupport = conv::StrideSupport::kUnity,
+  int Rank = 4
+>
+struct DefaultEpilogueWithBroadcastSimt {
+
+  static conv::StrideSupport const kStrideSupport = StrideSupport;
+  static int const kRank = Rank;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimt<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  using PackedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using StridedOutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorConv<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore,
+    kRank
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = typename platform::conditional<StrideSupport == cutlass::conv::StrideSupport::kUnity,
+                                                            PackedOutputTileIterator,
+                                                            StridedOutputTileIterator>::type;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for strided dgrad epilogues for SimtOps.
+template <
+  typename Shape,
+  typename WarpMmaSimt,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastSimtStridedDgrad {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueSimtStridedDgrad<
+    Shape,
+    WarpMmaSimt,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaSimt,
+    Base::kPartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithBroadcastTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for streamk epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultStreamkEpilogueWithBroadcastTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueStreamkWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding,
+    Base::kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for VoltaTensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename ElementTensor,
+  typename ElementVector,
+  typename OutputOp,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueWithBroadcastVoltaTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  //
+  // Stores the result z = (y = GEMM(A, B, C), broadcast)
+  //
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  //
+  // Additional tensor tile iterator - stores t = Elementwise(z)
+  //
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementTensor
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithBroadcast<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    ElementVector,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    OutputOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e023c66075f59d564f1b9991e20d93a6c00cab1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
@@ -0,0 +1,177 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape,
+  typename WarpMmaTensorOp,
+  int PartitionsK,
+  typename ElementOutput,
+  typename OutputOp,
+  typename ReductionOp,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWithReductionVoltaTensorOp {
+
+  /// Use defaults related to the existing epilogue
+  using Base = DefaultEpilogueVoltaTensorOp<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputOp,
+    ElementsPerAccess
+  >;
+
+  /// Additional tensor tile iterator
+  using TensorTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    typename OutputOp::ElementTensor
+  >;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    typename Base::OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  /// Define the epilogue
+  using Epilogue = EpilogueWithReduction<
+    Shape,
+    WarpMmaTensorOp,
+    PartitionsK,
+    OutputTileIterator,
+    TensorTileIterator,
+    typename WarpMmaTensorOp::ElementC,
+    typename Base::AccumulatorFragmentIterator,
+    typename Base::WarpTileIterator,
+    typename Base::SharedLoadIterator,
+    typename Base::OutputOp,
+    ReductionOp,
+    typename Base::Padding
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd7a071e62a32c76c82b715703c61485b5fd32b5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using WMMA.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for WMMA TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueWmmaTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapWmmaTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::ElementC,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorWmmaTensorOp<
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+    LayoutC
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementAccumulator
+  >;
+
+  /// Hard-coded padding elements added 
+  using Padding = typename WarpTileIterator::Padding;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..030a9c1b7f6d5e5e9f85374498f9721264641565
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for SIMT accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename MmaSimtPolicy_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapSimt {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Computes number of thread-level matrix multiplies are needed to span a warp
+    static int const kGroupCount =
+      WarpShape::kM / (MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM);
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    /// Number of iterations
+    static int const kIterations = MmaSimtPolicy::LaneMmaShape::kM * kGroupCount;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap<
+    OutputTileShape<                          // Shape
+      ThreadblockShape::kN, 
+      1, 
+      MmaSimtPolicy::WarpShape::kRow, 
+      Detail::WarpCount::kM, 
+      1>,
+    OutputTileShape<                          // Count
+      1, 
+      MmaSimtPolicy::LaneMmaShape::kM, 
+      Detail::kGroupCount, 
+      1, 
+      Detail::kIterations>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..39297f140cd37a319b42b828b8f4fa7e99655e71
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -0,0 +1,208 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapTensorOp {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedOutputTileThreadMap<
+      layout::PitchLinearShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      layout::PitchLinearShape<WarpShape::kM / Detail::kTensorOpRows,
+                               WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <typename ThreadblockShape_, typename WarpShape_, int PartitionsK,
+          typename Element_, int ElementsPerAccess, int InterleavedK>
+struct DefaultInterleavedConvThreadMapTensorOp {
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kInterleavedK = InterleavedK;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+    /// Tensor Operations fundamentally perform operations on 8 rows
+    static int const kTensorOpRows = 8;
+    static int const kWarpSize = 32;
+
+    static_assert(!(ThreadblockShape::kM % WarpShape::kM) &&
+                      !(ThreadblockShape::kN % WarpShape::kN),
+                  "Divisibility");
+
+    /// Number of warps
+    using WarpCount =
+        gemm::GemmShape<ThreadblockShape::kM / WarpShape::kM,
+                        ThreadblockShape::kN / WarpShape::kN, kPartitionsK>;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+
+  /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept
+  /// InterleavedOutputTileThreadMap
+  using Type = InterleavedConvOutputTileThreadMap<
+      MatrixShape<Detail::WarpCount::kM, Detail::WarpCount::kN>,
+      MatrixShape<WarpShape::kM / Detail::kTensorOpRows,
+                  WarpShape::kN / InterleavedK>,
+      Detail::kThreads, kElementsPerAccess, sizeof_bits<Element>::value>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c381162acd0abe412fa75343d3e76462d657e94
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -0,0 +1,228 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  int PartitionsK,
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ElementAccumulator
+>
+struct DefaultThreadMapVoltaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_, 
+  WarpShape_, 
+  PartitionsK, 
+  ElementOutput_, 
+  ElementsPerAccess, 
+  half_t> {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = half_t;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  int PartitionsK,
+  typename ElementOutput_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapVoltaTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  PartitionsK,
+  ElementOutput_,
+  ElementsPerAccess,
+  float> {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using ElementOutput = ElementOutput_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  using ElementAccumulator = float;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    static int const kTensorOpRows = 16;
+    static int const kWarpSize = 32;
+    static int const kInterleavedTilesM = WarpShape::kM / 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+
+    using Shape = cutlass::epilogue::threadblock::OutputTileShape<
+      ThreadblockShape::kN,   // column
+      4,                      // row
+      4,                      // group
+      WarpCount::kM,          // cluster
+      1                       // tile
+    >;
+    
+    /// Number of iterations per subspace
+    using Count = cutlass::epilogue::threadblock::OutputTileShape<
+      1,                                // column
+      2,                                // row
+      kInterleavedTilesM,               // group
+      1,                                // cluster
+      WarpShape::kM / kTensorOpRows     // iterations
+    >;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    typename Detail::Shape,
+    typename Detail::Count,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<ElementOutput>::value
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f5cd47eec71df08de6fa8e83148633b124dd596
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -0,0 +1,113 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the optimal thread map for Wmma TensorOp accumulator layouts
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename InstructionShape_,
+  int PartitionsK,
+  typename Element_,
+  int ElementsPerAccess
+>
+struct DefaultThreadMapWmmaTensorOp {
+
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  static int const kPartitionsK = PartitionsK;
+  using Element = Element_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Definitions
+  //
+
+  struct Detail {
+
+    /// Wmma Tensor Operations fundamentally perform operations on InstructionShape::kM rows
+    static int const kTensorOpRows = InstructionShape::kM;
+    static int const kWarpSize = 32;
+
+    static_assert(
+      !(ThreadblockShape::kM % WarpShape::kM) &&
+      !(ThreadblockShape::kN % WarpShape::kN), "Divisibility");
+
+    /// Number of warps
+    using WarpCount = gemm::GemmShape<
+      ThreadblockShape::kM / WarpShape::kM,
+      ThreadblockShape::kN / WarpShape::kN,
+      kPartitionsK
+    >;
+
+    /// Number of participating threads
+    static int const kThreads = WarpCount::kCount * kWarpSize;
+  };
+
+  //
+  // ThreadMap
+  //
+  
+  /// ThreadMap to be used by epilogue::PredicatedTileIterator satisfying concept OutputTileThreadMap
+  using Type = OutputTileOptimalThreadMap <
+    OutputTileShape<ThreadblockShape::kN, Detail::kTensorOpRows, Detail::WarpCount::kM, 1, 1>,
+    OutputTileShape<1, WarpShape::kM / Detail::kTensorOpRows, 1, 1, WarpShape::kM / Detail::kTensorOpRows>,
+    Detail::kThreads,
+    kElementsPerAccess,
+    sizeof_bits<Element>::value
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..07115e6919a3c8521ed9850c29b71347cce6fe47
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element_>
+class DirectStoreEpilogueIterator {
+public:
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = 1;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+      stride = layout.stride(0) * sizeof(Element);
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+public:
+
+  //
+  // Data members
+  //
+
+  Element *pointer;     // pointer to the output matrix
+
+  LongIndex stride;     // stride in elements between rows
+
+  TensorCoord extent;   // extent of output matrix
+
+  int thread_idx;       // thread index
+
+  TensorCoord threadblock_offset;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  DirectStoreEpilogueIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer_,
+    TensorCoord extent_,
+    int thread_idx_,
+    TensorCoord threadblock_offset_ = TensorCoord(),
+    int const * indices = nullptr
+  ): 
+    pointer(pointer_),
+    stride(params.stride / sizeof(Element)),
+    extent(extent_),
+    thread_idx(thread_idx_),
+    threadblock_offset(threadblock_offset_)
+  {
+
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d286b2de536811d4b4424218de3b5744d46c09
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue.h
@@ -0,0 +1,548 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class Epilogue :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+
+public:
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+
+public:
+
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+
+    // No-op
+    CUTLASS_DEVICE
+    void load() { }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+
+
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+
+    // Load addend source fragment from global memory
+    CUTLASS_DEVICE
+    void load() {
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+    }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment)
+    {
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  Epilogue(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    // Initialize/load source-fragment data
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
+
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(
+        output_fragment,
+        output_op,
+        aligned_accum_fragment,
+        source_fragment);
+
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
+
+  template<class Seq>
+  struct acc2smem;
+
+  template <size_t... Seq>
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                    AccumulatorFragmentIterator const &iterator_begin,
+                    WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+
+  /// Streams the result to global memory
+  template <typename SourceAspect>
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #ifdef __clang__
+    #pragma clang diagnostic push
+    #pragma clang diagnostic ignored "-Wcuda-compat"
+    // Turn off clangs warning about loop unroll argument using parens.
+    #endif
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter)
+    {
+      //
+      // Load the source
+      //
+
+        source.load();
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+        iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      if (kPartitionsK > 1) {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, aligned_accum_fragment[0]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+    
+    #ifdef __clang__
+    #pragma clang diagnostic pop
+    #endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..26c8ba828ddaa7ad50cc5c83173c9ad1c9f46a6b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if !defined(__CUDACC_RTC__)
+#include <type_traits>
+#include <utility>
+#endif
+#include CUDA_STD_HEADER(cassert)
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// This is used for metaprogramming epilogue functors. If they define 
+// `static bool const kIsHeavy = true;`, then the epilogue functor itself is
+// not inlined. This results in smaller code and is advantageous if the epilogue
+// functor consists of many instructions.
+//
+// If the epilogue functor does not define `kIsHeavy` or if it is `false`, then
+// the behavior from CUTLASS 2.5 and before is retained. The epilogue is fully
+// unrolled and inlined.
+//
+
+template<class> 
+struct TypeSink {  typedef void type; };
+
+template<class T> using TypeSinkT = typename TypeSink<T>::type;
+
+template<class T, class=void> struct IsEpilogueFunctorHeavy {
+  static bool const value = false;
+};
+
+template<class T> struct IsEpilogueFunctorHeavy<T, TypeSinkT< decltype( T::kIsHeavy ) > > {
+  static bool const value = T::kIsHeavy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Base class for epilogues defining warp-level 
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpShape_,                      ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerIteration = 1
+>
+class EpilogueBase {
+public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  static int const kPartitionsK = PartitionsK;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using Padding = Padding_;
+
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename AccumulatorTile::Element;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = FragmentsPerIteration;
+
+public:
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      (Shape::kRow + Padding::kRow) * kFragmentsPerIteration, 
+      Shape::kColumn + Padding::kColumn
+    >;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(), 
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  SharedStorage &shared_storage_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBase(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
new file mode 100644
index 0000000000000000000000000000000000000000..14aac16161dd67b59c69bce6332f8ceee4655e3f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Basic subset of epilogue functionality for supporting StreamK decompositions
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/block_striped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// StreamK epilogue functionality for cross-block accumulator fragment reduction
+template <
+  typename Shape,                          ///< Shape of threadblock tile (concept: GemmShape)
+  int PartitionsK,
+  typename WarpMmaOperator,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  typename AccumulatorFragmentIterator>    ///< Iterator for enumerating fragments within the per-thread tile of raw accumulators
+class EpilogueBaseStreamK
+{
+
+protected:
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+                        Shape::kM / WarpMmaOperator::Shape::kM,
+                        Shape::kN / WarpMmaOperator::Shape::kN,
+                        PartitionsK>;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+public:
+
+  /// Number of AccumulatorTile fragments per thread
+  static int const kAccumulatorFragments = AccumulatorFragmentIterator::Policy::kIterations;
+
+protected:
+
+  /// Number of AccumulatorTile fragments per block output tile
+  static int const kOutputTileFragments = kBlockThreads * kAccumulatorFragments;
+
+  /// Block-striped transfer utility for sharing AccumulatorFragment
+  using BlockStripedT = BlockStriped<kBlockThreads, AccumulatorFragment>;
+
+  /// AccumulatorFragment stride in the shared workspace between different peer blocks (each thread block can share accumulators for up to two block output tiles)
+  static const int kPeerFragmentStride = kOutputTileFragments * 2;
+
+public:
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =sizeof(AccumulatorFragment) * kPeerFragmentStride;
+
+public:
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueBaseStreamK(
+      int thread_idx)                                       ///< ID of a thread within the threadblock
+  :
+      thread_idx(thread_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace
+  CUTLASS_DEVICE
+  void reduce(
+      AccumulatorFragment &accum_fragment,                  ///< [out] sum of all shared accumulator fragments for these peer partials
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *workspace_ptr)
+  {
+    plus<AccumulatorFragment> add_fragments;
+
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = (peer_idx_begin * kPeerFragmentStride) + (reduce_fragment_idx * kBlockThreads);
+
+    // Load first peer fragment
+    BlockStripedT::load(accum_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+    fragment_offset += kPeerFragmentStride;         // Move to next peer
+    fragment_offset += kOutputTileFragments;        // Move to the set of fragments for this peer's "non-started" output tile
+
+    // Reduce fragments from additional peers
+    #pragma unroll 2
+    for (; fragment_offset < peer_idx_end * kPeerFragmentStride; fragment_offset += kPeerFragmentStride)
+    {
+      // Load peer fragment
+      AccumulatorFragment addend_fragment;
+      BlockStripedT::load(addend_fragment, fragment_workspace + fragment_offset, this->thread_idx);
+
+      // Add peer fragment
+      accum_fragment = add_fragments(accum_fragment, addend_fragment);
+    }
+  }
+
+
+  /// Shares the accumulator set with peers in the global workspace
+  CUTLASS_DEVICE
+  void share(
+      int peer_idx,
+      void *workspace_ptr,
+      AccumulatorTile const &accumulators,
+      bool started_tile)                      ///< Whether this thread block computed the first work volume for the current output tile
+  {
+    AccumulatorFragment *fragment_workspace = reinterpret_cast<AccumulatorFragment *>(workspace_ptr);
+
+    int fragment_offset = peer_idx * kPeerFragmentStride;
+
+    if (!started_tile) {
+      // Move to the set of fragments for the "non-started" output tile
+      fragment_offset += kOutputTileFragments;
+    }
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    // Convert raw accumulator tile to fragments and store
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kAccumulatorFragments; ++iter)
+    {
+      // Acquire reordered accumulator fragment
+      AccumulatorFragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      // Store accumulator fragment
+      BlockStripedT::store(fragment_workspace + fragment_offset, accum_fragment, this->thread_idx);
+
+      fragment_offset += kBlockThreads;
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..76967410dd2948926a2e7b8277988b64b4170f8c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
@@ -0,0 +1,335 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for Depthwise convoltuion
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <typename Shape_,                   ///< Shape of threadblock tile (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename WarpMmaOperator_,         ///< Warp-level MMA operator (concept:
+                                             ///< gemm::warp::MmaTensorOp)
+          typename OutputTileIterator_,      ///< Tile iterator reading and writing output tensors
+          typename AccumulatorFragmentIterator_,  ///< Fragment iterator selecting accumulators
+          typename WarpTileIterator_,    ///< Warp-scoped tile iterator writing accumulators to SMEM
+          typename SharedLoadIterator_,  ///< Threadblock-scoped tile iterator loading from SMEM
+          typename OutputOp_,            ///< Output operator
+          typename Padding_  ///< Padding added to SMEM allocation to avoid bank conflicts (concept:
+                             ///< MatrixShape)
+          >
+class EpilogueDepthwise {
+ public:
+  using Shape = Shape_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType =
+      Array<typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN>;
+
+ public:
+  static_assert(SharedLoadIterator::Fragment::kElements ==
+  OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<ThreadBlockOutputShape::kNHW, ThreadBlockOutputShape::kC>;
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<Shape::kRow, Shape::kColumn>;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, StorageShape::kCount> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() { return storage.data(); }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(storage.data(), Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n;  // warp coordinates within a cta
+  int tid_m, tid_n;    // thread coordinates within a warp
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDepthwise(SharedStorage &shared_storage,  ///< Shared storage object
+                    int thread_idx_,                ///< ID of a thread within the threadblock
+                    int warp_idx_,                  ///< ID of warp within threadblock
+                    int lane_idx_                   ///< Id of thread within warp
+                    )
+      : thread_idx(thread_idx_),
+        warp_idx(warp_idx_),
+        lane_idx(lane_idx_),
+        shared_load_iterator_(shared_storage.reference(), thread_idx_),
+        warp_tile_iterator_(shared_storage.reference(), thread_idx_, lane_idx_) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(OutputOp const &output_op,                ///< Output operator
+                  OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+                  AccumulatorTile const &accumulators,  ///< Complete warp-level accumulator tile
+                  OutputTileIterator source_iterator,   ///< Threadblock tile coordinate in GEMM (in
+                                                        ///< units of threadblock tiles)
+                  const int smem_base_offset) {         ///< SMEM base offset for epilogue operation
+    // initiate the smem base offset for different output tile.
+    warp_tile_iterator_.set_smem_base_address(smem_base_offset);
+
+    shared_load_iterator_.set_smem_base_address(smem_base_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+ private:
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators,      ///< Complete warp-level accumulator tile
+      OutputTileIterator source_iterator) {     ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    source_fragment.clear();
+
+    source_iterator.load(source_fragment);
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const &output_op,                ///< Output operator
+      OutputTileIterator destination_iterator,  ///< Tile iterator for destination
+      AccumulatorTile const &accumulators) {    ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    // store to smem
+    warp_tile_iterator_.store(accumulators);
+
+    __syncthreads();
+
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+
+    // load from smem
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    typename OutputTileIterator::Fragment output_fragment;
+
+    apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment);
+
+    // Store to GMEM
+    destination_iterator.store(output_fragment);
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment) {
+      
+    OutputAccessType *output_frag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    OutputAccessType const *source_frag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,  ///< Output operator
+      typename SharedLoadIterator::Fragment const &aligned_accum_fragment) {
+    OutputAccessType *output_frag_ptr = reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
new file mode 100644
index 0000000000000000000000000000000000000000..187d40c9d12b6c818158f21ad1216affd1510724
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
@@ -0,0 +1,347 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs and convolution using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueDirectStore {
+public:
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator_::Shape;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = MatrixShape<0, 0>;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+  
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Use this to control the granularity of one epilogue 'iteration'
+  static int const kFragmentsPerIteration = 1;
+
+  static int constexpr kSmemTiles = 1;
+  static int constexpr kSmemPointerOffset = 0;
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage { } ;
+
+private:
+
+  // Assume accumulator tile is multipile interleaved 32x32 tile.
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename platform::conditional<
+                              platform::is_same<ElementAccumulator, float>::value,
+                              MatrixShape<2, 2>,
+                              MatrixShape<1, 4> >::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = MatrixShape<4, 4>;
+
+  static_assert(OutputOp::kCount >= 2, 
+    "The direct store epilogue for Tensor Ops requires the output functor have kCount >= 2.");
+
+private:
+
+  LongIndex warp_offset;
+  int thread_idx;
+  int warp_idx;
+  int lane_idx;
+  int warp_m, warp_n; // warp coordinates within a cta
+  int tid_m, tid_n;   // thread coordinates within a warp
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueDirectStore(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx_,                   ///< ID of a thread within the threadblock
+    int warp_idx_,                     ///< ID of warp within threadblock
+    int lane_idx_                     ///< Id of thread within warp
+  ):
+    thread_idx(thread_idx_), 
+    warp_idx(warp_idx_), 
+    lane_idx(lane_idx_) 
+  {
+    
+    // warp offsetting calculations
+    warp_offset = warp_idx * WarpShape::kM * WarpShape::kN;
+    int warp_id_mn = warp_idx % (WarpCount::kM * WarpShape::kN);
+    warp_m = warp_id_mn % WarpCount::kM;
+    warp_n = warp_id_mn / WarpCount::kM;
+    MatrixCoord warp_offset_coord(warp_m*WarpShape::kM, warp_n*WarpShape::kN);
+    
+    // thread offsetting calculations
+    int quad = (lane_idx >> 2);
+    int lane_in_quad = (lane_idx & 3);
+
+    // this seems to be te correct layout
+    tid_m = quad;
+    tid_n = 2 * lane_in_quad;
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    }
+    else {
+      compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+
+private:
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+      ElementOutput *source_ptr = source_iterator.pointer + mL * source_iterator.stride;
+
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
+        int nL = nL_base + accum_n;
+          
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+
+        OutputFragmentType output_fragment;
+
+        if(guard) {
+          reinterpret_cast<OutputAccessType &>(output_fragment) = 
+            *reinterpret_cast<OutputAccessType const *>(source_ptr + nL);
+        }
+
+        // Perform output operator
+        output_fragment = output_op(accum_fragment, output_fragment);
+
+        if(guard) {
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = reinterpret_cast<OutputAccessType const &>(output_fragment);
+        }
+      }
+    }
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    const int kAccumBlockN = 2;
+    const int kThreadsM = 8;
+    const int kThreadsN = 4;
+    const int kBlockM = WarpShape::kM / kThreadsM;
+
+    /// Array type used to output
+    using OutputAccessType = AlignedArray<ElementOutput, kAccumBlockN>;
+
+    /// Array type passed to the output operator - unused elements are optimized away
+    using OutputFragmentType = Array<ElementOutput, OutputOp::kCount>;
+
+    /// Array type used by output functor
+    using AccumulatorAccessType = Array<ElementAccumulator, kAccumBlockN>;
+
+    /// Array type used by output functor
+    using AccumulatorFragmentType = Array<ElementAccumulator, OutputOp::kCount>;
+
+    AccumulatorAccessType const *accumulator_pair = reinterpret_cast<AccumulatorAccessType const *>(&accumulators);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int accum_m_idx = 0; accum_m_idx < WarpShape::kM / kThreadsM; accum_m_idx++) {
+
+      int accum_m = kThreadsM * accum_m_idx;
+      int mL = destination_iterator.threadblock_offset.row() + WarpShape::kM * warp_m + tid_m + accum_m;
+      int nL_base = destination_iterator.threadblock_offset.column() + WarpShape::kN * warp_n + tid_n;
+
+      ElementOutput *output_ptr = destination_iterator.pointer + mL * destination_iterator.stride;
+
+      int const kIterationsN = WarpShape::kN / kThreadsN / kAccumBlockN;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int accum_n_idx = 0; accum_n_idx < kIterationsN; accum_n_idx++) {
+
+        int accum_idx = accum_m_idx + kBlockM * accum_n_idx;
+        int accum_n = kThreadsM * accum_n_idx;
+        
+        // mL and nL are logical coordinate in 2D mapping of epilogue's 4D output 
+        int nL = nL_base + accum_n;
+          
+        bool guard = (mL < destination_iterator.extent.row()) && (nL < destination_iterator.extent.column());
+                   
+        AccumulatorFragmentType accum_fragment;
+        reinterpret_cast<AccumulatorAccessType &>(accum_fragment) = accumulator_pair[accum_idx];
+
+        OutputFragmentType output_fragment;
+
+        // Perform output operator
+        output_fragment = output_op(accum_fragment);
+
+        if(guard) { 
+
+          // Store
+          *reinterpret_cast<OutputAccessType *>(output_ptr + nL) = 
+            reinterpret_cast<OutputAccessType const &>(output_fragment);      
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..17a4538729d3a8c83489857cfa817dbfeb88e1ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
@@ -0,0 +1,206 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename ElementAccumulator_,
+  typename ElementOutput_,
+  typename ThreadBlockShape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  bool ReduceKForA_
+>
+class EpilogueGemmKReduction {
+
+public:
+
+  using ThreadBlockShape = ThreadBlockShape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  using WarpShape = typename WarpMmaOperator::Shape;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Accumulator element
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Output element
+  using ElementOutput = ElementOutput_;
+
+  /// Output access size
+  static int const kElementsPerAccess = 1;
+
+  static bool const kReduceKForA = ReduceKForA_;
+
+  static int const kThreadBlockSize = kReduceKForA ? ThreadBlockShape::kM : ThreadBlockShape::kN;
+
+  static int const kWarpSize = kReduceKForA ? WarpShape::kM : WarpShape::kN;
+
+  static int const kIterations = kWarpSize / 8;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kIterations>;
+
+private:
+
+  int thread_offset_;
+  ElementOutput* pointer_;
+  int col_;
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueGemmKReduction(
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx,                     ///< Id of thread within warp
+    int threadblock_offset,
+    ElementOutput* pointer 
+  )
+  {
+     col_ = lane_idx % 4;
+     thread_offset_ = threadblock_offset * kThreadBlockSize
+                    + warp_idx * kWarpSize 
+                    + lane_idx / 4 + col_ * 8;
+
+     pointer_ = pointer + LongIndex(thread_offset_);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    int size,
+    FragmentAccumulator &gemm_k_with_reduction_accumulation,
+    bool LoadForSerialSplitK
+  ) {
+      bool guard[kIterations / 4];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        guard[i] = ((thread_offset_ + i * 32) < size);
+      }
+
+      Array<ElementOutput, kIterations / 4> source;
+      source.clear();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        ElementOutput *source_ptr = reinterpret_cast<ElementOutput *>(&source);
+        cutlass::arch::global_load<ElementOutput, sizeof(ElementOutput)>(
+                                                  source_ptr[i],
+                                                  (void *)(pointer_ + i * 32),
+                                                  guard[i] && LoadForSerialSplitK);
+
+      }
+
+      FragmentAccumulator sum = gemm_k_with_reduction_accumulation;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations; ++i) {
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 1);
+        sum[i] += __shfl_xor_sync(0xffffffff, sum[i], 2);
+      }
+
+      Array<ElementAccumulator, kIterations / 4> intermediate;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        if (col_ == 0) {
+          intermediate[i] = sum[0 + i * 4];
+        }
+  
+        if (col_ == 1) {
+          intermediate[i] = sum[1 + i * 4];
+        }
+  
+        if (col_ == 2) {
+          intermediate[i] = sum[2 + i * 4];
+        }
+  
+        if (col_ == 3) {
+          intermediate[i] = sum[3 + i * 4];
+        }
+      }
+
+      NumericArrayConverter<ElementAccumulator, ElementOutput, kIterations / 4> source_converter;
+      Array<ElementAccumulator, kIterations / 4> converted_source = source_converter(source);
+
+      plus<Array<ElementAccumulator, kIterations / 4>> plus_source;
+      intermediate = plus_source(intermediate, converted_source);
+
+      NumericArrayConverter<ElementOutput, ElementAccumulator, kIterations / 4> converter;
+      Array<ElementOutput, kIterations / 4> result = converter(intermediate);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kIterations / 4; ++i) {
+        cutlass::arch::global_store<ElementOutput, sizeof(ElementOutput)>(result[i], 
+                                                (void *)(pointer_ + i * 32), guard[i]);
+      }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eb68f22ab2fdd8bbf42a2a54e928d17ace4411b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator for planar-complex output representations.
+///
+/// Note, as with most CUTLASS components for planar complex, the template arguments describe
+/// the underlying real data type.
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_                         ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+>
+class EpiloguePlanarComplex {
+public:
+  
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  /// Output layout is always row-major
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = ArrayPlanarComplex<
+    typename WarpMmaOperator::FragmentC::Element, 
+    WarpMmaOperator::FragmentC::kElements
+  >;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+  
+  /// Shape of each warp-level operation
+  using WarpShape = typename WarpMmaOperator::Shape;
+
+  /// Number of warps
+  using WarpCount = gemm::GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    kPartitionsK
+  >;
+
+  /// Shared memory allocation
+  struct SharedStorage {
+
+    //
+    // Type definitions
+    //
+
+    /// Element type of shared memory
+    using Element = typename WarpTileIterator::Element;
+
+    /// Tensor reference to shared memory allocation
+    using TensorRef = typename WarpTileIterator::TensorRef;
+
+    /// Layout of shared memory allocation
+    using Layout = typename WarpTileIterator::Layout;
+    
+    /// Logical shape of the shared memory tile written to by all warps.
+    using Shape = MatrixShape<
+      WarpCount::kM * WarpTileIterator::Shape::kRow * WarpCount::kK,
+      WarpCount::kN * WarpTileIterator::Shape::kColumn
+    >;
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      Shape::kRow + Padding::kRow, 
+      Shape::kColumn + Padding::kColumn
+    >;
+
+    static int const kImaginaryStride = StorageShape::kCount;
+
+    //
+    // Data members
+    //
+
+    AlignedBuffer<Element, kImaginaryStride * 2> storage;
+
+    //
+    // Methods
+    //
+
+    /// Returns a pointer to the shared memory buffer
+    CUTLASS_DEVICE
+    Element *data() {
+      return storage.data();
+    }
+
+    /// Returns a tensor reference to the shared memory buffer
+    CUTLASS_DEVICE
+    TensorRef reference() {
+      return TensorRef(
+        storage.data(), 
+        Layout::packed({StorageShape::kRow, StorageShape::kColumn}));
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  SharedStorage &shared_storage_;
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Stores a warp's fragment of accumulators to SMEM
+  WarpTileIterator warp_tile_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePlanarComplex(
+    SharedStorage &shared_storage,    ///< Shared storage object    
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    shared_storage_(shared_storage),
+    shared_load_iterator_(shared_storage.reference(), thread_idx),
+    warp_tile_iterator_(shared_storage.reference(), lane_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_k = warp_idx / (WarpCount::kM * WarpCount::kN);
+    int warp_mn = warp_idx % (WarpCount::kM * WarpCount::kN);
+    int warp_m = warp_mn % WarpCount::kM;
+    int warp_n = warp_mn / WarpCount::kM;
+
+    MatrixCoord warp_offset{warp_k * WarpCount::kM + warp_m, warp_n};
+
+    warp_tile_iterator_.add_tile_offset(warp_offset);
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    OutputTileIterator destination_iterator_real,     ///< Tile iterator for destination
+    OutputTileIterator destination_iterator_imag,     ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator_real,          ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    OutputTileIterator source_iterator_imag) {        ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    typename OutputTileIterator::Fragment source_fragment_real;
+    typename OutputTileIterator::Fragment source_fragment_imag;
+
+    if (!output_op.is_source_needed()) {
+      source_iterator_real.clear_mask();
+      source_iterator_imag.clear_mask();
+    }
+
+    source_fragment_real.clear();
+    source_fragment_imag.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator_real(accumulators.real);
+    AccumulatorFragmentIterator accum_fragment_iterator_imag(accumulators.imag);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator_real.load(source_fragment_real);
+      source_iterator_imag.load(source_fragment_imag);
+
+      ++source_iterator_real;
+      ++source_iterator_imag;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_real;
+      typename AccumulatorFragmentIterator::Fragment accum_fragment_imag;
+
+      accum_fragment_iterator_real.load(accum_fragment_real);
+      accum_fragment_iterator_imag.load(accum_fragment_imag);
+      
+      ++accum_fragment_iterator_real;
+      ++accum_fragment_iterator_imag;
+
+      this->warp_tile_iterator_.store(accum_fragment_real);
+      this->warp_tile_iterator_.store_with_pointer_offset(accum_fragment_imag, SharedStorage::kImaginaryStride);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_real[kPartitionsK];
+      typename SharedLoadIterator::Fragment aligned_accum_fragment_imag[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment_real[0]);
+      shared_load_iterator_.load_with_pointer_offset(aligned_accum_fragment_imag[0], SharedStorage::kImaginaryStride);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      static_assert(kPartitionsK  == 1, "Sliced-K not supported for planar complex at this time");
+    
+      //
+      // Compute the output result
+      //
+     
+      typename OutputTileIterator::Fragment output_fragment_real;
+      typename OutputTileIterator::Fragment output_fragment_imag;
+
+      apply_output_operator_(
+        output_fragment_real, 
+        output_fragment_imag, 
+        output_op, 
+        aligned_accum_fragment_real[0],
+        aligned_accum_fragment_imag[0], 
+        source_fragment_real,
+        source_fragment_imag);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator_real.store(output_fragment_real);
+      destination_iterator_imag.store(output_fragment_imag);
+
+      ++destination_iterator_real;
+      ++destination_iterator_imag;
+    }
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &output_fragment_real,
+    typename OutputTileIterator::Fragment &output_fragment_imag,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_real,
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment_imag,
+    typename OutputTileIterator::Fragment const &source_fragment_real,
+    typename OutputTileIterator::Fragment const &source_fragment_imag) {
+
+    OutputAccessType *output_frag_real_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment_real);
+
+    OutputAccessType *output_frag_imag_ptr = 
+      reinterpret_cast<OutputAccessType *>(&output_fragment_imag);
+
+    AccumulatorAccessType const *compute_frag_real_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_real);
+
+    AccumulatorAccessType const *compute_frag_imag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment_imag);
+
+    OutputAccessType const *source_frag_real_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_real);
+
+    OutputAccessType const *source_frag_imag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment_imag);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      auto result_fragment = output_op(
+        make_ArrayPlanarComplex(compute_frag_real_ptr[i], compute_frag_imag_ptr[i]), 
+        make_ArrayPlanarComplex(source_frag_real_ptr[i], source_frag_imag_ptr[i])
+      );
+
+      output_frag_real_ptr[i] = result_fragment.real;
+      output_frag_imag_ptr[i] = result_fragment.imag;
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4569ee8b503be7824f6e30eb1c29c3b662682fbb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
@@ -0,0 +1,224 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMM/CONV to store accumulator in shared memory after
+    applying scale, bias loaded from global memory and element-wise operations.
+
+    This Epilogue is typically used in fused GEMM/CONV to stage the intermediate accumulator.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename SmemTileIterator_,               ///< Shared memory Tile iterator to output to shared memory
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename ScaleBiasIterator_,              ///< Iterator to load scale and bias from global memory
+  typename OutputOp_                        ///< Output operator
+>
+class EpilogueSmemAccumulator {
+
+public:
+
+  using SmemTileIterator = SmemTileIterator_;
+
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+
+  using ScaleBiasIterator = ScaleBiasIterator_;
+
+  using OutputOp = OutputOp_;
+
+  /// Fragment of accumulator tile
+  using FragmentAccumulator = typename AccumulatorFragmentIterator::Fragment;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentScaleBias = typename ScaleBiasIterator::Fragment;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueSmemAccumulator() {}
+
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator,          ///< Complete warp-level accumulator tile
+    ScaleBiasIterator scale_iterator,             ///< iterator for scale vector in global memory
+    ScaleBiasIterator bias_iterator) {            ///< iterator for bias vector in global memory
+ 
+  
+    // Fragment to load scale bias from global memory
+    FragmentScaleBias tb_frag_scale;
+    FragmentScaleBias tb_frag_bias;
+      
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+  
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+  
+    /// Load scale and bias from global memory
+  
+    if(PerChannelScale)
+        scale_iterator.load(tb_frag_scale);
+  
+    bias_iterator.load(tb_frag_bias);
+  
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+    
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+  
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using ScaleBiasAccessType = typename OutputOp::FragmentScaleBias;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+  
+  
+        ScaleBiasAccessType const * scale_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_scale);
+        ScaleBiasAccessType const * bias_frag_ptr =
+          reinterpret_cast<ScaleBiasAccessType const *>(&tb_frag_bias);
+   
+        FragmentSmemAccessType * smem_frag_ptr =  
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+  
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+  
+          AccumulatorAccessType const * accumulator_frag_ptr = 
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it],
+                scale_frag_ptr[cid * kOutputIterations + it], bias_frag_ptr[cid * kOutputIterations + it]);
+          }
+        }
+  
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+  
+      }
+    }
+  }
+
+  /// Streams the result to shared memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                    ///< Output operator
+    SmemTileIterator smem_iterator,               ///< Tile iterator for destination in shared memory
+    AccumulatorTile const &accumulator) {          ///< Complete warp-level accumulator tile
+ 
+    /// Fragment Iterator to load slice of accumulator tile
+    AccumulatorFragmentIterator frag_iterator_accum(accumulator);
+    FragmentAccumulator tb_frag_accum;
+  
+    /// Epilogue output fragment
+    typename SmemTileIterator::Fragment tb_frag_smem;
+  
+    /// Iterate over the accumulator tile and store to shared memory
+    CUTLASS_PRAGMA_UNROLL
+    for (int rid = 0; rid < AccumulatorFragmentIterator::TileIterations::kRow; ++rid) {
+    
+      CUTLASS_PRAGMA_UNROLL
+      for (int cid = 0; cid < AccumulatorFragmentIterator::TileIterations::kColumn; ++cid) {
+  
+        using AccumulatorAccessType = typename OutputOp::FragmentAccumulator;
+        using FragmentSmemAccessType = typename OutputOp::FragmentOutput;
+  
+        FragmentSmemAccessType * smem_frag_ptr =  
+          reinterpret_cast<FragmentSmemAccessType *>(&tb_frag_smem);
+  
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < AccumulatorFragmentIterator::kIterationsPerTile; ++idx) {
+          frag_iterator_accum.load(tb_frag_accum);
+          ++frag_iterator_accum;
+  
+          AccumulatorAccessType const * accumulator_frag_ptr = 
+            reinterpret_cast<AccumulatorAccessType const *>(&tb_frag_accum);
+          const int kOutputIterations = FragmentAccumulator::kElements / OutputOp::kCount;
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int it = 0; it < kOutputIterations; it++) {
+            smem_frag_ptr[idx * kOutputIterations + it] = output_op(accumulator_frag_ptr[it]);
+          }
+        }
+  
+        smem_iterator.store(tb_frag_smem);
+        ++smem_iterator;
+  
+      }
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+ 
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..17cfbcf443f3617d476b897af452d24148ff960e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
@@ -0,0 +1,443 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+
+#include CUDA_STD_HEADER(cassert)
+
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueStreamkWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueStreamkWithBroadcastOpBase : EpilogueWithBroadcastOpBase<
+                                            ElementC_,
+                                            ElementAccumulator_,
+                                            ElementCompute_,
+                                            ElementZ_,
+                                            ElementT_,
+                                            ElementsPerAccess,
+                                            StoreZ,
+                                            StoreT
+                                            > 
+{
+
+  /// Parameters structure - required
+  struct Params { };
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  EpilogueStreamkWithBroadcastOpBase(Params const &params_) { }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }  
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }  
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueStreamkWithBroadcast;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Two sources
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    false>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+      OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator1, source_iterator2, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// EpilogueStreamkWithBroadcast: Single source
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueStreamkWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueWithBroadcast<
+    Shape_,
+    WarpMmaOperator_,
+    PartitionsK,
+    OutputTileIterator_,
+    TensorTileIterator_,
+    ElementVector_,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    SharedLoadIterator_,
+    OutputOp_,
+    Padding_,
+    FragmentsPerPartition,
+    IterationsUnroll,
+    true>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename Base::AccumulatorFragmentIterator::Fragment;
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  using SharedStorage = typename Base::SharedStorage;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueStreamkWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx)
+  { }
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      ElementVector const * broadcast_ptr,            ///< Broadcast vector
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+      TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+      MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+          MatrixCoord(Shape::kM, Shape::kN),
+      MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+          MatrixCoord()) 
+  {
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    Base::reduce(reduce_fragment_idx, output_op, broadcast_ptr, destination_iterator, source_iterator, tensor_iterator, problem_size, threadblock_offset);
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..8459a72a76ffbe0bccb33d442c2731b15361b596
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
+
+  The epilogue finds max values in each row of the row-major output matrix and stores them.
+  The max values are also used for a further round of threadblock scoped reduction operation, where
+  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <
+  typename ThreadblockShape_,
+  int ThreadCount,
+  typename OutputTileIterator_,
+  typename ElementAccumulator_,
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ElementwiseFunctor_,
+  bool UseMasking_ = false
+>
+class EpilogueVisitorSoftmax {
+public:
+
+  using ThreadblockShape   = ThreadblockShape_;
+  static int const kThreadCount = ThreadCount;
+
+  using OutputTileIterator = OutputTileIterator_;
+  using ElementwiseFunctor = ElementwiseFunctor_;
+
+  static int const kIterations = OutputTileIterator::kIterations;
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  using ElementOutput = typename OutputTileIterator::Element;
+  using LayoutOutput = cutlass::layout::RowMajor;
+  using ElementAccumulator = ElementAccumulator_;
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+  using SoftmaxFragment = Array<ElementSoftmaxCompute, kElementsPerAccess>;
+  using OutputVector = Array<ElementOutput, kElementsPerAccess>;
+  using TensorRefD = TensorRef<ElementOutput, LayoutOutput>;
+
+  static int const kThreadsPerRow = OutputTileIterator::ThreadMap::Detail::kAccessWidth;
+  static bool const kHasMultiStepsInRow = (OutputTileIterator::ThreadMap::Iterations::kColumn > 1);
+  static bool const kUseMasking = UseMasking_;
+
+  /// Argument structure
+  struct Arguments {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+
+    //
+    // Methods
+    //
+    Arguments():
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(0),
+      batch_stride_D(0),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    Arguments(
+      typename ElementwiseFunctor::Params   elementwise_,
+      int64_t                               batch_stride_C_,
+      int64_t                               batch_stride_D_,
+      int64_t                               batch_stride_Max_,
+      int64_t                               batch_stride_Sum_
+    ):
+      elementwise(elementwise_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      batch_stride_Max(batch_stride_Max_),
+      batch_stride_Sum(batch_stride_Sum_)
+    {
+
+    }
+
+  };
+
+  struct Params {
+
+    typename ElementwiseFunctor::Params   elementwise;
+    int64_t                               batch_stride_C;
+    int64_t                               batch_stride_D;
+    int64_t                               batch_stride_Max;
+    int64_t                               batch_stride_Sum;
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params()
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args):
+      elementwise(args.elementwise),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Max(args.batch_stride_Max),
+      batch_stride_Sum(args.batch_stride_Sum)
+    {
+
+    }
+  };
+
+  /// Shared storage
+  struct SharedStorage {
+
+  };
+
+private:
+
+  Params const &                        params_;
+  SharedStorage &                       shared_storage_;
+  MatrixCoord                           extent_;
+  MatrixCoord                           extent_real_;
+  ElementwiseFunctor                    elementwise_;
+
+  OutputTileIterator                    iterator_C_;
+  OutputTileIterator                    iterator_D_;
+  typename OutputTileIterator::Fragment fragment_C_;
+  typename OutputTileIterator::Fragment fragment_D_;
+
+  ElementAccumulator                    alpha_;
+  ElementAccumulator                    beta_;
+
+  ElementNorm                           *ptr_Max_;
+  ElementSum                            *ptr_Sum_;
+
+  int                                   column_offset_;
+
+  ElementSoftmaxCompute                 accum_max_;
+  ElementSoftmaxCompute                 accum_sum_;
+
+  MatrixCoord                           thread_offset_;
+
+  float                                 infinity_;
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueVisitorSoftmax(
+    Params const &params,
+    SharedStorage &shared_storage,
+    cutlass::MatrixCoord const &problem_size,
+    int thread_idx,
+    int warp_idx,
+    int lane_idx,
+    typename OutputTileIterator::Params params_C,
+    typename OutputTileIterator::Params params_D,
+    typename OutputTileIterator::Element *ptr_C,
+    typename OutputTileIterator::Element *ptr_D,
+    ElementNorm *ptr_Max = nullptr,
+    ElementSum *ptr_Sum = nullptr,
+    cutlass::MatrixCoord const &threadblock_offset = cutlass::MatrixCoord(0, 0),
+    int column_offset = 0,
+    cutlass::MatrixCoord const &problem_size_real = cutlass::MatrixCoord(0, 0),
+    float infinity = 10000.0f
+  ):
+    params_(params),
+    shared_storage_(shared_storage),
+    extent_(problem_size),
+    elementwise_(params.elementwise),
+    iterator_C_(params_C, ptr_C, problem_size, thread_idx, threadblock_offset),
+    iterator_D_(params_D, ptr_D, problem_size, thread_idx, threadblock_offset),
+    ptr_Max_(ptr_Max),
+    ptr_Sum_(ptr_Sum),
+    column_offset_(column_offset),
+    extent_real_(problem_size_real),
+    infinity_(infinity)
+  {
+    alpha_ = (params.elementwise.alpha_ptr ? *params.elementwise.alpha_ptr : params.elementwise.alpha);
+    beta_ =  (params.elementwise.beta_ptr ? *params.elementwise.beta_ptr : params.elementwise.beta);
+
+    if (beta_ == ElementAccumulator()) {
+      iterator_C_.clear_mask();
+    }
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+    iterator_C_.add_pointer_offset(batch_idx * params_.batch_stride_C);
+    iterator_D_.add_pointer_offset(batch_idx * params_.batch_stride_D);
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+    fragment_D_.clear();
+    fragment_C_.clear();
+
+    if (elementwise_.kScale != cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      iterator_C_.load(fragment_C_);
+      ++iterator_C_;
+    }
+    
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+    // Clear accumulators for max and sum when starting a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+    using Mul = cutlass::multiplies<SoftmaxFragment>;
+    using Minus = cutlass::minus<SoftmaxFragment>;
+    using Exp   = cutlass::fast_exp_op<SoftmaxFragment>;
+
+    Minus     minus;
+    Exp       exponential;
+
+    SoftmaxFragment result;
+
+    NumericArrayConverter<ElementSoftmaxCompute, ElementOutput, kElementsPerAccess> source_converter;
+    OutputVector &source_vector = reinterpret_cast<OutputVector *>(&fragment_C_)[frag_idx];
+
+    if (elementwise_.kScale == cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling) {
+      result = source_converter(elementwise_(accum));
+    }else{
+      result = source_converter(elementwise_(accum, source_vector));
+    }
+
+    thread_offset_ =
+      iterator_D_.thread_start() +
+      OutputTileIterator::ThreadMap::iteration_offset(frag_idx);
+
+    bool column_guard = (thread_offset_.column() < extent_.column());
+
+    if (kUseMasking) {
+      int elements_in_boundary = extent_real_.column() - thread_offset_.column();
+      elements_in_boundary = (elements_in_boundary > kElementsPerAccess) ? kElementsPerAccess : elements_in_boundary;
+      elementwise_padding_(result, elements_in_boundary);
+    }
+
+    ElementSoftmaxCompute accum_max_prev = accum_max_;
+
+    // Compute the maximum within one row
+    if (!column_idx) {
+      // This is the first fragment in a new row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result);
+      }
+    }
+    else {
+      // This is an additional fragment in the same row
+      if (column_guard) {
+        accum_max_ = maximum_accumulator_(result, accum_max_);
+      }
+    }
+
+    // proactively compute max in warps
+    accum_max_ = warp_reduce_max_(accum_max_);
+
+    ElementSoftmaxCompute updater = fast_exp(accum_max_prev - accum_max_);
+
+    SoftmaxFragment intermediate = exponential(minus(result, accum_max_));
+
+    if (kHasMultiStepsInRow) {
+      if (!column_idx) {
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate) : ElementSoftmaxCompute(0);
+      } else {
+        // Algorithm in $3.1, https://arxiv.org/pdf/2205.14135v1.pdf
+        // S* = S* x updater + sum_row(P'), where updater = exp(M* - M_row)
+        accum_sum_ = (column_guard) ? \
+          sum_accumulator_(intermediate, accum_sum_ * updater) : accum_sum_ * updater;
+      }
+    } else {
+      accum_sum_ = (column_guard) ? sum_accumulator_(intermediate, accum_sum_) : ElementSoftmaxCompute(0);
+    }
+
+    // Convert to the output
+    NumericArrayConverter<ElementOutput, ElementSoftmaxCompute, kElementsPerAccess> output_converter;
+    OutputVector &output = reinterpret_cast<OutputVector *>(&fragment_D_)[frag_idx];
+    output = output_converter(result);
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    // Compute accumulate sum only in the last step
+    accum_sum_ = warp_reduce_sum_(accum_sum_);
+
+    bool is_first_thread_in_tile = ((threadIdx.x % kThreadsPerRow) == 0);
+    bool row_guard = thread_offset_.row() < extent_.row();
+    bool is_write_thread = row_guard && is_first_thread_in_tile;
+
+    int block_batch = blockIdx.z;
+
+    ElementNorm *curr_ptr_max = ptr_Max_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Max;
+    ElementSum *curr_ptr_sum = ptr_Sum_ + thread_offset_.row() + column_offset_ + block_batch * params_.batch_stride_Sum;
+
+    arch::global_store<ElementNorm, sizeof(ElementNorm)>(
+              convert_norm_output(accum_max_),
+              (void *)curr_ptr_max,
+              is_write_thread);
+
+    arch::global_store<ElementSum, sizeof(ElementSum)>(
+              convert_sum_output(accum_sum_),
+              (void *)curr_ptr_sum,
+              is_write_thread);
+
+    // Clear accumulators for max and sum when finishing a whole row
+    clear_accum_();
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+    iterator_D_.store(fragment_D_);
+    ++iterator_D_;
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void elementwise_padding_(SoftmaxFragment &result, int elements_in_boundary) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      result[i] = (i < elements_in_boundary) ? result[i] : ElementSoftmaxCompute(-infinity_);
+    }
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_sum_(ElementSoftmaxCompute sum_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, sum_, i);
+      sum_ += tmp;
+    }
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute warp_reduce_max_(ElementSoftmaxCompute max_) {
+    int half_thread_in_row = (kThreadsPerRow >> 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = half_thread_in_row; i > 0; i >>= 1) {
+      ElementSoftmaxCompute tmp = __shfl_xor_sync(0xFFFFFFFF, max_, i);
+      max_ = fast_max(max_, tmp);
+    }
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  void clear_accum_() {
+
+    uint32_t float_max_bits = 0xff7fffff;   // -FLT_MAX
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+    accum_max_ = ElementSoftmaxCompute(min_float);
+    accum_sum_ = ElementSoftmaxCompute(0);
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute sum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute sum_) {
+    // ElementSoftmaxCompute sum_ = ElementSoftmaxCompute(0);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      sum_ += ElementSoftmaxCompute(accum[i]);
+    }
+
+    return sum_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum) {
+    ElementSoftmaxCompute max_ = accum[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+
+  CUTLASS_DEVICE
+  ElementSoftmaxCompute maximum_accumulator_(SoftmaxFragment const &accum, ElementSoftmaxCompute max_) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < SoftmaxFragment::kElements; ++i) {
+      max_ = fast_max(max_, ElementSoftmaxCompute(accum[i]));
+    }
+
+    return max_;
+  }
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..8573524005d69c5e5f12759409d18bb8ff965c59
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
@@ -0,0 +1,922 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+
+  \brief Threadblock-level epilogue computing:
+    Aux = ((alpha * scale_a * scale_b) * accumulator) + ((beta * scale_c) * source) + bias
+    D = activation(Aux)
+
+    if Aux is fp8 type:
+        abs_max_output = max( abs(aux) | (for every aux in Aux))
+        Aux = scale_aux * Aux
+    endif
+
+    if D is fp8 type:
+        abs_max_output = max( abs(d) | (for every d in D))
+        D = scale_d * D
+    endif
+
+    Parameter Aux is optionally stored to global memory
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Helper class for keeping track of absolute maximums and performing scaling
+template <
+  typename Iterator,        // Iterator type used for storing the data for which absolute maximum and scaling
+                            // will be computed. This type is used for predicating absolute maximum calculations.
+  typename Fragment,        // Type of input to be computed on
+  bool ScalingAndAmaxNeeded // Whether to perform absolute maximum and scaling operations
+>
+struct ScalingAndAmaxHelper;
+
+/// Partial specialization that does not perform scaling or calculate an absolute maximum
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, false> {
+  using Element = typename Fragment::Element;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale) { }
+
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& inp) {
+    return inp;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return Element(0.);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) { }
+};
+
+/// Partial specialization that keeps track of an absolute maximum value of inputs seen
+/// and scales inputs
+template <typename Iterator, typename Fragment>
+struct ScalingAndAmaxHelper<Iterator, Fragment, true> {
+  using Element = typename Fragment::Element;
+  using AccessType = typename Iterator::AccessType;
+  using ThreadMap = typename Iterator::ThreadMap;
+
+  Element abs_max;
+  Element scale;
+
+  // Operators
+  maximum_with_nan_propogation<Element> max_op;
+  absolute_value_op<Element> abs_op;
+  multiplies<Fragment> multiply;
+
+  CUTLASS_HOST_DEVICE
+  ScalingAndAmaxHelper(Element scale_) : abs_max(0.), scale(scale_) { }
+
+  // Compute the absolute maximum value between `abs_max` and the entries
+  // of `frag` for predicated-on entries of `iterator`. Return a scaled
+  // version of `inp`.
+  CUTLASS_DEVICE
+  Fragment operator()(const Iterator& iterator, const Fragment& frag) {
+    using PredicateGroup = Array<Element, Iterator::ThreadMap::kElementsPerAccess>;
+    PredicateGroup const *frag_ptr = reinterpret_cast<PredicateGroup const *>(&frag);
+
+    typename Iterator::Mask mask;
+    iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + iterator.thread_start_row()) < iterator.extent_row());
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            bool guard = row_guard && mask.predicates[column];
+
+            if (guard) {
+              int access_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+              CUTLASS_PRAGMA_UNROLL
+              for (int i = 0; i < PredicateGroup::kElements; ++i) {
+                abs_max = max_op(abs_max, abs_op(frag_ptr[access_idx][i]));
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Perform scaling
+    return multiply(scale, frag);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Element get_abs_max() const {
+    return abs_max;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_scaling_factor(Element scale_) {
+    scale = scale_;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AuxOutputTileIterator_,          ///< Tile iterator writing auxiliary output tensors
+  typename ElementVector_,                  ///< Data type of bias vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsen the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithAbsMax :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AuxOutputTileIterator = AuxOutputTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Data type used for absolute maximum value
+  using ElementAbsmax = typename OutputOp::ElementAbsmax;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Helpers for (optionally) computing absolute maximums and scaling output and auxiliary output
+  using OutputScaler = detail::ScalingAndAmaxHelper<OutputTileIterator,
+                                                    FragmentCompute,
+                                                    OutputOp::kIsScalingAndAmaxOutputNeeded>;
+
+  using AuxOutputScaler = detail::ScalingAndAmaxHelper<AuxOutputTileIterator,
+                                                       FragmentCompute,
+                                                       OutputOp::kIsScalingAndAmaxAuxOutputNeeded>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute,
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of auxiliary output
+  using ElementAuxOutput = typename AuxOutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Auxiliary output access type
+  using AuxAccessType = Array<ElementAuxOutput, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithAbsMax(
+    SharedStorage &shared_storage,                    ///< Shared storage object
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp &output_op,                              ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    OutputScaler output_scaler(output_op.get_scale_d());
+
+    AuxOutputScaler aux_scaler(output_op.get_scale_aux());
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+    else {
+      compute_source_needed_(
+        output_op,
+        broadcast_fragment,
+        destination_iterator,
+        accumulators,
+        source_iterator,
+        aux_iterator,
+        output_scaler,
+        aux_scaler);
+    }
+
+    // Store the absolute maximum values of the output and auxiliar tensors, if needed.
+    if (output_op.get_ptr_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(output_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_output_abs_max(), local_abs_max);
+    }
+
+    if (output_op.get_ptr_aux_output_abs_max() != nullptr) {
+      ElementAbsmax local_abs_max =
+          NumericConverter<ElementAbsmax, ElementCompute, OutputOp::kRound>{}(aux_scaler.get_abs_max());
+      atomic_maximum<ElementAbsmax>{}(
+        output_op.get_ptr_aux_output_abs_max(), local_abs_max);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp &output_op,                              ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        FragmentCompute frag_Z_compute;
+        FragmentCompute frag_Aux_compute;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z_compute,
+          frag_Aux_compute,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+        frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+        NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                              OutputTileIterator::Fragment::kElements> cvt_to_dst;
+        typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+        // Always store the output
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+
+        // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+        if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+          frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+          NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                                AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+          typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+          aux_iterator.store(frag_Aux);
+          ++aux_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp &output_op,                          ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    AuxOutputTileIterator aux_iterator,               ///< Tile iterator for destination auxiliary output
+    OutputScaler& output_scaler,                      ///< Helper for (optionally) computing the absolute maximum and scaling output
+    AuxOutputScaler& aux_scaler                       ///< Helper for (optionally) computing the absolute maximum and scaling the auxiliary output
+    ) {
+
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      FragmentCompute frag_Z_compute;
+      FragmentCompute frag_Aux_compute;
+
+      apply_output_operator_(
+        frag_Z_compute,
+        frag_Aux_compute,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      // (Optionally) compute the absolute maximum of frag_Z and scale frag_Z
+      frag_Z_compute = output_scaler(destination_iterator, frag_Z_compute);
+      NumericArrayConverter<typename OutputTileIterator::Fragment::Element, ElementCompute,
+                            OutputTileIterator::Fragment::kElements> cvt_to_dst;
+      typename OutputTileIterator::Fragment frag_Z = cvt_to_dst(frag_Z_compute);
+
+      // Always store the output
+      destination_iterator.store(frag_Z);
+      ++destination_iterator;
+
+      // Only store the auxiliary output if scaling and absolute-maximum calculation were needed
+      if (OutputOp::kIsScalingAndAmaxAuxOutputNeeded) {
+        frag_Aux_compute = aux_scaler(aux_iterator, frag_Aux_compute);
+
+        NumericArrayConverter<typename AuxOutputTileIterator::Fragment::Element, ElementCompute,
+                              AuxOutputTileIterator::Fragment::kElements> cvt_to_aux;
+        typename AuxOutputTileIterator::Fragment frag_Aux = cvt_to_aux(frag_Aux_compute);
+        aux_iterator.store(frag_Aux);
+        ++aux_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_Aux_ptr[i],
+          frag_AB_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn],
+          frag_C_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    FragmentCompute &frag_Z,
+    FragmentCompute &frag_Aux,
+    OutputOp &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeAux = Array<ElementCompute, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeAux *frag_Aux_ptr = reinterpret_cast<AccessTypeAux *>(&frag_Aux);
+
+    AccumulatorAccessType const *frag_AB_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations =
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i],
+        frag_Aux_ptr[i],
+        frag_AB_ptr[i],
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9cf5e18c805fca0418a534267fc8c7674881efe
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
@@ -0,0 +1,1717 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(utility)
+#else
+#include <utility>
+#endif
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  bool StoreZ = true,
+  bool StoreT = true
+>
+struct EpilogueWithBroadcastOpBase {
+  
+  using ElementOutput = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementOutput, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = StoreZ;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT;
+
+  /// Parameters structure - required
+  struct Params { };
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  EpilogueWithBroadcastOpBase(Params const &params_) { }
+
+  /// Determine if the source is needed. May return false if 
+  bool is_source_needed() const {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) { }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C1,
+    FragmentC const &frag_C2,
+    FragmentCompute const &V) const {
+
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z, 
+    FragmentT &frag_T, 
+    FragmentAccumulator const &AB,
+    FragmentCompute const &V) const {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with bias vector broadcast over columns.
+///
+/// Computes the following:
+///
+///
+///  Z, T = OutputOp(AB, C, Broadcast)
+///
+///  if (ElementwiseOp::kStoreZ) {
+///    store(converted_u);
+///  }  
+///
+///  if (ElementwiseOp::kStoreT) {
+///    store(v);
+///  }  
+///
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors (z)
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands (t)
+  typename ElementVector_,                  ///< Pointer to broadcast vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator - concept is EpilogueWithBroadcastOp
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+  bool IsSingleSource = OutputOp_::kIsSingleSource
+>
+class EpilogueWithBroadcast;
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  false
+> : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = false;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,              ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,              ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator1,
+        source_iterator2,
+        tensor_iterator);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+    
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+      
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator1,          ///< Tile iterator for first source accumulator matrix
+    OutputTileIterator source_iterator2,          ///< Tile iterator for second source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment1;
+    source_fragment1.clear();
+    typename OutputTileIterator::Fragment source_fragment2;
+    source_fragment2.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator1.load(source_fragment1);
+      ++source_iterator1;
+
+      source_iterator2.load(source_fragment2);
+      ++source_iterator2;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment1,
+        source_fragment2,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C1,
+    typename OutputTileIterator::Fragment const &frag_C2,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C1_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C1);
+
+    OutputAccessType const *frag_C2_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C2);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C1_ptr[i],
+          frag_C2_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i], 
+        frag_T_ptr[i], 
+        frag_AB_ptr[i], 
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator1,            ///< Tile iterator for first  source accumulator matrix
+        OutputTileIterator source_iterator2,            ///< Tile iterator for second source accumulator matrix
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment1;
+      source_fragment1.clear();
+      typename OutputTileIterator::Fragment source_fragment2;
+      source_fragment2.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator1 += reduce_fragment_idx;
+        source_iterator1.load(source_fragment1);
+
+        source_iterator2 += reduce_fragment_idx;
+        source_iterator2.load(source_fragment2);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment1,
+          source_fragment2,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator += reduce_fragment_idx;
+        destination_iterator.store(frag_Z);
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator += reduce_fragment_idx;
+        tensor_iterator.store(frag_T);
+      }
+    }
+};
+
+
+template <
+  typename Shape_,
+  typename WarpMmaOperator_,
+  int PartitionsK,
+  typename OutputTileIterator_,
+  typename TensorTileIterator_,
+  typename ElementVector_,
+  typename AccumulatorFragmentIterator_,
+  typename WarpTileIterator_,
+  typename SharedLoadIterator_,
+  typename OutputOp_,
+  typename Padding_,
+  int FragmentsPerPartition,
+  int IterationsUnroll
+>
+class EpilogueWithBroadcast<
+  Shape_,
+  WarpMmaOperator_,
+  PartitionsK,
+  OutputTileIterator_,
+  TensorTileIterator_,
+  ElementVector_,
+  AccumulatorFragmentIterator_,
+  WarpTileIterator_,
+  SharedLoadIterator_,
+  OutputOp_,
+  Padding_,
+  FragmentsPerPartition,
+  IterationsUnroll,
+  true
+> : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_,
+    FragmentsPerPartition>;
+
+  static bool const kIsSingleSource = true;
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used to store the broadcast values
+  using BroadcastFragment = Array<
+    ElementCompute, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  /// Used for the broadcast
+  struct BroadcastDetail {
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("BroadcastDetail {\n");
+      printf(
+        "  kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithBroadcast(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector const * broadcast_ptr,              ///< Broadcast vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    BroadcastFragment broadcast_fragment;
+
+    load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        broadcast_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator,
+        tensor_iterator);
+    }
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void load_broadcast_fragment_(
+    BroadcastFragment & broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    ElementVector const * broadcast_ptr,         ///< Broadcast vector
+    MatrixCoord const &problem_size,             ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset        ///< Threadblock's initial offset within the problem size space
+    ) {
+
+    broadcast_fragment.clear();
+    
+    // If no pointer is supplied, set with all zeros and avoid memory accesses
+    if (!broadcast_ptr) {
+      return;
+    }
+
+    int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column();
+
+    int thread_column_idx = threadblock_offset.column() + thread_initial_column;
+    broadcast_ptr += thread_initial_column;
+
+    NumericArrayConverter<ElementCompute, ElementVector, BroadcastDetail::kElementsPerAccess> converter;
+    using AccessType = AlignedArray<ElementVector, BroadcastDetail::kElementsPerAccess>;
+    using ComputeFragmentType = Array<ElementCompute, BroadcastDetail::kElementsPerAccess>;
+
+    ComputeFragmentType *frag_ptr = reinterpret_cast<ComputeFragmentType *>(&broadcast_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) {
+
+      AccessType loaded;
+
+      loaded.clear();
+
+      if (thread_column_idx < problem_size.column()) {
+        loaded = *reinterpret_cast<AccessType const *>(broadcast_ptr);
+      }
+
+      ComputeFragmentType cvt = converter(loaded);
+      frag_ptr[j] = cvt;
+
+      thread_column_idx += ThreadMap::Delta::kColumn;
+      broadcast_ptr += ThreadMap::Delta::kColumn;
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                                      WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(kSmemPointerOffset *
+                                              (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(iterator_begin, warp_tile_iterator), 0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    BroadcastFragment const &broadcast_fragment,      ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator                ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    // CUTLASS_PRAGMA_UNROLL
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration) {
+
+      //
+      // Convert and store fragment
+      //
+      
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations /
+                                   Base::kFragmentsPerIteration>>::push(iter,
+                                                                        accum_fragment_iterator,
+                                                                        this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+        else if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Apply output operation
+        //
+
+        typename OutputTileIterator::Fragment frag_Z;
+        typename TensorTileIterator::Fragment frag_T;
+
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+
+        //
+        // Conditionally store fragments
+        //
+
+        if (OutputOp::kStoreZ) {
+          destination_iterator.store(frag_Z);
+          ++destination_iterator;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_iterator.store(frag_T);
+          ++tensor_iterator;
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    BroadcastFragment const &broadcast_fragment,  ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator            ///< Threadblock tile iterator for additioanl tensor operand
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      apply_output_operator_(
+        frag_Z,
+        frag_T,
+        output_op,
+        aligned_accum_fragment[0],
+        source_fragment,
+        broadcast_fragment);
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    typename OutputTileIterator::Fragment const &frag_C,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    OutputAccessType const *frag_C_ptr =
+      reinterpret_cast<OutputAccessType const *>(&frag_C);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+        output_op(
+          frag_Z_ptr[i],
+          frag_T_ptr[i],
+          frag_AB_ptr[i],
+          frag_C_ptr[i],
+          frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    typename OutputTileIterator::Fragment &frag_Z,
+    typename TensorTileIterator::Fragment &frag_T,
+    OutputOp const &output_op,
+    typename SharedLoadIterator::Fragment const &frag_AB,
+    BroadcastFragment const &frag_Broadcast) {
+
+    using AccessTypeZ = Array<typename OutputTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeT = Array<typename TensorTileIterator::Element, kElementsPerAccess>;
+    using AccessTypeBroadcast = Array<ElementCompute, kElementsPerAccess>;
+
+    AccessTypeZ *frag_Z_ptr = reinterpret_cast<AccessTypeZ *>(&frag_Z);
+    AccessTypeT *frag_T_ptr = reinterpret_cast<AccessTypeT *>(&frag_T);
+    
+    AccumulatorAccessType const *frag_AB_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&frag_AB);
+
+    AccessTypeBroadcast const *frag_Broadcast_ptr =
+      reinterpret_cast<AccessTypeBroadcast const *>(&frag_Broadcast);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      output_op(
+        frag_Z_ptr[i], 
+        frag_T_ptr[i], 
+        frag_AB_ptr[i], 
+        frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]);
+    }
+  }
+
+
+  public:
+    /// Stream-K reduce helper
+    CUTLASS_DEVICE
+    void reduce(
+        int reduce_fragment_idx,                        ///< Reduce fragment index
+        OutputOp const &output_op,                      ///< Output operator
+        ElementVector const * broadcast_ptr,            ///< Broadcast vector
+        OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+        OutputTileIterator source_iterator,             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+        TensorTileIterator tensor_iterator,             ///< Threadblock tile iterator for additional tensor operand
+        MatrixCoord const &problem_size =               ///< Problem size needed to guard against out-of-bounds accesses
+            MatrixCoord(Shape::kM, Shape::kN),
+        MatrixCoord const &threadblock_offset =         ///< Threadblock's initial offset within the problem size space
+            MatrixCoord()) 
+    {
+
+      BroadcastFragment broadcast_fragment;
+      load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset);
+
+      // Initialize/load source-fragment data
+      typename OutputTileIterator::Fragment source_fragment;
+      source_fragment.clear();
+
+      if (output_op.is_source_needed())
+      {
+        source_iterator += reduce_fragment_idx;
+        source_iterator.load(source_fragment);
+      }
+
+      // Load fragment from shared memory
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // Add fragments shared by other k partitions
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+      }
+
+      //
+      // Apply output operation
+      //
+
+      typename OutputTileIterator::Fragment frag_Z;
+      typename TensorTileIterator::Fragment frag_T;
+
+      if (!output_op.is_source_needed()) {
+        apply_output_operator_source_not_needed_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          broadcast_fragment);
+      } else {
+        apply_output_operator_(
+          frag_Z,
+          frag_T,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment,
+          broadcast_fragment);
+      }
+
+      //
+      // Conditionally store fragments
+      //
+
+      if (OutputOp::kStoreZ) {
+        destination_iterator.store(frag_Z);
+        ++destination_iterator;
+      }
+
+      if (OutputOp::kStoreT) {
+        tensor_iterator.store(frag_T);
+        ++tensor_iterator;
+      }
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f5567f11fc2ad7c5ac5c1161097bbd9e5e1046
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
@@ -0,0 +1,819 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator with reduction over each column 
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename TensorTileIterator_,             ///< Additional tile iterator for tensor-valued operands
+  typename ElementVector_,                  ///< Pointer to reduction vector
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename ReductionOp_,                    ///< Reduction operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class EpilogueWithReduction : 
+  public EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_> {
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_, 
+    typename WarpMmaOperator_::Shape, 
+    PartitionsK, 
+    AccumulatorFragmentIterator_, 
+    WarpTileIterator_, 
+    Padding_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using TensorTileIterator = TensorTileIterator_;
+  using ElementVector = ElementVector_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using ReductionOp = ReductionOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  static bool const kIsSingleSource = true;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Compute data type produced by the output op
+  using ElementCompute = typename OutputOp::ElementCompute;
+
+  /// Compute fragment
+  using FragmentCompute = Array<ElementCompute, OutputTileIterator::Fragment::kElements>;
+
+  /// Thread map used by output tile iterators
+  using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+  /// Fragment object used in reduction
+  using ReductionFragment = Array<
+    ElementAccumulator, 
+    ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess>;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Data type of additional tensor
+  using ElementTensor = typename TensorTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<typename WarpTileIterator::Element, OutputTileIterator::kElementsPerAccess>; 
+
+  /// Array type used by output functor
+  using ComputeAccessType = Array<ElementCompute, OutputTileIterator::kElementsPerAccess>;
+
+  /// Tensor access type
+  using TensorAccessType = Array<ElementTensor, OutputTileIterator::kElementsPerAccess>;
+  
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  /// Shared memory allocation from epilogue base class
+  using BaseSharedStorage = typename Base::SharedStorage;
+
+  /// Used for the reduction
+  struct ReductionDetail {
+
+    /// If true, accumulator coordinates are computed and out-of-bounds checks are enabled when
+    /// performing the reduction.
+    static bool const kOobCheck = false;
+
+    /// Number of threads per warp
+    static int const kWarpSize = 32;
+
+    /// Number of distinct scalar column indices handled by each thread
+    static int const kColumnsPerThread = ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess;
+
+    /// Number of distinct scalar row indices handled by each thread
+    static int const kRowsPerThread = ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn;
+
+    /// Number of threads per threadblock
+    static int const kThreadCount = kWarpSize * WarpCount::kCount;
+
+    /// Number of distinct threads per row of output tile
+    static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread);
+
+    /// Number of distinct threads which must be reduced during the final reduction phase within the threadblock.
+    static int const kThreadRows = kThreadCount / kThreadsPerRow;
+
+    /// I'm not sure what I meant here.
+    static int const kThreadAccessesPerRow = const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount);
+
+    /// Shape of the shared memory allocation for the epilogue    
+    using StorageShape = MatrixShape<
+      kThreadRows,
+      Shape::kN
+    >;
+
+    /// Debug printing
+    CUTLASS_DEVICE
+    static void print() {
+#if 0
+      printf("ReductionDetail {\n");
+      printf(
+        "  kElementsPerAccess:%d\nkColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n"
+        "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n",
+        kElementsPerAccess,
+        kColumnsPerThread,
+        kRowsPerThread,
+        kThreadCount,
+        kThreadsPerRow,
+        kThreadRows,
+        kThreadAccessesPerRow,
+        StorageShape::kRow,
+        StorageShape::kColumn,
+        StorageShape::kCount
+      );
+      printf("};\n");
+#endif
+    }
+  };
+
+  /// Shared storage structure (shadows base) with additional SMEM buffer for reduction
+  struct SharedStorage {
+    union {
+      BaseSharedStorage base;
+      AlignedArray<ElementAccumulator, ReductionDetail::StorageShape::kCount, 16> reduction;    ///< Shared storage for reduction
+    };
+
+    CUTLASS_HOST_DEVICE
+    SharedStorage() { }
+  };
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), 
+    "Divisibility");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Shared memory pointer fo rreduction
+  ElementAccumulator *reduction_ptr_;
+
+  /// Thread index within the threadblock
+  int thread_idx_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithReduction(
+    SharedStorage &shared_storage,                    ///< Shared storage object    
+    int thread_idx,                                   ///< ID of a thread within the threadblock
+    int warp_idx,                                     ///< ID of warp within threadblock
+    int lane_idx                                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.base, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.base.reference(), thread_idx),
+    reduction_ptr_(shared_storage.reduction.data()),
+    thread_idx_(thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                        ///< Output operator
+    ElementVector * reduction_output_ptr,             ///< Reduction output vector
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,               ///< Tile iterator for source accumulator matrix
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additional tensor operand
+    MatrixCoord const &problem_size =                 ///< Problem size needed to guard against out-of-bounds accesses
+        MatrixCoord(Shape::kM, Shape::kN),
+    MatrixCoord const &threadblock_offset =           ///< Threadblock's initial offset within the problem size space
+        MatrixCoord()) {
+    
+    ReductionFragment reduction_fragment;
+    reduction_fragment.clear();
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(
+        output_op, 
+        reduction_fragment, 
+        destination_iterator, 
+        accumulators,
+        tensor_iterator,
+        problem_size,
+        threadblock_offset);
+    }
+    else {
+      compute_source_needed_(
+        output_op, 
+        reduction_fragment, 
+        destination_iterator, 
+        accumulators, 
+        source_iterator,
+        tensor_iterator,
+        problem_size,
+        threadblock_offset);
+    }
+
+    if (output_op.participates_in_reduction()) {
+      reduction_(problem_size, threadblock_offset, reduction_output_ptr, reduction_fragment);
+    }
+  }
+
+private:
+
+  /// Perform the reduction
+  CUTLASS_DEVICE
+  void reduction_(
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset,            ///< Problem size needed to guard against out-of-bounds accesses
+    ElementVector * reduction_output_ptr,          ///< Reduction output vector
+    ReductionFragment const & reduction_fragment) {
+
+    //
+    // Store the partially reduced value to SMEM
+    //
+
+    // Guard against uses of the existing SMEM tile
+    __syncthreads();
+    
+    using AccessType = AlignedArray<ElementAccumulator, ThreadMap::kElementsPerAccess>;
+
+    //
+    // Determine a compacted thread arrangement to store to SMEM.
+    //
+    int const kThreadsPerRow = Shape::kN / (ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess);
+
+    MatrixCoord thread_offset(
+      thread_idx_ / kThreadsPerRow, 
+      (thread_idx_ % kThreadsPerRow) * ThreadMap::kElementsPerAccess);
+   
+    //
+    // Each thread store its fragment to a SMEM
+    //
+
+    AccessType *aligned_reduction_ptr = reinterpret_cast<AccessType *>(
+      &reduction_ptr_[thread_offset.row() * Shape::kN + thread_offset.column()]);
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&reduction_fragment);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+      int col_idx = column * ThreadMap::Delta::kColumn / ThreadMap::kElementsPerAccess;
+
+      aligned_reduction_ptr[col_idx] = frag_ptr[column];
+    }
+
+    __syncthreads();
+
+    //
+    // Now, threads are assigned several columns of the output. They fetch over all rows from
+    // the compacted SMEM tile and perform a reduction.
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < ReductionDetail::kThreadAccessesPerRow; ++j) {
+      int column_idx = thread_idx_ + j * ReductionDetail::kThreadCount;
+
+      ReductionOp reduction_op;
+      ElementAccumulator reduction_element = ElementAccumulator();
+
+      int output_column_idx = threadblock_offset.column() + column_idx;
+
+      if (column_idx < Shape::kN && output_column_idx < problem_size.column()) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ReductionDetail::kThreadRows; ++row) {
+          if (row) {
+            auto frag = reduction_ptr_[row * Shape::kN + column_idx];
+
+            reduction_element = reduction_op(reduction_element, frag);
+          }
+          else {
+
+            reduction_element = reduction_ptr_[column_idx];
+          }
+        }
+
+        // Store
+        reduction_output_ptr[column_idx] = ElementVector(reduction_element);
+      }
+    }
+  }
+
+  template<class Seq>
+  struct acc2smem;
+
+  template <size_t... Seq>
+  struct acc2smem<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+    OutputOp const &output_op,                        ///< Output operator
+    ReductionFragment &reduction_fragment,            ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,          ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,              ///< Complete warp-level accumulator tile 
+    TensorTileIterator tensor_iterator,               ///< Threadblock tile iterator for additioanl tensor operand
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
+    ) { 
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    typename TensorTileIterator::Fragment tensor_fragment;
+    tensor_fragment.clear();
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert and store fragment
+      //
+
+      tensor_iterator.load(tensor_fragment);
+      ++tensor_iterator;
+      
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      //
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      //
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      FragmentCompute compute_fragment;
+
+      apply_output_operator_source_not_needed_(
+        reduction_fragment,
+        compute_fragment, 
+        output_op, 
+        aligned_accum_fragment[0],
+        tensor_fragment,
+        destination_iterator);
+
+      //
+      // Store the final result
+      //
+      
+      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
+
+      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+    OutputOp const &output_op,                    ///< Output operator
+    ReductionFragment &reduction_fragment,        ///< Fragment containing the accumulated partial reduction over columns
+    OutputTileIterator destination_iterator,      ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,          ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator,           ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    TensorTileIterator tensor_iterator,            ///< Threadblock tile iterator for additioanl tensor operand
+    MatrixCoord const &problem_size,                  ///< Problem size needed to guard against out-of-bounds accesses
+    MatrixCoord const &threadblock_offset             ///< Threadblock's initial offset within the problem size space
+    ) { 
+    
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    typename TensorTileIterator::Fragment tensor_fragment;
+    tensor_fragment.clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    // 
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Load the source
+      //
+
+      source_fragment.clear();
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      tensor_iterator.load(tensor_fragment);
+      ++tensor_iterator;
+
+      //
+      // Convert and store fragment
+      //
+      
+      __syncthreads();
+
+      acc2smem<cutlass::make_index_sequence<OutputTileIterator::kIterations>>::push(
+          iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1)
+      {
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+        const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_tile_offset({tile_row_offset , 0});
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0});
+      }
+
+      //
+      // Compute the output result
+      //
+     
+      FragmentCompute compute_fragment;
+
+      apply_output_operator_(
+        reduction_fragment, 
+        compute_fragment, 
+        output_op, 
+        aligned_accum_fragment[0], 
+        source_fragment,
+        tensor_fragment,
+        destination_iterator);
+
+      //
+      // Convert and store the final result
+      //
+
+      NumericArrayConverter<ElementOutput, ElementCompute, FragmentCompute::kElements> converter;
+
+      typename OutputTileIterator::Fragment output_fragment = converter(compute_fragment);
+
+      destination_iterator.store(output_fragment);      
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+    ReductionFragment &reduction_fragment,
+    FragmentCompute &compute_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment,
+    typename TensorTileIterator::Fragment const &tensor_fragment,
+    OutputTileIterator const & destination_iterator) {
+      
+    ComputeAccessType *compute_frag_ptr = 
+      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
+
+    AccumulatorAccessType const *accum_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    OutputAccessType const *source_frag_ptr = 
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    TensorAccessType const *tensor_frag_ptr =
+      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], source_frag_ptr[i], tensor_frag_ptr[i]);
+    }
+
+    //
+    // Partial reduction over each column
+    //
+
+    ReductionOp reduction_op;
+
+    typename OutputTileIterator::Mask mask;
+    destination_iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
+
+      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
+      bool column_guard = mask.predicates[column_vector_idx];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
+
+        bool fetch;
+        if (ReductionDetail::kOobCheck) {
+          int row_idx = (row % ThreadMap::Iterations::kRow);
+          int residual = (row / ThreadMap::Iterations::kRow);
+
+          int group_idx = (residual % ThreadMap::Iterations::kGroup);
+          residual = (residual / ThreadMap::Iterations::kGroup);
+
+          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
+
+          int row_offset = row_idx * ThreadMap::Delta::kRow 
+            + group_idx * ThreadMap::Delta::kGroup 
+            + cluster_idx * ThreadMap::Delta::kCluster;
+
+          int output_row = destination_iterator.thread_start_row() + row_offset;
+
+          fetch = (output_row < destination_iterator.extent_row() && column_guard);
+        }
+        else {
+          fetch = true;
+        }
+
+        ElementCompute value = ElementCompute();
+        if (fetch) {
+          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
+        }
+
+        reduction_fragment[column] = reduction_op(
+          reduction_fragment[column], 
+          value);
+      }
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+    ReductionFragment &reduction_fragment,
+    FragmentCompute &compute_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename SharedLoadIterator::Fragment const &aligned_accum_fragment,
+    typename TensorTileIterator::Fragment const &tensor_fragment,
+    OutputTileIterator const & destination_iterator
+  ) {
+    
+    ComputeAccessType *compute_frag_ptr = 
+      reinterpret_cast<ComputeAccessType *>(&compute_fragment);
+
+    AccumulatorAccessType const *accum_frag_ptr = 
+      reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+    TensorAccessType const *tensor_frag_ptr =
+      reinterpret_cast<TensorAccessType const *>(&tensor_fragment);
+
+    int const kOutputOpIterations = 
+      OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+
+      // Call the output operator
+      compute_frag_ptr[i] = output_op(accum_frag_ptr[i], tensor_frag_ptr[i]);
+    }
+
+    //
+    // Partial reduction over each column
+    //
+
+    ReductionOp reduction_op;
+
+    typename OutputTileIterator::Mask mask;
+    destination_iterator.get_mask(mask);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int column = 0; column < ReductionDetail::kColumnsPerThread; ++column) {
+
+      int column_vector_idx = column / ThreadMap::kElementsPerAccess;
+      bool column_guard = mask.predicates[column_vector_idx];
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < ReductionDetail::kRowsPerThread; ++row) {
+
+        bool fetch;
+        if (ReductionDetail::kOobCheck) {
+          int row_idx = (row % ThreadMap::Iterations::kRow);
+          int residual = (row / ThreadMap::Iterations::kRow);
+
+          int group_idx = (residual % ThreadMap::Iterations::kGroup);
+          residual = (residual / ThreadMap::Iterations::kGroup);
+
+          int cluster_idx = (residual % ThreadMap::Iterations::kCluster);
+
+          int row_offset = row_idx * ThreadMap::Delta::kRow 
+            + group_idx * ThreadMap::Delta::kGroup 
+            + cluster_idx * ThreadMap::Delta::kCluster;
+
+          int output_row = destination_iterator.thread_start_row() + row_offset;
+
+          fetch = (output_row < destination_iterator.extent_row() && column_guard);
+        }
+        else {
+          fetch = true;
+        }
+
+        ElementCompute value = ElementCompute();
+        if (fetch) {
+          value = compute_fragment[row * ReductionDetail::kColumnsPerThread + column];
+        }
+
+        reduction_fragment[column] = reduction_op(
+          reduction_fragment[column], 
+          value);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h
new file mode 100644
index 0000000000000000000000000000000000000000..da3637391e604c7ffb67598d5ce5c73fce0d0350
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_scaling_factor.h
@@ -0,0 +1,231 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue visitor for threadblock scoped GEMMs that process softmax computations in epilogue.
+
+  The epilogue finds max values in each row of the row-major output matrix and stores them.
+  The max values are also used for a further round of threadblock scoped reduction operation, where
+  the partial reduction results are stored in a pre-allocated array and used for further full reduction.
+
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"  // cutlass::TensorRef
+
+namespace cutlass
+{
+namespace epilogue
+{
+namespace threadblock
+{
+
+template <int kVectorSize_,
+          typename ThreadShape_,
+          typename ElementCompute_,
+          typename ElementAccumulator_,
+          typename ElementC_,
+          typename ElementD_,
+          typename ElementSFD_,
+          typename LayoutOutput_,
+          typename LayoutSFD_>
+class GemvEpilogueWithScalingFactor
+{
+  public:
+  using ThreadShape = ThreadShape_;
+  using ElementCompute = ElementCompute_;          // f32
+  using ElementAccumulator = ElementAccumulator_;  // f32
+  using ElementC = ElementC_;                      // e2m1
+  using ElementD = ElementD_;                      // e2m1
+  using ElementSFD = ElementSFD_;                  // e4m3
+  using LayoutOutput = LayoutOutput_;              // ColumnMajor
+  using LayoutSFD = LayoutSFD_;                    // ColumnMajor
+  using TensorRefD = TensorRef<ElementD, LayoutOutput_>;
+  static constexpr int kVectorSize = kVectorSize_;
+  // number of threads row
+  static constexpr int kThreadsPerCol = ThreadShape::kM;  // 16
+  // number of threads col
+  static constexpr int kThreadsPerRow = ThreadShape::kN;                // 8
+  static constexpr int kThreadCount = kThreadsPerCol * kThreadsPerRow;  // 128
+
+  static_assert(kVectorSize == kThreadsPerCol, "vector size and number of threads row should be equal");
+  static_assert(std::is_same_v<LayoutSFD, cutlass::layout::ColumnMajor> &&
+                    std::is_same_v<LayoutOutput, cutlass::layout::ColumnMajor>,
+                "Only support Mx1 (ColumnMajor) output and ColumnMajor scaling factor");
+  static_assert(std::is_same_v<ElementCompute, float>, "ElementCompute should be float type");
+  static_assert(cutlass::sizeof_bits<ElementD>::value == 4, "Output should be FP4 type");
+  static_assert(cutlass::sizeof_bits<ElementSFD>::value == 8, "ElementSFD should be FP8 type");
+  static_assert(std::is_same_v<LayoutOutput, LayoutSFD>, "only support same layout for D and SFD");
+
+  // Hardcode static_assert on threadshape 16x8 to avoid bug
+  static_assert(kThreadsPerCol == 16, "thread shape col false");
+  static_assert(kThreadsPerRow == 8, "thread shape row false");
+  static_assert(kThreadCount == 128, "thread count false");
+
+  struct Params
+  {
+    TensorRefD tensor_d;
+    ElementSFD *scale_factor_d_ptr{nullptr};
+    ElementCompute alpha{0};
+    ElementCompute beta{0};
+    float st{0};
+    int64_t batch_stride_sfd{0};  // Add batch stride for SFD
+    int64_t stride_d{0};          // Add stride for D tensor
+  };
+
+  /// Shared storage
+  struct SharedStorage
+  {
+    // fp32
+    // Each thread store one fp32
+#if 1
+    ElementAccumulator reduction_buffer[kThreadsPerCol];
+#else
+    ElementAccumulator reduction_buffer[kThreadCount];
+#endif
+    // Buffer for collecting 4-bit values for packed store
+    uint8_t packed_buffer[kThreadsPerCol];
+  };
+
+  private:
+  Params const &params_;
+  SharedStorage &shared_storage_;
+  float st_scale_down{0};
+
+  public:
+  CUTLASS_HOST_DEVICE GemvEpilogueWithScalingFactor(Params const &params, SharedStorage &shared_storage)
+      : params_(params)
+      , shared_storage_(shared_storage)
+  {
+    const float fp_subtype_max = static_cast<float>(cutlass::platform::numeric_limits<ElementD>::max());
+    this->st_scale_down = this->params_.st / fp_subtype_max;
+  }
+
+  CUTLASS_DEVICE void operator()(ElementAccumulator frag_acc, ElementC frag_c, int batch_idx)
+  {
+    const int block_idx = blockIdx.x;
+    const int thread_idx_col = threadIdx.x;
+    const int thread_idx_row = threadIdx.y;
+
+    const float st_scale_down = this->st_scale_down;
+    const float st = this->params_.st;
+
+    // Compute D offset using batch_idx and stride_d
+    const int output_d_base_offset = blockIdx.x * blockDim.y;
+    const int d_batch_offset = batch_idx * params_.stride_d;
+    ElementD* output_ptr = &params_.tensor_d.at({output_d_base_offset + d_batch_offset, 0});
+    uint8_t* byte_ptr = reinterpret_cast<uint8_t*>(output_ptr);
+    // For 8x16 thread layout, 1 thread per 128 threads write to sf d
+    // Every block write one SFD to gmem
+    const bool is_write_sfd_thread = (thread_idx_row == 0);
+
+    // Calculate SFD offset using proper batch stride
+    const int output_sfd_offset = (block_idx / 4) * 512 + block_idx % 4 + batch_idx * params_.batch_stride_sfd;
+
+    auto reduction_buffer = shared_storage_.reduction_buffer;
+    // fp32
+    ElementAccumulator max_accum_row0 = ElementAccumulator(0);
+    ElementAccumulator max_accum_row1 = ElementAccumulator(0);
+
+    // Thread in row contain duplicate frag_acc data
+    if ( thread_idx_col == 0 ) {
+      // 16 threads write to 16 contigious bank, no conflict
+      reduction_buffer[thread_idx_row] = frag_acc;
+    }
+
+    __syncthreads();
+
+    if (threadIdx.y == 0) {
+      auto acc_0 = reduction_buffer[threadIdx.x * 2];
+      auto acc_1 = reduction_buffer[threadIdx.x * 2 + 1];
+      // compute the max for me using shuffling among 16 threads.
+      ElementAccumulator max_accum = fabsf(acc_0);
+      max_accum = cutlass::fast_max(max_accum, fabsf(acc_1));
+      
+      // Butterfly reduction pattern for 16 threads
+      // Each iteration halves the number of active lanes
+      max_accum = cutlass::fast_max(max_accum, __shfl_down_sync(0xFF, max_accum, 4));  // 8->4  
+      max_accum = cutlass::fast_max(max_accum, __shfl_down_sync(0xFF, max_accum, 2));  // 4->2
+      max_accum = cutlass::fast_max(max_accum, __shfl_down_sync(0xFF, max_accum, 1));  // 2->1
+      
+      // Broadcast the final result to all 8 threads
+      max_accum = __shfl_sync(0xFF, max_accum, 0);
+
+      float pvscale = max_accum * st_scale_down;
+      ElementSFD qpvscale = static_cast<ElementSFD>(pvscale);
+      float qpvscale_up = NumericConverter<ElementCompute, ElementSFD>{}(qpvscale);
+      float qpvscale_up_rcp = __frcp_rn(qpvscale_up) * st;
+      uint8_t qval_u8_compare;
+
+      #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+        uint32_t temp_result;
+        asm volatile (
+            "{\n"
+            "  .reg .f32 output_fp32_0, output_fp32_1;\n"
+            "  .reg .b8 byte0, byte1, byte2, byte3;\n"
+            "  mul.f32 output_fp32_0, %1, %3;\n"
+            "  mul.f32 output_fp32_1, %2, %3;\n"
+            "  cvt.rn.satfinite.e2m1x2.f32 byte0, output_fp32_1, output_fp32_0;\n"
+            "  mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+            "}\n"
+            : "=r"(temp_result)                             // Output to uint32_t
+            : "f"(acc_0), "f"(acc_1), "f"(qpvscale_up_rcp)
+        );
+        qval_u8_compare = temp_result & 0xFF;
+      #else
+        ElementD output_fp4_0 = NumericConverter<ElementD, ElementCompute>{}(acc_0 * qpvscale_up_rcp);
+        ElementD output_fp4_1 = NumericConverter<ElementD, ElementCompute>{}(acc_1 * qpvscale_up_rcp);
+        uint8_t raw_fp4_0 = reinterpret_cast<const uint8_t&>(output_fp4_0) & 0x0F;
+        uint8_t raw_fp4_1 = reinterpret_cast<const uint8_t&>(output_fp4_1) & 0x0F;
+        qval_u8_compare = (raw_fp4_1 << 4) | raw_fp4_0;
+      #endif
+      byte_ptr[threadIdx.x] = qval_u8_compare;
+
+      arch::global_store<ElementSFD, sizeof(ElementSFD)>(qpvscale,
+                                                        (void *)(params_.scale_factor_d_ptr + output_sfd_offset),
+                                                        is_write_sfd_thread);
+
+    }
+
+  }  // end of operator()
+};
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3e5abd090fb13ae790eb694a2b6a72833ae4b65
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
@@ -0,0 +1,409 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Generic epilogue for implementing certain kinds of fused epilogue behavior.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class EpilogueFusedVisitorConcept {
+public:
+
+  static int const kIterations = 1;
+  static int const kElementsPerAccess = 4;
+  using ElementOutput = float;
+  using ElementAccumulator = float;
+  using AccumulatorFragment = Array<ElementAccumulator, kElementsPerAccess>;
+
+  /// Arguments structure
+  struct Arguments {  };
+
+  /// Params structure
+  struct Params {
+
+    Params() { }
+    Params(Arguments const &args) { }
+  };
+
+  /// Shared storage
+  struct SharedStorage { };
+
+public:
+
+  CUTLASS_DEVICE
+  EpilogueFusedVisitorConcept(
+    Params const &params,                                         ///< Parameters routed to the epilogue
+    SharedStorage &shared_storage,                                ///< Shared storage needed by the functors here
+    MatrixCoord const &problem_size,                              ///< Problem size of the output
+    int thread_idx,                                               ///< Thread index within the threadblock
+    int warp_idx,                                                 ///< Warp index within the threadblock
+    int lane_idx,                                                 ///< Lane index within the warp
+    MatrixCoord const &threadblock_offset = MatrixCoord(0, 0)) {  ///< Coordinate
+
+  }
+
+  /// Helper to indicate split-K behavior
+  CUTLASS_DEVICE
+  void set_k_partition(
+    int split_k_index,                                            ///< Index of this threadblock within split-K partitioned scheme
+    int split_k_slices) {                                         ///< Total number of split-K slices
+
+  }
+
+  /// Called to set the batch index
+  CUTLASS_DEVICE
+  void set_batch_index(int batch_idx) {
+
+  }
+
+  /// Called at the start of the epilogue just before iterating over accumulator slices
+  CUTLASS_DEVICE
+  void begin_epilogue() {
+
+  }
+
+  /// Called at the start of one step before starting accumulator exchange
+  CUTLASS_DEVICE
+  void begin_step(int step_idx) {
+
+  }
+
+  /// Called at the start of a row
+  CUTLASS_DEVICE
+  void begin_row(int row_idx) {
+
+  }
+
+  /// Called after accumulators have been exchanged for each accumulator vector
+  CUTLASS_DEVICE
+  void visit(
+    int iter_idx,
+    int row_idx,
+    int column_idx,
+    int frag_idx,
+    AccumulatorFragment const &accum) {
+
+  }
+
+  /// Called at the end of a row
+  CUTLASS_DEVICE
+  void end_row(int row_idx) {
+
+  }
+
+  /// Called after all accumulator elements have been visited
+  CUTLASS_DEVICE
+  void end_step(int step_idx) {
+
+  }
+
+  /// Called after all steps have been completed
+  CUTLASS_DEVICE
+  void end_epilogue() {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Visitor_,                        ///< Functor containing fused operations (satisfies EpilogueFusedVisitorConcept)
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (true || !IsEpilogueFunctorHeavy<Visitor_>::value)
+>
+class EpilogueWithVisitor :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition> {
+
+public:
+
+  using Visitor = Visitor_;
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = Visitor::kElementsPerAccess;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using SharedStorage = typename Base::SharedStorage;
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitor(
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    shared_load_iterator_(shared_storage.reference(), thread_idx)
+  {
+
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    Visitor & visitor,
+    AccumulatorTile const &accumulators) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    visitor.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? Visitor::kIterations : 1)
+    for (int iter_idx = 0; iter_idx < Visitor::kIterations; ++iter_idx) {
+
+      //
+      // Load the source
+      //
+
+      visitor.begin_step(iter_idx);
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_needed<cutlass::make_index_sequence<Visitor::kIterations>>::push(
+          iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+      if (kPartitionsK > 1) {
+
+        plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for ( int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Iterate over output fragments
+      //
+
+      AccumulatorAccessType const *accum_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+      int const kAccumulatorFragmentCount = AccumulatorTile::kElements / (Visitor::kIterations * AccumulatorAccessType::kElements);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+        int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+        int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+        // Start a new row of the output fragment
+        if (!col_idx) {
+          visitor.begin_row(row_idx);
+        }
+
+        visitor.visit(
+          iter_idx,
+          row_idx,
+          col_idx,
+          idx,
+          accum_frag_ptr[idx]
+        );
+
+        // End the row of the output fragment
+        if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+          visitor.end_row(row_idx);
+        }
+      }
+
+      //
+      // Conclude the step
+      //
+
+      visitor.end_step(iter_idx);
+    }
+
+    visitor.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to create an EpilogueWithVisitor from an existing epilogue
+template <typename Visitor_, typename Existing_, bool IterationsUnroll = true>
+struct EpilogueWithVisitorFromExistingEpilogue  {
+
+  using Epilogue = EpilogueWithVisitor<
+    Visitor_,
+    typename Existing_::Shape,
+    typename Existing_::WarpMmaOperator,
+    Existing_::kPartitionsK,
+    typename Existing_::AccumulatorFragmentIterator,
+    typename Existing_::WarpTileIterator,
+    typename Existing_::SharedLoadIterator,
+    typename Existing_::Padding,
+    Existing_::kFragmentsPerIteration,
+    IterationsUnroll
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
new file mode 100644
index 0000000000000000000000000000000000000000..377524f715ef93e909ae678f6c295818ae93bd09
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
@@ -0,0 +1,526 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+ /*! \file
+  \brief Functor performing elementwise operations used by epilogues.
+*/
+
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace detail {
+
+struct EVT2xBase { };
+
+template <class T>
+static constexpr bool is_2x_evt_v = platform::is_base_of<EVT2xBase, T>::value;
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename DefaultEpilogue,                 ///< Default Epilogue Descriptor
+  typename FusionCallbacks_,                ///< The called fusion callbacks
+  int Stages = 2,                           ///< Software pipeline stages for epilogue
+  int IterationsUnroll = true               ///< Used to reduce binary size when epilogue op is large
+>
+class EpilogueWithVisitorCallbacks :
+  public EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>,
+  public EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>,
+  public detail::EVT2xBase
+   {
+
+public:
+
+  static_assert(Stages <= 2, "Sm80 EVT only support upto 2 Stages.");
+
+  // Whether the epilogue is pipelined
+  static bool constexpr Pipelined = Stages > 1;
+
+  using FusionCallbacks = FusionCallbacks_;
+
+  using OutputTileIterator = typename DefaultEpilogue::OutputTileIterator;
+  // Number of epilogue iterations. 
+  // Each iteration processes a 8xThreadblockTile::kN output tile
+  static const int kIterations = OutputTileIterator::kIterations;
+
+  using Base = EpilogueBase<
+    typename DefaultEpilogue::Shape,
+    typename DefaultEpilogue::WarpMmaOperator::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::AccumulatorFragmentIterator,
+    typename DefaultEpilogue::WarpTileIterator,
+    typename DefaultEpilogue::Padding,
+    DefaultEpilogue::kFragmentsPerIteration>;
+  
+  using BaseStreamK = EpilogueBaseStreamK<
+    typename DefaultEpilogue::Shape,
+    DefaultEpilogue::kPartitionsK,
+    typename DefaultEpilogue::WarpMmaOperator,
+    typename DefaultEpilogue::AccumulatorFragmentIterator>;
+
+  static int const kPartitionsK = DefaultEpilogue::kPartitionsK;
+
+  using AccumulatorFragmentIterator = typename DefaultEpilogue::AccumulatorFragmentIterator;
+  using WarpTileIterator = typename DefaultEpilogue::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultEpilogue::SharedLoadIterator;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  struct OutputOp{
+    using ElementAccumulator = ElementAccumulator;
+    using Params = typename FusionCallbacks::Arguments;
+  };
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  // Output access size
+  static int const kElementsPerAccess = DefaultEpilogue::kElementsPerAccess;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+    typename WarpTileIterator::Element, kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+  using Params = typename FusionCallbacks::Params;
+
+  static size_t constexpr kSmemStageOffset = sizeof(Base::SharedStorage) / sizeof(ElementAccumulator);
+  static int constexpr kAccumulatorFragmentCount = AccumulatorTile::kElements / (kIterations * AccumulatorAccessType::kElements) / kPartitionsK;
+
+  struct SharedStorage {
+    typename Base::SharedStorage acc_smem[Stages];
+    typename FusionCallbacks::SharedStorage callback_smem;
+  };
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+  FusionCallbacks fusion_callbacks;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWithVisitorCallbacks(
+    const Params &params_callbacks,   ///< Epilogue Visitor params
+    SharedStorage &shared_storage,    ///< Shared storage object
+    int thread_idx,                   ///< ID of a thread within the threadblock
+    int warp_idx,                     ///< ID of warp within threadblock
+    int lane_idx                      ///< Id of thread within warp
+  ):
+    Base(shared_storage.acc_smem[0], thread_idx, warp_idx, lane_idx),
+    BaseStreamK(thread_idx),
+    shared_load_iterator_(shared_storage.acc_smem[0].reference(), thread_idx),
+    fusion_callbacks(params_callbacks, shared_storage.callback_smem)
+  { }
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      cutlass::gemm::GemmCoord threadblock_tile_offset,
+      ProblemShape problem_shape,
+      int thread_idx) 
+  {
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+    // Reduce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Store fragment to shared memory
+    this->warp_tile_iterator_.store(accum_fragment);
+
+    __syncthreads();
+
+    callbacks.begin_step(reduce_fragment_idx);
+
+    // Load fragment from shared memory
+    typename SharedLoadIterator::Fragment aligned_accum_fragment;
+    shared_load_iterator_.load(aligned_accum_fragment);
+
+    // Add fragments shared by other k partitions
+    if (kPartitionsK > 1)
+    {
+      plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+      CUTLASS_PRAGMA_UNROLL
+      for ( int i = 1; i < kPartitionsK; ++i) {
+        typename SharedLoadIterator::Fragment aligned_addend_fragment;
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        shared_load_iterator_.load(aligned_addend_fragment);
+        aligned_accum_fragment = add_fragments(aligned_accum_fragment, aligned_addend_fragment);
+      }
+    }
+
+    //
+    // Iterate over output fragment
+    //
+
+    AccumulatorAccessType const *accum_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+      int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+      int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+      // Start a new row of the output fragment
+      if (!col_idx) {
+        callbacks.begin_row(row_idx);
+      }
+
+      callbacks.visit(
+        reduce_fragment_idx,
+        row_idx,
+        col_idx,
+        idx,
+        accum_frag_ptr[idx]
+      );
+
+      // End the row of the output fragment
+      if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+        callbacks.end_row(row_idx);
+      }
+    }
+
+    callbacks.end_step(reduce_fragment_idx);
+    callbacks.end_epilogue();
+  }
+
+  /// Streams the result to global memory
+  template <class ProblemShape>
+  CUTLASS_DEVICE
+  void operator()(
+    AccumulatorTile const &accumulators,
+    cutlass::gemm::GemmCoord threadblock_tile_offset,
+    ProblemShape problem_shape,
+    int thread_idx
+    ) {         ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+
+    auto callbacks = fusion_callbacks.get_callbacks(
+      threadblock_tile_offset,
+      thread_idx,
+      problem_shape
+    );
+
+    callbacks.begin_epilogue();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    if constexpr(Pipelined){
+      __syncthreads();
+
+      //
+      // Pipeline Prologue
+      //
+      size_t warp_iterator_offset = kSmemStageOffset;
+      size_t smem_iterator_offset = kSmemStageOffset;
+      callbacks.begin_step(0);
+    
+      acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            0, accum_fragment_iterator, this->warp_tile_iterator_);
+      
+      this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+      warp_iterator_offset = -warp_iterator_offset;
+
+      //
+      // Pipeline Loop
+      //
+
+      #ifdef __clang__
+      #pragma clang diagnostic push
+      #pragma clang diagnostic ignored "-Wcuda-compat"
+      // Turn off clang warning about loop unroll argument using parens.
+      #endif
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 1; iter_idx < kIterations + 1; ++iter_idx) {
+
+        __syncthreads();
+
+        // Skip the load for epilogue
+        if (iter_idx < kIterations) {
+          callbacks.begin_step(iter_idx);
+
+          acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+              iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+          this->warp_tile_iterator_.add_pointer_offset(warp_iterator_offset);
+          warp_iterator_offset = -warp_iterator_offset;
+        }
+        
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+        shared_load_iterator_.add_pointer_offset(smem_iterator_offset);
+        smem_iterator_offset = -smem_iterator_offset;
+        
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx-1,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx-1);
+      }
+
+      #ifdef __clang__
+      #pragma clang diagnostic pop
+      #endif
+
+    } else {
+
+      #ifdef __clang__
+      #pragma clang diagnostic push
+      #pragma clang diagnostic ignored "-Wcuda-compat"
+      // Turn off clang warning about loop unroll argument using parens.
+      #endif
+
+      #pragma unroll(IterationsUnroll ? kIterations : 1)
+      for (int iter_idx = 0; iter_idx < kIterations; ++iter_idx) {
+
+        //
+        // Load the source
+        //
+
+        callbacks.begin_step(iter_idx);
+
+        //
+        // Convert and store fragment
+        //
+
+        __syncthreads();
+
+        acc2smem_source_needed<cutlass::make_index_sequence<kIterations>>::push(
+            iter_idx, accum_fragment_iterator, this->warp_tile_iterator_);
+
+        __syncthreads();
+
+        //
+        // Load fragments from shared memory
+        //
+
+        typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+        // If the number of k-slices is > 1 - perform a reduction amongst the k-slices
+        if (kPartitionsK > 1) {
+
+          plus <typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for ( int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset((1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Iterate over output fragments
+        //
+
+        AccumulatorAccessType const *accum_frag_ptr =
+          reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment[0]);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int idx = 0; idx < kAccumulatorFragmentCount; ++idx) {
+
+          int row_idx = idx / SharedLoadIterator::ThreadMap::Iterations::kColumn;
+          int col_idx = idx % SharedLoadIterator::ThreadMap::Iterations::kColumn;
+
+          // Start a new row of the output fragment
+          if (!col_idx) {
+            callbacks.begin_row(row_idx);
+          }
+
+          callbacks.visit(
+            iter_idx,
+            row_idx,
+            col_idx,
+            idx,
+            accum_frag_ptr[idx]
+          );
+
+          // End the row of the output fragment
+          if (col_idx + 1 == SharedLoadIterator::ThreadMap::Iterations::kColumn) {
+            callbacks.end_row(row_idx);
+          }
+        }
+
+        //
+        // Conclude the step
+        //
+
+        callbacks.end_step(iter_idx);
+      }
+
+      #ifdef __clang__
+      #pragma clang diagnostic pop
+      #endif
+
+    }
+
+    callbacks.end_epilogue();
+  }
+
+private:
+
+
+  template<class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template<int Advance>
+    CUTLASS_DEVICE
+    static void helper(AccumulatorFragmentIterator accum_fragment_iterator,
+                       WarpTileIterator &warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(size_t pos,
+                     AccumulatorFragmentIterator const &iterator_begin,
+                     WarpTileIterator &warp_tile_iterator) {
+      int dummy[] = {(pos == Seq) && (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
new file mode 100644
index 0000000000000000000000000000000000000000..65bf32a5ca1cf12a3f72a79b065119dbdfa59281
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs.
+
+  This does not attempt to target any particular output layout. Instead, each threadblock
+  streams out its accumulator elements using 128b store operations. This assumes all threadblocks
+  have unique output tiles.
+
+  The target data layout is:
+  - threadblock indices mapped to linear offsets as (m, n, k), where m is fastest-changing
+  - threadblock output space partitioned into warps; each warp's region is contiguous
+  - per-thread accumulators partitioned into 128b accesses
+  - output memory striped across the threads of a warp
+
+  This enables very fast streaming of data, completely limited by the memory system. No predication
+  or data exchange is performed, and each threadblock is assumed to have a full region of memory
+  to write to.
+
+  This epilogue establishes an upper bound for epilogue performance and is suitable for
+  reductions across the GEMM K dimension which require a separate workspace.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,      ///< shape of accumulator tile (concept: MatrixShape)
+  int WarpCount,        ///< number of warps
+  typename FragmentC_   ///< warp-level GEMM operator (concept: gemm::warp::Mma)
+>
+class EpilogueWorkspace {
+public:
+
+  using Shape = Shape_;
+  using FragmentC = FragmentC_;
+  using ElementC = typename FragmentC::value_type;
+
+  static int const kWarpCount = WarpCount;
+
+  /// Optimize for 128b accesses
+  static int const kAccessSizeInBits = 128;
+
+  /// Warp size from the perspective of memory operations
+  static int const kWarpSize = 32;
+
+  /// Vector length of accesses
+  static int const kElementsPerAccess = 
+    kAccessSizeInBits / sizeof_bits<ElementC>::value;
+
+  /// Number of stores per thread
+  static int const kIterations = FragmentC::kElements / kElementsPerAccess;
+
+  static_assert(
+    !(FragmentC::kElements % kElementsPerAccess), 
+    "The number of accumulators must be divisible by the access size.");
+
+  /// Total number of vectorized accesses in warp (in units of vector)
+  static int const kWarpAccesses = kIterations * kWarpSize;
+
+  /// Total number of vectorized accesses in threadblock tile (in units of vector)
+  static int const kThreadblockAccesses = kWarpAccesses * kWarpCount;
+
+  /// Parameters structure
+  struct Params {
+
+    /// Pointer to C matrix
+    ElementC *ptr_C;
+
+    /// Stride between tiles along the GEMM N dimension (in units of vectors)
+    int stride_n;
+
+    /// Stride between tiles along the GEMM K dimension (in units of vectors)
+    int stride_k;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementC *ptr_C,   ///< Pointer to C matrix
+      int stride_n_,      ///< Stride between tiles along the GEMM N dimension (in units of ElementC)
+      int stride_k_       ///< Stride between tiles along the GEMM K dimension (in units of ElementC)
+    ):
+      ptr_C(ptr_C), stride_n(stride_n_ / kElementsPerAccess), stride_k(stride_k_ / kElementsPerAccess) {
+
+    }
+  };
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {
+    // Intentionally empty
+  };
+
+private:
+
+  struct alignas((kAccessSizeInBits / 8)) AccessType {
+    Array<ElementC, kElementsPerAccess> storage;
+  };
+
+  /// Constant reference to parameters object
+  AccessType *pointer_;
+
+  /// Stride between tiles along the n dimension (in vectors)
+  int stride_n_;
+
+  /// Stride between tiles along the k dimension (in vectors)
+  int stride_k_;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  EpilogueWorkspace(
+    Params const &params,     ///< Host-constructable params object
+    SharedStorage &,          ///< Shared storage object
+    int warp_idx,             ///< ID of warp within threadblock
+    int lane_idx              ///< Id of thread within warp
+
+  ):
+    pointer_(reinterpret_cast<AccessType *>(params.ptr_C)),
+    stride_n_(params.stride_n), 
+    stride_k_(params.stride_k) {
+
+    // Add per-thread offset
+    pointer_ += lane_idx + warp_idx * kWarpAccesses;
+  }
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    cutlass::gemm::GemmCoord problem_size,       ///< Problem size of GEMM (units of ElementC)
+    cutlass::gemm::GemmCoord tb_tile_coord,      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+    FragmentC const &accum) {     ///< Accumulator tile
+    
+    // Compute offset for entire threadblock (note, per-thread offset has been folded in already)
+    AccessType *pointer = pointer_ + 
+      tb_tile_coord.m() * kThreadblockAccesses + 
+      tb_tile_coord.n() * stride_n_ +
+      tb_tile_coord.k() * stride_k_;
+
+    // Cast to vectorized view of accumulator fragments
+    AccessType const * src_pointer = reinterpret_cast<AccessType const *>(&accum);
+
+    // Write out accumulators at full speed
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kIterations; ++i) {
+      pointer[i * kWarpSize] = src_pointer[i];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5b26e08f8da9062c6fe1af5cdbc6d391d047c2e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
@@ -0,0 +1,433 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree operation base implementation to enable composable fusions
+         for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using cute::tuple;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <class... Ops>
+struct VisitorImpl2x: fusion::detail::Sm90VisitorImplBase<Ops...> {
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::Sm90VisitorImplBase;
+  using fusion::detail::Sm90VisitorImplBase<Ops...>::ops;
+
+  template <class CallbacksTuple>
+  struct Callbacks {
+    // Callbacks can store non-persistent variables (e.g. tensors) or copies of persistent variables
+    CallbacksTuple callbacks_tuple;
+
+    /// Called at the start of the epilogue just before iterating over accumulator slices
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.begin_epilogue();
+        }
+      );
+    }
+
+    /// Called at the start of one step before starting accumulator exchange
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_step(step_idx);
+        }
+      );
+    }
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.begin_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after accumulators have been exchanged for each accumulator vector
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) // depends on the N-naryness of the op
+      = delete; // Must be implemented for each operation
+
+    /// Called at the start of a row
+    CUTLASS_DEVICE void
+    end_row(int row_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_row(row_idx);
+        }
+      );
+    }
+
+    /// Called after all accumulator elements have been visited
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      for_each(callbacks_tuple,
+        [&] (auto& callbacks) {
+          callbacks.end_step(step_idx);
+        }
+      );
+    }
+
+    /// Called after all steps have been completed
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      for_each(callbacks_tuple,
+        [] (auto& callbacks) {
+          callbacks.end_epilogue();
+        }
+      );
+    }
+  };
+
+  // Callbacks factory
+  // All operations must redefine this
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return transform_apply(ops,
+      [&] (auto& op) {
+        return op.get_callbacks(
+          threadblock_tile_offset,
+          thread_idx,
+          problem_shape);
+      },
+      [] (auto&&... callbacks) {
+        auto callbacks_tuple = cute::make_tuple(callbacks...);
+        return Callbacks<decltype(callbacks_tuple)>{callbacks_tuple};
+      }
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Convenience aliases
+using EmptyCallbacks = VisitorImpl2x<>::Callbacks<cute::tuple<>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Tree visitor
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class NodeOp, class... ChildOps>
+struct TreeVisitor2x : VisitorImpl2x<ChildOps..., NodeOp> {
+
+  using VisitorImpl2x<ChildOps..., NodeOp>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(ChildOps);
+      return cute::detail::tapply(callbacks_tuple,
+        [&] (auto& child_callbacks) {
+          return child_callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc);
+        },
+        [&] (auto&&... frg_inputs) {
+          return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+        },
+        make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<
+    decltype(VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<ChildOps..., NodeOp>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+struct TopologicalVisitor2x : VisitorImpl2x<Ops...> {
+  static_assert(is_static_v<EdgeTuple>);
+  static_assert(cute::rank(EdgeTuple{}) == sizeof...(Ops));
+  static_assert(sizeof...(Ops) > 1);
+
+  using VisitorImpl2x<Ops...>::VisitorImpl2x;
+
+  template<class CallbacksImpl>
+  struct Callbacks : CallbacksImpl {
+    CUTLASS_DEVICE
+    Callbacks(CallbacksImpl&& impl)
+      : CallbacksImpl(cute::forward<CallbacksImpl>(impl)) {}
+
+    using CallbacksImpl::callbacks_tuple;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      constexpr int Rm1 = sizeof...(Ops) - 1;
+      auto frg_compute_tuple = cute::repeat<Rm1>(Array<ElementCompute, FragmentSize>{});
+
+      return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
+        // Visit the first R-1 ops in topological order
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+          frg_compute = cute::detail::apply(frg_compute_tuple,
+          // Compute the current op with children inputs
+          [&] (auto const&... frg_inputs) {
+            auto frg_output = callbacks.visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+            using ElementOutput = typename decltype(frg_output)::Element;
+            using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
+            ConvertOutput convert_output{};
+
+            return convert_output(frg_output);
+          },
+          // Get inputs in the sequence given by the children indices of the current op
+          edge_seq
+        );
+        return frg_compute;
+      },
+      // Visit the last op
+      [&] (auto const&...ops) {
+        return cute::detail::apply(frg_compute_tuple,
+          // Compute the last op with children inputs
+          [&] (auto const&... frg_inputs) {
+            return get<Rm1>(callbacks_tuple).visit(iter_idx, row_idx, column_idx, frg_idx, frg_acc, frg_inputs...);
+          },
+          // Get inputs in the sequence given by the children indices of the last op
+          get<Rm1>(EdgeTuple{})
+        );
+      },
+      // Transform to visit R-1 ops, apply to visit last op
+      make_seq<Rm1>{}
+      );
+    }
+  };
+
+  // Callbacks factory
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks<decltype(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      ))>(
+      VisitorImpl2x<Ops...>::
+      get_callbacks(
+        threadblock_tile_offset,
+        thread_idx,
+        problem_shape
+      )
+    );
+  }
+};
+
+
+template <class NodeOp, class... ChildOps>
+using Sm80EVT = TreeVisitor2x<NodeOp, ChildOps...>;
+
+template<
+  class ElementCompute,
+  class EdgeTuple,
+  class... Ops
+>
+using Sm80TopologicalVisitor = TopologicalVisitor2x<ElementCompute, EdgeTuple, Ops...>;
+
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// OutputTileThreadLayout translate the CUTLASS 2.X OutputTileOptimalThreadMap into cute layout
+// used by CUTLASS 3.X Epilogue
+template <
+  typename ThreadblockShape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  int Stages_=1
+>
+struct OutputTileThreadLayout: DefaultThreadMapTensorOp<
+  ThreadblockShape_,
+  WarpShape_,
+  ThreadblockShape_::kK/WarpShape_::kK,
+  Element_,
+  ElementsPerAccess>::Type {
+
+  using Base = typename DefaultThreadMapTensorOp<
+    ThreadblockShape_,
+    WarpShape_,
+    ThreadblockShape_::kK/WarpShape_::kK,
+    Element_,
+    ElementsPerAccess>::Type;
+  using Base::Base;
+
+  // Software pipeline stages in epilogue
+  static_assert(Stages_ <= 2, "Sm80 EVT only support upto 2 Stages.");
+  static const int Stages = Stages_;
+
+  using ThreadShape = cute::Shape<
+    cute::Int<Base::Detail::kAccessWidth>,                 // lane col idx
+    cute::Int<Base::Detail::kAccessRows>,                  // lane row idx
+    cute::Int<Base::Detail::kWarpsRemainingForRows>,       // warp row idx
+    cute::Int<Base::Shape::kGroup>,                        // group idx
+    cute::Int<Base::Shape::kCluster>                       // cluster idx
+  >;
+
+  using Shape = typename Base::Shape;
+  using Count = typename Base::Count;
+
+  using ThreadMapShape = cute::Shape<
+    // Column
+    Int<Base::kElementsPerAccess>,                // vector
+    Int<Base::Detail::kAccessWidth>,              // lane_col_coord
+    Int<Base::Iterations::kColumn>,               // iteration::column
+    // Row
+    Int<Base::Detail::kAccessRows>,               // lane_row_coord
+    Int<Base::Iterations::kRow>,                  // iterations in row
+    Int<Base::Detail::kWarpsRemainingForRows>,    // warp_row_coord
+    Int<Count::kRow>,                             // iteration::row
+    Int<Count::kGroup>,                           // iteration::group
+    Int<Shape::kGroup>,                           // group_coord
+    Int<Count::kCluster>,                         // iteration::cluster
+    Int<Shape::kCluster>                          // cluster_coord
+  >;
+
+  // The shape of CTA Tile
+  using CtaShapeMNL = cute::Shape<
+    Int<
+      Shape::kRow * Count::kRow *
+      Shape::kGroup * Count::kGroup *
+      Shape::kCluster * Count::kCluster
+    >,
+    Int<Shape::kColumn * Count::kColumn>,
+    _1
+  >;
+
+  static const int kElementsPerAccess = ElementsPerAccess;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  static auto tid2coord(int thread_idx) {
+    return cute::idx2crd(thread_idx, ThreadShape{});
+  }
+
+  template <class TensorInput>
+  CUTLASS_DEVICE
+  static auto partition(TensorInput &&xT, int thread_idx, gemm::GemmCoord threadblock_tile_offset) {
+
+    // (BLK_M,BLK_N)
+    Tensor bCxT = local_tile(
+      xT, CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+    )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k());
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = tid2coord(thread_idx);
+
+    // transform to column-major
+    Tensor bCxT_nm = make_tensor(
+      std::forward<decltype(bCxT)>(bCxT).data(), make_layout(get<1>(bCxT.layout()), get<0>(bCxT.layout()))
+    ).compose(make_layout(ThreadMapShape{}));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    return bCxT_nm(_,lane_col_coord,_,lane_row_coord,_,warp_row_coord,_,_,group_coord,_,cluster_coord);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6275a2ff522c91f37215a4295b38670ab3b78f0c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
@@ -0,0 +1,109 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree compute operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// N-nary Elementwise Compute Operation
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  template <class> class ComputeFn,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class = void
+>
+struct VisitorCompute : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <typename ElementAccumulator, typename... ElementInputs, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementOutput, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, 
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInputs, FragmentSize> const&... frg_inputs) {
+      return transform_apply(cute::make_tuple(frg_inputs...),
+        [&] (auto&& frg_input) {
+          using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
+          using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+          ConvertInput convert_input{};
+
+          return convert_input(frg_input);
+        },
+        [&] (auto&&... cvt_frg_inputs) {
+          using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
+          using ConvertOutput = NumericArrayConverter<ElementOutput, ElementCompute, FragmentSize, RoundStyle>;
+          ComputeOutput compute_output{};
+          ConvertOutput convert_output{};
+
+          return convert_output(compute_output(cvt_frg_inputs...));
+        }
+      );
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d894b114cd41b98db97f3764323347ad1096630e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
@@ -0,0 +1,597 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree load operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cute/tensor.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Fetch Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// returns accumulator
+struct VisitorAccFetch : VisitorImpl2x<> {
+
+  using VisitorImpl2x<>::VisitorImpl2x;
+
+  struct Callbacks : EmptyCallbacks {
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx, Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      return frg_acc;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    return Callbacks{};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Broadcast Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar broadcast
+template<
+  class Element,
+  class StrideMNL = Stride<_0,_0,_0>,
+  int BroadcastCount = 1,
+  template <class> class ReductionFn = multiplies
+>
+struct VisitorScalarBroadcast {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) || // scalar broadcast, e.g. alpha
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,_1>>) ||
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));  // batched scalar broadcast, e.g. per-batch alpha
+
+  struct SharedStorage { };
+
+  struct Arguments {
+    Element scalars[BroadcastCount] = {};
+    Element const* scalar_ptrs[BroadcastCount] = {};
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params_ptr(&params) {
+    // Get the scalar for non-batched broadcast
+    if constexpr (cute::is_same_v<StrideMNL, Stride<_0,_0,_0>>) {
+      update_scalar();
+    }
+  }
+
+  Element scalar;
+  Params const* params_ptr;
+
+  struct Callbacks: EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(Element scalar)
+      : scalar(scalar) {}
+
+    Element scalar;
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_scalar;
+      frg_scalar.fill(scalar);
+
+      return frg_scalar;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    // Get the scalar for batched broadcast
+    if constexpr (
+      cute::is_same_v<StrideMNL, Stride<_0,_0,_1>> ||
+      cute::is_same_v<StrideMNL, Stride<_0,_0,int>>) {
+      update_scalar(threadblock_tile_offset.k());
+    }
+    return Callbacks(scalar);
+  }
+
+private:
+  CUTLASS_DEVICE void
+  update_scalar(int l_coord = 0) {
+    int l_offset = l_coord * size<2>(params_ptr->dScalar);
+
+    if (params_ptr->scalar_ptrs[0] != nullptr) {
+      scalar = params_ptr->scalar_ptrs[0][l_offset];
+    } else {
+      // batch stride is ignored for nullptr fallback
+      scalar = params_ptr->scalars[0];
+    }
+
+    // Do reduction over multiple broadcasts if necessary
+    ReductionFn<Element> reduction_fn;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < BroadcastCount; ++i) {
+      if (params_ptr->scalar_ptrs[i] != nullptr) {
+        scalar = reduction_fn(scalar, params_ptr->scalar_ptrs[i][l_offset]);
+      } else {
+        // batch stride is ignored for nullptr fallback
+        scalar = reduction_fn(scalar, params_ptr->scalars[i]);
+      }
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Load Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL
+>
+struct VisitorAuxLoad{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  // Software pipeline stages
+  static const int Stages = ThreadMap::Stages;
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxLoad(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux(_,_,_,step_idx%Stages));
+      auto src_v = filter(tC_gAux(_,_,_,step_idx));
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_rAux(_,_,_,step_idx%Stages));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux(_,_,_,iter_idx%Stages)));
+      return tC_rAux_frg(frg_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(
+      group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, Stages
+    Tensor tC_rAux = make_tensor<VecType>(
+      make_layout(flatten(make_shape(take<0,3>(tC_gAux.shape()), Int<Stages>{}))));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Row vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct VisitorRowBroadcast {
+
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  // Global load type
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gRow,
+      RTensor&& tC_rRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_rRow(cute::forward<RTensor>(tC_rRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gRow;
+    RTensor tC_rRow;
+    CTensor tC_cRow;
+    Params const* params_ptr;
+    int n;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_row == nullptr) {
+          auto tC_rRow_vec = recast<Array<Element, VecLength>>(coalesce(tC_rRow));
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tC_rRow_vec); ++i) {
+            tC_rRow_vec[i].fill(params_ptr->null_default);
+          }
+          return;
+        }
+      }
+      clear(tC_rRow);
+      auto src_v = filter(tC_gRow);
+      auto coord_v = filter(tC_cRow);
+      auto dst_v = filter(tC_rRow);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = get<1>(coord_v(i)) < n;
+        cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const *)&src_v(i), guard);
+      }
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Tensor rRow_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rRow));
+      return rRow_frg(column_idx);
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    // VECTOR, FRAGMENT_COLUMN
+    Tensor tC_gRow = recast<VecType>(
+      ThreadMap::partition(mRow, thread_idx, threadblock_tile_offset)
+    )(_,_,_0{},_0{},_0{},_0{});
+    Tensor tC_rRow = make_tensor_like(tC_gRow);
+
+    // Generate the pred tensor
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      ThreadMap::partition(cRow, thread_idx, threadblock_tile_offset)(_,_,_0{},_0{},_0{},_0{}),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gRow), decltype(tC_rRow),
+      decltype(tC_cRow), ProblemShape>(
+      cute::move(tC_gRow),
+      cute::move(tC_rRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  class ThreadMap,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  bool EnableNullptr = true // Fallback scalar broadcast for nullptr params
+>
+struct VisitorColBroadcast {
+
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    Element null_default = Element(0);
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColBroadcast(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      RTensor&& tC_rCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_rCol(cute::forward<RTensor>(tC_rCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gCol;
+    RTensor tC_rCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      if constexpr (EnableNullptr) {
+        if (params_ptr->ptr_col == nullptr) {
+          fill(tC_rCol, params_ptr->null_default);
+          return;
+        }
+      }
+      clear(tC_rCol);
+      Tensor tC_pCol = cute::lazy::transform(tC_cCol, [&] (auto const& c) { return get<0>(c) < m; });
+      copy_if(tC_pCol, tC_gCol, tC_rCol);
+    }
+
+    template <class ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc) {
+      Array<Element, FragmentSize> frg_col;
+      frg_col.fill(tC_rCol(row_idx,iter_idx));
+      return frg_col;
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+    Tensor tC_rCol = make_tensor_like(tC_gCol);
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tC_cCol = group_modes<1,4>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_rCol),
+      decltype(tC_cCol), ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_rCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bc7f80f8dd835a4b1007fcf16db6e313907e4d7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
@@ -0,0 +1,802 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Visitor tree store operations for the CUTLASS 2x epilogue
+*/
+
+#pragma once
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::epilogue::threadblock {
+
+using namespace cute;
+using namespace detail;
+using X = Underscore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Elementwise Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ThreadMap,
+  class Element,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL
+>
+struct VisitorAuxStore{
+
+  struct Arguments {
+    Element* ptr_aux = nullptr;
+    StrideMNL dAux = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage {};
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorAuxStore(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gAux,
+      RTensor&& tC_rAux,
+      CTensor&& tC_cAux,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      tC_gAux(cute::forward<GTensor>(tC_gAux)),
+      tC_rAux(cute::forward<RTensor>(tC_rAux)),
+      tC_cAux(cute::forward<CTensor>(tC_cAux)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) { }
+
+    GTensor tC_gAux;
+    RTensor tC_rAux;
+    CTensor tC_cAux;
+    Params const* params_ptr;
+    ProblemShape problem_shape;
+
+    CUTLASS_DEVICE void
+    begin_step(int step_idx) {
+      clear(tC_rAux);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+      using ConvertInput = NumericArrayConverter<Element, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+
+      Tensor tC_rAux_frg = recast<Array<Element, FragmentSize>>(coalesce(tC_rAux));
+      tC_rAux_frg(frg_idx) = convert_input(frg_input);
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_step(int step_idx) {
+      auto src_v = filter(tC_rAux);
+      auto coord_v = filter(tC_cAux(_,_,_,step_idx));
+      auto dst_v = filter(tC_gAux(_,_,_,step_idx));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(src_v); ++i) {
+        bool guard = elem_less(coord_v(i), problem_shape);
+        cutlass::arch::global_store<VecType, sizeof(VecType)>(src_v(i), (void*)&dst_v(i), guard);
+      }
+    }
+
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mAux = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_aux),
+      problem_shape,
+      params_ptr->dAux);   // (M,N,L)
+    // VECTOR, FRAGMENT_COLUMN, FRAGMENT_ROW, ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER
+    Tensor tC_gAux = recast<VecType>(group_modes<3,6>(ThreadMap::partition(mAux, thread_idx, threadblock_tile_offset)));
+    Tensor tC_rAux = make_tensor_like(take<0,3>(tC_gAux));
+
+    // Generate the pred tensor
+    Tensor cAux = make_identity_tensor(mAux.shape());
+    Tensor tC_cAux = outer_partition(
+      group_modes<3,6>(ThreadMap::partition(cAux, thread_idx, threadblock_tile_offset)),
+      Shape<Int<VecLength>>{},
+      (_0{})
+    );
+
+    return Callbacks<
+      decltype(tC_gAux), decltype(tC_rAux),
+      decltype(tC_cAux), ProblemShape>(
+      cute::move(tC_gAux),
+      cute::move(tC_rAux),
+      cute::move(tC_cAux),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Reduction Store Operations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Helper functions
+template <
+  template <class> class ReduceFn,
+  int kThreads, class T>
+CUTLASS_DEVICE
+void intra_warp_row_reduce(T& value) {
+  using ReduceInput = ReduceFn<T>;
+  ReduceInput reduce_input{};
+  constexpr int kHalfThreads = kThreads >> 1;
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = kHalfThreads; i > 0; i >>= 1) {
+    value = reduce_input(value, __shfl_xor_sync(0xFFFFFFFF, value, i));
+  }
+}
+
+template <
+  template <class> class ReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementFragment, int FragmentSize>
+CUTLASS_DEVICE
+void fragment_reduce(ElementCompute& value, Array<ElementFragment, FragmentSize> const& frg) {
+  using ReduceInput = ReduceFn<ElementCompute>;
+  ReduceInput reduce_input{};
+  using ConvertInput = NumericConverter<ElementCompute, ElementFragment, RoundStyle>;
+  ConvertInput convert_input{};
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < FragmentSize; ++i) {
+    value = reduce_input(value, convert_input(frg[i]));
+  }
+}
+
+template<
+  template <class> class AtomicReduceFn,
+  FloatRoundStyle RoundStyle,
+  class ElementCompute,
+  class ElementOutput>
+CUTLASS_DEVICE
+void atomic_reduce(ElementOutput* ptr, ElementCompute const& value) {
+  using ReduceOutput = AtomicReduceFn<ElementOutput>;
+  using ConvertOutput = NumericConverter<ElementOutput, ElementCompute, RoundStyle>;
+  ReduceOutput reduce_output{};
+  ConvertOutput convert_output{};
+
+  reduce_output(ptr, convert_output(value));
+}
+
+// Col vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_1,_0,_0>
+>
+struct VisitorColReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_col = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorColReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      GTensor&& tC_gCol,
+      CTensor&& tC_cCol,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_gCol(cute::forward<GTensor>(tC_gCol)),
+      tC_cCol(cute::forward<CTensor>(tC_cCol)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to the first thread in each row.
+        // Only the first thread in each row is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::Detail::kAccessWidth == 0;
+      }
+
+    GTensor tC_gCol;
+    CTensor tC_cCol;
+    Params const* params_ptr;
+    int m;
+    int n;
+    int curr_iter_idx;
+    bool is_writing_thread;
+
+    ElementCompute reduction_accum;
+
+    CUTLASS_DEVICE void
+    begin_row(int row_idx) {
+      reduction_accum = ElementCompute(params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      curr_iter_idx = iter_idx;
+
+      int coord_n = get<1>(tC_cCol(column_idx, row_idx, iter_idx));
+      if (coord_n < n) {
+        fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+      }
+
+      // Intra-warp reduction
+      if (column_idx + 1 == ThreadMap::Iterations::kColumn) {
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::Detail::kAccessWidth>(reduction_accum);
+      }
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE auto
+    end_row(int row_idx) {
+      bool guard = get<0>(tC_cCol(_0{}, row_idx,curr_iter_idx)) < m;
+
+      if (guard && is_writing_thread) {
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gCol(row_idx,curr_iter_idx), reduction_accum);
+      }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+
+    Tensor mCol = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_col),
+      problem_shape,
+      params_ptr->dCol);
+    // FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_gCol = group_modes<1,4>(
+      ThreadMap::partition(mCol, thread_idx, threadblock_tile_offset)(_0{},_0{},_,_,_,_));
+
+    // Generate the pred tensor
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cCol = group_modes<2,5>(
+      ThreadMap::partition(cCol, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    return Callbacks<
+      decltype(tC_gCol), decltype(tC_cCol),
+      ProblemShape>(
+      cute::move(tC_gCol),
+      cute::move(tC_cCol),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Row vector reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_1,_0>
+>
+struct VisitorRowReduction {
+
+  struct Arguments {
+    ElementOutput* ptr_row = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  using SharedStorageShape = decltype(select<0,1,2,3,5,8,10>(typename ThreadMap::ThreadMapShape{}));
+
+  struct SharedStorage {
+    AlignedArray<ElementCompute, size(SharedStorageShape{}), 16> reduction;
+  };
+
+  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<ElementOutput>::value;
+  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction() { }
+
+  CUTLASS_HOST_DEVICE
+  VisitorRowReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params),
+      smem_reduce(const_cast<ElementCompute*>(shared_storage.reduction.data())) { }
+
+  Params const* params_ptr;
+  ElementCompute* smem_reduce;
+
+  template <
+    class RTensorR2S, class STensorR2S, class CTensorR2S,
+    class STensorS2R, class RTensorS2R, class CTensorS2R,
+    class GTensor, class CTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      // R->S
+      RTensorR2S&& tRS_rSrc,
+      STensorR2S&& tRS_sRows,
+      CTensorR2S&& tRS_cSrc,
+      // S->R
+      STensorS2R&& tSR_sRows,
+      RTensorS2R&& tSR_rRows,
+      CTensorS2R&& tSR_cRows,
+      // R->G
+      GTensor&& tC_gRow,
+      CTensor&& tC_cRow,
+      ProblemShape problem_shape,
+      Params const* params_ptr
+    ):
+      // R->S
+      tRS_rSrc(cute::forward<RTensorR2S>(tRS_rSrc)),
+      tRS_sRows(cute::forward<STensorR2S>(tRS_sRows)),
+      tRS_cSrc(cute::forward<CTensorR2S>(tRS_cSrc)),
+      // S->R
+      tSR_sRows(cute::forward<STensorS2R>(tSR_sRows)),
+      tSR_rRows(cute::forward<RTensorS2R>(tSR_rRows)),
+      tSR_cRows(cute::forward<CTensorS2R>(tSR_cRows)),
+      // R->G
+      tC_gRow(cute::forward<GTensor>(tC_gRow)),
+      tC_cRow(cute::forward<CTensor>(tC_cRow)),
+      m(get<0>(problem_shape)),
+      n(get<1>(problem_shape)),
+      params_ptr(params_ptr) { }
+
+    // R->S
+    RTensorR2S tRS_rSrc;
+    STensorR2S tRS_sRows;
+    CTensorR2S tRS_cSrc;
+    // S->R
+    STensorS2R tSR_sRows;
+    RTensorS2R tSR_rRows;
+    CTensorS2R tSR_cRows;
+    // R->G
+    GTensor tC_gRow;
+    CTensor tC_cRow;
+
+    Params const* params_ptr;
+    int n;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin_epilogue() {
+      fill(tRS_rSrc, params_ptr->reduction_identity);
+    }
+
+    template <class ElementAccumulator, class ElementInput, int FragmentSize>
+    CUTLASS_DEVICE auto // returns an Array
+    visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+          Array<ElementAccumulator, FragmentSize> const& frg_acc,
+          Array<ElementInput, FragmentSize> const& frg_input) {
+
+      using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
+      ConvertInput convert_input{};
+      Tensor tRS_rRow_frg = recast<Array<ElementCompute, FragmentSize>>(coalesce(tRS_rSrc));
+
+      int coord_m = get<0>(tRS_cSrc(column_idx,row_idx,iter_idx));
+      if (coord_m < m)
+        reduction(tRS_rRow_frg[column_idx], convert_input(frg_input));
+
+      return frg_input;
+    }
+
+    CUTLASS_DEVICE void
+    end_epilogue() {
+      //
+      // Store the partially reduced value to SMEM
+      //
+
+      // Guard against uses of the existing SMEM tile
+      __syncthreads();
+
+      copy(tRS_rSrc, tRS_sRows);
+
+      __syncthreads();
+
+      //
+      // Now, threads are assigned several columns of the output. They fetch over all rows from
+      // the compacted SMEM tile and perform a reduction.
+      //
+
+      fill(tSR_rRows, params_ptr->reduction_identity);
+
+      using ReduceInputReg = RegReduceFn<ElementCompute>;
+      ReduceInputReg reduce_input_reg{};
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < size(tSR_rRows); ++j) {
+        if (get<0>(tSR_cRows(j)) < get<1>(typename ThreadMap::CtaShapeMNL{}) && get<1>(tC_cRow(j)) < n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 0; i < size(tSR_sRows) / size(tSR_rRows); ++i) {
+            tSR_rRows(j) = reduce_input_reg(tSR_rRows(j), tSR_sRows(i + j * size(tSR_sRows) / size(tSR_rRows)));
+          }
+          atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gRow(j), tSR_rRows(j));
+        }
+
+      }
+    }
+
+  private:
+
+    template <int FragmentSize>
+    CUTLASS_DEVICE ElementCompute
+    reduction(Array<ElementCompute, FragmentSize>& reduce_buffer, Array<ElementCompute, FragmentSize> const& result) {
+      using ReduceInput = RegReduceFn<ElementCompute>;
+      ReduceInput reduce_input{};
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < FragmentSize; ++i) {
+            reduce_buffer[i] = reduce_input(reduce_buffer[i], result[i]);
+        }
+    }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor mRow = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_row),
+      problem_shape,
+      params_ptr->dRow);
+
+    //
+    // Step 1: reduce fragment input (Src) into tRS_rSrc
+    //
+
+    // VECTOR,FRAGMENT_COL
+    Tensor tRS_rSrc = make_tensor<ElementCompute>(select<0,2>(typename ThreadMap::ThreadMapShape{}));
+
+    Tensor cSrc = make_identity_tensor(mRow.shape());
+    // FRAGMENT_COLUMN, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tRS_cSrc = group_modes<2,5>(ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_));
+
+    //
+    // Step 2: copy the partial results in tRS_rSrc to sRows in shared memory
+    //
+
+    // VECTOR,ACCESS_WIDTH,FRAGMENT_COL,ACCESS_ROWS,WARPS_PER_ROW,GROUPS,CLUSTERS
+    Tensor sRows = make_tensor(
+      make_smem_ptr(smem_reduce), SharedStorageShape{}
+    );
+
+    auto [lane_col_coord, lane_row_coord, warp_row_coord, group_coord, cluster_coord] = ThreadMap::tid2coord(thread_idx);
+    Tensor tRS_sRows = sRows(_,lane_col_coord,_,lane_row_coord,warp_row_coord,group_coord,cluster_coord);
+
+    //
+    // Step 3: copy the partial results in sRows to tSR_sRow for reduction
+    //
+
+    // VECTOR*ACCESS_WIDTH*FRAGMENT_COL,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor sRows_nm = coalesce(group_modes<1,5>(group_modes<0,3>(sRows)), Shape<_1,_1>{});
+    // SMEM_ROW/THREADS,ACCESS_ROWS*WARPS_PER_ROW*GROUPS*CLUSTERS
+    Tensor tSR_sRows = outer_partition(sRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx);
+    // SMEM_ROW/THREADS
+    Tensor tSR_rRows = make_tensor_like(tSR_sRows(_,_0{}));
+    // Coord
+    Tensor cRows_nm = make_identity_tensor(sRows_nm.shape());
+    Tensor tSR_cRows = outer_partition(cRows_nm, Shape<Int<ThreadMap::kThreads>,_1>{}, thread_idx)(_,_0{});
+
+    //
+    // Step 4: atomically reduce the results to global memory
+    //
+
+    Tensor tC_gRow = outer_partition(
+      // Cta tile
+      local_tile(
+        mRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_),Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    Tensor cRow = make_identity_tensor(mRow.shape());
+    Tensor tC_cRow = outer_partition(
+      // Cta tile
+      local_tile(
+        cRow, typename ThreadMap::CtaShapeMNL{}, make_coord(_,_,_), Step<_1,_1, X>{}
+      )(_,_,threadblock_tile_offset.m(),threadblock_tile_offset.n(),threadblock_tile_offset.k()),
+      // Partition to threads
+      Shape<_1,Int<ThreadMap::kThreads>>{}, thread_idx
+    )(_0{},_);
+
+    return Callbacks<
+      decltype(tRS_rSrc), decltype(tRS_sRows),
+      decltype(tRS_cSrc), decltype(tSR_sRows),
+      decltype(tSR_rRows), decltype(tSR_cRows),
+      decltype(tC_gRow), decltype(tC_cRow),
+      ProblemShape>(
+      // R->S
+      cute::move(tRS_rSrc),
+      cute::move(tRS_sRows),
+      cute::move(tRS_cSrc),
+      // S->R
+      cute::move(tSR_sRows),
+      cute::move(tSR_rRows),
+      cute::move(tSR_cRows),
+      // R->G
+      cute::move(tC_gRow),
+      cute::move(tC_cRow),
+      problem_shape,
+      params_ptr
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Scalar reduction
+template <
+  template <class> class RegReduceFn,
+  template <class> class AtomicReduceFn,
+  class ThreadMap,
+  class ElementOutput,
+  class ElementCompute,
+  FloatRoundStyle RoundStyle,
+  class StrideMNL = Stride<_0,_0,_0>
+>
+struct VisitorScalarReduction {
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _0>>) || // scalar reduction, e.g. tensor max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0, _1>>) || // batched scalar reduction, e.g. per-batch max element
+    (cute::is_same_v<StrideMNL, Stride<_0,_0,int>>));
+
+  struct Arguments {
+    ElementOutput* ptr_scalar = nullptr;
+    ElementCompute reduction_identity = 0;
+    StrideMNL dScalar = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  struct SharedStorage { };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(){ };
+
+  CUTLASS_HOST_DEVICE
+  VisitorScalarReduction(Params const& params, SharedStorage const& shared_storage)
+    : params_ptr(&params) { }
+
+  Params const* params_ptr;
+
+  template <class CTensor, class GTensor, class ProblemShape>
+  struct Callbacks : EmptyCallbacks {
+    CUTLASS_DEVICE
+    Callbacks(
+      CTensor&& tC_cSrc,
+      GTensor&& tC_gScalar,
+      ProblemShape problem_shape,
+      Params const* params_ptr,
+      int thread_idx
+    ):
+      tC_cSrc(cute::forward<CTensor>(tC_cSrc)),
+      tC_gScalar(cute::forward<GTensor>(tC_gScalar)),
+      problem_shape(problem_shape),
+      params_ptr(params_ptr) {
+        // The partial reduction results of each warp are further
+        // reduced to this first thread.
+        // Only the first thread of each warp is the writing thread
+        is_writing_thread = thread_idx % ThreadMap::kWarpSize == 0;
+      }
+
+      GTensor tC_gScalar;
+      CTensor tC_cSrc;
+      Params const* params_ptr;
+      ProblemShape problem_shape;
+      bool is_writing_thread;
+
+      ElementCompute reduction_accum;
+
+      CUTLASS_DEVICE void
+      begin_epilogue() {
+        reduction_accum = ElementCompute(params_ptr->reduction_identity);
+      }
+
+      template <class ElementAccumulator, class ElementInput, int FragmentSize>
+      CUTLASS_DEVICE auto
+      visit(int iter_idx, int row_idx, int column_idx, int frg_idx,
+            Array<ElementAccumulator, FragmentSize> const& frg_acc,
+            Array<ElementInput, FragmentSize> const& frg_input) {
+
+        auto coord = tC_cSrc(column_idx, row_idx, iter_idx);
+        if (elem_less(coord, problem_shape)) {
+          fragment_reduce<RegReduceFn, RoundStyle>(reduction_accum, frg_input);
+        }
+
+        return frg_input;
+      }
+
+      CUTLASS_DEVICE auto
+      end_epilogue() {
+        // Intra-warp reduction
+        intra_warp_row_reduce<RegReduceFn, ThreadMap::kWarpSize>(reduction_accum);
+
+        // Atomically reduce to global memory
+        atomic_reduce<AtomicReduceFn, RoundStyle>(&tC_gScalar(_0{},_0{}), reduction_accum);
+      }
+  };
+
+  template <class ProblemShape>
+  CUTLASS_DEVICE auto
+  get_callbacks(
+    gemm::GemmCoord threadblock_tile_offset,
+    int thread_idx,
+    ProblemShape problem_shape
+  ) {
+    Tensor cSrc = make_identity_tensor(problem_shape);
+    // FRAGMENT_COL, FRAGMENT_ROW, (ITERATION_ROW, ITERATION_GROUP, ITERATION_CLUSTER)
+    Tensor tC_cSrc = group_modes<2,5>(
+      ThreadMap::partition(cSrc, thread_idx, threadblock_tile_offset)(_0{},_,_,_,_,_)
+    );
+
+    Tensor mScalar = make_tensor(
+      make_gmem_ptr(params_ptr->ptr_scalar),
+      problem_shape,
+      params_ptr->dScalar
+    );
+
+    Tensor tC_gScalar = mScalar(_,_,threadblock_tile_offset.k());
+
+    return Callbacks<
+      decltype(tC_cSrc), decltype(tC_gScalar),
+      ProblemShape>(
+      cute::move(tC_cSrc),
+      cute::move(tC_gScalar),
+      problem_shape,
+      params_ptr,
+      thread_idx
+    );
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::epilogue::threadblock
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1936f2533fe55b22f2f6308aa4facc203c6d498
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Higher-level header file includes all the CUTLASS 2x visitors
+*/
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_load.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_store.hpp"
+#include "cutlass/epilogue/threadblock/fusion/visitor_compute.hpp"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec717fbcc16d1ea94ef6fb3e114edef50c3b506b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator without splitk
+template <
+    /// Shape of threadblock tile (concept: GemmShape)
+    typename Shape_,
+    /// Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+    typename WarpMmaOperator_,
+    /// Number of partitions of the K dimension
+    int PartitionsK,
+    /// Tile iterator reading and writing output tensors
+    typename OutputTileIterator_,
+    /// Fragment iterator selecting accumulators
+    typename AccumulatorFragmentIterator_,
+    /// Output operator
+    typename OutputOp_,
+    /// Number of interleaved k
+    int InterleavedK>
+class InterleavedEpilogue :
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+public:
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputOp = OutputOp_;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename AccumulatorFragmentIterator::AccumulatorTile;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Accumulator element
+  using ElementAccumulator = typename AccumulatorTile::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<typename OutputTileIterator::Element,
+                                 OutputTileIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType =
+      Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount =
+      gemm::GemmShape<Shape::kM / WarpMmaOperator::Shape::kM,
+                      Shape::kN / WarpMmaOperator::Shape::kN, kPartitionsK>;
+
+public:
+
+  static_assert(OutputTileIterator::kElementsPerAccess,
+                "This must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements %
+                  OutputTileIterator::kElementsPerAccess),
+                "Divisibility");
+
+public:
+
+  /// Aspect for when epilogue source is not needed
+  struct SourceAspectNotNeeded
+  {
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNotNeeded()
+    {}
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i]);
+      }
+    }
+  };
+
+
+  /// Aspect for when epilogue source is needed
+  struct SourceAspectNeeded
+  {
+    OutputTileIterator source_iterator;
+
+    typename OutputTileIterator::Fragment source_fragment;
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    static void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment,
+      typename OutputTileIterator::Fragment const &source_fragment)
+    {
+      OutputAccessType *output_frag_ptr =
+        reinterpret_cast<OutputAccessType *>(&output_fragment);
+
+      AccumulatorAccessType const *compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const *>(&aligned_accum_fragment);
+
+      OutputAccessType const *source_frag_ptr =
+        reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+      int const kOutputOpIterations =
+        OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kOutputOpIterations; ++i)
+      {
+        // Call the output operator
+        output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]);
+      }
+    }
+
+    /// Constructor
+    CUTLASS_DEVICE
+    SourceAspectNeeded(OutputTileIterator source_iterator) :
+      source_iterator(source_iterator)
+    {
+      source_fragment.clear();
+    }
+
+    /// Invoke the output functor over each vector of output
+    CUTLASS_DEVICE
+    void apply_output_operator(
+      typename OutputTileIterator::Fragment &output_fragment,
+      OutputOp const &output_op,
+      typename AccumulatorFragmentIterator::Fragment const &aligned_accum_fragment)
+    {
+      // Load addend source fragment from global memory
+      source_iterator.load(source_fragment);
+      ++source_iterator;
+
+      apply_output_operator(output_fragment, output_op, aligned_accum_fragment, source_fragment);
+    }
+  };
+
+
+  /// Shared storage allocation needed by the epilogue
+  struct SharedStorage {};
+
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedEpilogue(
+      SharedStorage &shared_storage,  ///< Shared storage object
+      int thread_idx,                 ///< ID of a thread within the threadblock
+      int warp_idx,                   ///< ID of warp within threadblock
+      int lane_idx)                   ///< Id of thread within warp
+  :
+      BaseStreamK(thread_idx)
+  {}
+
+
+  /// Aggregates the accumulator sets shared by peer blocks in the global workspace,
+  /// performing epilogue computations, writing to output
+  CUTLASS_DEVICE
+  void reduce(
+      int peer_idx_begin,
+      int peer_idx_end,
+      int reduce_fragment_idx,
+      void *element_workspace,
+      OutputOp const &output_op,                      ///< Output operator
+      OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+      OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    // Redcuce peer accumulator fragments into one fragment
+    AccumulatorFragment accum_fragment;
+    BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    if (output_op.is_source_needed())
+    {
+      source_iterator += reduce_fragment_idx;
+      source_iterator.load(source_fragment);
+    }
+
+    // Compute the output result
+    typename OutputTileIterator::Fragment output_fragment;
+
+    // Apply the output operator
+    SourceAspectNeeded::apply_output_operator(output_fragment, output_op, accum_fragment, source_fragment);
+
+    // Store the final result
+    destination_iterator += reduce_fragment_idx;
+    destination_iterator.store(output_fragment);
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators)            ///< Complete warp-level accumulator tile
+  {
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements
+  /// two alternative codepaths, depending on whether the output op requires addend data to be loaded.
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (output_op.is_source_needed())
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+    }
+    else
+    {
+      operator()(output_op, destination_iterator, accumulators, SourceAspectNotNeeded());
+    }
+  }
+
+
+  /// Perform the epilogue computations and stream the result to global memory.  Implements a
+  /// single codepath, regardless of whether the output op requires addend data to be loaded
+  CUTLASS_DEVICE
+  void unified(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )            ///< Tile iterator for addend source
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    operator()(output_op, destination_iterator, accumulators, SourceAspectNeeded(source_iterator));
+  }
+
+
+  /// Streams the result to global memory
+  template <typename SourceAspect>
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                      ///< Output operator
+    OutputTileIterator destination_iterator,        ///< Tile iterator for destination
+    AccumulatorTile const &accumulators,            ///< Complete warp-level accumulator tile
+    SourceAspect source)
+  {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+
+      //
+      // Convert fragment
+      //
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+      accum_fragment_iterator.load(accum_fragment);
+      ++accum_fragment_iterator;
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+      source.apply_output_operator(output_fragment, output_op, accum_fragment);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.set_iteration_index(iter);
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f6d101d088fd3bab48067f428e5bf01fa35a67a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template<
+  typename TensorLayout_,                             ///! The original output tensor layout
+  typename OutputIteratorLayout_,                     ///! Layout used by epilogue output iterator
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  conv::Operator ConvOperator,                        ///! Convolutional operator (Fprop, Dgrad, Wgrad)
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter {
+
+  using TensorLayout = TensorLayout_;
+  using OutputIteratorLayout = OutputIteratorLayout_;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  /// Wgrad stride idx for implicit gemm algorithm 
+  // Conv2d row-major matrix (KxRSC) 
+  // Conv3d row-major matrix (KxTRSC)
+  static int const kWgradStrideIdx = 
+    platform::is_same<TensorLayout, layout::TensorNHWC>::value ? 2 : 3;
+
+  /// This chooses the appropriate stride element of the C tensor.
+  static int const kTensorStrideIdx = 
+    (kConvolutionalOperator == conv::Operator::kWgrad ? kWgradStrideIdx : 0);
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride(kTensorStrideIdx);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNHWC, layout::TensorNHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNHWC;
+  using OutputIteratorLayout = layout::TensorNHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kFprop, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kFprop;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template<
+  typename TensorRef_,                                ///! Input tensor to epilogue output iterator
+  typename ConvProblemSize_                          ///! Convolutional operator on 2D or 3D problem
+>
+struct ConvOutputIteratorParameter<layout::TensorNDHWC, layout::TensorNDHWC, TensorRef_, conv::Operator::kDeconv, ConvProblemSize_> {
+
+  using TensorLayout = layout::TensorNDHWC;
+  using OutputIteratorLayout = layout::TensorNDHWC;
+  using MappedLayout = layout::RowMajor;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using MappedTensorCoord = typename MappedLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = conv::Operator::kDeconv;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static MappedTensorCoord extent(ConvProblemSize problem_size) {
+    return conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn();
+  }
+};
+
+template <
+  int InterleavedK,
+  typename TensorRef_,
+  conv::Operator ConvOperator,
+  typename ConvProblemSize_
+>
+struct ConvOutputIteratorParameter<
+  layout::TensorNCxHWx<InterleavedK>, 
+  layout::TensorNCxHWx<InterleavedK>,
+  TensorRef_,
+  ConvOperator,
+  ConvProblemSize_>
+{ 
+
+  using TensorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputIteratorLayout = typename layout::TensorNCxHWx<InterleavedK>;
+  using OutputTensorCoord = typename OutputIteratorLayout::TensorCoord;
+  using TensorRef = TensorRef_;
+  static conv::Operator const kConvolutionalOperator = ConvOperator;
+  using ConvProblemSize = ConvProblemSize_;
+
+  CUTLASS_HOST_DEVICE
+  static OutputIteratorLayout layout(const TensorRef & ref) {
+    return ref.stride();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static OutputTensorCoord extent(ConvProblemSize problem_size) {
+    return problem_size.output_extent();
+  }
+
+};
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c011c1dc7268a9117b67bd75aefe8739f5441c7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Metaprogram for determining the mapping of output elements to threads for epilogue tiles.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple defining point in output tile
+template <
+  int Column,
+  int Row,
+  int Group,
+  int Cluster,
+  int Tile
+>
+struct OutputTileShape {
+  static int const kColumn = Column;
+  static int const kRow = Row;
+  static int const kGroup = Group;
+  static int const kCluster = Cluster;
+  static int const kTile = Tile;
+
+  static int const kCount = kColumn * kRow * kGroup * kCluster * kTile;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Iterations, typename Delta>
+struct OutputTileThreadMapHelpers {
+
+  /// Determines the iteration index of a vector access according to the thread map
+  CUTLASS_HOST_DEVICE
+  static void iteration_index(
+    int &column_idx,
+    int &row_idx,
+    int &group_idx,
+    int &cluster_idx,
+    int &tile_idx,
+    int iter_idx) {
+
+    column_idx = iter_idx % Iterations::kColumn;
+    int residual   = iter_idx / Iterations::kColumn;
+
+    row_idx    = residual % Iterations::kRow;
+    residual       = residual / Iterations::kRow;
+
+    group_idx  = residual % Iterations::kGroup;
+    residual       = residual / Iterations::kGroup;
+
+    cluster_idx = residual % Iterations::kCluster;
+    tile_idx    = residual / Iterations::kCluster;
+  }
+
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+
+    int column_idx;
+    int row_idx;
+    int group_idx;
+    int cluster_idx;
+    int tile_idx;
+
+    iteration_index(column_idx, row_idx, group_idx, cluster_idx, tile_idx, iter_idx);
+
+    return
+      MatrixCoord(
+        row_idx     * Delta::kRow     +
+        group_idx   * Delta::kGroup   +
+        cluster_idx * Delta::kCluster +
+        tile_idx    * Delta::kTile,
+
+        column_idx  * Delta::kColumn);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <
+  typename ThreadMap_,
+  typename Shape_,
+  typename Iterations_,
+  typename Delta_,
+  typename Count_
+>
+struct OutputTileThreadMap : public OutputTileThreadMapHelpers<Iterations_, Delta_> {
+
+  /// Conventional thread map (concept: ThreadMap)
+  using ThreadMap = ThreadMap_;
+
+  /// Number of threads participating in the operation
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Number of scalar elements per access
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  /// Shape of the tile
+  using Shape = Shape_;
+
+  /// Iterations performed by each thread
+  using Iterations = Iterations_;
+
+  /// Delta between accesses
+  using Delta = Delta_;
+
+  /// Number of iterator iterations 
+  using Count = Count_;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+
+    using Index = typename layout::PitchLinearCoord::Index;
+    
+    layout::PitchLinearCoord coord = ThreadMap::initial_offset(thread_idx);
+
+    Index cluster = coord.strided() / (Shape::kGroup * Shape::kRow);
+    Index cluster_residual = coord.strided() % (Shape::kGroup * Shape::kRow);
+
+    Index group = cluster_residual / (Shape::kRow);
+    Index row = cluster_residual % (Shape::kRow);
+
+    return MatrixCoord{
+      row + group * Shape::kRow * Count::kRow 
+        + cluster * Shape::kGroup * Count::kGroup * Shape::kRow * Count::kRow,
+      coord.contiguous()
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// RowArrangement determines how one or more warps cover a region of consecutive rows.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize,
+  bool Is2dTile
+>
+struct RowArrangement;
+
+/// RowArrangement in which each warp's access is a 1D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, false> {
+  static int const kWarpSize = 32;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  static int const kIterationsRow = 1;
+  static int const kDeltaRow = 1;
+  static int const kIterationsColumn = Shape::kColumn / kElementsPerAccess / kWarpSize;
+  static int const kDeltaColumn = kWarpSize * kElementsPerAccess;
+
+  static int const kAccessWidth = kWarpSize;
+  static int const kAccessRows = 1;
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = WarpsRemaining;
+};
+
+/// RowArrangement in which each warp's access is a 2D tiled arrangement.
+template <
+  typename Shape,
+  int WarpsRemaining,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct RowArrangement<Shape, WarpsRemaining, ElementsPerAccess, ElementSize, true> {
+
+  static int const kMemoryAccessSize = 256; // Preferred access size
+  static int const kWarpSize = 32;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  struct Detail {
+    static int const kShapeRow = Shape::kRow / WarpsRemaining;
+    static int const kShapeWidth = Shape::kColumn / kElementsPerAccess;
+
+    static int const kTargetMemoryAccessWidth = 
+      kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8);
+
+    static int const kTargetAccessRows = kWarpSize / kTargetMemoryAccessWidth;
+  };
+
+  static int const kAccessWidth = 
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      kWarpSize / Detail::kShapeRow
+      : const_min(
+          Detail::kShapeWidth,
+        const_min(kWarpSize, kMemoryAccessSize / (kElementsPerAccess * kElementSize / 8))
+        ));
+
+  static int const kAccessRows =
+    (Detail::kTargetAccessRows > Detail::kShapeRow ?
+      Detail::kShapeRow
+      : const_min(Shape::kRow, kWarpSize / kAccessWidth));
+
+  static int const kIterationsRow = Detail::kShapeRow / kAccessRows;
+  static int const kDeltaRow = kAccessRows;
+
+  static int const kIterationsColumn = Detail::kShapeWidth / kAccessWidth;
+  static int const kDeltaColumn = kAccessWidth * kElementsPerAccess;
+
+  static_assert( kAccessWidth * kElementsPerAccess <= Shape::kColumn, "Accessing too many elements per access");
+  static_assert( kIterationsColumn > 0, "Iteration Count Column must be > 0" );
+  static_assert( kIterationsRow > 0, "Iteration Count Row must be > 0" );
+
+  static int const kWarpPartitionsRow = 1;
+  static int const kWarpPartitionsColumn = 1;
+};
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D space across warps to achieve several performance
+/// objectives:
+///
+///   - coalesced memory accesses in units of 128 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <
+  typename Shape_,
+  typename Count_,
+  int Threads,
+  int ElementsPerAccess,
+  int ElementSize
+>
+struct OutputTileOptimalThreadMap {
+
+  using Shape = Shape_;
+  using Count = Count_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {
+
+    // Clusters
+    static int const kIterationsCluster = 
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kCluster / kWarpCount
+        : 1);
+
+    static int const kDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kCompactedDeltaCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        Shape::kRow * Shape::kGroup * Shape::kCluster / kIterationsCluster
+        : 1);
+
+    static int const kWarpPartitionsCluster =
+      ((Shape::kCluster > kWarpCount) ?
+        kWarpCount
+        : kWarpCount / Shape::kCluster);
+
+    static int const kWarpsRemainingForGroups =
+      ((Shape::kCluster > kWarpCount) ? 1 : kWarpCount / Shape::kCluster);
+
+    // Groups
+    static int const kIterationsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kGroup / kWarpsRemainingForGroups
+        : 1);
+
+    static int const kDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Count::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kCompactedDeltaGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        Shape::kRow * Shape::kGroup / kIterationsGroup
+        : 1);
+
+    static int const kWarpPartitionsGroup =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+
+    static int const kWarpsRemainingForRows =
+      ((Shape::kGroup > kWarpsRemainingForGroups) ?
+        1
+        : kWarpsRemainingForGroups / Shape::kGroup);
+    
+    // Rows
+    using RowArrangement = detail::RowArrangement<
+      Shape,
+      kWarpsRemainingForRows,
+      kElementsPerAccess,
+      kElementSize,
+      (Shape::kRow > kWarpsRemainingForRows)
+    >;
+
+    // Warp partitions
+    using WarpPartitions = OutputTileShape<
+      RowArrangement::kWarpPartitionsColumn,
+      RowArrangement::kWarpPartitionsRow,
+      kWarpPartitionsGroup,
+      kWarpPartitionsCluster,
+      1>;
+
+    static int const kAccessWidth = RowArrangement::kAccessWidth;
+    static int const kAccessRows = RowArrangement::kAccessRows;
+  };
+
+  //
+  // Output
+  //
+
+  using Iterations = OutputTileShape<
+    Detail::RowArrangement::kIterationsColumn, 
+    Detail::RowArrangement::kIterationsRow, 
+    Detail::kIterationsGroup, 
+    Detail::kIterationsCluster, 
+    1>;
+
+  using Delta = OutputTileShape<
+    Detail::RowArrangement::kDeltaColumn,
+    Detail::RowArrangement::kDeltaRow,
+    Detail::kDeltaGroup,
+    Detail::kDeltaCluster,
+    1>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+
+//    int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
+    int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
+
+    int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
+    int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
+
+    int row_idx = residual_group / Detail::WarpPartitions::kRow;
+    int col_idx = residual_group % Detail::WarpPartitions::kRow;
+
+    // Compute per-lane offset
+    int lane_row_offset = lane_idx / Detail::kAccessWidth;
+    int lane_col_offset = lane_idx % Detail::kAccessWidth;
+
+    // Compute coordinate in output space
+    int cluster_offset = cluster_idx * Shape::kRow * Count::kRow * Shape::kGroup * Count::kGroup;
+    int group_offset = group_idx * Shape::kRow * Count::kRow;
+    int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
+    int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
+
+    return MatrixCoord(
+      cluster_offset + group_offset + row_offset + lane_row_offset,
+      column_offset + lane_col_offset * kElementsPerAccess
+    );
+  }
+
+  /// Computes the offset of a given vector access
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord iteration_offset(int iter_idx) {
+    return OutputTileThreadMapHelpers<Iterations, Delta>::iteration_offset(iter_idx);
+  }
+
+  /// Compacted thread map in which the 4D region is contiguous
+  struct CompactedThreadMap {
+
+
+    using Shape = Shape_;
+
+    using TileShape = MatrixShape<
+      Shape::kTile * Shape::kCluster * Shape::kGroup * Shape::kRow,
+      Shape::kColumn
+    >;
+
+    using Iterations = OutputTileShape<
+      Detail::RowArrangement::kIterationsColumn,
+      Detail::RowArrangement::kIterationsRow,
+      Detail::kIterationsGroup,
+      Detail::kIterationsCluster,
+      1>;
+
+    using Delta = OutputTileShape<
+      Detail::RowArrangement::kDeltaColumn,
+      Detail::RowArrangement::kDeltaRow,
+      Detail::kCompactedDeltaGroup,
+      Detail::kCompactedDeltaCluster,
+      1>;
+
+    /// Number of elements within each vector access
+    static int const kElementsPerAccess = ElementsPerAccess;
+
+    /// Number  of threads
+    static int const kThreads = Threads;
+
+    /// Function to compute each thread's initial offset
+    CUTLASS_HOST_DEVICE
+    static MatrixCoord initial_offset(int thread_idx) {
+
+//      int warp_idx = __shfl_sync(0xffffffff, thread_idx / kWarpSize, 0);
+      int warp_idx = thread_idx / kWarpSize;
+      int lane_idx = thread_idx % kWarpSize;
+
+      // Compute warp location
+      int cluster_idx = warp_idx / Detail::WarpPartitions::kCluster;
+      int residual_cluster = warp_idx % Detail::WarpPartitions::kCluster;
+
+      int group_idx = residual_cluster / Detail::WarpPartitions::kGroup;
+      int residual_group = residual_cluster % Detail::WarpPartitions::kGroup;
+
+      int row_idx = residual_group / Detail::WarpPartitions::kRow;
+      int col_idx = residual_group % Detail::WarpPartitions::kRow;
+
+      // Compute per-lane offset
+      int lane_row_offset = lane_idx / Detail::kAccessWidth;
+      int lane_col_offset = lane_idx % Detail::kAccessWidth;
+
+      // Compute coordinate in output space
+      int cluster_offset = cluster_idx * Shape::kRow * Shape::kGroup;
+      int group_offset = group_idx * Shape::kRow;
+      int row_offset = row_idx * Iterations::kRow * Detail::kAccessRows;
+      int column_offset = col_idx * Iterations::kColumn * Detail::kAccessWidth * kElementsPerAccess;
+
+      MatrixCoord coord(
+        cluster_offset + group_offset + row_offset + lane_row_offset,
+        column_offset + lane_col_offset * kElementsPerAccess
+      );
+
+      return coord;
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 3D interleaved layout across warps
+/// to achieve several performance objectives:
+///
+///   - coalesced memory accesses in units of 64 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <typename WarpCount_, typename Iterations_, int Threads,
+          int ElementsPerAccess, int ElementSize>
+struct InterleavedOutputTileThreadMap {
+  using WarpCount = WarpCount_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {};
+
+  //
+  // Output
+  //
+
+  using Iterations = Iterations_;
+
+  using Delta = layout::PitchLinearShape<kWarpSize * kElementsPerAccess, 1>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static layout::PitchLinearCoord initial_offset(int thread_idx) {
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    layout::PitchLinearCoord warp_footprint{
+        Delta::kContiguous * Iterations::kContiguous,
+        Delta::kStrided * Iterations::kStrided};
+
+    layout::PitchLinearCoord warp_offset{warp_idx % WarpCount::kContiguous,
+                                         warp_idx / WarpCount::kContiguous};
+
+    // Compute per-lane offset
+    layout::PitchLinearCoord thread_offset_in_warp{
+        lane_idx * kElementsPerAccess, 0};
+
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    return thread_offset_in_threadblock_tile;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template metaprogram for partitioning a 4D interleaved layout across warps
+/// to achieve several performance objectives:
+///
+///   - coalesced memory accesses in units of 64 Byte lines
+///   - minimal address arithmetic
+///   - minimal predicate calculations
+///
+template <typename WarpCount_, typename Iterations_, int Threads,
+          int ElementsPerAccess, int ElementSize>
+struct InterleavedConvOutputTileThreadMap {
+  using WarpCount = WarpCount_;
+
+  static int const kWarpSize = 32;
+  static int const kThreads = Threads;
+  static int const kWarpCount = kThreads / kWarpSize;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kElementSize = ElementSize;
+
+  //
+  // Metaprogram computation
+  //
+
+  struct Detail {};
+
+  //
+  // Output
+  //
+
+  using Iterations = Iterations_;
+
+  using Delta = MatrixShape<kWarpSize / 4, 4 * kElementsPerAccess>;
+
+  /// Initial offset function
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord initial_offset(int thread_idx) {
+    int warp_idx = thread_idx / kWarpSize;
+    int lane_idx = thread_idx % kWarpSize;
+
+    // Compute warp location
+    MatrixCoord warp_footprint{
+        Delta::kRow * Iterations::kRow,
+        Delta::kColumn * Iterations::kColumn,
+    };
+
+    MatrixCoord warp_offset{warp_idx % WarpCount::kRow,
+                            warp_idx / WarpCount::kRow};
+
+    // Compute per-lane offset
+    MatrixCoord thread_offset_in_warp{lane_idx / 4,
+                                      (lane_idx % 4) * kElementsPerAccess};
+
+    MatrixCoord thread_offset_in_threadblock_tile =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    return thread_offset_in_threadblock_tile;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c4692ffa29519b137dbb6dfb5918a85794178d2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -0,0 +1,1387 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
+  bool UseCUDAStore = false
+>
+class PredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout):
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor5DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
+  uint8_t *byte_pointer_;
+
+  /// Byte-level pointer for store(). Due to PermuteD Op, store_byte_pointer_ may be with different address computation compared to byte_pointer_.
+  uint8_t *store_byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const *indices_;
+
+  /// PermuteDLayout
+  PermuteDLayout permute_layout_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIterator(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord(),
+    int const *indices = nullptr
+  ): 
+    params_(params), indices_(indices),
+    permute_layout_(PitchLinearCoord(extent.column(), extent.row()), params_.stride * kElementsPerAccess / sizeof(AccessType))
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+    }
+
+    // store_byte_pointer_ is set to be the same with byte_pointer_ unless PermuteD is used.
+    store_byte_pointer_ = PermuteD ? reinterpret_cast<uint8_t *>(pointer) : byte_pointer_;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    store_byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = store_byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset +
+              LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+            
+            if (PermuteD) {
+
+              int col_offset = column * ThreadMap::Delta::kColumn;
+
+              int col = col_offset + thread_start_column_;
+              int row = row_offset + thread_start_row_;
+
+              // Locate memory_pointer
+              memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset
+                 + permute_layout_(PitchLinearCoord(col, row)) * sizeof(AccessType) / kElementsPerAccess);
+            }
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[0] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[0],
+                  guard);
+            }
+
+            if (!PermuteD) {
+              memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD && !PermuteD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          if (!ScatterD && !PermuteD) {
+            byte_pointer += params_.increment_group;
+          }
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        if (!ScatterD && !PermuteD) {
+          byte_pointer += params_.increment_cluster;
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+            (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(Fragment &frag, int64_t byte_offset, int convolution_P, int convolution_Q, int add_P, int add_Q, int problem_N) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+	  if (output_P > convolution_P - 2) row_add_P = 0;
+	  if (output_Q > convolution_Q - 2) row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P/2) * (convolution_Q/2) +
+            ((output_P + row_add_P)/2) * (convolution_Q/2) + (output_Q + row_add_Q)/2;
+
+          int64_t byte_offset = (input_row-output_row)*problem_N*sizeof(float);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    if (!ScatterD && !PermuteD) {
+      store_byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      if (!ScatterD) {
+        byte_pointer_ += params_.advance_group;
+      }
+
+      if (!ScatterD && !PermuteD) {
+        store_byte_pointer_ += params_.advance_group;
+      }
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        if (!ScatterD) {
+          byte_pointer_ += params_.advance_cluster;
+        }
+
+        if (!ScatterD && !PermuteD) {
+          store_byte_pointer_ += params_.advance_cluster;
+        }
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+
+          if (!ScatterD) {
+            byte_pointer_ += params_.advance_tile;
+          }
+
+          if (!ScatterD && !PermuteD) {
+            store_byte_pointer_ += params_.advance_tile;
+          }
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    byte_pointer_ += (params_.advance_row * increment);
+    store_byte_pointer_ += (params_.advance_row * increment);
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    byte_pointer_ += (params_.advance_group * increment_row);
+    store_byte_pointer_ += (params_.advance_group * increment_row);
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    byte_pointer_ += (params_.advance_cluster * increment_group);
+    store_byte_pointer_ += (params_.advance_cluster * increment_group);
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    byte_pointer_ += (params_.advance_tile * increment_cluster);
+    store_byte_pointer_ += (params_.advance_tile * increment_cluster);
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | InterleavedPredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int InterleavedN           ///< Number of Interleaved N 
+>
+class InterleavedPredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+
+  using Element = Element_;
+
+  using Layout = layout::ColumnMajorInterleaved<InterleavedN>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Iterations::kCount;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Uses a non-template class
+  struct Params : InterleavedPredicatedTileIteratorParams {
+    using Base = InterleavedPredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      Base(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>()
+      ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = (ThreadMap::Iterations::kContiguous < 8)
+                                  ? 8
+                                  : ThreadMap::Iterations::kContiguous;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// A thread's starting column position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_col_;
+
+  /// Internal iteration counter
+  int iteration_contiguous_;
+
+  int iteration_strided_;
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedPredicatedTileIterator(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset,
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    params_(params) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) +
+                                TensorCoord(threadblock_offset.contiguous() * InterleavedN,
+                                 threadblock_offset.strided() / InterleavedN);
+
+    extent_col_ = extent.strided() / InterleavedN;
+    thread_start_col_ = thread_offset.strided();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.contiguous() + ThreadMap::Delta::kContiguous * c) <
+           (extent.contiguous() * InterleavedN));
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
+      LongIndex(thread_offset.strided()) * LongIndex(params_.stride) + 
+      LongIndex(thread_offset.contiguous()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    iteration_contiguous_ = iteration_strided_ = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
+
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+
+    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
+
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    int col_offset = iteration_strided_ * ThreadMap::Delta::kStrided;
+
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+
+    bool guard = col_guard && mask_.predicates[iteration_contiguous_];
+
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int iteration) {
+    iteration_contiguous_ = iteration % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = iteration / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIterator &operator++() {
+
+    ++iteration_contiguous_;
+    byte_pointer_ += params_.advance_row;
+
+    if (iteration_contiguous_ == ThreadMap::Iterations::kContiguous) {
+
+      iteration_contiguous_ = 0;
+      ++iteration_strided_;
+      byte_pointer_ += params_.advance_column;
+
+      if (iteration_strided_ == ThreadMap::Iterations::kStrided) {
+        iteration_strided_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIterator &operator+=(int increment)
+  {
+    // Contiguous
+    iteration_contiguous_ += increment;
+    int increment_strided = iteration_contiguous_ / ThreadMap::Iterations::kContiguous;
+    iteration_contiguous_ = iteration_contiguous_ % ThreadMap::Iterations::kContiguous;
+    byte_pointer_ += (params_.advance_row * increment);
+
+    // Strided
+    iteration_strided_ += increment_strided;
+    byte_pointer_ += (params_.advance_column * increment_strided);
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | InterleavedMaskedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int InterleavedN           ///< Number of Interleaved N
+>
+class InterleavedConvPredicatedTileIterator {
+public:
+  using ThreadMap = ThreadMap_;
+
+  using Element = Element_;
+
+  using Layout = layout::TensorNCxHWx<InterleavedN>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = Tensor4DCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Iterations::kCount;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    LongIndex stride_col;           ///< stride in bytes between columns
+    LongIndex stride_row;           ///< stride in bytes between rows
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Status initialize(typename Layout::Stride stride_) {
+      stride_col = stride_[1];
+      stride_row = stride_[2];
+
+      return Status::kSuccess;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params() {
+      initialize(cutlass::make_Coord(0, 0, 0));
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) {
+
+      initialize(layout.stride());
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout,
+           // Not needed.  Added to be compatible with strided conv epilogue.
+           cutlass::Tensor4DCoord const &tensor_extent):
+      Params(layout)
+    { }
+
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount =
+        (ThreadMap::Iterations::kRow < 8) ? 8 : ThreadMap::Iterations::kRow;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in pq 
+  Index extent_pq_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_col_;
+
+  /// Internal iteration counter
+  LongIndex iteration_row_;
+  LongIndex iteration_col_;
+
+  uint32_t pq_mul_;
+
+  uint32_t pq_shr_;
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  InterleavedConvPredicatedTileIterator(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    MatrixCoord threadblock_offset
+  ):
+    params_(params) {
+    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+                                
+    extent_col_ = extent.c();
+    extent_pq_ = extent.h() * extent.w();
+    extent_row_ = extent.n() * extent_pq_;
+
+    find_divisor(pq_mul_, pq_shr_, extent_pq_);
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_col_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int r = 0; r < ThreadMap::Iterations::kRow; ++r) {
+      mask_.predicates[r] =
+          ((thread_offset.row() + ThreadMap::Delta::kRow * r) < extent_row_);
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+                    ((thread_start_col_ / InterleavedN) * params_.stride_col +
+                     (thread_start_col_ % InterleavedN)) *
+                        sizeof_bits<Element>::value / 8;
+
+    // Initialize internal state counter
+    iteration_row_ = iteration_col_ = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *memory_pointer =
+        reinterpret_cast<AccessType const *>(byte_pointer);
+
+    cutlass::arch::global_load<
+      AccessType, 
+      sizeof(AccessType)
+    >(
+        *frag_ptr,
+        (void *)memory_pointer,
+        guard);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    int col_offset = iteration_col_ * ThreadMap::Delta::kColumn;
+    bool col_guard = ((thread_start_col_ + col_offset) < extent_col_);
+    bool guard = col_guard && mask_.predicates[iteration_row_];
+
+    int n, pq_rem;
+
+    fast_divmod(n, pq_rem,
+                thread_start_row_ + iteration_row_ * ThreadMap::Delta::kRow,
+                extent_pq_, pq_mul_, pq_shr_);
+
+    uint8_t *byte_pointer =
+        byte_pointer_ + (n * params_.stride_row + pq_rem * InterleavedN) *
+                            sizeof_bits<Element>::value / 8;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer);
+
+    cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+        *frag_ptr, (void *)memory_pointer, guard);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int iteration) {
+    iteration_row_ = iteration % ThreadMap::Iterations::kRow;
+    iteration_col_ = iteration / ThreadMap::Iterations::kRow;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  InterleavedConvPredicatedTileIterator &operator++() {
+
+    ++iteration_row_;
+
+    if (iteration_row_ == ThreadMap::Iterations::kRow) {
+
+      iteration_row_ = 0;
+      ++iteration_col_;
+      byte_pointer_ += params_.stride_col;
+
+      if (iteration_col_ == ThreadMap::Iterations::kColumn) {
+        iteration_col_ = 0;
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
new file mode 100644
index 0000000000000000000000000000000000000000..7068c39409f4a25d97e6adfb5360d2c0d226e1e6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
@@ -0,0 +1,615 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+/// It provides a fast path for the case Rank = 2 which does not need div/rem to 
+/// calculate modes.
+
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int Rank
+>
+class PredicatedTileIteratorAffineRankN {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::AffineRankN<Rank>;
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+  static_assert( !(Layout::kRank % 2), 
+    "Layout rank must be even. This assumes the first half of the modes correspond to the 'row' "
+    "and the second half of the modes correspond to the 'column'");
+
+  static bool const kBigEndian = false;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    //
+    // Data members
+    //
+
+    Layout layout;
+
+    /// Stride in units of bytes along M modes
+    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
+
+    /// Stride in units of bytes along N modes
+    Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+    int64_t rank2_inc_col;
+    int64_t rank2_inc_row;
+
+    //
+    // Methods
+    //
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(TensorCoord const &extent, Layout const &layout_): layout(layout_) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2; ++i) {
+        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+      }
+
+      if (kBigEndian) {
+        // "Big Endian" scheme
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+          divmod_m[i] = FastDivmod(extent[i + 1]);
+          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+        }
+      }
+      else {
+        // "Little Endian" scheme
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+          divmod_m[i] = FastDivmod(extent[i]);
+          divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+        }
+      }
+
+      #if 0
+      //
+      // Debug print statements to verify extents and strides are passed correctly.
+      //
+      printf("PredicatedTileIteratorAffine::Params() entered\n");
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank; ++i) {
+        printf("  extent[%d]: %d\n", i, extent[i]);
+      }
+      for (int i = 0; i < Layout::kRank; ++i) {
+        printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
+      }
+      printf("PredicatedTileIteratorAffine::Params() returning\n");
+      #endif
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout_): layout(layout_) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2; ++i) {
+        stride_m[i] = OffsetBytes<Element>(layout_.stride()[i]);
+        stride_n[i] = OffsetBytes<Element>(layout_.stride()[i + Layout::kRank / 2]);
+      }
+
+      rank2_inc_col = ThreadMap::Delta::kColumn * stride_n[0];
+      rank2_inc_row = ThreadMap::Delta::kRow * stride_m[0];
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in columns
+  Index extent_col_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have been computed)
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Offsets in columns, cached for performance
+  int64_t offset_modes_n_[ThreadMap::Iterations::kColumn];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorAffineRankN(
+    Params const & params,
+    Element *pointer,
+    MatrixCoord extent,
+    int thread_idx,
+    MatrixCoord threadblock_offset = MatrixCoord(),
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ): 
+    params_(params)
+  {
+
+    MatrixCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_col_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    if (Layout::kRank > 2) {
+      // Initialize predicates
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+        // 
+        // Compute coordinate and decompose into N modes
+        //
+
+        int coord_n = thread_start_column_ + c * ThreadMap::Delta::kColumn;
+
+        mask_.predicates[c] = coord_n < extent.column();
+        
+        Coord<Layout::kRank / 2, Index> modes_n;
+
+        int64_t offset_modes_n = 0;
+
+        if (kBigEndian) {
+          modes_n = CoordinateDecomposition<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+        else {
+          modes_n = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_n, params_.divmod_n);
+
+          offset_modes_n = dot(modes_n, params_.stride_n);
+        }
+
+        offset_modes_n_[c] = offset_modes_n;
+
+      }
+
+      if (!pointer) {
+        mask_.clear();
+      }
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+    uint8_t const *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
+        int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          // 
+          // Compute coordinate and decompose into M modes
+          //
+
+          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+          Coord<Layout::kRank / 2, Index> modes_m;
+
+          if (Layout::kRank > 2) {
+            if (kBigEndian) {
+              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            } else {
+              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            }
+
+            offset_modes_m = dot(modes_m, params_.stride_m);
+          }
+
+          //
+          // Compute the offset due to modes M
+          //
+
+          bool row_guard = (coord_m < extent_row_);
+          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            // 
+            // Compute coordinate and decompose into N modes
+            //
+            
+            if (Layout::kRank > 2) {
+              offset_modes_n = offset_modes_n_[column];
+            }
+
+            //
+            // Compute the pointer and access
+            //
+            bool guard;
+
+            if (Layout::kRank > 2) {
+              guard = row_guard && mask_.predicates[column];
+            } else {
+              guard = (coord_m < extent_row_) && 
+              ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
+            }
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+              frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+              (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+              guard
+            );
+
+            if (Layout::kRank == 2) {
+              offset_modes_n += params_.rank2_inc_col;
+            }
+          }
+
+          if (Layout::kRank == 2) {
+            offset_modes_m += params_.rank2_inc_row;
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        int row_begin = thread_start_row_ + group * ThreadMap::Delta::kGroup + cluster * ThreadMap::Delta::kCluster;
+        int64_t offset_modes_m = row_begin * params_.stride_m[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          // 
+          // Compute coordinate and decompose into M modes
+          //
+
+          int coord_m = row * ThreadMap::Delta::kRow + row_begin;
+
+          Coord<Layout::kRank / 2, Index> modes_m;
+
+          if (Layout::kRank > 2) {
+            if (kBigEndian) {
+              modes_m = CoordinateDecomposition<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            } else {
+              modes_m = CoordinateDecompositionLittleEndian<Layout::kRank / 2>(coord_m, params_.divmod_m);
+            }
+
+            offset_modes_m = dot(modes_m, params_.stride_m);
+          }
+
+          //
+          // Compute the offset due to modes M
+          //
+
+          bool row_guard = (coord_m < extent_row_);
+          int64_t offset_modes_n = thread_start_column_ * params_.stride_n[0];
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            // 
+            // Compute coordinate and decompose into N modes
+            //
+            
+            if (Layout::kRank > 2) {
+              offset_modes_n = offset_modes_n_[column];
+            } 
+
+            //
+            // Compute the pointer and access
+            //
+            bool guard;
+            if (Layout::kRank > 2) {            
+              guard = row_guard && mask_.predicates[column];
+            } else {
+              guard = (coord_m < extent_row_) && ((thread_start_column_ + ThreadMap::Delta::kColumn * column) < extent_col_);
+            }
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)(byte_pointer + offset_modes_m + offset_modes_n + byte_offset),
+                guard);
+
+            if (Layout::kRank == 2) {
+              offset_modes_n += params_.rank2_inc_col;
+            }
+          }
+
+          if (Layout::kRank == 2) {
+            offset_modes_m += params_.rank2_inc_row;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineRankN &operator++() {
+
+    ++state_[0];
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..9990dbdbfc1445f91df4aa6a1eb5776663a5d4fd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Rank
+>
+struct PredicatedTileIteratorAffineLayoutRankNParams {
+  using Layout = layout::AffineRankN<Rank>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static bool const kBigEndian = false;
+  
+  //
+  // Data members
+  //
+
+  Layout layout;
+
+  /// Stride in units of bytes along M modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_m;
+
+  /// Stride in units of bytes along N modes
+  Coord<Layout::kRank/2, typename Layout::LongIndex> stride_n;
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_m[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  /// Fast divmod objects divided by tensor extents
+  FastDivmod divmod_n[(Layout::kRank == 2) ? 1 : (Layout::kRank/2 - 1)];
+
+  int64_t rank2_inc_col;
+  int64_t rank2_inc_row;
+
+  //
+  // Methods
+  //
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams() { }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(TensorCoord const &extent, 
+                                                Layout const &layout_,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    if (kBigEndian) {
+      // "Big Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i + 1]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2 + 1]);
+      }
+    }
+    else {
+      // "Little Endian" scheme
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Layout::kRank / 2 - 1; ++i) {
+        divmod_m[i] = FastDivmod(extent[i]);
+        divmod_n[i] = FastDivmod(extent[i + Layout::kRank / 2]);
+      }
+    }
+
+    #if 0
+    //
+    // Debug print statements to verify extents and strides are passed correctly.
+    //
+    printf("PredicatedTileIteratorAffine::Params() entered\n");
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  extent[%d]: %d\n", i, extent[i]);
+    }
+    for (int i = 0; i < Layout::kRank; ++i) {
+      printf("  stride[%d]: %ld\n", i, layout_.stride()[i]);
+    }
+    printf("PredicatedTileIteratorAffine::Params() returning\n");
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorAffineLayoutRankNParams(Layout const &layout_,
+                                                int32_t threadmap_delta_kColumn,
+                                                int32_t threadmap_delta_kRow,
+                                                int64_t element_sizeof_bits)
+  : layout(layout_) 
+  {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Layout::kRank / 2; ++i) {
+      stride_m[i] = OffsetBytes(layout_.stride()[i], element_sizeof_bits);
+      stride_n[i] = OffsetBytes(layout_.stride()[i + Layout::kRank / 2], element_sizeof_bits);
+    }
+
+    rank2_inc_col = threadmap_delta_kColumn * stride_n[0];
+    rank2_inc_row = threadmap_delta_kRow * stride_m[0];
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
new file mode 100644
index 0000000000000000000000000000000000000000..518ad0908c48a7e99b5cdb87792fd4b1a6d2672d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
@@ -0,0 +1,633 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,                     ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,                        ///< Element data type
+  BlasMode BlasMode_ = BlasMode::kGemm   ///< Tile Iterator for a Symmetric or Hermitian Kernel
+>
+class PredicatedTileIteratorBlas3 {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static BlasMode const kBlasMode = BlasMode_;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+  static_assert( AccessType::kElements == 1, "BLAS3 Epilogue must use AccessType::kElements as 1");
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Fill Mode for a tile on diagonal of a symmetric kernel
+  cutlass::FillMode fill_mode;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Starting address of the matrix  
+  size_t matrix_start_addr; 
+ 
+  static_assert((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian), 
+    "Unsupported blas3 mode.");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorBlas3(
+    PredicatedTileIteratorParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset
+    , cutlass::FillMode fill_mode
+  ): 
+    params_(params), fill_mode(fill_mode)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    thread_start_row_ = thread_offset.row();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Check Symmetric kernel modes (Lower and Upper - for diagonal CTAs, None for rest CTAs)
+    if ((kBlasMode == BlasMode::kSymmetric || kBlasMode == BlasMode::kHermitian) && 
+        fill_mode == cutlass::FillMode::kInvalid) {
+      arch::device_breakpoint();
+    }
+
+    // Starting address of the matrix
+    matrix_start_addr =  reinterpret_cast<size_t>(pointer); 
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) + 
+      LongIndex(thread_offset.row()) * LongIndex(params_.stride) + 
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void load_symmetric_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      load_with_byte_offset(frag, 0);
+    }
+    else {
+      load_symmetric_with_byte_offset(frag, 0);
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment on the diagonal of a symmetric kernel to memory 
+  CUTLASS_DEVICE
+  void store_symmetric_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    bool isLowerMode = (fill_mode == cutlass::FillMode::kLower) ? true : false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          // Offset of row from beginning of the matrix per thread
+          size_t row_start_offset = (size_t)memory_pointer - matrix_start_addr;
+
+          // Absolute row index
+          int row_index = int(row_start_offset/params_.stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            // Offset of column from beginning of row per thread     
+            size_t col_start_offset = row_start_offset + 
+                        (column * ThreadMap::Delta::kColumn / kElementsPerAccess) * sizeof(AccessType);
+
+            // Absolute column index
+            size_t col_index = (col_start_offset%params_.stride)/sizeof(AccessType);
+
+            guard = guard && ( (isLowerMode && row_index >= col_index) ||
+                               (!isLowerMode && row_index <= col_index) );
+
+            // The imaginary parts of the diagonal elements of a complex element are assumed and set to zero
+            if (guard && kBlasMode == BlasMode::kHermitian && cutlass::is_complex<Element>::value) {
+
+              AccessType *frag_ptr_modify = const_cast<AccessType *>(frag_ptr);
+              Element *scalar_ptr = reinterpret_cast<Element *>(frag_ptr_modify);
+
+              if (row_index == col_index) {
+                scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = 
+                  real(scalar_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]);
+              }
+            }
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    
+    if (fill_mode == cutlass::FillMode::kNone) {
+      store_with_byte_offset(frag, 0);
+    }
+    else {
+      store_symmetric_with_byte_offset(frag, 0); 
+    }
+
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorBlas3 &operator++() {
+
+    ++state_[0];
+    byte_pointer_ += params_.advance_row;
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..49ee22efad4bb40366b6358dba13ec689a3e059d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
@@ -0,0 +1,562 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIteratorConv | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  bool ScatterD = false,     ///< Scatter D operand or not
+  typename PermuteDLayout = layout::NoPermute, ///< Permute D operand or not
+  bool UseCUDAStore = false,
+  int Rank = 4
+>
+class PredicatedTileIteratorConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  static int const kRank = Rank;
+  using Layout = typename platform::conditional<kRank == 4,
+                                       layout::TensorNHWC,
+                                       layout::TensorNDHWC>::type;
+
+  using Stride = typename Layout::Stride;
+  static int const kStrideRank = Layout::kStrideRank;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using MappedLayout = layout::RowMajor;
+  using Index = typename MappedLayout::Index;
+  using LongIndex = typename MappedLayout::LongIndex;
+  using TensorCoord = typename MappedLayout::TensorCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static bool constexpr PermuteD = !layout::is_trivial_permute<PermuteDLayout>;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element,
+    ThreadMap::Iterations::kColumn *
+    ThreadMap::Iterations::kRow *
+    ThreadMap::Iterations::kGroup *
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    /// Fast divmod objects divided by tensor extents
+    FastDivmod divmod[kStrideRank - 1];
+    Stride tensor_stride;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor4DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[2] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[1] /* P for Fprop & H for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::Tensor5DCoord const &tensor_extent):
+      PredicatedTileIteratorParams(
+        layout.stride()[0] * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) {
+      divmod[0] = FastDivmod(tensor_extent[3] /* Q for Fprop & W for Deconv*/);
+      divmod[1] = FastDivmod(tensor_extent[2] /* P for Fprop & H for Deconv*/);
+      divmod[2] = FastDivmod(tensor_extent[1] /* Z for Fprop & D for Deconv*/);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kStrideRank; ++i) {
+        tensor_stride[i] = layout.stride()[i];
+      }
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer. This pointer is usually for both load() and store(), unless PermuteD is performed. When having PermuteD, byte_pointer_ is only for load().
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorConv(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ):
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column()
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    // Initialize byte_pointer_
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer) +
+      LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess;
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>(row_offset + thread_start_row_, params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType,
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)&memory_pointer[column * ThreadMap::Delta::kColumn /
+                                        kElementsPerAccess + tensor_offset / kElementsPerAccess],
+                guard);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx =
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow
+            + group * ThreadMap::Delta::kGroup
+            + cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          Stride tensor_coord = CoordinateDecompositionLittleEndian<kStrideRank>((row_offset + thread_start_row_), params_.divmod);
+
+          LongIndex tensor_offset = dot(tensor_coord, params_.tensor_stride);
+
+          AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer[tensor_offset / kElementsPerAccess] =
+                    frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void *)&memory_pointer[tensor_offset / kElementsPerAccess],
+                  guard);
+            }
+
+            memory_pointer += (ThreadMap::Delta::kColumn / kElementsPerAccess);
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator++() {
+
+    ++state_[0];
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+
+          thread_start_row_ += ThreadMap::Shape::kGroup * ThreadMap::Shape::kRow
+            * ThreadMap::Shape::kCluster * ThreadMap::Shape::kTile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances a number of positions to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorConv &operator+=(int increment)
+  {
+    // Row
+    state_[0] += increment;
+    int increment_row = state_[0] / ThreadMap::Count::kRow;
+    state_[0] = state_[0] % ThreadMap::Count::kRow;
+
+    thread_start_row_ += (ThreadMap::Shape::kRow * increment);
+
+    // Group
+    state_[1] += increment_row;
+    int increment_group = state_[1] / ThreadMap::Count::kGroup;
+    state_[1] = state_[1] % ThreadMap::Count::kGroup;
+
+    thread_start_row_ +=
+        (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Count::kRow *
+        increment_row;
+
+    // Cluster
+    state_[2] += increment_group;
+    int increment_cluster = state_[2] / ThreadMap::Count::kCluster;
+    state_[2] = state_[2] % ThreadMap::Count::kCluster;
+
+    thread_start_row_ +=
+        ThreadMap::Count::kGroup *
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Count::kRow *
+        ThreadMap::Shape::kRow *
+        increment_group;
+
+    // Tile
+    thread_start_row_ +=
+        ThreadMap::Shape::kGroup *
+        ThreadMap::Shape::kRow *
+        ThreadMap::Shape::kCluster *
+        ThreadMap::Shape::kTile *
+        increment_cluster;
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d1f171100d40fa8fd07d643b28c547d817cae56
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: PitchLinearThreadMap)
+  typename Element_,         ///< Element data type
+  typename ThreadOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>,
+  typename ThreadBlockOutputShape_ = cutlass::conv::TensorNHWCShape<1, 1, 1, 1>
+>
+class PredicatedTileIteratorDirectConv {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+
+  using ConvProblemSize = typename cutlass::conv::Conv2dProblemSize;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / AccessType::kElements;
+
+  using ThreadTileCount = MatrixShape<
+    ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+    ThreadBlockOutputShape::kW / ThreadOutputShape::kW
+  >;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorDirect2dConvParams {
+    using Base = PredicatedTileIteratorDirect2dConvParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize const &problem_size): 
+      PredicatedTileIteratorDirect2dConvParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        problem_size,
+        {ThreadBlockOutputShape::kH, ThreadBlockOutputShape::kW}
+      ) 
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kContiguous;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorDirect2dConvParams params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  ///     
+  Element *pointer_;
+
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Initial thread output location
+  int thread_start_n_, thread_start_p_, thread_start_q_;
+
+  /// Current threadblock tile index
+  int tile_index_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorDirect2dConvParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorDirectConv(
+    PredicatedTileIteratorDirect2dConvParams const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params), pointer_(pointer)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    // stride dim (PQ)
+    thread_start_row_ = thread_offset.column();
+    // contiguous dim (Channels)
+    thread_start_column_ = threadblock_offset.column() + thread_offset.row();
+
+    tile_index_ = threadblock_offset.row();
+
+    set_tile_index(0);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void set_tile_index(const int index) { 
+   
+    int residual;
+    params_.pq_divmod(thread_start_n_, residual, tile_index_ + index);
+    params_.q_divmod(thread_start_p_, thread_start_q_, residual);
+
+    // Compute the base output coord of ThreadBlock
+    thread_start_p_ *= ThreadBlockOutputShape::kH;
+    thread_start_q_ *= ThreadBlockOutputShape::kW;
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+      mask_.predicates[c] = ((thread_start_column_ 
+        + c * ThreadMap::Delta::kContiguous) < extent_column_);
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer_) {
+      mask_.clear();
+    }
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        int current_row = thread_start_row_ + s * ThreadMap::Delta::kStrided;
+        int p = current_row / ThreadBlockOutputShape::kW;
+        int q = current_row % ThreadBlockOutputShape::kW;
+
+        int current_p = thread_start_p_ + p;
+        int current_q = thread_start_q_ + q;
+
+        bool row_guard = (current_p) < params_.P && (current_q) < params_.Q &&
+                         (thread_start_n_ < params_.N) && current_row < ThreadMap::Shape::kStrided;
+
+        int output_row_offset =
+            thread_start_n_ * params_.stride_n + current_p * params_.stride_p + current_q;
+
+        uint8_t *byte_pointer =
+            reinterpret_cast<uint8_t *>(pointer_) +
+            LongIndex(output_row_offset) * LongIndex(params_.stride) +
+            LongIndex(thread_start_column_ + c * ThreadMap::Delta::kContiguous) *
+                sizeof(AccessType) / kElementsPerAccess;
+
+        AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+        AccessType *memory_pointer = reinterpret_cast<AccessType *>(byte_pointer + byte_offset);
+
+        bool guard = row_guard && mask_.predicates[c];
+
+        cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+            frag_ptr[frag_base_idx], (void *)&memory_pointer[0], guard);
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) const {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirectConv &operator++() {
+    // do nothing
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..11ec3d72ea14fd23a99ead9a52fe14f947436a1a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -0,0 +1,483 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/conv/conv2d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct OutputTileShapeDesc {
+
+  int column;
+  int row;
+  int group;
+  int cluster;
+  int tile;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(): column(0), row(0), group(0), cluster(0), tile(0) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  OutputTileShapeDesc(
+    int column_,
+    int row_,
+    int group_,
+    int cluster_,
+    int tile_
+  ):
+    column(column_),
+    row(row_),
+    group(group_),
+    cluster(cluster_),
+    tile(tile_) { }
+
+  /// Total number of points in the 5D space
+  CUTLASS_HOST_DEVICE
+  int count() const {
+    return column * row * group * cluster * tile;
+  }
+
+  #if 0
+  CUTLASS_HOST_DEVICE
+  void print() const {
+    printf("{%d, %d, %d, %d, %d}", column, row, group, cluster, tile);
+  }
+  #endif
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileShape template.
+template <typename Shape>
+CUTLASS_HOST_DEVICE
+OutputTileShapeDesc make_OutputTileShapeDesc() {
+  return OutputTileShapeDesc(
+    Shape::kColumn,
+    Shape::kRow,
+    Shape::kGroup,
+    Shape::kCluster,
+    Shape::kTile
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread map description
+struct OutputTileThreadMapDesc {
+
+  int threads;
+  int elements_per_access;
+  OutputTileShapeDesc shape;
+  OutputTileShapeDesc iterations;
+  OutputTileShapeDesc delta;
+  OutputTileShapeDesc count;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc() { }
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc(
+    int threads_,
+    int elements_per_access_,
+    OutputTileShapeDesc shape_,
+    OutputTileShapeDesc iterations_,
+    OutputTileShapeDesc delta_,
+    OutputTileShapeDesc count_
+  ):
+    threads(threads_), 
+    elements_per_access(elements_per_access_),
+    shape(shape_),
+    iterations(iterations_),
+    delta(delta_),
+    count(count_) 
+  {
+    
+  }
+};
+
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
+template <typename ThreadMap>
+CUTLASS_HOST_DEVICE
+OutputTileThreadMapDesc make_OutputTileThreadMapDesc() {
+  return OutputTileThreadMapDesc(
+    ThreadMap::kThreads,
+    ThreadMap::kElementsPerAccess,
+    make_OutputTileShapeDesc<typename ThreadMap::Shape>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Iterations>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Delta>(),
+    make_OutputTileShapeDesc<typename ThreadMap::Count>()
+  );
+}
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct for PredicatedTileIterator
+//
+
+struct PredicatedTileIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+
+  LongIndex stride;               ///< stride in bytes between rows
+
+  LongIndex increment_row;        ///< increment quantity (in bytes) to advance when moving between rows
+  LongIndex increment_group;      ///< increment quantity (in bytes) to advance when moving to the next group
+  LongIndex increment_cluster;    ///< increment quantity (in bytes) to advance when moving to the next cluster
+
+  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
+  LongIndex advance_group;        ///< amount to add to move to the next 'group' position
+  LongIndex advance_cluster;      ///< amount to add to move to the next 'cluster' position
+  LongIndex advance_tile;         ///< amount to add to move to the next 'tile'
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_, OutputTileThreadMapDesc thread_map) {
+    
+    stride = stride_;
+
+    increment_row = stride * thread_map.delta.row;
+
+    increment_group = stride * thread_map.delta.group
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    increment_cluster = stride * thread_map.delta.cluster
+      - stride * thread_map.delta.group * (thread_map.iterations.group - 1)
+      - stride * thread_map.delta.row * (thread_map.iterations.row - 1);
+
+    advance_row = stride * thread_map.shape.row;
+
+    advance_group = 
+      stride * 
+      (thread_map.shape.group - 1) * thread_map.shape.row * thread_map.count.row;
+    
+    advance_cluster = 
+      stride * 
+      thread_map.count.group * 
+      thread_map.shape.group * 
+      thread_map.count.row * 
+      thread_map.shape.row;
+    
+    advance_tile =
+      stride * 
+      thread_map.shape.group * 
+      thread_map.shape.row * 
+      thread_map.shape.cluster * 
+      thread_map.shape.tile;
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride_, OutputTileThreadMapDesc thread_map) {
+    return initialize(LongIndex(stride_), thread_map); 
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams() {
+    initialize(LongIndex(0), OutputTileThreadMapDesc());
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams(Index stride, OutputTileThreadMapDesc thread_map) {
+    initialize(stride, thread_map);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorParams(LongIndex stride, OutputTileThreadMapDesc thread_map) {
+    initialize(stride, thread_map);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct for PredicatedTileIteratorDirect2dConv
+//
+
+struct PredicatedTileIteratorDirect2dConvParams{
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+  FastDivmod pq_divmod;
+  FastDivmod q_divmod;
+
+  LongIndex stride;
+  LongIndex stride_n;
+  LongIndex stride_p;
+
+  int N;
+  int P;
+  int Q;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_,
+                    cutlass::conv::Conv2dProblemSize const &problem_size,
+                    MatrixCoord threadblock_output_shape) {
+    stride = stride_; // The stride per row of output tensor (bytes)
+    stride_n = problem_size.P * problem_size.Q;
+    stride_p = problem_size.Q ;
+
+    N = problem_size.N;
+    P = problem_size.P;
+    Q = problem_size.Q;
+
+    // Fastdivmod for output O, P, Q
+    if(threadblock_output_shape.row() != 0 && threadblock_output_shape.column() !=0 ){
+      // MSVC emits a "potential divide by 0" warning as error
+      // if the code just divides without a check and substitution.
+
+      CUTLASS_ASSERT(threadblock_output_shape.row() != 0);
+      const auto row_denom = threadblock_output_shape.row() != 0 ?
+        threadblock_output_shape.row() : cutlass::MatrixCoord::Index(1);
+      int tiles_p =
+          (problem_size.P + (threadblock_output_shape.row() - 1)) / row_denom;
+
+      CUTLASS_ASSERT(threadblock_output_shape.column() != 0);
+      const auto col_denom = threadblock_output_shape.column() != 0 ?
+        threadblock_output_shape.column() : cutlass::MatrixCoord::Index(1);
+      int tiles_q = (problem_size.Q + (threadblock_output_shape.column() - 1)) /
+                    col_denom;
+
+      pq_divmod = FastDivmod(tiles_p * tiles_q);
+      q_divmod = FastDivmod(tiles_q);
+    }
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(
+      Index stride_,
+      cutlass::conv::Conv2dProblemSize const &problem_size = cutlass::conv::Conv2dProblemSize(),
+      MatrixCoord threadblock_output_shape = MatrixCoord()) {
+    return initialize(LongIndex(stride_), problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams() { initialize(LongIndex(0)); }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(Index stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorDirect2dConvParams(LongIndex stride,
+                               cutlass::conv::Conv2dProblemSize const &problem_size,
+                               MatrixCoord threadblock_output_shape) {
+    initialize(stride, problem_size, threadblock_output_shape);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//  InterleavedPredicatedTileIterator
+///////////////////////////////////////////////////////////////////////////////
+
+
+/// Predicated tile access iterator descriptor object containing template dependent state
+struct InterleavedPredicatedTileIteratorDesc {
+
+  int element_size_bits;
+  int elements_per_access;
+  int threadmap_warp_size;
+  layout::PitchLinearCoord threadmap_iterations;
+  layout::PitchLinearCoord threadmap_delta;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc() { }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc(
+    int element_size_bits_,
+    int elements_per_access_,
+    int threadmap_warp_size_,
+    layout::PitchLinearCoord threadmap_iterations_,
+    layout::PitchLinearCoord threadmap_delta_
+  ):
+    element_size_bits(element_size_bits_),
+    elements_per_access(elements_per_access_),
+    threadmap_warp_size(threadmap_warp_size_),
+    threadmap_iterations(threadmap_iterations_),
+    threadmap_delta(threadmap_delta_) { }
+};
+
+//
+// Parameters struct InterleavedPredicatedTileIterator
+//
+
+struct InterleavedPredicatedTileIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+
+  LongIndex stride;               ///< stride in bytes between rows
+  LongIndex advance_row;          ///< amount to add to move to the next 'row' position
+  LongIndex advance_column;       ///< amount to add to move to the next 'column' position
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride_, InterleavedPredicatedTileIteratorDesc desc) {
+    
+    stride = stride_;
+
+    advance_row = desc.threadmap_delta.contiguous() * desc.element_size_bits / 8;
+
+    advance_column = stride_ - desc.threadmap_iterations.contiguous() *
+                               desc.elements_per_access *
+                               desc.element_size_bits *
+                               desc.threadmap_warp_size / 8;
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams() {
+    initialize(LongIndex(0), InterleavedPredicatedTileIteratorDesc());
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams(Index stride, InterleavedPredicatedTileIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorParams(LongIndex stride, InterleavedPredicatedTileIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an OutputTileShapeDesc from a OutputTileThreadMap template.
+template <typename Element, typename ThreadMap>
+CUTLASS_HOST_DEVICE
+InterleavedPredicatedTileIteratorDesc make_InterleavedPredicatedTileIteratorDesc() {
+  return InterleavedPredicatedTileIteratorDesc(
+    sizeof_bits<Element>::value,
+    ThreadMap::kElementsPerAccess,
+    ThreadMap::kWarpSize,
+    {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+    {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+  );
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an MakePredicatedTileIteratorDesc from a template 
+// dependent state
+template <typename Element, typename Layout,
+   typename ThreadMap>
+  struct MakePredicatedTileIteratorDesc;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for layout::RowMajor output data.
+template <typename Element, typename ThreadMap>
+struct MakePredicatedTileIteratorDesc <
+    Element, layout::RowMajor, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  OutputTileThreadMapDesc operator()() {
+
+    return make_OutputTileThreadMapDesc<ThreadMap>();
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for layout::ColumnMajorInterleaved<InterleavedN> output data.
+template <typename Element, typename ThreadMap, int InterleavedN>
+struct MakePredicatedTileIteratorDesc <
+    Element, layout::ColumnMajorInterleaved<InterleavedN>, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  InterleavedPredicatedTileIteratorDesc operator()() {
+
+    return make_InterleavedPredicatedTileIteratorDesc<Element, ThreadMap>();
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4ed371f4d9d22f205306fb43253f5021168b003
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
@@ -0,0 +1,309 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief PredicatedTileIteratorPredicates.
+
+  PredicatedTileIteratorPredicates enables both upper and lower bounds for predicates.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator predicates used to bound computations in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Element data type
+>
+class PredicatedTileIteratorPredicates {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+        
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index lower_extent_row_;
+  Index upper_extent_row_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// Internal state counter
+  int state_[3];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(lower_extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(upper_extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPredicates(
+    PredicatedTileIteratorParams const & params,
+    TensorCoord lower_extent,
+    TensorCoord upper_extent,
+    int thread_idx,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    lower_extent_row_ = lower_extent.row();
+    upper_extent_row_ = upper_extent.row();
+    thread_start_row_ = thread_offset.row();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < upper_extent.column()) &&
+        ((thread_offset.column() + ThreadMap::Delta::kColumn * c) >= lower_extent.column());
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPredicates &operator++() {
+
+    ++state_[0];
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Gets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+
+  ///< Gets lower_extent_row_
+  CUTLASS_DEVICE Index get_lower_extent_row() {
+    return lower_extent_row_;
+  }
+
+  ///< Gets upper_extent_row_
+  CUTLASS_DEVICE Index get_upper_extent_row() {
+    return upper_extent_row_;
+  }
+
+  ///< Gets thread_start_row_
+  CUTLASS_DEVICE Index get_thread_start_row() {
+    return thread_start_row_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfe9571e72bafe38b8877d64106e3dda6c0d93d3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@@ -0,0 +1,479 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Element data type
+>
+class PredicatedTileIteratorStridedDgrad {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert( ThreadMap::Iterations::kRow > 0,"ThreadMap::Iterations::kRow must be > 0");
+  static_assert( ThreadMap::Iterations::kGroup > 0,"ThreadMap::Iterations::kGroup must be > 0");
+  static_assert( ThreadMap::Iterations::kCluster > 0,"ThreadMap::Iterations::kCluster must be > 0");
+  static_assert( ThreadMap::Iterations::kColumn > 0,"ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+
+    /// Convolution problem size
+    cutlass::conv::Conv2dProblemSize problem_size;
+    int tiled_rows_per_filter;
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, cutlass::conv::Conv2dProblemSize problem_size_, int threadblock_row): 
+      problem_size(problem_size_), 
+      PredicatedTileIteratorParams(
+        layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+        make_OutputTileThreadMapDesc<ThreadMap>()
+      ) 
+    {
+  
+      int tile_m_per_filter = strided_dgrad_tile_m_per_filter(problem_size, threadblock_row);
+
+      tiled_rows_per_filter = tile_m_per_filter * threadblock_row;
+    }
+  };
+
+  /// Mask object
+  struct Mask {
+
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  Params params_;
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Starting Dx h and w dimension for strided dgrad mapping
+  int start_h_, start_w_;
+
+  /// Effective Dy P and Q dimensions for strided dgrad mapping
+  int p_, q_;
+
+  /// A thread's starting row position (assuming steady-state predicates have been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column position (assuming steady-state predicates have been computed)
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+ 
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides");
+
+private:
+
+  //
+  // Methods
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorStridedDgrad(
+    Params const & params,
+    Element *pointer,
+    TensorCoord extent,
+    int thread_idx,
+    FastDivmod const &stride_h_divmod, FastDivmod const &stride_w_divmod,
+    int start_r, int start_s,
+    TensorCoord threadblock_offset = TensorCoord()
+  ): 
+    params_(params)
+  {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    int r = start_r;
+    int s = start_s;
+
+    if (params_.problem_size.mode == cutlass::conv::Mode::kConvolution) {
+      r = (params_.problem_size.R - 1 - r);
+      s = (params_.problem_size.S - 1 - s);
+    }
+
+    // compute starting coordinates in Dx start_h_ and start_w_
+    strided_dgrad_starting_coords(
+      params_.problem_size, 
+      stride_h_divmod, stride_w_divmod, 
+      r, s, 
+      start_h_, start_w_);
+
+    p_ = (params_.problem_size.H - start_h_ + params_.problem_size.stride_h - 1) / params_.problem_size.stride_h;
+    q_ = (params_.problem_size.W - start_w_ + params_.problem_size.stride_w - 1) / params_.problem_size.stride_w;
+
+    extent_row_ = extent.row();
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+
+      mask_.predicates[c] = ((thread_offset.column() 
+        + ThreadMap::Delta::kColumn * c) < extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t *>(pointer);
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, int64_t byte_offset) {
+
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          // remapping rows to find the mapped_row_offset
+          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
+
+          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
+          int n = npq_offset / (p_ * q_); 
+          int residual = npq_offset % (p_ * q_);
+          int p = residual / q_;
+          int q = residual % q_;
+        
+          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
+                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
+                                  (start_w_ + q * params_.problem_size.stride_w);
+          bool row_guard = mapped_row_offset < extent_row_;
+
+          int64_t row_byte_offset = mapped_row_offset * params_.stride;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<
+              AccessType, 
+              sizeof(AccessType)
+            >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn +
+                         column],
+                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
+                guard);
+          }
+        }
+      }
+    }
+  }
+
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, int64_t byte_offset) {
+    uint8_t *byte_pointer = byte_pointer_;
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow 
+            + group * ThreadMap::Delta::kGroup 
+            + cluster * ThreadMap::Delta::kCluster;
+
+          // remapping rows to find the mapped_row_offset
+          int npq_offset = (row_offset + thread_start_row_) % params_.tiled_rows_per_filter;
+
+          // (STEP 4.a) [order NHW rows to be loaded and stored in output Dx NHWxC layout]
+          int n = npq_offset / (p_ * q_); 
+          int residual = npq_offset % (p_ * q_);
+          int p = residual / q_;
+          int q = residual % q_;
+        
+          int mapped_row_offset = n * (params_.problem_size.H * params_.problem_size.W) +
+                                  (start_h_ + p * params_.problem_size.stride_h) * params_.problem_size.W +
+                                  (start_w_ + q * params_.problem_size.stride_w);
+          bool row_guard = mapped_row_offset < extent_row_;
+
+          int64_t row_byte_offset = mapped_row_offset * params_.stride;
+          
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+
+            int64_t column_byte_offset = (thread_start_column_ + column * ThreadMap::Delta::kColumn) * (sizeof_bits<Element>::value / 8);
+
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_store<AccessType, sizeof(AccessType) >(
+                frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void *)(byte_pointer + row_byte_offset + column_byte_offset + byte_offset),
+                guard);            
+          }
+        }
+      }
+    }
+  }
+
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorStridedDgrad &operator++() {
+
+    ++state_[0];
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+    
+    if (state_[0] == ThreadMap::Count::kRow) {
+
+      state_[0] = 0;
+      ++state_[1];
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * 
+        ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+
+        state_[1] = 0;
+        ++state_[2];
+
+        thread_start_row_ += ThreadMap::Count::kGroup * 
+          ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask &mask) {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const &mask) {
+    mask_ = mask;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a321f1b61b3364d2c6450604b14822a2dc560a26
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Element data type
+  int MaxAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8
+>
+class SharedLoadIterator {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment = ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    ThreadMap::kElementsPerAccess, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIterator(
+    TensorRef ref,
+    int thread_idx
+  ):
+    byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
+    stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ +=
+      thread_offset.row() * stride_ + 
+      thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    byte_pointer_ += 
+      offset.row() * Shape::kRow * stride_ + 
+      offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          uint8_t const *byte_pointer = byte_pointer_ + 
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx = 
+            (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+          LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = 
+                memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
new file mode 100644
index 0000000000000000000000000000000000000000..66cc17f72817d1feb4d8eb6c6242c1e8efb5ce2e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -0,0 +1,594 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_,         ///< Accumulator data type
+  int ElementSizeBits_,      ///< Size of accumulator in bits
+  int OutputSizeBits_,       ///< Size of output element in bits
+  int ElementsPerAccess,     ///< Vector length of output vector
+  int ContiguousLanes,       ///< Number of lanes in the warp writing to contiguous elements
+                             ///  in the global memory tensor
+  bool EightBitsOutputOrLess = (OutputSizeBits_ <= 8)
+>
+class SharedLoadIteratorMixed;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+  typename ThreadMap_,       ///< Thread map (conept: OutputTileThreadMap)
+  typename Element_          ///< Accumulator data type
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, 16, 8, 8, false> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    ThreadMap::kElementsPerAccess, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+    const_min(16, kAlignment)
+  >;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointers
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] = reinterpret_cast<LoadType const *>(ref.data());
+
+      int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess;
+      int bank_offset = (col_idx * int(sizeof(LoadType)) / 128) % kLoadsPerAccess;
+
+      col_idx += (bank_offset + i) % kLoadsPerAccess;
+
+      pointers_[i] += thread_offset.row() * stride_ + col_idx;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); 
+
+              LoadType const *memory_pointer = pointers_[v] + row_ptr_offset;
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for
+///   int32_t x 16 => int8_t/int4b_t x 16 and
+///   float x 16 => float_e4m3_t/float_e5m2_t x 16
+template <
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
+  int OutputSizeBits_       ///< Size of output element in bits
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 16, 8, true> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 16;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    16, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 16;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i);
+ 
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for:
+///   int32_t x 8 => int8_t/int4b_t x 8 and
+///   float x 8 => float_e4m3_t/float_e5m2_t x 8
+template <
+  typename ThreadMap_,      ///< Thread map (concept: OutputTileThreadMap)
+  typename Element_,
+  int OutputSizeBits_
+>
+class SharedLoadIteratorMixed<ThreadMap_, Element_, 32, OutputSizeBits_, 8, 8, true> {
+public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 32, "Element size in bits must be 32.");
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kAlignment = 8;
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+    Element, 
+    ThreadMap::Iterations::kColumn * 
+    ThreadMap::Iterations::kRow * 
+    ThreadMap::Iterations::kGroup * 
+    ThreadMap::Iterations::kCluster * 
+    ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<
+    Element, 
+    8, 
+    kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+    Element,
+    4,
+    16
+  >;
+
+  static int const kLoadsPerAccess = 2;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  LoadType const *pointers_[kLoadsPerAccess];
+
+  /// Stride along adjacent rows in units of LoadType
+  int stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorMixed(
+    TensorRef ref,
+    int thread_idx
+  ):
+    stride_((ref.stride(0) / LoadType::kElements)) {
+
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+    
+    // Initialize pointers
+    LoadType const *base_ptr = reinterpret_cast<LoadType const *>(ref.data()) + thread_offset.row() * stride_;
+      
+    int lane_col_idx = thread_offset.column() / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i);
+
+      pointers_[i] = base_ptr + lane_offset;
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += pointer_offset / LoadType::kElements;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kLoadsPerAccess; ++i) {
+      pointers_[i] += 
+        offset.row() * Shape::kRow * stride_ + 
+        offset.column() * Shape::kColumn / LoadType::kElements;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+
+          int row_ptr_offset =
+            row * ThreadMap::Delta::kRow * stride_ + 
+            group * ThreadMap::Delta::kGroup* stride_ + 
+            cluster * ThreadMap::Delta::kCluster * stride_ +
+            pointer_offset / LoadType::kElements;
+
+          int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) {
+            
+            int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+           
+              LoadType const *memory_pointer = pointers_[v];
+            
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Set base smem address
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..74d040ba0be731c2a3faa46a1a4034ed9eccb9e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading.
+  
+  When the fragment is loaded into registers, it matches the row-major thread map assumed by
+  the predicated tile iterator writing to global memory.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in epilogue.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <typename ThreadMap_,  ///< Thread map (conept: PitchLinearThreadMap)
+          typename Element_,    ///< Element data type
+          int MaxAlignment = ThreadMap_::kElementsPerAccess *sizeof_bits<Element_>::value / 8>
+class SharedLoadIteratorPitchLinear {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment = (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType =
+      AlignedArray<Element,
+                   const_min(128 / sizeof_bits<Element>::value, ThreadMap::kElementsPerAccess),
+                   const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t *byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+  /// Base address offset
+  Index base_smem_address_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  SharedLoadIteratorPitchLinear(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t *>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8),
+        base_smem_address_(0) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    // thread_offset.row() is contiguous dim
+    // thread_offset.column() is stride dim
+    byte_pointer_ += thread_offset.row() * sizeof(AccessType) / kElementsPerAccess+
+                     thread_offset.column() * stride_ ;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &offset) {
+    byte_pointer_ +=
+        offset.row() * ThreadMap::StorageShape::kContiguous * sizeof(AccessType) / kElementsPerAccess +
+        offset.column() * ThreadMap::StorageShape::kStrided * stride_;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        uint8_t const *byte_pointer =
+            byte_pointer_ + s * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous * ThreadMap::kElementsPerAccess *
+                sizeof_bits<Element>::value / 8 +
+            pointer_offset * sizeof_bits<Element>::value / 8 + base_smem_address_;
+
+        int frag_base_idx = s * ThreadMap::Iterations::kContiguous + c;
+
+        LoadType *frag_ptr = reinterpret_cast<LoadType *>(&frag);
+
+        LoadType const *memory_pointer = reinterpret_cast<LoadType const *>(byte_pointer);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kLoadsPerAccess; ++v) {
+          frag_ptr[frag_base_idx * kLoadsPerAccess + v] = memory_pointer[v];
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment &frag) const { load_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace epilogue
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..58ccbfacf504b28da2282dc69214b149acda3c65
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -0,0 +1,187 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorComplexTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
+  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    complex<OperatorElementC>, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  static int const kRealIndex = 0;
+
+  /// Offset into the accumulator fragment
+  static int const kImaginaryIndex = 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<OperatorElementC, 2 * kImaginaryIndex>;
+
+  /// This is the complete warp-level accumulator tile.
+  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kImaginaryIndex>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorComplexTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      auto const & real_accum_array = accumulators_[accumulator_access_offset + kRealIndex];
+      auto const & imag_accum_array = accumulators_[accumulator_access_offset + kImaginaryIndex / Policy::kElementsPerAccess];
+
+      // Pack real and imaginary parts into a structure. This is likely to result in MOVs
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
+
+        frag_ptr[n][i].real() = real_accum_array[i];
+        frag_ptr[n][i].imag() = imag_accum_array[i]; 
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b03cab835c7f137db1f923cf393007fbfaa7ed1e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -0,0 +1,194 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorGaussianComplexTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< underlying real-valued matrix multiply operation data type
+  typename OperatorFragmentC_  ///< underlying real-valued matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorGaussianComplexTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    complex<OperatorElementC>, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC
+  static int const kElementsAccumulatorPerPart = 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn;
+
+  /// Offset into the accumulator fragment part 1
+  static int const kPart1Index = kElementsAccumulatorPerPart * 0;
+
+  /// Offset into the accumulator fragment part 2
+  static int const kPart2Index = kElementsAccumulatorPerPart * 1;
+
+  /// Offset into the accumulator fragment part 3
+  static int const kPart3Index = kElementsAccumulatorPerPart * 2;
+
+  /// This is the complete warp-level accumulator tile holding part1, part2, and part3
+  using AccumulatorTile = Array<OperatorElementC, kElementsAccumulatorPerPart * 3>;
+
+  /// This is the complete warp-level accumulator tile holding final output of complex<T> type 
+  using OutputAccumulatorTile = Array<complex<OperatorElementC>, kElementsAccumulatorPerPart>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+  using FragmentAccessType = Array<complex<OperatorElementC>, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorGaussianComplexTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index];
+      auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess];
+      auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess];
+
+      // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Policy::kElementsPerAccess; ++i) {
+
+        frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i];
+        frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; 
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..404be79f3ba894a90fbd3b6fa8ec56ac1717ff4b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Fragment iterator for SIMT accumulator arrangements
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator,              ///< matrix multiply operation (concept: arch::Mma)
+  typename Layout,                ///< target shared memory layout
+  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class FragmentIteratorSimt;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,     ///< shape of the warp-level GEMM tile
+  typename Operator_ ,     ///< matrix multiply operator (concept: arch::Mma)
+  typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class FragmentIteratorSimt<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Layout = layout::RowMajor;
+
+  /// Policy for warp-level epilogue components
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<typename Operator::ElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorSimt &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+
+      int accumulator_access_offset = index_ * Policy::kAccessesPerIteration + n;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c6f10b0e694bcc142b60d39e242d9192482d566
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset = 
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for col-major shared memory
+/// Only works for 168x tensor core kernels
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::ColumnMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::ColumnMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC, 
+    4 * Policy::OperatorCount::kRow * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC, 
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Policy::kAccumulatorRowStride; ++i) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < (Policy::OperatorCount::kRow * 2); ++m) {
+
+        int accumulator_access_offset = 
+          index * Policy::kAccumulatorColumnStride + m * Policy::kAccumulatorRowStride / Policy::kElementsPerAccess + i;
+
+        frag_ptr[m + i * Policy::OperatorCount::kRow * 2] = accumulators_[accumulator_access_offset];
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class FragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fede55860c5aa6a24dce06f1b065d2711eef49a4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+class FragmentIteratorVoltaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = half_t;
+  using Layout = layout::RowMajor;
+
+  /// Policy operator
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    static int const kAccessesPerMma = Policy::kElementsPerMma / Policy::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      
+      int tile_access_idx = 
+        (tile_n * Policy::TileIterations::kRow + (index_ & 2) / 2) * Policy::MmaIterations::kCount * kAccessesPerMma;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * kAccessesPerMma; ++mma_n) {
+
+        int mma_access_idx = ((mma_n & 1) * 2 + (index_ & 1)) * kAccessesPerMma + (mma_n & 2) / 2;
+
+        frag_ptr[tile_n * Policy::MmaIterations::kColumn * kAccessesPerMma +
+          mma_n] = accumulators_[tile_access_idx + mma_access_idx];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+class FragmentIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = float;
+  using Layout = layout::RowMajor;
+
+  /// Policy operator
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, ElementC, Layout>;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+private:
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorVoltaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    int const kRegsPerMmaRow = 2;
+      
+    CUTLASS_PRAGMA_UNROLL
+    for (int reg_row = 0; reg_row < Policy::kRowsPerMmaTile; ++reg_row) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+    
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn * 2; ++mma_n) {
+
+          int mma_idx = (index_ & 1) + (index_ & 2) * Policy::MmaIterations::kCount / 2 +
+            (tile_n * Policy::TileIterations::kRow) * Policy::MmaIterations::kCount + (mma_n & 1) * 2;
+
+          int reg_offset = reg_row * kRegsPerMmaRow + (mma_n & 2) * 2;
+          int reg_idx = mma_idx * Policy::kElementsPerMma + reg_offset;
+
+          *frag_ptr = accumulators_[reg_idx / Policy::kElementsPerAccess];
+          ++frag_ptr;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..245499b02e2758be4d0a8998650a94cffa92112e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -0,0 +1,158 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage 
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+*/
+
+#pragma once
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// 
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
+  typename Layout             ///< target shared memory layout
+>
+class FragmentIteratorWmmaTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: nvcuda::cuda::fragment)
+>
+class FragmentIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = WmmaFragmentArray<OperatorFragmentC, Policy::OperatorCount::kCount>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+private:
+
+  /// Internal access type
+  using AccessType = WmmaFragmentArray<OperatorFragmentC, Policy::kWmmaFragmentsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp(AccumulatorTile const &accum): 
+    accumulators_(reinterpret_cast<AccessType const *>(&accum)), 
+    index_(0) { 
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  FragmentIteratorWmmaTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+      
+      int accumulator_access_offset = index_ * Policy::OperatorCount::kColumn + n;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/simt_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/simt_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1fa65ca57aa2599c4321202a9ee9dca5ffef3a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/simt_policy.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of SimtOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename WarpShape,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator,             ///< matrix multiply operation (concept: arch::Mma)
+  typename Layout,               ///< destination layout in shared memory
+  typename MmaSimtPolicy         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+struct SimtPolicy;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator_,            ///< matrix multiply operation (concept: arch::Mma)
+  typename MmaSimtPolicy_        ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+struct SimtPolicy<WarpShape_, Operator_, layout::RowMajor, MmaSimtPolicy_> {
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+
+  static_assert(!(WarpShape::kM % MmaSimtPolicy::WarpShape::kRow), "Divisibility");
+  static_assert(!(WarpShape::kN % MmaSimtPolicy::WarpShape::kColumn), "Divisibility");
+
+  /// Number of iterations
+  static int const kIterations = WarpShape::kM / MmaSimtPolicy::WarpShape::kRow;
+
+  /// Number of accumulators written per iteration
+  static int const kElementsPerIteration = 
+    (WarpShape::kN / MmaSimtPolicy::WarpShape::kColumn);
+
+  /// Total number of accumulators
+  static int const kAccumulatorElementCount = kElementsPerIteration * kIterations;
+
+  /// Number of consecutive elements
+  static int const kElementsPerAccess = MmaSimtPolicy::LaneMmaShape::kN;
+
+  /// Number of rows per epilogue iteration
+  static int const kRowsPerIteration = MmaSimtPolicy::WarpShape::kRow;
+
+  /// Number of accesses made in one iteration
+  static int const kAccessesPerIteration = kElementsPerIteration / kElementsPerAccess;
+
+  /// Number of elements in between accumulator chunks of (LaneMmaShape::kM x LaneMmaShape::kN)
+  using Delta = MatrixShape<
+    MmaSimtPolicy::WarpShape::kRow * MmaSimtPolicy::LaneMmaShape::kM,
+    MmaSimtPolicy::WarpShape::kColumn * MmaSimtPolicy::LaneMmaShape::kN
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..002d8591e19041f22d9c105b85caa51538540f4a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
+  typename Layout         ///< target shared memory layout
+>
+struct TensorOpPolicy; 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct TensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
+    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = 8;
+  static bool const kDivisible = 
+    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction = OperatorShape::kM / kRowsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kRow * kIterationsPerInstruction;
+
+  using TileIterations = MatrixShape<kIterations, 1>;
+
+  static int const kAccumulatorRowStride = kElementsPerAccess;
+  static int const kAccumulatorColumnStride = kElementsPerAccess * OperatorCount::kRow * kIterationsPerInstruction;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct TensorOpPolicy<WarpShape, OperatorShape, layout::ColumnMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    (WarpShape::kM + OperatorShape::kM - 1) / OperatorShape::kM,
+    (WarpShape::kN + OperatorShape::kN - 1) / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 1;
+  static int const kColumnsPerIteration = 8;
+  static bool const kDivisible = 
+    !(WarpShape::kM % OperatorShape::kM) && !(WarpShape::kN % OperatorShape::kN);
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction = OperatorShape::kN / kColumnsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kColumn * kIterationsPerInstruction;
+
+  using TileIterations = MatrixShape<kIterations, 1>;
+
+  // Hard code for 16x8
+  static int const kAccumulatorRowStride = 2;
+  static int const kAccumulatorColumnStride = 4 * OperatorCount::kRow;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major-interleaved
+template <
+    typename WarpShape,  ///< shape of warp-level GEMM (concept: MatrixShape)
+    typename OperatorShape,   ///< matrix multiply operation (concept: arch::Mma)
+    int InterleavedK     ///< number of interleaved k
+    >
+struct TensorOpPolicy<WarpShape, OperatorShape,
+                      layout::ColumnMajorInterleaved<InterleavedK> > {
+  /// Number of operations
+  using OperatorCount = MatrixShape<WarpShape::kM / OperatorShape::kM,
+                                    WarpShape::kN / OperatorShape::kN>;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = 8;
+
+  //
+  // Derived quantities
+  //
+
+  // Number of 'externally visible' iterations per actual instruction
+  static int const kIterationsPerInstruction =
+      OperatorShape::kM / kRowsPerIteration;
+
+  // Number of externally visible iterations
+  static int const kIterations = WarpShape::kN / InterleavedK *
+                                 OperatorCount::kRow *
+                                 kIterationsPerInstruction;
+
+  static int const kElementsPerIteration = InterleavedK / OperatorShape::kN * kElementsPerAccess;
+
+  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
+
+  // Number of externally visible iterations
+  //static int const kTileIterations = OperatorCount::kRow * kIterationsPerInstruction;
+  using TileIterations = MatrixShape<1, WarpShape::kN / InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..be7af1355fc634174dac2d15740ad94e15f60fe6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -0,0 +1,785 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/simt_policy.h"
+
+#define CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES 1
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename Operator,      ///< matrix multiply operation (concept: arch::Mma)
+  typename Element,       ///< data type of element to be written
+  typename Layout,        ///< target shared memory layout
+  typename MmaSimtPolicy          ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimt;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
+  typename Element_,       ///< data type of element to be written
+  typename MmaSimtPolicy_         ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimt<WarpShape_, Operator_, Element_, layout::RowMajor, MmaSimtPolicy_> {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+    + 1
+#endif
+  >;
+
+private:
+
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    1
+  >;
+
+#else
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    Policy::kElementsPerAccess
+  >;
+#endif
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    pointer_ += layout_({
+      lane_offset.row(),
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimt & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+#if CUTLASS_SIMT_EPILOGUE_USE_SCALAR_STORES
+      // de-vectorized stores
+      using ScalarAccessType = AlignedArray<Element, 1>;
+      ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
+      ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+          scalarPointer[n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s] = scalarFragPtr[n * Policy::kElementsPerAccess + s];
+        }
+      }
+#else
+    // original vector stores
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
+    }
+#endif
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+          typename Operator_,      ///< matrix multiply operation (concept: arch::Mma)
+          typename Element_,       ///< data type of element to be written
+          typename Layout_,         ///< target shared memory layout
+          typename MmaSimtPolicy_  ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirectConv {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<Policy::kRowsPerIteration, WarpShape::kN>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<typename Operator::ElementC, Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<typename Operator::ElementC, Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0,
+                              0
+                              >;
+
+private:
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    Policy::kElementsPerAccess
+  >;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements) {
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    pointer_ += layout_({
+      lane_offset.row(),
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      (tile_offset.column() * Shape::kColumn / int(AccessType::kElements))
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirectConv & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    // original vector stores
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+    AccessType * load_pointer_ = reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      load_pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      frag_ptr[n] = pointer_[n * Policy::MmaSimtPolicy::WarpShape::kColumn + pointer_offset / int(AccessType::kElements)];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address){
+    base_smem_address_ = address;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Template for reading and writing tiles of accumulators to shared memory
+template <typename WarpShape_,               ///< shape of warp-level GEMM (concept: GemmShape)
+          typename ThreadOutputShape_,       /// Size of the matrix to load (concept: TensorNHWC)
+          typename ThreadBlockOutputShape_,  /// Size of the matrix to load (concept: TensorNHWC)
+          typename Operator_,                ///< matrix multi ply operation (concept: arch::Mma)
+          typename Element_,                 ///< data type of element to be written
+          typename Layout_,                  ///< target shared memory layout
+          typename MmaSimtPolicy_            ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+          >
+class TileIteratorSimtDirect2dConv {
+ public:
+  using WarpShape = WarpShape_;
+  using ThreadOutputShape = ThreadOutputShape_;
+  using ThreadBlockOutputShape = ThreadBlockOutputShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  using MmaSimtPolicy = MmaSimtPolicy_;
+
+  using TensorRef = TensorRef<Element, Layout>;  ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;               ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  // Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<ThreadOutputShape::kNHW, ThreadOutputShape::kC>;
+
+  static_assert(!(ThreadShape::kColumn % MmaSimtPolicy::LaneMmaShape::kN),
+                "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  using ThreadTileCount = MatrixShape<ThreadBlockOutputShape::kH / ThreadOutputShape::kH,
+                                      ThreadBlockOutputShape::kW / ThreadOutputShape::kW>;
+
+  using Iterations =
+      MatrixShape<ThreadShape::kRow, ThreadShape::kColumn / MmaSimtPolicy::LaneMmaShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Operator::FragmentC;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = AccumulatorTile;
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, 0>;
+
+ private:
+  // Storage type for accessing memory
+  using AccessType = AlignedArray<Element, MmaSimtPolicy::LaneMmaShape::kN>;
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Base smem offset;
+  Index base_smem_address_;
+
+ public:
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv() : pointer_(nullptr) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv(TensorRef const &ref, unsigned thread_id, unsigned lane_id)
+      : pointer_(reinterpret_cast<AccessType *>(ref.data())),
+        layout_(ref.stride()[0] / AccessType::kElements) {
+  
+    auto lane_layout = MmaSimtPolicy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    // Get base HW offset of current threads
+    const int threadgroup = thread_id / (ThreadBlockOutputShape::kC / ThreadOutputShape::kC);
+    const int base_p = (threadgroup / (ThreadTileCount::kColumn)) * ThreadOutputShape::kH;
+    const int base_q = (threadgroup % (ThreadTileCount::kColumn)) * ThreadOutputShape::kW;
+
+    const int row_offset = base_p * ThreadBlockOutputShape::kW + base_q;
+
+    pointer_ += layout_(
+        {row_offset,
+         lane_offset.column() * MmaSimtPolicy::LaneMmaShape::kN / int(AccessType::kElements)});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtDirect2dConv &add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType *storer_pointer_ =
+        reinterpret_cast<AccessType *>(reinterpret_cast<uint8_t *>(pointer_) + base_smem_address_);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int h = 0; h < ThreadOutputShape::kH; ++h) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int w = 0; w < ThreadOutputShape::kW; ++w) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int col = 0; col < Iterations::kColumn; ++col) {
+          int offset = (w + h * ThreadBlockOutputShape::kW) *
+                           (ThreadBlockOutputShape::kC / AccessType::kElements) +
+                       col;
+          storer_pointer_[offset + pointer_offset / int(AccessType::kElements)] =
+              frag_ptr[w + h * ThreadOutputShape::kW + col];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) { base_smem_address_ = address; }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,        ///< shape of warp-level GEMM (concept: GemmShape)
+  typename Operator_,         ///< matrix multiply operation (concept: arch::Mma)
+  typename Element_,          ///< data type of element to be written
+  typename Layout_,            ///< target shared memory layout
+  typename MmaSimtPolicy_     ///< policy defining lane arrangement (concept: MmaSimtPolicy)
+>
+class TileIteratorSimtCanonical {
+public:
+
+  using WarpShape = WarpShape_;
+  using Operator = Operator_;
+  using Element = Element_;
+  using Layout = Layout_;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = SimtPolicy<WarpShape, Operator, Layout, MmaSimtPolicy_>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    typename Operator::ElementC, 
+    Policy::kElementsPerIteration>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    typename Operator::ElementC, 
+    Policy::kAccumulatorElementCount>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess + 1
+  >;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<
+    Element, 
+    1
+  >;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Guard to indicate whether the shape is divisible
+  bool divisible_;
+
+  /// Extent of the output tensor
+  MatrixCoord extent_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements),
+    divisible_(true),
+    extent_(WarpShape::kM, WarpShape::kN) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    thread_offset_ = {
+      lane_offset.row() * Shape::kRow, 
+      lane_offset.column() * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({
+      lane_offset.row() * Shape::kRow,
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical(
+    TensorRef const &ref,
+    TensorCoord const &extent,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / AccessType::kElements),
+    divisible_(false),
+    extent_(extent) { 
+
+    auto lane_layout = Policy::MmaSimtPolicy::get_lane_layout();
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id);
+
+    thread_offset_ = {
+      lane_offset.row() * Shape::kRow, 
+      lane_offset.column() * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({
+      lane_offset.row() * Shape::kRow,
+      lane_offset.column() * Policy::kElementsPerAccess / int(AccessType::kElements)
+    });
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / AccessType::kElements;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row(), 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(), 
+      coord_offset.column()
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & operator+=(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+    
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    // de-vectorized stores
+    using ScalarAccessType = AlignedArray<Element, 1>;
+    ScalarAccessType const *scalarFragPtr = reinterpret_cast<ScalarAccessType const *>(&frag);
+    ScalarAccessType *scalarPointer = reinterpret_cast<ScalarAccessType *>(pointer_) + pointer_offset;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+        
+        int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
+        int frag_idx = n * Policy::kElementsPerAccess + s;
+        
+        int col = thread_offset_.column() + ptr_idx;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          scalarPointer[ptr_idx] = scalarFragPtr[frag_idx];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+      // de-vectorized loads
+      using ScalarAccessType = AlignedArray<Element, 1>;
+      ScalarAccessType *scalarFragPtr = reinterpret_cast<ScalarAccessType *>(&frag);
+      ScalarAccessType const *scalarPointer = reinterpret_cast<ScalarAccessType const*>(pointer_) + pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::kAccessesPerIteration; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 0; s < Policy::kElementsPerAccess; s++) {
+          
+          int ptr_idx = n * Policy::MmaSimtPolicy::WarpShape::kColumn * Policy::kElementsPerAccess + s;
+          int frag_idx = n * Policy::kElementsPerAccess + s;
+          
+          int col = thread_offset_.column() + ptr_idx;
+
+          if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+            scalarFragPtr[frag_idx] = scalarPointer[ptr_idx];
+          }
+        }
+      }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorSimtCanonical & operator++() {
+    return add_tile_offset({1, 0});
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cfa072c4f8dbfb192c10a96ef776e235a7c10cf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -0,0 +1,671 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element,       ///< data type of element to be written
+  typename Layout         ///< target shared memory layout
+>
+class TileIteratorTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_        ///< data type of element to be written
+>
+class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using TensorLayout = Layout;
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of times this iterator can be incremented
+  using TileIterations = typename Policy::TileIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column() / Policy::kElementsPerAccess});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(),
+      coord_offset.column() / Policy::kElementsPerAccess
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      frag_ptr[n] = pointer_[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator++() {
+    return add_tile_offset({1, 0});
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,       ///< data type of element to be written
+  int InterleavedK         ///< number of interleaved k
+>
+class TileIteratorTensorOp<WarpShape_, OperatorShape_, Element_, 
+                            layout::ColumnMajorInterleaved<InterleavedK> > {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorInterleaved<InterleavedK>;
+  using TensorLayout = Layout;                ///< shared memory tensor ref layout
+
+  using TensorRef = TensorRef<Element, TensorLayout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+//    Policy::kRowsPerIteration,
+    WarpShape::kM,
+    InterleavedK
+  >;
+
+  /// This is the fragment size produced by one tile
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction 
+        * Policy::kElementsPerIteration>;
+
+  /// This is the fragment size produced by one iteration
+//  using Fragment = Array<
+//    Element, Policy::kElementsPerIteration >;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  using TileIterations = typename Policy::TileIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerIteration>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  TensorLayout layout_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerIteration
+    };
+
+    pointer_ += (layout_({thread_offset_.row(), thread_offset_.column()}) / Policy::kElementsPerAccess);
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += (layout_({
+      coord_offset.row(),
+      coord_offset.column()
+    }) / Policy::kElementsPerAccess);
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+      
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
+
+      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
+        ptr[a + pointer_offset / Policy::kElementsPerAccess] = frag_ptr[n * Policy::kAccessPerIteration + a];
+
+//        printf("store thread %d, address %p, bank %ld\n", threadIdx.x, pointer_+a+n*Detail::kLanesInQuad, 
+//            ((long long)(pointer_+a+n*Detail::kLanesInQuad)>>2)&0x1f);
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kRow * Policy::kIterationsPerInstruction; n++ ) {
+
+      AccessType *ptr = pointer_ + layout_({n * Policy::kRowsPerIteration, 0}) / Policy::kElementsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < Policy::kAccessPerIteration; ++a) {
+        frag_ptr[n * Policy::kAccessPerIteration + a] = ptr[a + pointer_offset / Policy::kElementsPerAccess];
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOp & operator++() {
+    return add_tile_offset({0, 1});
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,     ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,       ///< data type of element to be written
+  typename Layout_
+>
+class TileIteratorTensorOpCanonical {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = Layout_;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  static int const kAccessSize = 1;
+  static int const kAccessCount = Policy::kElementsPerAccess / kAccessSize;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, kAccessSize>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+  /// Guard to indicate whether the shape is divisible
+  bool divisible_;
+
+  /// Extent of the output tensor
+  MatrixCoord extent_;
+
+  /// Thread offset
+  MatrixCoord thread_offset_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]),
+    divisible_(true),
+    extent_(WarpShape::kM, WarpShape::kN) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical(
+    TensorRef const &ref,
+    TensorCoord const &extent,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0]),
+    divisible_(false),
+    extent_(extent) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    thread_offset_ = {
+      quad_id, lane_in_quad * Policy::kElementsPerAccess
+    };
+
+    pointer_ += layout_({thread_offset_.row(), thread_offset_.column()});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & add_tile_offset(TensorCoord const &tile_offset) {
+
+    MatrixCoord coord_offset(
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn
+    );
+
+    thread_offset_ += coord_offset;
+
+    pointer_ += layout_({
+      coord_offset.row(),
+      coord_offset.column()
+    });
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < kAccessCount; ++a) {
+
+        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
+        int frag_idx = n * kAccessCount + a;
+
+        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          pointer_[ptr_idx] = frag_ptr[frag_idx];
+        }
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int a = 0; a < kAccessCount; ++a) {
+
+        int ptr_idx = n * Detail::kLanesInQuad * kAccessCount + pointer_offset + a;
+        int frag_idx = n * kAccessCount + a;
+        
+        int col = thread_offset_.column() + n * Detail::kLanesInQuad * Policy::kElementsPerAccess + a;
+
+        if (divisible_ || (thread_offset_.row() < extent_.row() && col < extent_.column())) {
+          frag_ptr[frag_idx] = pointer_[ptr_idx];
+        }
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpCanonical & operator++() {
+    return add_tile_offset({1, 0});
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
new file mode 100644
index 0000000000000000000000000000000000000000..134e668606dc79589f49e38b16fd06d14e97e27d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -0,0 +1,1089 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// This is an optimization available on CUDA 11.2 and beyond that eliminates branches in the epilogue.
+#define CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED ((__CUDACC_VER_MAJOR__ * 10 + __CUDACC_VER_MINOR__) >= 112)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory. This is optimized
+/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output
+/// data type is smaller. 
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename Element_,              ///< data type of accumulator element
+  int ElementSizeBits,            ///< Size of accumulator element in bits
+  int OutputSizeBits,             ///< Size of output element in bits
+  int OutputElementCount,         ///< number of elements in output vector
+  int ContiguousLanes,            ///< Number of consecutive lanes writing to contiguous memory
+  bool EightBitsOutputOrLess = (OutputSizeBits <= 8)
+>
+class TileIteratorTensorOpMixed {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = OutputElementCount;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 
+      (OutputElementCount * sizeof_bits<Element>::value) / (const_min(128, OutputElementCount * sizeof_bits<Element>::value));
+
+    // Currently support max 4 ptr
+    static constexpr int kMaxPointerCount{4};
+
+    static_assert(kPointerCount <= kMaxPointerCount, "Can only accommodate four pointers at present.");
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Detail::kLanesInQuad * Policy::kElementsPerAccess>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Logical column in which warp tile is aligned
+  int warp_column_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / Policy::kElementsPerAccess),
+    warp_column_(0) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2;
+
+      ptr += column_idx;
+
+      pointers_[i % Detail::kPointerCount] = ptr;
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / Policy::kElementsPerAccess;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + 
+        tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess;
+    }
+
+    warp_column_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    AccessType *ptr = pointers_[0];
+
+#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+    // When the optimization is enabled, small tiles require separate logic.
+    bool kN32_optimization = (WarpShape::kN * Detail::kLanesInQuad * Policy::kElementsPerAccess * sizeof_bits<Element>::value) % 1024 == 0;
+    if (kN32_optimization) {
+      
+      int ptr_idx = ((warp_column_ * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+      
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      } else if (ptr_idx == 1) {
+	if constexpr (AccessType::kElements >= 2) {
+          ptr = pointers_[1];
+	}
+      } else if (ptr_idx == 2) {
+	if constexpr (AccessType::kElements >= 3) {
+          ptr = pointers_[2];
+	}
+      } else if (ptr_idx == 3) {
+	if constexpr (AccessType::kElements >= 4) {
+          ptr = pointers_[3];
+	}
+      }
+    }
+
+#endif
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      
+#if CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+      //
+      // When the optimization is enabled, this expression suffices to obtain the SMEM pointer.
+      //
+      if (WarpShape::kN == 64) {
+        ptr = pointers_[n / 4];
+      }
+      else if (!kN32_optimization)
+#endif
+      {
+        // This is the reference implementation
+        int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+        int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+  
+        if (ptr_idx == 0) {
+          ptr = pointers_[0 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 1) {
+          ptr = pointers_[1 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 2) {
+          ptr = pointers_[2 % Detail::kPointerCount];
+        }
+        else if (ptr_idx == 3) {
+          ptr = pointers_[3 % Detail::kPointerCount];
+        }
+      }
+
+      int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess;
+      ptr[offset] = frag_ptr[n];
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess;
+      int ptr_idx = ((column_idx * sizeof_bits<Element>::value) / 1024) % Detail::kPointerCount;
+
+      AccessType const *smem_ptr = pointers_[ptr_idx];
+      frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess];
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 16 => int8_t/int4b_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape),
+  int OutputSizeBits              ///< Size of output element in bits
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 16, 8, true> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added 
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount] = {0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for int32_t x 8 => int8_t/int4b_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  int OutputSizeBits              ///< Size of output element in bits
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, int32_t, 32, OutputSizeBits, 8, 8, true> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = int32_t;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element, 
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) { 
+
+    int quad_id = (lane_id / Detail::kLanesInQuad); 
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+    
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+    
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + 
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+   
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+ 
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 16 => float_e4m3_t/float_e5m2_t x 16
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape),
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 16, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 16;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    /// Offsets added
+    static int const kOffsetCount = 4;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+  /// Uniform offset in bytes added to warp tile iterator
+  int uniform_offset_[Detail::kOffsetCount] = {0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType);
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kOffsetCount; ++i) {
+      uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType);
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+      int offset_idx = (n % 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx];
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float x 8 => float_e4m3_t/float_e5m2_t x 8
+template <
+  typename WarpShape_,            ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_         ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+class TileIteratorTensorOpMixed<WarpShape_, OperatorShape_, float, 32, 8, 8, 8> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using Element = float;
+  using Layout = layout::RowMajor;
+  static int const kOutputElementCount = 8;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    Element,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+
+    /// Number of pointers needed to write accumulators
+    static int const kPointerCount = 2;
+
+    static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32).");
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>;
+
+private:
+
+  /// Storage type for accessing memory
+  using AccessType = AlignedArray<Element, 2>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointers_[Detail::kPointerCount] = {nullptr};
+
+  /// Stride in units of AccessType
+  int stride_{0};
+
+public:
+
+  /// Default constructor
+  TileIteratorTensorOpMixed() = default;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    stride_(ref.stride()[0] / AccessType::kElements) {
+
+    int quad_id = (lane_id / Detail::kLanesInQuad);
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      AccessType *ptr = reinterpret_cast<AccessType *>(ref.data()) + quad_id * stride_;
+      int column_idx = lane_in_quad ^ (i * 2);
+
+      ptr += column_idx;
+
+      if (i == 0) {
+        pointers_[0] = ptr;
+      }
+      else if (i == 1) {
+        pointers_[1] = ptr;
+      }
+    }
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int64_t i = 0; i < Detail::kPointerCount; ++i) {
+      pointers_[i] += pointer_offset / AccessType::kElements;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) {
+
+    int ptr_offset = tile_offset.row() * Shape::kRow * stride_ +
+      tile_offset.column() * Shape::kColumn / AccessType::kElements;
+
+    pointers_[0] += ptr_offset;
+    pointers_[1] += ptr_offset;
+
+    if (tile_offset.column() % 2) {
+      auto tmp = pointers_[0];
+      pointers_[0] = pointers_[1];
+      pointers_[1] = tmp;
+    }
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) {
+    return add_tile_offset(tile_offset);
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int ptr_idx = (n / 4);
+
+      AccessType *ptr;
+      if (ptr_idx == 0) {
+        ptr = pointers_[0];
+      }
+      else if (ptr_idx == 1) {
+        ptr = pointers_[1];
+      }
+
+      int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4;
+
+#if 0
+      //
+      // Using inline PTX to avoid generic memory
+      //
+      AccessType *smem_ptr = pointers_[ptr_idx];
+      smem_ptr[offset] = frag_ptr[n];
+#else
+      uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr);
+      uint32_t const *data = reinterpret_cast<uint32_t const *>(frag_ptr + n);
+      uint32_t offset_in_bytes = offset * sizeof(AccessType);
+
+      asm volatile(
+        "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n"
+        : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1])
+      );
+#endif
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#undef CUTLASS_EPILOGUE_WARP_TILE_ITERATOR_TENSOR_OP_MIXED_OPTIMIZATION_ENABLED
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a18a9ac8f9804da6349512c781174e16f87ce5ed
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -0,0 +1,440 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+#include "cutlass/epilogue/warp/volta_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+struct TileIteratorVoltaTensorOp; 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using Element = half_t;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of elements per access
+  static int const kElementsPerAccess = Policy::kElementsPerAccess;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+    static int const kRowsPerQuad = 4;
+    static int const kColumnsPerQuad = 8;
+    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
+    static int const kAccessQuadDelta = 16;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorVoltaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
+
+    int quad_id = lane_id / Detail::kLanesInQuad;
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    int quad_row_idx = ((quad_id & 4) >> 1) + (quad_id & 1);
+    int quad_col_idx = ((quad_id & 2) >> 1);
+
+    int row = quad_row_idx * Detail::kRowsPerQuad + lane_in_quad;
+    int column = quad_col_idx * Detail::kColumnsPerQuad;
+
+    pointer_ += layout_({row, column / kElementsPerAccess});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
+
+        int access_quad = access_idx / 2;
+        int access = access_idx % 2;
+
+        int ptr_offset = tile_idx * InterleavedTileShape::kN / Policy::kElementsPerAccess +
+          access_quad * Detail::kAccessQuadDelta / Policy::kElementsPerAccess + 
+          access + pointer_offset / Policy::kElementsPerAccess;
+
+        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
+
+        AccessType access_vector = frag_ptr[frag_idx];
+
+        pointer_[ptr_offset] = access_vector;
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_idx = 0; tile_idx < Policy::TileIterations::kColumn; ++tile_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < Policy::kAccessesPerInterleavedTile; ++access_idx) {
+
+        int access_quad = access_idx / 2;
+        int access = access_idx % 2;
+
+        int ptr_offset = tile_idx * Detail::kTileDelta + access_quad * Detail::kAccessQuadDelta + 
+          access + pointer_offset / Policy::kElementsPerAccess;
+
+        int frag_idx = tile_idx * Policy::kAccessesPerInterleavedTile + access_idx;
+
+        frag_ptr[frag_idx] = pointer_[ptr_offset];
+      }
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment const &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_         ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct TileIteratorVoltaTensorOp<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using Element = float;
+  using Layout = layout::RowMajor;
+
+  using TensorRef = TensorRef<Element, Layout>;         ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                      ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = VoltaTensorOpPolicy<WarpShape, InterleavedTileShape, Element, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// Array type for aligned memory accesses
+  using AccessType = typename Policy::AccessType;
+  
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = typename Policy::Fragment;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = typename Policy::AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+
+  /// Number of elements per access
+  static int const kElementsPerAccess = Policy::kElementsPerAccess;
+
+  // Internal constants
+  struct Detail {
+    static int const kLanesInQuad = 4;
+    static int const kRowsPerQuad = 4;
+    static int const kColumnsPerQuad = 8;
+    static int const kAccessesPerQuad = kColumnsPerQuad / Policy::kElementsPerAccess;
+    static int const kAccessQuadDelta = 16;
+  };
+
+  /// Padding quantity
+  using Padding = MatrixShape<
+    0,
+    Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to memory
+  AccessType *pointer_;
+
+  /// Internal layout object
+  Layout layout_;
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp(): pointer_(nullptr) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorVoltaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ):
+    pointer_(reinterpret_cast<AccessType *>(ref.data())),
+    layout_(ref.stride()[0] / Policy::kElementsPerAccess) { 
+
+    int quad_id = lane_id / Detail::kLanesInQuad;
+    int lane_in_quad = (lane_id % Detail::kLanesInQuad);
+
+    int const kQuadRowDelta = 4;
+    int const kQuadColumnDelta = 2 * Policy::MmaIterations::kColumn;
+
+    int quad_row_offset = ((quad_id & 4) / 2 + (quad_id & 1)) * kQuadRowDelta;
+    int quad_column_offset = (quad_id & 2) / 2 * kQuadColumnDelta;
+
+    int thread_row_offset = (lane_in_quad & 1);
+    int thread_column_offset = (lane_in_quad & 2) / 2;
+
+    int row = quad_row_offset + thread_row_offset;
+    int column = quad_column_offset + thread_column_offset;
+
+    pointer_ += layout_({row, column});
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_pointer_offset(Index pointer_offset) {
+    pointer_ += pointer_offset / Policy::kElementsPerAccess;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+
+    pointer_ += layout_({
+      tile_offset.row() * Shape::kRow, 
+      tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorVoltaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    int const kAccessesPerRow = Policy::TileIterations::kColumn * Policy::MmaIterations::kColumn * 2;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int row_idx = 0; row_idx < Policy::kRowsPerMmaTile; ++row_idx) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int access_idx = 0; access_idx < kAccessesPerRow; ++access_idx) {
+
+        int frag_idx = row_idx * kAccessesPerRow + access_idx;
+
+        int ptr_column_offset = (access_idx & 1) * 2 + 
+          (access_idx & 2) * Policy::MmaIterations::kColumn * 2 + 
+          (access_idx & 4) * Policy::MmaIterations::kColumn * 2;
+
+        int ptr_row_offset = row_idx * 2;
+
+        int ptr_offset = layout_({ptr_row_offset, ptr_column_offset}) + pointer_offset / Policy::kElementsPerAccess;
+
+        pointer_[ptr_offset] = frag_ptr[frag_idx];
+      }
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    assert(0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment const &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8129dce1d80d805054c2c35a83797379522c3121
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -0,0 +1,224 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/wmma_array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/warp/wmma_tensor_op_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,       ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorFragment,    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
+  typename Layout               ///< target shared memory layout
+>
+class TileIteratorWmmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template for reading and writing tiles of accumulators to shared memory
+template <
+  typename WarpShape_,          ///< shape of warp-level GEMM (concept: GemmShape)
+  typename OperatorShape_,      ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorFragment_    ///< wmma fragment to be written (concept: nvcuda::wmma::fragment)
+>
+class TileIteratorWmmaTensorOp<WarpShape_, OperatorShape_, OperatorFragment_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorFragment = OperatorFragment_;
+  using Layout = layout::RowMajor;
+
+  //
+  // Derived types
+  //
+  using WmmaDataType = typename OperatorFragment::element_type;
+  using Element = typename cutlass::arch::WmmaToCutlassDataType<WmmaDataType>::Type; ///< Data Type of element stored in nvcuda::wmma::frament         
+  using TensorRef = TensorRef<Element, Layout>;                                      ///< Tensor Reference object
+  using TensorCoord = MatrixCoord;                                                   ///< Logical coordinate in referenced tensor
+  using Index = typename TensorRef::Index;
+  using LongIndex = typename TensorRef::LongIndex;
+
+  using Policy = WmmaTensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// Shape of the tile in memory
+  using Shape = MatrixShape<
+    Policy::kRowsPerIteration,
+    WarpShape::kN
+  >;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = WmmaFragmentArray<OperatorFragment, Policy::OperatorCount::kColumn * Policy::kWmmaFragmentsPerAccess>;
+
+
+  /// This is the complete warp-level accumulator tile.
+  //using AccumulatorTile = typename Operator::FragmentC;
+
+
+  /// Padding quantity 
+  // (Epilogue shared memory padding for WMMA Gemm kernel is set to run optimaly on Turing)
+  using Padding = MatrixShape<
+    0,
+    4 * Policy::kElementsPerAccess
+  >;
+
+private:
+
+  /// Storage type for accessing memory
+  //using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+  //
+  // Data members
+  //
+
+  /// Internal pointer to shared memory
+  TensorRef ref_;
+
+
+public:
+
+  /// Default constructor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp(): ref_(nullptr) { 
+
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp(
+    TensorRef const &ref,
+    unsigned lane_id
+  ): ref_(ref) {
+  }
+
+  /// Adds a pointer offset
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & add_pointer_offset(Index pointer_offset) {
+    ref_.add_pointer_offset(pointer_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & add_tile_offset(TensorCoord const &tile_offset) {
+    ref_.add_coord_offset({tile_offset.row() * OperatorShape::kM, tile_offset.column() * WarpShape::kN});
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_HOST_DEVICE
+  TileIteratorWmmaTensorOp & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+      
+      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
+
+      nvcuda::wmma::store_matrix_sync(
+        ptr, 
+        frag[n], 
+        ref_.stride()[0], 
+        nvcuda::wmma::layout_t::mem_row_major
+      ); 
+    
+    }
+  }
+
+  /// Store
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+ 
+    for(int n=0; n < Policy::OperatorCount::kColumn; n++) {
+
+      WmmaDataType* ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({0, n * OperatorShape::kN}) + pointer_offset);
+
+      nvcuda::wmma::load_matrix_sync(         
+        frag[n], 
+        ptr,
+        ref_.stride()[0], 
+        nvcuda::wmma::layout_t::mem_row_major
+      ); 
+    
+    }
+  }
+
+  /// Load
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  
+  /// Set smem base address
+  CUTLASS_HOST_DEVICE
+  void set_smem_base_address(Index address) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..c108fc91cab2349cea54c758a3b19237aa7b692d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -0,0 +1,195 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,             ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename InterleavedTileShape,  ///< shape of indivisible instruction-level arrangement (concept: GemmShape)
+  typename ElementC,              ///< Accumulator layout
+  typename Layout                 ///< target shared memory layout
+>
+struct VoltaTensorOpPolicy; 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_          ///< shape of warp-level GEMM (concept: GemmShape)
+>
+struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, half_t, layout::RowMajor> {
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = half_t;
+  using Layout = layout::RowMajor;
+
+  /// Shape of one warp-levelinstruction
+  using InstructionShape = gemm::GemmShape<16, 16, 4>;
+
+  /// Number of mma operations performed for one 32x32x4 interleaved tile
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / InstructionShape::kM,
+    InterleavedTileShape::kN / InstructionShape::kN
+  >;
+
+  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
+  using TileIterations = MatrixShape<
+    WarpShape::kM / InterleavedTileShape::kM,
+    WarpShape::kN / InterleavedTileShape::kN
+  >;
+
+  /// Number of accumulator elements owned by each thread per Mma
+  static int const kElementsPerMma = 8;
+  static int const kRowsPerIteration = 16;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  /// Number of accumulator elements stored per memory instruction to shared memory
+  static int const kElementsPerAccess = 4;
+  
+  /// Number of accesses performed per interleaved tile
+  static int const kAccessesPerInterleavedTile = 4;
+
+  /// Total number of iterations needed to cover the entire tile
+  static int const kIterations = TileIterations::kRow * 2;
+
+  //
+  // Derived types
+  //
+
+  /// Array type for aligned memory accesses
+  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    ElementC, 
+    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    ElementC, 
+    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape_          ///< shape of warp-level GEMM (concept: MatrixShape)
+>
+struct VoltaTensorOpPolicy<WarpShape_, gemm::GemmShape<32, 32, 4>, float, layout::RowMajor> {
+
+  using WarpShape = WarpShape_;
+  using InterleavedTileShape = gemm::GemmShape<32, 32, 4>;
+  using ElementC = float;
+  using Layout = layout::RowMajor;
+
+  /// Shape of one warp-levelinstruction
+  using InstructionShape = gemm::GemmShape<16, 16, 4>;
+
+  /// Number of mma operations performed for one 32x32x4 interleaved tile
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / InstructionShape::kM,
+    InterleavedTileShape::kN / InstructionShape::kN
+  >;
+
+  /// Number of 32x32x4 interleaved tiles performed to cover the warp-level GEMM shape
+  using TileIterations = MatrixShape<
+    WarpShape::kM / InterleavedTileShape::kM,
+    WarpShape::kN / InterleavedTileShape::kN
+  >;
+
+  /// Number of accumulator elements owned by each thread per Mma
+  static int const kElementsPerMma = 8;
+  static int const kRowsPerIteration = 16;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+
+  /// Number of accumulator elements stored per memory instruction to shared memory
+  static int const kElementsPerAccess = 2;
+  
+  /// Number of accesses performed per interleaved tile
+  static int const kAccessesPerInterleavedTile = 8;
+
+  /// Number of rows per interleaved tile
+  static int const kRowsPerMmaTile = 2;
+
+  /// Total number of iterations needed to cover the entire tile
+  static int const kIterations = TileIterations::kRow * MmaIterations::kRow;
+
+  //
+  // Derived types
+  //
+  
+  /// Array type for aligned memory accesses
+  using AccessType = AlignedArray<ElementC, kElementsPerAccess>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    ElementC, 
+    kElementsPerAccess * kAccessesPerInterleavedTile * TileIterations::kColumn>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    ElementC, 
+    TileIterations::kCount * MmaIterations::kCount * kElementsPerMma>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..01b1e72e52181a2556720340f2483716f24264c2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -0,0 +1,101 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic structures needed for implementing the warp-scoped phase of the epilogue.
+          These quantities assume a 'column-major' arrangement of TensorOp instructions, of which
+          a row-oriented slice is visible per iteration.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy details related to the epilogue
+template <
+  typename WarpShape,     ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape, ///< matrix multiply operation shape (concept: gemm:GemmShape)
+  typename Layout         ///< target shared memory layout
+>
+struct WmmaTensorOpPolicy; 
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major
+template <
+  typename WarpShape,           ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape        ///< matrix multiply operation shape (concept: gemm::GemmShape)
+>
+struct WmmaTensorOpPolicy<WarpShape, OperatorShape, layout::RowMajor> {
+
+  /// Number of operations
+  using OperatorCount = MatrixShape<
+    WarpShape::kM / OperatorShape::kM,
+    WarpShape::kN / OperatorShape::kN
+  >;
+
+  //
+  // Hard-coded constants regarding Tensor Operations
+  //
+  static int const kElementsPerAccess = 2;
+  static int const kRowsPerIteration = OperatorShape::kM;
+  static int const kWmmaFragmentsPerAccess = 1;
+
+  //
+  // Derived quantities
+  //
+
+  // Number of externally visible iterations
+  static int const kIterations = OperatorCount::kRow;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/exmy_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/exmy_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..be207a4952ead88b1f6717fd1e66728e351f8bf1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/exmy_base.h
@@ -0,0 +1,1222 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+/*!
+  \file
+  \brief Generic floating-point type for ExMy format
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_size.h"
+#include "cutlass/platform/platform.h"
+
+// #define CUTLASS_DEBUG_TRACE_LEVEL 2
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+ // Helper functions
+namespace detail {
+
+template <class Src, class Dst>
+CUTLASS_HOST_DEVICE
+Dst copy_bits(Src src)
+{
+  Dst dst;
+  static_assert(sizeof(Src) <= sizeof(Dst), "Dst type should be at least the same size as Src type");
+  static_assert(cutlass::platform::is_trivially_copyable<Dst>::value, "Dst type should be trivially copyable");
+  static_assert(cutlass::platform::is_trivially_copyable<
+    /*cutlass::platform::remove_cvref_t< */ Dst /* > */ >::value, "Dst type should be trivially copyable");
+  memcpy(&dst, &src, sizeof(src));
+  return dst;
+}
+
+enum class NanInfEncoding
+{
+  // IEEE-754 style NaN. Exponent bits are
+  // all ones, and at least one bit of mantissa is one
+  IEEE_754,
+  // Canonical NaN. There is only one value representing NaN and
+  // no Inf is defined.
+  CANONICAL_ONLY,
+  // No NaN or Inf encoded.
+  NONE
+};
+
+enum class FpEncoding
+{
+  E11M52, // double
+  E8M23,  // float
+  E5M2,   // FP8
+  E4M3,   // FP8
+  UE4M3,  // FP8 
+  UE8M0,  // FP8
+  E3M2,   // FP6
+  E2M3,   // FP6
+  E2M1,   // FP4
+};
+
+//////
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<uint32_t NumExpBits, uint32_t NumMantissaBits>
+constexpr int exponent_bias_cxx17() {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
+    static_assert(NumMantissaBits <= static_cast<uint32_t>(cutlass::platform::numeric_limits<int32_t>::max()));
+    return -1 * static_cast<int>(NumMantissaBits);
+  }
+  else {
+    return static_cast<int>((1 << (NumExpBits - 1))) - 1;
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+namespace impl {
+template<uint32_t NumExpBitsMinusOne>
+constexpr int shift_num_bits_expression_cxx11() {
+#if (CUTLASS_CXX17_OR_LATER)
+  static_assert(NumExpBitsMinusOne <= 31u);
+#endif
+  return NumExpBitsMinusOne > 31u ? 31u : NumExpBitsMinusOne;
+}
+
+template<uint32_t NumExpBitsMinusOne>
+constexpr int inner_shift_expression_cxx11() {
+  return static_cast<int>((1u << shift_num_bits_expression_cxx11<NumExpBitsMinusOne>()) - 1u);
+}
+
+} // namespace impl
+
+// C++11 equivalent of exponent_bias_cxx17()
+template<uint32_t NumExpBits, uint32_t NumMantissaBits>
+constexpr int exponent_bias_cxx11() {
+#if (CUTLASS_CXX17_OR_LATER)
+  return exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
+#else
+  return (NumExpBits == 0) ?
+    -1 * static_cast<int>(NumMantissaBits) : impl::inner_shift_expression_cxx11<NumExpBits - 1u>();
+#endif
+}
+
+// C++11 equivalent of maximum_exponent_cxx17()
+template<uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr int maximum_exponent_cxx11() {
+  return
+    ((NumExpBits == 0) ?
+      (0 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>()) :
+      ((NaNEncoding == NanInfEncoding::IEEE_754) ?
+        ((static_cast<int>((1 << NumExpBits)) - 2) - exponent_bias_cxx11<NumExpBits, NumMantissaBits>()) :
+        ((NaNEncoding == NanInfEncoding::CANONICAL_ONLY) ?
+          ((NumMantissaBits > 0) ?
+            static_cast<int>((1 << NumExpBits)) - 1 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>() :
+            static_cast<int>((1 << NumExpBits)) - 2 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>()
+          ) :
+          (static_cast<int>((1 << NumExpBits)) - 1 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>())
+        )
+      )
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr int maximum_exponent_cxx17() {
+  constexpr int exp_bias = exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
+    // If no exponent bits, return fixed hidden bias
+    return 0 - exp_bias;
+  }
+  else {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::IEEE_754) {
+      // We have IEEE style NaN and infinity
+      // All values when exp_bits = 1...1s are used.
+      int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 2;
+      return max_exp_bits - exp_bias;
+    }
+    else {
+      // There are no cases where we have Inf without IEEE_754_Nan
+
+      // If we have a canonical NaN. Only exp=1..1 and mantissa=1..1
+      // value has a special meaning. If we also have at least one mantissa
+      // bit, then maximum exponent is 1...1 - exponent_bias
+      if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::CANONICAL_ONLY) {
+        if CUTLASS_CONSTEXPR_IF_CXX17 (NumMantissaBits > 0) {
+          int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 1;
+          return max_exp_bits - exp_bias;
+        }
+        else { // no mantissa bits
+          int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 2;
+          return max_exp_bits - exp_bias;
+        }
+      }
+      // No NaNs or infs
+      int max_exp_bits = static_cast<int>((1 << NumExpBits)) - 1;
+      return max_exp_bits - exp_bias;
+    }
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+template<uint32_t NumExpBits, uint32_t NumMantissaBits>
+constexpr int minimum_exponent_cxx11() {
+  return
+    ((NumExpBits == 0) ?
+      0 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>() :
+      ((NumMantissaBits > 0) ?
+        1 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>() :
+        0 - exponent_bias_cxx11<NumExpBits, NumMantissaBits>())
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<uint32_t NumExpBits, uint32_t NumMantissaBits>
+constexpr int minimum_exponent_cxx17() {
+  constexpr int exp_bias = exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
+  constexpr bool has_denorm = (NumMantissaBits > 0);
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
+    // If no exponent bits, return fixed hidden bias
+    // Note that minimum and maximum exponents are the same.
+    return 0 - exp_bias;
+  }
+
+  if CUTLASS_CONSTEXPR_IF_CXX17 (has_denorm) {
+    // Exp = 0...0s is reserved for denorm values.
+    return 1 - exp_bias;
+  }
+  return 0 - exp_bias;
+}
+#endif
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_pos_denormal_value_cxx11() {
+  static_assert(NumExpBits > 0 || NumMantissaBits > 0, "Both NumExpBits and NumMantissaBits can't be zero");
+  return
+    (!(NumMantissaBits > 0) ? Storage(0) : Storage((1ull << NumMantissaBits) - 1));
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_pos_denormal_value_cxx17() {
+  static_assert(NumExpBits > 0 || NumMantissaBits > 0, "Both NumExpBits and NumMantissaBits can't be zero");
+  constexpr bool has_denorm = (NumMantissaBits > 0);
+  if CUTLASS_CONSTEXPR_IF_CXX17 (!has_denorm) {
+    // If we don't have denormal values, return all 0s
+    return Storage(0);
+  }
+  else {
+    // Case: (NumExpBits > 0 && NumMantissaBits > 0) or (NumExpBits == 0 && NumMantissaBits > 0)
+    return Storage((1ull << NumMantissaBits) - 1);
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage min_pos_denormal_value_cxx11() {
+  return (!(NumMantissaBits > 0) ? Storage(0) : Storage(1));
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage min_pos_denormal_value_cxx17() {
+  constexpr bool has_denorm = (NumMantissaBits > 0);
+  if CUTLASS_CONSTEXPR_IF_CXX17 (!has_denorm) {
+    // If we don't have denormal values, return all 0s
+    return Storage(0);
+  }
+  // Case: (NumExpBits > 0 && NumMantissaBits > 0) or (NumExpBits == 0 && NumMantissaBits > 0)
+  return Storage(1);
+}
+#endif
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_pos_normal_value_cxx11() {
+  return
+    ((NumExpBits == 0) ?
+      Storage(0) :
+      ((NumMantissaBits == 0) ?
+        0 :
+        (((NaNEncoding == NanInfEncoding::IEEE_754 || NaNEncoding == NanInfEncoding::NONE) ?
+          ((1ull << NumMantissaBits) - 1) :
+          ((1ull << NumMantissaBits) - 2)))
+      ) | (static_cast<Storage>(
+            maximum_exponent_cxx11<NumExpBits, NumMantissaBits, NaNEncoding>() +
+            exponent_bias_cxx11<NumExpBits, NumMantissaBits>()
+          ) << NumMantissaBits)
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_pos_normal_value_cxx17() {
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
+    // if there are no exponent bits, we don't have normal values.
+    return Storage(0);
+  }
+  constexpr int exp_bias = exponent_bias_cxx17<NumExpBits, NumMantissaBits>();
+  constexpr int max_exp = maximum_exponent_cxx17<NumExpBits, NumMantissaBits, NaNEncoding>();
+  constexpr int exp = max_exp + exp_bias;
+
+  // place the exponent
+  Storage val = static_cast<Storage>(exp) << NumMantissaBits;
+  // If there are no mantissa bits return the exponent
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumMantissaBits == 0) {
+    return val;
+  }
+  else {
+    // If the NaN Inf encoding follows IEEE 754 or there is no (NaN and Inf) then mantissa can be all 1..1s
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NaNEncoding == NanInfEncoding::IEEE_754 ||
+                  NaNEncoding == NanInfEncoding::NONE  ) {
+      Storage mantissa = (1ull << NumMantissaBits) - 1;
+      val |= mantissa;
+    }
+    else {
+      // If we have a canonical NaN, then the exponent can be the maximum bit value
+      // but mantissa=1..1s is reserved for NaN.
+      Storage mantissa = (1ull << NumMantissaBits) - 2;
+      val |= mantissa;
+    }
+    return val;
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage min_pos_normal_value_cxx11() {
+  return
+    ((NumExpBits == 0) ?
+      Storage(0) :
+      (Storage((NumMantissaBits > 0) ? 1 : 0) << NumMantissaBits)
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage min_pos_normal_value_cxx17() {
+  constexpr bool has_denorm = (NumMantissaBits > 0);
+
+  if CUTLASS_CONSTEXPR_IF_CXX17 (NumExpBits == 0) {
+    // if there are no exponent bits, we don't have normal values.
+    return Storage(0);
+  }
+  Storage exp = 0;
+  if CUTLASS_CONSTEXPR_IF_CXX17 (has_denorm) {
+    exp = 1;
+  }
+  return static_cast<Storage>(exp << NumMantissaBits);
+}
+#endif
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_value_cxx11() {
+  return
+    ((NumExpBits > 0) ?
+      max_pos_normal_value_cxx11<Storage, NumExpBits, NumMantissaBits, NaNEncoding>() :
+      max_pos_denormal_value_cxx11<Storage, NumExpBits, NumMantissaBits, NaNEncoding>()
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding>
+constexpr Storage max_value_cxx17() {
+  constexpr bool has_normal = (NumExpBits > 0);
+  if CUTLASS_CONSTEXPR_IF_CXX17 (has_normal) {
+    return max_pos_normal_value_cxx17<Storage, NumExpBits, NumMantissaBits, NaNEncoding>();
+  }
+  else {
+    return max_pos_denormal_value_cxx17<Storage, NumExpBits, NumMantissaBits, NaNEncoding>();
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding, bool IsSigned>
+constexpr Storage min_value_cxx11() {
+  return
+    (IsSigned ?
+      Storage(1ull << (NumExpBits + NumMantissaBits)) | max_value_cxx11<Storage, NumExpBits, NumMantissaBits, NaNEncoding>() :
+      Storage(0)
+    );
+}
+
+#if (CUTLASS_CXX17_OR_LATER)
+template<class Storage, uint32_t NumExpBits, uint32_t NumMantissaBits, NanInfEncoding NaNEncoding, bool IsSigned>
+constexpr Storage min_value_cxx17() {
+  if (IsSigned) {
+    return Storage(1ull << (NumExpBits + NumMantissaBits)) | max_value_cxx17<Storage, NumExpBits, NumMantissaBits, NaNEncoding>();
+  }
+  else { // Unsigned number
+    return Storage(0);
+  }
+
+  CUTLASS_GCC_UNREACHABLE;
+}
+#endif
+
+template <
+    class StorageType,
+    uint32_t NumBits, uint32_t NumExpBits, uint32_t NumMantissaBits,
+    NanInfEncoding Nan = NanInfEncoding::IEEE_754, bool IsSigned = true>
+struct FpBitRepresentation {
+public:
+
+  using Storage = StorageType;
+
+#if (CUTLASS_CXX17_OR_LATER)
+  static_assert(cutlass::platform::is_unsigned_v<Storage>, "Use an unsigned integer for StorageType");
+#endif
+  static constexpr bool IS_SIGNED = IsSigned;
+  // Canonical NaN is always represented as exponent=11...11 and mantissa=11...11, if it exists
+  static constexpr NanInfEncoding NAN_TYPE = Nan;
+  // Inf is always represented as exponent=11...11 and mantissa=00...00, if it exists
+  static constexpr bool HAS_INF = (NAN_TYPE == NanInfEncoding::IEEE_754);
+  static constexpr bool HAS_NAN = (NAN_TYPE != NanInfEncoding::NONE);
+
+  static constexpr bool HAS_DENORM = (NumMantissaBits > 0);
+  static constexpr bool HAS_NORMAL = !HAS_DENORM;
+
+  static constexpr uint32_t NUM_BITS = NumBits;
+  static constexpr uint32_t NUM_EXPONENT_BITS = NumExpBits;
+  static constexpr uint32_t NUM_MANTISSA_BITS = NumMantissaBits;
+  static_assert(NUM_BITS >= (NUM_EXPONENT_BITS + NUM_MANTISSA_BITS + uint32_t(IS_SIGNED)), "Number of bits do not match");
+
+  static constexpr Storage ONE = Storage(1);
+  static constexpr Storage ZERO = Storage(0);
+
+  // Note: Don't rely on operator precedence. Use parenthesis.
+  static constexpr Storage EXPONENT_MASK = (Storage(1) << Storage(NUM_EXPONENT_BITS)) - ONE;
+  static constexpr Storage MANTISSA_MASK = (Storage(1) << Storage(NUM_MANTISSA_BITS)) - ONE;
+  static constexpr Storage EXPONENT_SHIFT = Storage(NUM_MANTISSA_BITS);
+  static constexpr Storage SIGN_SHIFT = (IS_SIGNED) ? Storage(NUM_MANTISSA_BITS + NUM_EXPONENT_BITS) : Storage(0);
+
+  // Note: All biased/real exponent calculation are done with signed ints
+  // Use unsigned to represent data not exponent.
+  static constexpr int EXP_BIAS = detail::exponent_bias_cxx11<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS>();
+  static constexpr int MAX_EXP = detail::maximum_exponent_cxx11<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+  static constexpr int MIN_EXP = detail::minimum_exponent_cxx11<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS>();
+
+  // Floating-point Limits
+  static constexpr Storage MAX_POS_NORMAL_VAL = detail::max_pos_normal_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+  static constexpr Storage MAX_POS_DENORMAL_VAL = detail::max_pos_denormal_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+  static constexpr Storage MIN_POS_NORMAL_VAL = detail::min_pos_normal_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+  static constexpr Storage MIN_POS_DENORMAL_VAL = detail::min_pos_denormal_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+
+  static constexpr Storage MAX_VALUE = max_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>();
+  static constexpr Storage MIN_VALUE = min_value_cxx11<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE, IS_SIGNED>();
+
+  //
+  // C++17 Verification
+  //
+#if (CUTLASS_CXX17_OR_LATER)
+  static_assert(EXP_BIAS == detail::exponent_bias_cxx17<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS>(),                "Error");
+  static_assert(MAX_EXP  == detail::maximum_exponent_cxx17<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(),   "Error");
+  static_assert(MIN_EXP  == detail::minimum_exponent_cxx17<NUM_EXPONENT_BITS, NUM_MANTISSA_BITS>(),             "Error");
+
+  static_assert(MAX_POS_NORMAL_VAL   == detail::max_pos_normal_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(), "Error");
+  static_assert(MAX_POS_DENORMAL_VAL == detail::max_pos_denormal_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(), "Error");
+  static_assert(MIN_POS_NORMAL_VAL   == detail::min_pos_normal_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(), "Error");
+  static_assert(MIN_POS_DENORMAL_VAL == detail::min_pos_denormal_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(), "Error");
+  static_assert(MAX_VALUE            == max_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE>(), "Error");
+  static_assert(MIN_VALUE            == min_value_cxx17<Storage, NUM_EXPONENT_BITS, NUM_MANTISSA_BITS, NAN_TYPE, IS_SIGNED>(), "Error");
+#endif
+
+  // If we don't have INF defined, set the largest number. Gives us .satfinite behavior.
+  static constexpr Storage INF_MASK = (HAS_INF) ?
+      (Storage(EXPONENT_MASK) << Storage(NUM_MANTISSA_BITS)) : MAX_VALUE;
+  static constexpr Storage NAN_MASK = (Storage(EXPONENT_MASK) << Storage(NUM_MANTISSA_BITS)) | MANTISSA_MASK;
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 bool is_inf(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (!HAS_INF) {
+      return false;
+    }
+    bool exp_all_ones = (exponent_bits(flt) ^ EXPONENT_MASK) == 0;
+    bool mantissa_all_zeros = mantissa_bits(flt) == 0;
+    return exp_all_ones && mantissa_all_zeros;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 bool is_canonical_nan(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NAN_TYPE == NanInfEncoding::NONE) {
+      return false;
+    }
+    bool exp_all_ones = (exponent_bits(flt) ^ EXPONENT_MASK) == ZERO;
+    bool mantissa_all_ones = (mantissa_bits(flt) ^ MANTISSA_MASK) == ZERO;
+    return exp_all_ones && mantissa_all_ones;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 bool is_nan(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NAN_TYPE == NanInfEncoding::NONE) {
+      return false;
+    }
+
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NAN_TYPE == NanInfEncoding::CANONICAL_ONLY) {
+      return is_canonical_nan(flt);
+    }
+
+    bool exp_all_ones = (exponent_bits(flt) ^ EXPONENT_MASK) == ZERO;
+    bool mantissa_has_ones = mantissa_bits(flt) > ZERO;
+    return exp_all_ones && mantissa_has_ones;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 bool is_denorm(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (!HAS_DENORM) {
+      return false;
+    }
+    else if (exponent_bits(flt) == ZERO) {
+      // Exponent bits are all 0s
+      return true;
+    }
+    return false;
+  }
+
+  template<typename T = Storage>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 T sign_bit(T flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (!IS_SIGNED) {
+      return T(0);
+    }
+    return static_cast<T>(flt >> T(SIGN_SHIFT));
+  }
+
+  template<typename T = Storage>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 T set_sign_bit(T flt, T sign) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (!IS_SIGNED) {
+      return flt;
+    }
+    return static_cast<T>(flt | (sign << T(SIGN_SHIFT)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage exponent_bits(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NUM_EXPONENT_BITS == ZERO) {
+      return ZERO;
+    }
+    return (flt >> (NUM_MANTISSA_BITS)) & EXPONENT_MASK;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 int exponent(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NUM_EXPONENT_BITS == ZERO) {
+      return -int(EXP_BIAS);
+    }
+
+    if (HAS_DENORM && (exponent_bits(flt) == ZERO)) {
+      return 1 - int(EXP_BIAS);
+    }
+
+    return int(flt >> (NUM_MANTISSA_BITS) & EXPONENT_MASK) - int(EXP_BIAS);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage mantissa_bits(Storage flt) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NUM_MANTISSA_BITS == ZERO) {
+      return ZERO;
+    }
+    return (flt & MANTISSA_MASK);
+  }
+
+  template <class FpType>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage to_bits(FpType flt) {
+    return copy_bits<FpType, Storage>(flt);
+  }
+
+  template <class DstFpBits>
+  CUTLASS_HOST_DEVICE static typename DstFpBits::Storage convert_to(
+      Storage src_val,
+      DstFpBits dst_encoding) {
+    return convert(FpBitRepresentation{}, src_val, dst_encoding);
+  }
+
+  template <class SrcFpBits>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage convert_from(
+      typename SrcFpBits::Storage src_val,
+      SrcFpBits src_encoding) {
+    return convert(src_encoding, src_val, FpBitRepresentation{});
+  }
+
+private:
+
+  template<typename T = Storage>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 T make_fp_from_bits(T sign, T exp, T mantissa) {
+    T fp_bits = T(ZERO);
+    CUTLASS_UNUSED(sign);
+    if CUTLASS_CONSTEXPR_IF_CXX17 (IS_SIGNED) {
+      fp_bits = sign << SIGN_SHIFT;
+    }
+    fp_bits |= (exp << T(NUM_MANTISSA_BITS));
+    fp_bits |= (mantissa);
+    return fp_bits;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage nan_with_sign(Storage sign) {
+    Storage fp_bits = NAN_MASK;
+    return set_sign_bit(fp_bits, sign);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage inf_with_sign(Storage sign) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (HAS_INF) {
+      Storage fp_bits = INF_MASK;
+      return set_sign_bit(fp_bits, sign);
+    }
+    else {
+      // If INF is not defined assume satfinite behavior
+      return (sign == ZERO) ? MAX_VALUE : MIN_VALUE;
+    }
+
+    CUTLASS_GCC_UNREACHABLE;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 Storage significand(Storage flt) {
+    if (is_denorm(flt)) {
+      return mantissa_bits(flt);
+    }
+    else {
+      return (ONE << Storage(NUM_MANTISSA_BITS)) | mantissa_bits(flt);
+    }
+
+    CUTLASS_GCC_UNREACHABLE;
+  }
+
+  template<typename T>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 T significand_hidden_bits(T significand) {
+    if CUTLASS_CONSTEXPR_IF_CXX17 (NUM_MANTISSA_BITS == 0) {
+      return T(1);
+    }
+    return ((T(0b11) << T(NUM_MANTISSA_BITS)) & significand) >> T(NUM_MANTISSA_BITS);
+  }
+
+  // Current assumption round to nearest even
+  template<class T>
+  CUTLASS_HOST_DEVICE
+  static CUTLASS_CONSTEXPR_IF_CXX17 T round_significand(T src, int shift_amount) {
+    T dst_mantissa = src;
+    // If the shift amount is positive, we are shifting left
+    // Type with less mantissa bits is rounded to a type with more
+    // mantissa bits.
+    if (shift_amount > 0) {
+      dst_mantissa = (dst_mantissa << (shift_amount));
+    }
+    else {
+      // There are fewer mantissa bits in the target type
+      // we need to round the destination number up for all
+      // lower precision bits removed.
+      // We assume round-to-nearest-even here.
+      int pos_shift_amount = -shift_amount;
+
+      // Too large shift return all zeros to prevent undefined behavior for shift.
+      if (pos_shift_amount >= static_cast<int>(sizeof(T) * 8)) {
+        return T(0);
+      }
+
+      T guard_bit_mask = (T(1) << T(pos_shift_amount));            // Last bit to remain in mantissa
+      T sticky_mask    = (T(1) << T(pos_shift_amount - 1)) - T(1); // Remaining bits
+      T round_bit_mask = (T(1) << T(pos_shift_amount - 1));        // First bit removed from mantissa
+
+      bool sticky_bit = (src & sticky_mask) >= T(1);                      // ORing all sticky bits
+      bool round_bit = (src & round_bit_mask) >= T(1);
+      bool guard_bit = (src & guard_bit_mask) >= T(1);
+
+      // Shift mantissa bits to right to remove lowest precision bits
+      dst_mantissa = dst_mantissa >> pos_shift_amount;
+
+      if ((sticky_bit && round_bit) || (guard_bit && round_bit && !sticky_bit)) {
+        dst_mantissa += 1;
+      }
+    }
+    return dst_mantissa;
+  }
+
+  template <class SrcFpBits, class DstFpBits>
+  CUTLASS_HOST_DEVICE
+  static typename DstFpBits::Storage convert(
+      SrcFpBits src_encoding,
+      typename SrcFpBits::Storage src_val,
+      DstFpBits dst_encoding) {
+
+    using SrcT = typename SrcFpBits::Storage;
+    using DstT = typename DstFpBits::Storage;
+    using LargeStorage = typename cutlass::platform::conditional<(sizeof(SrcT) > sizeof(DstT)), SrcT, DstT>::type;
+
+    LargeStorage src_sign_bit = src_encoding.sign_bit(src_val);
+
+    // If the source is NaN, set the destination to NaN carrying the sign bit
+    if (src_encoding.is_nan(src_val)) {
+      return dst_encoding.nan_with_sign(DstT(src_sign_bit));
+    }
+    // If the source is INF, set the destination to INF carrying the sign bit
+    else if (src_encoding.is_inf(src_val)) {
+      return dst_encoding.set_sign_bit(DstFpBits::INF_MASK, DstT(src_sign_bit));
+    }
+    // Number is not NaN or INF: Zero and others
+
+    LargeStorage src_exp_bits = src_encoding.exponent_bits(src_val);
+    LargeStorage src_significand = src_encoding.significand(src_val);
+    int src_exp = src_encoding.exponent(src_val);
+
+    // The source value is 0. Return a signed 0.
+    if (src_exp_bits == LargeStorage(0) && src_significand == LargeStorage(0)) {
+      return dst_encoding.set_sign_bit(DstT(0), DstT(src_sign_bit));
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("(1) src_sign: %llu src_exp_bits %llx src_exp %d src_significand %llx\n",
+      static_cast<unsigned long long>(src_sign_bit), static_cast<unsigned long long>(src_exp_bits), src_exp, static_cast<unsigned long long>(src_significand));
+#endif
+    // Normalize the number: Left shift the significand bits until hidden "1" appears.
+    // Only needed if the src value is denormal.
+    // Conditions:
+    //  If the exponent is 0, then the significand can't be 0 (src_val==0 case handled above):
+    //    there is at least one "1" bit in the significand. Loop executes.
+    //  If the exponent is not 0, then the number is normal:
+    //    significand has hidden bit set. Loop doesn't execute.
+    // Assumption: Zero is always defined for the floating point types and detected above
+
+    while (src_encoding.significand_hidden_bits(src_significand) == LargeStorage(0)) {
+      src_significand <<= LargeStorage(1);
+      src_exp--;
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    printf("(2) src_sign: %llu src_exp_bits %llx src_exp %d src_significand %llx\n",
+      static_cast<unsigned long long>(src_sign_bit), static_cast<unsigned long long>(src_exp_bits), src_exp, static_cast<unsigned long long>(src_significand));
+#endif
+    // The exponent exceeds DstFormat's exponent capacity
+    // Return positive/negative infinity.
+    // If no INF is defined, return positive/negative largest value.
+    if (src_exp > DstFpBits::MAX_EXP) {
+      return dst_encoding.set_sign_bit(DstFpBits::INF_MASK, DstT(src_sign_bit));
+    }
+    else if (src_exp <= DstFpBits::MAX_EXP && src_exp >= DstFpBits::MIN_EXP) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(3) Exp match: src_sign: %d src_exp_bits: %x src_exp: %d src_significand: %x\n",
+        src_sign_bit, src_exp_bits, src_exp, src_significand);
+#endif
+
+      int shift_amount = int(DstFpBits::NUM_MANTISSA_BITS) - int(SrcFpBits::NUM_MANTISSA_BITS);
+      int dst_exponent = src_exp + DstFpBits::EXP_BIAS;
+      LargeStorage dst_mantissa = src_significand;
+
+      // if we have an M0 case, the floating point number is always denormal.
+      // Therefore, if exponents are equal, we need to check whether it is inf
+      if (DstFpBits::NUM_EXPONENT_BITS == 0) {
+        if (dst_mantissa > DstFpBits::INF_MASK) {
+          return dst_encoding.inf_with_sign(DstT(src_sign_bit));
+        }
+      }
+
+      // Round to nearest even
+      dst_mantissa = round_significand(dst_mantissa, shift_amount);
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(4) after rounding src_sign: %d dst_exponent: %d dst_mantissa: %x\n",
+        src_sign_bit, dst_exponent, dst_mantissa);
+#endif
+
+      if (dst_encoding.significand_hidden_bits(dst_mantissa) > 0b1) {
+        // Significant became larger than 01.X...X. Divide significand by 2 and multiply exp by 2
+        while (dst_exponent < (DstFpBits::MAX_EXP+DstFpBits::EXP_BIAS) &&
+               dst_encoding.significand_hidden_bits(dst_mantissa) > LargeStorage(0b1)) {
+          dst_mantissa >>= LargeStorage(1);
+          dst_exponent++;
+        }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        printf("(5) after rounding  max_exp: %d src_sign: %d dst_exponent: %d dst_mantissa: %x\n",
+          DstFpBits::MAX_EXP,src_sign_bit, dst_exponent, dst_mantissa);
+#endif
+
+        if (dst_encoding.significand_hidden_bits(dst_mantissa) > LargeStorage(0b1)) {
+          return dst_encoding.set_sign_bit(DstFpBits::INF_MASK, DstT(src_sign_bit));
+        }
+      }
+
+      dst_mantissa = dst_mantissa & DstFpBits::MANTISSA_MASK;
+      static_assert(sizeof(LargeStorage) >= sizeof(decltype(dst_exponent)),
+        "sizeof(LargeStorage) must be greater than or equal to sizeof(decltype(dst_exponent))");
+      LargeStorage dst_exponent_bits = static_cast<LargeStorage>(dst_exponent);
+
+      DstT final_val = static_cast<DstT>(dst_encoding.template make_fp_from_bits<LargeStorage>(src_sign_bit, dst_exponent_bits, dst_mantissa));
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(6) Final Value src_sign: %d dst_exp_bits: %x dst_mantissa: %x\n",
+        src_sign_bit, dst_exponent_bits, dst_mantissa);
+#endif
+
+      if (DstFpBits::is_nan(final_val)) {
+        // This NAN is generated when:
+        //  Src is not an Nan
+        //  the exp of Src == the max_exp of Dst.
+        //  The mantissa becomes all-1s after rounding.
+        // Return max value of Dst (not NAN) as it just couldn't be represented in the range of Dst.
+        return dst_encoding.set_sign_bit(DstFpBits::INF_MASK, DstT(src_sign_bit));
+      }
+      else {
+        return final_val;
+      }
+    }
+    else {
+      // Result is denormal
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(7) Denormal case src_sign: %d src_exp: %d src_significand: %x MIN_EXP: %d\n",
+        src_sign_bit, src_exp, src_significand, DstFpBits::MIN_EXP);
+#endif
+
+      int exp_diff = src_exp - DstFpBits::MIN_EXP;
+      int shift_amount = int(DstFpBits::NUM_MANTISSA_BITS) - int(SrcFpBits::NUM_MANTISSA_BITS);
+      shift_amount += exp_diff;
+      LargeStorage dst_mantissa = src_significand;
+      dst_mantissa = round_significand(dst_mantissa, shift_amount);
+
+      if (dst_encoding.significand_hidden_bits(dst_mantissa) >= LargeStorage(0b1)) {
+        if CUTLASS_CONSTEXPR_IF_CXX17 (DstFpBits::NUM_EXPONENT_BITS == 0) {
+          return dst_encoding.inf_with_sign(DstT(src_sign_bit));
+        }
+        else {
+          LargeStorage dst_exp_bits = 1;
+          dst_mantissa &= DstFpBits::MANTISSA_MASK;
+          DstT final_val = static_cast<DstT>(dst_encoding.template make_fp_from_bits<LargeStorage>(src_sign_bit, dst_exp_bits, dst_mantissa));
+          return final_val;
+        }
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(7.1) Denormal case exp_diff: %d shift_amount: %d dst_mantissa %d\n", exp_diff, shift_amount, dst_mantissa);
+#endif
+      dst_mantissa &= DstFpBits::MANTISSA_MASK;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("(8) Final Value src_sign: %d src_exp: %d dst_mantissa: %x\n",
+        src_sign_bit, src_exp, dst_mantissa);
+#endif
+
+      DstT final_val = static_cast<DstT>(dst_encoding.template make_fp_from_bits<LargeStorage>(src_sign_bit, LargeStorage(0), dst_mantissa));
+      return final_val;
+    }
+
+    return DstT(0);
+  }
+
+  template <class StorageType_, uint32_t NumBits_, uint32_t NumExpBits_,
+            uint32_t NumMantissaBits_, NanInfEncoding Nan_, bool IsSigned_>
+            friend struct FpBitRepresentation;
+};
+
+#if (CUTLASS_CXX17_OR_LATER)
+
+template<FpEncoding FpExMyCode>
+CUTLASS_CONSTEXPR_IF_CXX17 auto fp_encoding_selector() {
+  if CUTLASS_CONSTEXPR_IF_CXX17      (FpExMyCode == FpEncoding::E11M52) { // double
+    return cutlass::detail::FpBitRepresentation<uint64_t, 64, 11, 52, cutlass::detail::NanInfEncoding::IEEE_754>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E8M23)  { // float
+    return cutlass::detail::FpBitRepresentation<uint32_t, 32, 8, 23, cutlass::detail::NanInfEncoding::IEEE_754>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E5M2)   {   // FP8
+    return cutlass::detail::FpBitRepresentation<uint8_t, 8, 5, 2, cutlass::detail::NanInfEncoding::IEEE_754>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E4M3)   {   // FP8
+    return cutlass::detail::FpBitRepresentation<uint8_t, 8, 4, 3, cutlass::detail::NanInfEncoding::CANONICAL_ONLY>{};
+  }
+  
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::UE4M3)   {   // FP8
+    return cutlass::detail::FpBitRepresentation<uint8_t, 8, 4, 3, cutlass::detail::NanInfEncoding::CANONICAL_ONLY, false>{};
+  }
+  
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::UE8M0)   {   // FP8
+    return cutlass::detail::FpBitRepresentation<uint8_t, 8, 8, 0, cutlass::detail::NanInfEncoding::CANONICAL_ONLY, false>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E3M2)   {   // FP6
+    return cutlass::detail::FpBitRepresentation<uint8_t, 6, 3, 2, cutlass::detail::NanInfEncoding::NONE>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E2M3)   {   // FP6
+    return cutlass::detail::FpBitRepresentation<uint8_t, 6, 2, 3, cutlass::detail::NanInfEncoding::NONE>{};
+  }
+  else if CUTLASS_CONSTEXPR_IF_CXX17 (FpExMyCode == FpEncoding::E2M1)   {   // FP4
+    return cutlass::detail::FpBitRepresentation<uint8_t, 4, 2, 1, cutlass::detail::NanInfEncoding::NONE>{};
+  }
+  CUTLASS_GCC_UNREACHABLE;
+}
+
+#else
+//
+// Definitions for floating point encodings.
+//
+
+template <FpEncoding FpExMyCode> struct FpEncodingSelector {
+  using type = void;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E11M52> {
+  using type = cutlass::detail::FpBitRepresentation<uint64_t, 64, 11, 52, cutlass::detail::NanInfEncoding::IEEE_754>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E8M23> {
+  using type = cutlass::detail::FpBitRepresentation<uint32_t, 32, 8, 23, cutlass::detail::NanInfEncoding::IEEE_754>;
+};
+template <> struct FpEncodingSelector<FpEncoding::E5M2> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 8, 5, 2, cutlass::detail::NanInfEncoding::IEEE_754>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E4M3> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 8, 4, 3, cutlass::detail::NanInfEncoding::CANONICAL_ONLY>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::UE4M3> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 8, 4, 3, cutlass::detail::NanInfEncoding::CANONICAL_ONLY, false>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::UE8M0> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 8, 8, 0, cutlass::detail::NanInfEncoding::CANONICAL_ONLY, false>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E3M2> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 6, 3, 2, cutlass::detail::NanInfEncoding::NONE>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E2M3> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 6, 2, 3, cutlass::detail::NanInfEncoding::NONE>;
+};
+
+template <> struct FpEncodingSelector<FpEncoding::E2M1> {
+  using type = cutlass::detail::FpBitRepresentation<uint8_t, 4, 2, 1, cutlass::detail::NanInfEncoding::NONE>;
+};
+#endif
+
+} // namespace detail
+
+template <detail::FpEncoding T, class Derived>
+struct float_exmy_base
+{
+
+  static constexpr detail::FpEncoding Encoding = T;
+  using BitRepresentation =
+    #if (CUTLASS_CXX17_OR_LATER)
+      decltype(detail::fp_encoding_selector<T>())
+    #else
+      typename detail::FpEncodingSelector<T>::type
+    #endif
+      ;
+
+  using FP32BitRepresentation =
+    #if (CUTLASS_CXX17_OR_LATER)
+      decltype(cutlass::detail::fp_encoding_selector<cutlass::detail::FpEncoding::E8M23>())
+    #else
+      typename detail::FpEncodingSelector<cutlass::detail::FpEncoding::E8M23>::type
+    #endif
+      ;
+
+  using Storage = typename BitRepresentation::Storage;
+
+  //
+  // Data members
+  //
+
+  /// Data container
+  Storage storage;
+
+  /// Ctors.
+  float_exmy_base() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_exmy_base(Storage s) : storage(s) {
+  }
+
+  /// Is finite implementation
+  CUTLASS_HOST_DEVICE
+  static bool isfinite(float_exmy_base flt) {
+    return !BitRepresentation::is_inf(flt.storage);
+  }
+
+  /// Is NaN implementation
+  CUTLASS_HOST_DEVICE
+  static bool isnan(float_exmy_base flt) {
+    return BitRepresentation::is_nan(flt.storage);
+  }
+
+  /// Is infinite implementation
+  CUTLASS_HOST_DEVICE
+  static bool isinf(float_exmy_base flt) {
+    return BitRepresentation::is_inf(flt.storage);
+  }
+
+  /// Is infinite implementation
+  CUTLASS_HOST_DEVICE
+  static bool isnormal(float_exmy_base flt) {
+    return !BitRepresentation::is_denorm(flt.storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static float_exmy_base<T, Derived> bitcast(Storage x) {
+    float_exmy_base f;
+    f.storage = x;
+    return f;
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_exmy_base convert_from_float(float const &flt) const {
+    FP32BitRepresentation::Storage fp32_bits = FP32BitRepresentation::to_bits(flt);
+    float_exmy_base float_exmy;
+    float_exmy.storage = BitRepresentation::convert_from(fp32_bits, FP32BitRepresentation{});
+    return float_exmy;
+  }
+
+  CUTLASS_HOST_DEVICE
+  float convert_to_float(float_exmy_base<T, Derived> const &x) const {
+    FP32BitRepresentation::Storage fp32_bits;
+    fp32_bits = BitRepresentation::convert_to(x.storage, FP32BitRepresentation{});
+    return detail::copy_bits<FP32BitRepresentation::Storage, float>(fp32_bits);
+  }
+
+  // Note: Only consider float/int conversions in this Base class
+  // Types inheriting from this class should define their own constructors and
+  // specialized type conversions
+
+  /// Floating point conversion
+  CUTLASS_HOST_DEVICE
+  explicit float_exmy_base<T, Derived>(float x) {
+    storage = static_cast<Derived*>(this)->convert_from_float(x).storage;
+  }
+
+  // Integer conversion
+  CUTLASS_HOST_DEVICE
+  explicit float_exmy_base<T, Derived>(int x) {
+    storage = static_cast<Derived*>(this)->convert_from_float(float(x)).storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_exmy_base<T, Derived>(unsigned x) {
+    storage = static_cast<Derived*>(this)->convert_from_float(float(x)).storage;
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    return static_cast<const Derived*>(this)->convert_to_float(*this);
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(static_cast<const Derived*>(this)->convert_to_float(*this));
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  Storage &raw() {
+    return storage;
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  Storage raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return bool(BitRepresentation::sign_bit(storage));
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int(BitRepresentation::exponent_bits(storage));
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return int(BitRepresentation::exponent(storage));
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(BitRepresentation::mantissa_bits(storage));
+  }
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Arithmetic operators
+  //
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // Note: Almost all data types cast to float then do the arithmetic operations
+  // Types inheriting from this class can overload them if specialized instructions are available
+  // in HW (e.g. half_t)
+
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) == float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) != float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator<(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) < float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator<=(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) <= float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator>(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) > float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator>=(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float(lhs) >= float(rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator+(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float_exmy_base(float(lhs) + float(rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator-(float_exmy_base const &lhs) {
+    return float_exmy_base(-float(lhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator-(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float_exmy_base(float(lhs) - float(rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator*(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float_exmy_base(float(lhs) * float(rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator/(float_exmy_base const &lhs, float_exmy_base const &rhs) {
+    return float_exmy_base(float(lhs) / float(rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator+=(float_exmy_base &lhs, float_exmy_base const &rhs) {
+    lhs = float_exmy_base(float(lhs) + float(rhs));
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator-=(float_exmy_base &lhs, float_exmy_base const &rhs) {
+    lhs = float_exmy_base(float(lhs) - float(rhs));
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator*=(float_exmy_base &lhs, float_exmy_base const &rhs) {
+    lhs = float_exmy_base(float(lhs) * float(rhs));
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator/=(float_exmy_base &lhs, float_exmy_base const &rhs) {
+    lhs = float_exmy_base(float(lhs) / float(rhs));
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator++(float_exmy_base &lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_exmy_base(tmp);
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base &operator--(float_exmy_base &lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_exmy_base(tmp);
+    return lhs;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator++(float_exmy_base &lhs, int) {
+    float_exmy_base ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_exmy_base(tmp);
+    return ret;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend float_exmy_base operator--(float_exmy_base &lhs, int) {
+    float_exmy_base ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_exmy_base(tmp);
+    return ret;
+  }
+
+};
+
+template <detail::FpEncoding T, class Derived>
+CUTLASS_HOST_DEVICE
+cutlass::float_exmy_base<T, Derived> abs(cutlass::float_exmy_base<T, Derived> const& h) {
+  using BitRepresentation = typename cutlass::float_exmy_base<T, Derived>::BitRepresentation;
+  using Storage = typename cutlass::float_exmy_base<T, Derived>::Storage;
+  return BitRepresentation::IS_SIGNED ?
+      cutlass::float_exmy_base<T, Derived>(Storage(h.raw() & Storage((1<<BitRepresentation::SIGN_SHIFT) - 1))) :
+      cutlass::float_exmy_base<T, Derived>(h.raw());
+}
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..129f733725d22bdcdfa4b55a9d52afb031adc908
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/detail.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed gemm device layer helpers.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::device::detail {
+
+
+cutlass::Status check_cuda_status(cudaError_t status) {
+  if (status != cudaSuccess) {
+    auto result = cudaGetLastError();
+    CUTLASS_TRACE_HOST("  error message: " << cudaGetErrorString(result));
+    return cutlass::Status::kErrorInternal;
+  }
+  return cutlass::Status::kSuccess;                   
+}
+
+// DistGemmBufferHelper computes required buffer size and offsets for GEMM operands.
+template <
+  typename Tiler_, 
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementD_>
+struct DistGemmBufferHelper {
+
+  using Tiler = Tiler_;
+
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+  using ElementD = ElementD_;
+
+  static constexpr int NumBuffersA = Tiler::NumBuffersA;
+  static constexpr int NumBuffersB = Tiler::NumBuffersB;
+  static constexpr int NumBuffersC = Tiler::NumBuffersC;
+  static constexpr int NumBuffersD = Tiler::NumBuffersD;
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_a(ProblemShape problem_shape) {
+    auto a_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersA, Tiler::get_local_a_shape(problem_shape), sizeof(ElementA))
+    );
+    return size(a_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_b(ProblemShape problem_shape) {
+    auto b_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersB, Tiler::get_local_b_shape(problem_shape), sizeof(ElementB))
+    );
+    return size(b_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_c(ProblemShape problem_shape) {
+    auto c_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersC, Tiler::get_local_c_shape(problem_shape), sizeof(ElementC))
+    );
+    return size(c_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_d(ProblemShape problem_shape) {
+    auto d_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersD, Tiler::get_local_d_shape(problem_shape), sizeof(ElementD))
+    );
+    return size(d_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size(ProblemShape problem_shape) {
+    size_t buffer_size = 0;
+
+    if constexpr (NumBuffersA > 0) {
+      buffer_size += get_buffer_size_a(problem_shape);
+    }
+    if constexpr (NumBuffersB > 0) {
+      buffer_size += get_buffer_size_b(problem_shape);
+    }
+    if constexpr (NumBuffersC > 0) {
+      buffer_size += get_buffer_size_c(problem_shape);
+    }
+    if constexpr (NumBuffersD > 0) {
+      buffer_size += get_buffer_size_d(problem_shape);
+    }
+
+    return buffer_size;
+  }
+
+  // Buffer space: |  buffer_A  |  buffer_B  |  buffer_C  |  buffer_D  |
+  // And buffer_{A,B,C,D}: |  iter 1  |  iter 2  | ... |  iter TP - 1 |
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_A(ProblemShape problem_shape) {
+    return 0;
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_B(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape);
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_C(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape) + get_buffer_size_b(problem_shape);
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_D(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape) + get_buffer_size_b(problem_shape) + get_buffer_size_c(problem_shape);
+  }
+};
+
+} // namespace cutlass::distributed::device::detail
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7968849a87d228aef5e5e39afcb705e1595fcd4f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp
@@ -0,0 +1,717 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Distributed GEMM Device Adapter
+
+  Sets up local GEMM stages, the cuda graph, manages buffer and barrier spaces,
+  and maps arguments to per-stage arguments.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/experimental/distributed/device/full_barrier.hpp"
+#include "cutlass/experimental/distributed/device/detail.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::device {
+
+template <class GemmKernel_>
+class DistributedGemmUniversalAdapter {
+public:
+  using DeviceGemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel_>;
+  using GemmKernel = GemmKernel_;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // "Inherit" type decls and static values from device GEMM
+  using LayoutA = typename DeviceGemm::LayoutA;
+  using LayoutB = typename DeviceGemm::LayoutB;
+  using LayoutC = typename DeviceGemm::LayoutC;
+  using LayoutD = typename DeviceGemm::LayoutD;
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+  using StrideC = typename GemmKernel::StrideC;
+  using StrideD = typename GemmKernel::StrideD;
+
+  static bool const kEnableCudaHostAdapter = DeviceGemm::kEnableCudaHostAdapter;
+
+  static ComplexTransform const kTransformA = DeviceGemm::kTransformA;
+  static ComplexTransform const kTransformB = DeviceGemm::kTransformB;
+
+  using MathOperator = typename DeviceGemm::MathOperator;
+  using OperatorClass = typename DeviceGemm::OperatorClass;
+  using ArchTag = typename DeviceGemm::ArchTag;
+
+  using ThreadblockSwizzle = typename DeviceGemm::ThreadblockSwizzle;
+  using ThreadblockShape = typename DeviceGemm::ThreadblockShape;
+  using ClusterShape = typename DeviceGemm::ClusterShape;
+  using InstructionShape = typename DeviceGemm::InstructionShape;
+
+  static int const kThreadCount = DeviceGemm::kThreadCount;
+  static constexpr int WarpsInMma = DeviceGemm::WarpsInMma;
+  static constexpr int WarpsInMmaM = DeviceGemm::WarpsInMmaM;
+  static constexpr int WarpsInMmaN = DeviceGemm::WarpsInMmaN;
+
+  using WarpCount = typename DeviceGemm::WarpCount;
+  using WarpShape = typename DeviceGemm::WarpShape;
+
+  static int constexpr kStages = DeviceGemm::kStages;
+
+  static int constexpr kAlignmentA = DeviceGemm::kAlignmentA;
+  static int constexpr kAlignmentB = DeviceGemm::kAlignmentB;
+  static int constexpr kAlignmentC = DeviceGemm::kAlignmentC;
+  static int constexpr kAlignmentD = DeviceGemm::kAlignmentD;
+
+  using EpilogueOutputOp = typename DeviceGemm::EpilogueOutputOp;
+
+  static int constexpr kSplitKAlignment = DeviceGemm::kSplitKAlignment;
+
+  // Distributed GEMM types and defs
+  using DistSchedule = typename GemmKernel::DistSchedule;
+  static constexpr bool HasMemcpy = DistSchedule::HasMemcpy;
+  using TP = typename DistSchedule::TP;
+  static constexpr int TP_ = TP{};
+  using ElementFlag = typename GemmKernel::ElementFlag;
+  using ElementBarrier = uint32_t;
+
+  using BufferHelper = detail::DistGemmBufferHelper<
+    DistSchedule,
+    ElementA,
+    ElementB,
+    ElementC,
+    ElementD>;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::BaseArguments;
+  using DistributedArguments = typename GemmKernel::DistributedArguments;
+  using PackedArguments = typename GemmKernel::PackedArguments;
+
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::PackedParams;
+
+  struct DistributedGemmState {
+    int device_idx;
+
+    Params params_array[TP_];
+
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_executable;
+
+    bool graph_created = false;
+    bool graph_instantiated = false;
+
+    void * memcpy_source_ptr_array[TP_];
+    void const * memcpy_remote_ptr_array[TP_];
+    size_t memcpy_bytes[TP_];
+
+    cutlass::Array<ElementBarrier*, TP_> device_barrier_ptrs;
+
+    bool is_initialized = false;
+  };
+
+private:
+
+  DistributedGemmState state_;
+
+public:
+
+  bool is_initialized() {
+    return state_.is_initialized && state_.graph_created && state_.graph_instantiated;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (args.epilogue.thread.beta != 0.0 && DistSchedule::RemoteC) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Selected TP uses Remote C to communicate " <<
+          "partial results, which do not support non-zero values for beta yet " <<
+          "(epilogue must be sourceless.)\n");
+      return Status::kInvalid;
+    }
+
+    if (not DistSchedule::can_implement_global(args.problem_shape)) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem shape not divisible by TP.\n");
+      return Status::kInvalid;
+    }
+
+    Arguments args_copy = args;
+    args_copy.problem_shape = DistSchedule::get_local_gemm_shape(args.problem_shape);
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      if (not GemmKernel::can_implement(args_copy)) {
+        return Status::kInvalid;
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Gets buffer space size
+  static size_t
+  get_buffer_space_size(Arguments const& args) {
+    size_t buffer_bytes = 0;
+
+    buffer_bytes = BufferHelper::get_buffer_size(args.problem_shape);
+    buffer_bytes = round_nearest(buffer_bytes, MinWorkspaceAlignment);
+
+    return buffer_bytes;
+  }
+
+  static auto
+  get_tensor_A_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_A = make_tensor(args.mainloop.ptr_A, make_layout(
+          DistSchedule::get_local_a_shape(args.problem_shape),
+          args.mainloop.dA));
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_A(args.problem_shape);
+
+    return DistSchedule::get_tensor_A(tensor_A, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_B_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_B = make_tensor(args.mainloop.ptr_B, make_layout(
+          DistSchedule::get_local_b_shape(args.problem_shape),
+          args.mainloop.dB));
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_B(args.problem_shape);
+
+    return DistSchedule::get_tensor_B(tensor_B, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_C_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_C = make_tensor(args.epilogue.ptr_C, make_layout(
+          DistSchedule::get_local_c_shape(args.problem_shape),
+          args.epilogue.dC));
+
+    auto peer_idx_iter = DistSchedule::get_remote_peer_id(device_idx, iteration);
+    void* buffer_ptr = DistSchedule::RemoteC ? buffer_space[peer_idx_iter] : buffer_space[device_idx];
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_ptr) +
+      BufferHelper::get_buffer_offset_C(args.problem_shape);
+
+    return DistSchedule::get_tensor_C(tensor_C, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_D_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_D = make_tensor(args.epilogue.ptr_D, make_layout(
+          DistSchedule::get_local_d_shape(args.problem_shape),
+          args.epilogue.dD));
+
+    // support remoteD
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_D(args.problem_shape);
+
+    return DistSchedule::get_tensor_D(tensor_D, tensor_buffer, device_idx, iteration);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+
+    workspace_bytes = get_buffer_space_size(args);
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      // NOTE: assumes underlying kernels align up to alignment requirements on their own,
+      // and that the alignment requirements of the individual kernels match.
+      workspace_bytes += GemmKernel::get_workspace_size(args);
+    }
+
+    return workspace_bytes;
+  }
+
+  static size_t
+  get_barrier_bytes() {
+    return round_nearest(sizeof(ElementBarrier), 32);
+  }
+
+  static size_t
+  get_flag_bytes() {
+    return round_nearest(sizeof(ElementFlag) * TP_, 32);
+  }
+
+  static void *
+  exclusive_workspace_ptr_to_flag_ptr(void * exclusive_workspace_ptr, int iteration) {
+    return static_cast<void*>(
+        static_cast<uint8_t*>(exclusive_workspace_ptr) + 
+        get_barrier_bytes() + 
+        (sizeof(ElementFlag) * iteration));
+  }
+
+  static size_t
+  get_exclusive_workspace_size() {
+    return get_barrier_bytes() + get_flag_bytes();
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) {
+
+    CUTLASS_TRACE_HOST("DistributedGemm::initialize() - stream: " << (stream ? "non-null" : "null"));
+
+    state_.device_idx = device_idx;
+
+    for (int device = 0; device < TP_; ++device) {
+      state_.device_barrier_ptrs[device] = reinterpret_cast<ElementBarrier*>(exclusive_workspace_ptrs[device]);
+    }
+
+    // Zero out exclusive workspace
+    zero_workspace(exclusive_workspace_ptrs[device_idx], get_exclusive_workspace_size(), stream, nullptr);
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+
+      size_t workspace_iteration_offset = GemmKernel::get_workspace_size(args[device_idx]);
+      uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace_ptrs[device_idx]) + 
+        get_buffer_space_size(args[device_idx]) + 
+        (iteration * workspace_iteration_offset);
+
+      void * workspace_iter = reinterpret_cast<void*>(workspace_ptr);
+      void** buffer_space = workspace_ptrs;
+
+      // Set up GEMM arguments for the current stage/iteration
+      auto tensor_a_iter = get_tensor_A_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_b_iter = get_tensor_B_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_c_iter = get_tensor_C_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_d_iter = get_tensor_D_for_iter(args, buffer_space, device_idx, iteration);
+
+      Arguments base_args = args[device_idx];
+      base_args.problem_shape = DistSchedule::get_local_gemm_shape(args[device_idx].problem_shape);
+      base_args.mainloop = {
+        reinterpret_cast<const ElementA*>(tensor_a_iter.data()),
+        tensor_a_iter.stride(),
+        reinterpret_cast<const ElementB*>(tensor_b_iter.data()),
+        tensor_b_iter.stride()
+      };
+      base_args.epilogue = {
+        base_args.epilogue.thread,
+        reinterpret_cast<const ElementC*>(tensor_c_iter.data()),
+        tensor_c_iter.stride(),
+        reinterpret_cast<ElementD*>(tensor_d_iter.data()),
+        tensor_d_iter.stride()
+      };
+
+      if constexpr (DistSchedule::RemoteC) {
+        if (iteration > 0) {
+          base_args.epilogue.thread.beta = 1.0;
+        }
+        else if (iteration == 0){
+          base_args.epilogue.thread.beta = 0.0;
+        }
+      }
+
+      auto [left_peer_idx, right_peer_idx] = DistSchedule::get_peers_for_device(device_idx);
+      auto flag_peer_idx = DistSchedule::KernelWritesArrivalFlag ? right_peer_idx : device_idx;
+
+      void * self_flag_ptr = exclusive_workspace_ptr_to_flag_ptr(exclusive_workspace_ptrs[device_idx], iteration);
+      void * peer_flag_ptr = exclusive_workspace_ptr_to_flag_ptr(exclusive_workspace_ptrs[flag_peer_idx], iteration);
+
+      DistributedArguments distributed_args = {
+        device_idx,
+        iteration,
+        self_flag_ptr,
+        peer_flag_ptr
+      };
+      PackedArguments args_iter = {base_args, distributed_args};
+
+      // Initialize the workspace
+      Status status = GemmKernel::initialize_workspace(args_iter, workspace_iter, stream);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      // Initialize the Params structure
+      state_.params_array[iteration] = GemmKernel::to_underlying_arguments(args_iter, workspace_iter);
+
+      // Set up peer buffer ptrs
+      if (iteration > 0 && HasMemcpy) {
+        auto peer_idx_iter = DistSchedule::get_remote_peer_id(device_idx, iteration);
+
+        void * local_ptr_itr = nullptr;
+        void const * remote_ptr_itr = nullptr;
+        size_t local_size = 0;
+        size_t remote_size = 0;
+
+        static_assert(not DistSchedule::HasMemcpy || (
+              DistSchedule::MemcpyA || DistSchedule::MemcpyB),
+            "Expected to either memcpy A or B when scheduler requires memcpy.");
+        if constexpr (DistSchedule::MemcpyA) {
+          local_size = cute::cosize(tensor_a_iter.layout()) * sizeof(ElementA);
+          local_ptr_itr = reinterpret_cast<void*>(tensor_a_iter.data());
+
+          // Copy peer's slice in the first iteration (direct access memcpy instead of logical ring)
+          auto remote_tensor_iter = get_tensor_A_for_iter(args, buffer_space, peer_idx_iter, 0);
+          remote_ptr_itr = reinterpret_cast<void const*>(remote_tensor_iter.data());
+          remote_size = cute::cosize(remote_tensor_iter.layout()) * sizeof(ElementA);
+        }
+        else if constexpr (DistSchedule::MemcpyB) {
+          local_size = cute::cosize(tensor_b_iter.layout()) * sizeof(ElementB);
+          local_ptr_itr = reinterpret_cast<void*>(tensor_b_iter.data());
+
+          // Copy peer's slice in the first iteration (direct access memcpy instead of logical ring)
+          auto remote_tensor_iter = get_tensor_B_for_iter(args, buffer_space, peer_idx_iter, 0);
+          remote_ptr_itr = reinterpret_cast<void const*>(remote_tensor_iter.data());
+          remote_size = cute::cosize(remote_tensor_iter.layout()) * sizeof(ElementB);
+        }
+
+        assert(local_size == remote_size && local_size > 0);
+
+        state_.memcpy_source_ptr_array[iteration] = local_ptr_itr;
+        state_.memcpy_remote_ptr_array[iteration] = remote_ptr_itr;
+        state_.memcpy_bytes[iteration] = local_size;
+      }
+    }
+
+    //
+    // Account for dynamic smem capacity if needed
+    //
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    state_.is_initialized = true;
+
+    // Instantiate graph
+    Status status = construct_graph(launch_with_pdl);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  Status
+  construct_graph(bool launch_with_pdl) {
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 6))
+    Status status = Status::kSuccess;
+
+    // Destroy existing graph, if created
+    if (state_.graph_created) {
+      status = detail::check_cuda_status(cudaGraphDestroy(state_.graph));
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    state_.graph_created = true;
+
+    cudaGraphNode_t full_barrier_node;
+
+    // Create dummy stream
+    cudaStream_t stream;
+    status = detail::check_cuda_status(cudaStreamCreate(&stream));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Create graph
+    status = detail::check_cuda_status(cudaGraphCreate(&state_.graph, 0));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 1. Full barrier node
+    status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+          stream,
+          state_.graph,
+          nullptr, nullptr, 0,
+          cudaStreamCaptureModeRelaxed));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    cutlass::Array<ElementFlag*, TP_> self_flag_ptrs;
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      self_flag_ptrs[iteration] = state_.params_array[iteration].distributed.self_flag_ptr_;
+    }
+
+    launch_full_barrier<TP_, ElementBarrier, TP_, ElementFlag>(
+        state_.device_barrier_ptrs, self_flag_ptrs, state_.device_idx, stream, launch_with_pdl);
+
+    status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    size_t num_nodes;
+    status = detail::check_cuda_status(cudaGraphGetNodes(state_.graph, nullptr, &num_nodes));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    if (num_nodes != 1) {
+      CUTLASS_TRACE_HOST("  construct_graph() failure: expected a single node in the graph, got " << num_nodes << ".");
+      return Status::kErrorInternal;
+    }
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    status = detail::check_cuda_status(cudaGraphGetNodes(state_.graph, &full_barrier_node, &num_nodes));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 2. Optional mem copy branch
+    if constexpr (HasMemcpy) {
+
+      status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+            stream,
+            state_.graph,
+            &full_barrier_node,
+            /* dependencyData = */ nullptr,
+            1,
+            cudaStreamCaptureModeRelaxed));
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      // No copies for first iter; we assume the data is already there.
+      for (int iteration = 1; iteration < TP_; ++iteration) {
+
+        status = detail::check_cuda_status(cudaMemcpyAsync(
+              state_.memcpy_source_ptr_array[iteration],
+              state_.memcpy_remote_ptr_array[iteration],
+              state_.memcpy_bytes[iteration],
+              cudaMemcpyDeviceToDevice, stream));
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+
+        // Set flag to non zero
+        status = detail::check_cuda_status(cudaMemsetAsync(
+              reinterpret_cast<void *>(state_.params_array[iteration].distributed.peer_flag_ptr_),
+              0b11111111,
+              sizeof(ElementFlag),
+              stream));
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+
+      status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    // 3. Run local GEMMs
+    // 3.1. Create edge between full barrier and the correct gemm stage/iteration
+    cudaGraphEdgeData barrier_to_gemm_edge = {};
+    barrier_to_gemm_edge.from_port = HasMemcpy ? cudaGraphKernelNodePortLaunchCompletion: cudaGraphKernelNodePortProgrammatic;
+    barrier_to_gemm_edge.type = cudaGraphDependencyTypeProgrammatic;
+
+    status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+          stream,
+          state_.graph,
+          &full_barrier_node,
+          /* dependencyData = */ &barrier_to_gemm_edge,
+          1,
+          cudaStreamCaptureModeRelaxed));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      status = DeviceGemm::run(
+            state_.params_array[iteration],
+            stream,
+            /* cuda_adapter = */ nullptr,
+            /* launch_with_pdl = */ launch_with_pdl);
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 4. Cleanup.
+    //// Destroy dummy stream
+    status = detail::check_cuda_status(cudaStreamDestroy(stream));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 5. Instantiate graph
+    status = detail::check_cuda_status(cudaGraphInstantiate(
+          &state_.graph_executable,
+          state_.graph,
+          /* flags = */ 0));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    state_.graph_instantiated = true;
+
+    return Status::kSuccess;
+#else
+      CUTLASS_TRACE_HOST("  construct_graph() failure: target was compiled with an incompatible " <<
+          "version of the CUDA toolkit. Please compile Distributed GEMM with CUDA toolkit 12.4 or later.");
+      return Status::kErrorInternal;
+#endif
+  }
+
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("  DistributedGemm does not support updating arguments yet.");
+    return Status::kErrorInternal;
+  }
+
+  // NOTE: the interface for run() is different in Distributed Gemm:
+  //   1. launch_with_pdl is specified in `initialize`, where the cuda graph is being constructed,
+  //   2. the state of distributed gemm is an array of params for different iterations, and a
+  //      cuda graph.
+  //   3. Custom cuda adapters aren't supported for simplicity.
+  static Status
+  run(DistributedGemmState& state,
+      cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("DistributedGemm::run()");
+
+    if (not state.is_initialized) {
+      CUTLASS_TRACE_HOST("  Distributed gemm was not initialized. Did you forget to call initialize()?");
+      return Status::kErrorInternal;
+    }
+
+    if (not state.graph_instantiated) {
+      CUTLASS_TRACE_HOST("  Distributed gemm graph was not instantiated. Did you forget to call initialize()/construct_graph()?");
+      return Status::kErrorInternal;
+    }
+
+    cudaError_t result = cudaGraphLaunch(state.graph_executable, stream);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaGraphLaunch() returned error: " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr) {
+    return run(state_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(state_, stream);
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr) {
+    Status status = initialize(
+        args,
+        workspace_ptrs,
+        exclusive_workspace_ptrs,
+        device_idx,
+        stream);
+
+    if (Status::kSuccess == status) {
+      status = run(stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr) {
+    return run(
+        args,
+        workspace_ptrs,
+        exclusive_workspace_ptrs,
+        device_idx,
+        stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::distributed::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/full_barrier.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/full_barrier.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab91cf890a0e544d685689e7081cf904e626813d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/device/full_barrier.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Device layer interface for Distributed GEMM barrier kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/experimental/distributed/kernel/full_barrier.hpp"
+
+namespace cutlass::distributed::device {
+
+template <int NP, typename IntType, int Iterations, typename FlagType>
+void launch_full_barrier(
+    cutlass::Array<IntType*, NP> device_arrival_ptrs,
+    cutlass::Array<FlagType*, Iterations> iteration_flag_ptrs,
+    IntType device_idx,
+    cudaStream_t stream,
+    bool launch_with_pdl) {
+
+#if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 6))
+  // Legacy (kernel) launch with PDL
+  cudaLaunchAttribute attributes[1];
+  attributes[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attributes[0].val.programmaticStreamSerializationAllowed = 1;
+
+  cudaLaunchConfig_t launch_config;
+  launch_config.gridDim = 1;
+  launch_config.blockDim = 1;
+  launch_config.dynamicSmemBytes = 0;
+  launch_config.stream = stream;
+  launch_config.attrs = attributes;
+  launch_config.numAttrs = launch_with_pdl ? 1 : 0;
+
+  cudaLaunchKernelEx(
+      &launch_config,
+      cutlass::distributed::kernel::full_barrier_kernel<NP, IntType, Iterations, FlagType>,
+      device_arrival_ptrs,
+      iteration_flag_ptrs,
+      device_idx);
+#endif
+}
+
+} // namespace cutlass::distributed::device
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0445567ee4dd67cb8f0139fe3ae6a16291b5689a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/detail.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed gemm kernel layer helpers.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::kernel::detail {
+
+// Ld with CV cache hint (don’t cache and fetch again)
+// Reference:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#cache-operators
+// Used for loading arrival counts from peer devices
+
+CUTLASS_DEVICE
+void ld_without_cache(uint64_t& val, void const * ptr) {
+  asm volatile(
+      "{\n"
+      "  ld.global.cv.u64 %0, [%1];\n"
+      "}\n"
+      : "=l"(val)
+      : "l"(ptr));
+}
+
+CUTLASS_DEVICE
+void ld_without_cache(uint32_t& val, void const * ptr) {
+  asm volatile(
+      "{\n"
+      "  ld.global.cv.u32 %0, [%1];\n"
+      "}\n"
+      : "=r"(val)
+      : "l"(ptr));
+}
+
+} // namespace cutlass::distributed::kernel::detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b29003104508dd6ad1cecaa43aaf38fdba017463
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Distributed GEMM Kernel Wrapper
+
+  Prepends CUTLASS 3 GEMM kernels with barriers and other necessary instructions to exectue
+  a Distributed GEMM stage.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/experimental/distributed/kernel/detail.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::kernel {
+
+namespace detail {
+
+// Allow all CUTLASS 3.X GEMM kernels
+template <typename GemmKernel_>
+struct SupportsDistributedGemm: cutlass::gemm::detail::IsCutlass3GemmKernel<GemmKernel_> {};
+
+} // namespace detail
+
+/*!
+  DistributedGemmKernelWrapper is a wrapper around a GEMM kernel.
+
+  Depending on the underlying distribution policy/schedule, it prepends the underlying local GEMM
+  kernel with a few additional instructions that gate the execution of the GEMM on buffers being
+  ready for stages/iterations > 0.
+*/
+
+template <class GemmKernel_, class DistSchedule_, class Enable = void>
+struct DistributedGemmKernelWrapper;
+
+template <class GemmKernel_, class DistSchedule_>
+struct DistributedGemmKernelWrapper<
+  GemmKernel_,
+  DistSchedule_,
+  cute::enable_if_t<detail::SupportsDistributedGemm<GemmKernel_>::value>
+  >: GemmKernel_
+{
+  using DistSchedule = DistSchedule_;
+  using TP = typename DistSchedule::TP;
+
+  static constexpr bool KernelWritesArrivalFlag = DistSchedule::KernelWritesArrivalFlag;
+
+  using BaseKernel = GemmKernel_;
+  using BaseArguments = typename BaseKernel::Arguments;
+  using BaseParams = typename BaseKernel::Params;
+
+  //static_assert(BaseKernel::ArchTag::kMinComputeCapability == 90, "DistGEMM only supports Hopper GEMMs for now.");
+  static_assert(not cute::is_same_v<typename BaseKernel::ElementC, void>, "DistributedGEMM epilogues must have a source.");
+
+  using ElementFlag = uint32_t;
+
+  // Device side arguments
+  struct DistributedArguments {
+    int device_idx = 0;
+    int iteration = 0;
+
+    void* self_flag_ptr{nullptr};
+    void* peer_flag_ptr{nullptr};
+  };
+
+  struct PackedArguments {
+    BaseArguments base{};
+    DistributedArguments distributed{};
+  };
+
+  struct DistributedParams {
+    int device_idx = 0;
+    int iteration = 0;
+
+    ElementFlag* self_flag_ptr_{nullptr};
+    ElementFlag* peer_flag_ptr_{nullptr};
+  };
+
+  // Kernel entry point API
+  struct PackedParams {
+    BaseParams base{};
+    DistributedParams distributed{};
+  };
+
+  using Params = PackedParams;
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  PackedParams
+  to_underlying_arguments(PackedArguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("distributed::to_underlying_arguments():");
+
+    auto kernel_params = BaseKernel::to_underlying_arguments(args.base, workspace);
+
+    DistributedParams dist_params = {
+        args.distributed.device_idx,
+        args.distributed.iteration,
+        reinterpret_cast<ElementFlag*>(args.distributed.self_flag_ptr),
+        reinterpret_cast<ElementFlag*>(args.distributed.peer_flag_ptr)
+    };
+
+    return {kernel_params, dist_params};
+  }
+
+  static bool
+  can_implement(BaseArguments const& args) {
+    return BaseKernel::can_implement(args);
+  }
+
+  static bool
+  can_implement(PackedArguments const& args) {
+    return BaseKernel::can_implement(args.base);
+  }
+
+  static size_t
+  get_workspace_size(BaseArguments const& args) {
+    return BaseKernel::get_workspace_size(args);
+  }
+
+  static size_t
+  get_workspace_size(PackedArguments const& args) {
+    return BaseKernel::get_workspace_size(args.base);
+  }
+
+  static cutlass::Status
+  initialize_workspace(BaseArguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return BaseKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+  }
+
+  static cutlass::Status
+  initialize_workspace(PackedArguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return BaseKernel::initialize_workspace(args.base, workspace, stream, cuda_adapter);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(PackedParams const& params) {
+    return BaseKernel::get_grid_shape(params.base);
+  }
+  
+  static dim3
+  get_grid_shape(BaseParams const& params) {
+    return BaseKernel::get_grid_shape(params);
+  }
+
+  CUTLASS_DEVICE
+  void
+  barrier_buffer(PackedParams const& params) {
+    if (params.distributed.iteration > 0) {
+
+      ElementFlag comm_iter = 0;
+      detail::ld_without_cache(comm_iter, params.distributed.self_flag_ptr_);
+      while (comm_iter == 0) {
+        detail::ld_without_cache(comm_iter, params.distributed.self_flag_ptr_);
+        __nanosleep(40);
+      }
+
+    }
+  }
+
+  CUTLASS_DEVICE
+  void
+  maybe_signal_arrival(PackedParams const& params) {
+    if constexpr (KernelWritesArrivalFlag) {
+      if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
+          threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+          params.distributed.iteration > 0) {
+        *reinterpret_cast<ElementFlag*>(params.distributed.peer_flag_ptr_) = 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(PackedParams const& params, char* smem_buf) {
+    // Launch next grid as soon as possible
+    arch::launch_dependent_grids();
+
+    // Wait on previous kernels to flush their memory.
+    arch::wait_on_dependent_grids();
+
+    // Optionally write arrivals for the previous stage/iteration.
+    maybe_signal_arrival(params);
+
+    // Spin-wait on an arrival flag, make sure the respective buffers are ready.
+    // If the buffered operand is memcpied into, it would wait on its local flag.
+    // If it's a remote buffer that is accessed directly, it would wait on its remote flag.
+    barrier_buffer(params);
+
+    // Perform local gemm
+    BaseKernel gemm;
+    gemm(params.base, smem_buf);
+  }
+
+};
+
+} // namespace cutlass::distributed::kernel
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/full_barrier.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/full_barrier.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ec620a536f258dea265a4e6c7fd55ee7a3168be
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/kernel/full_barrier.hpp
@@ -0,0 +1,82 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed GEMM barrier kernel.
+
+    The kernel resets the per-stage arrival flags, performs a full barrier (any-to-any),
+    and also atomically resets the local barrier arrival count.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+#include "cutlass/experimental/distributed/kernel/detail.hpp"
+
+namespace cutlass::distributed::kernel {
+
+template <int NP, typename IntType, int Iterations, typename FlagType>
+__global__ void full_barrier_kernel(
+    cutlass::Array<IntType*, NP> device_arrival_ptrs,
+    cutlass::Array<FlagType*, Iterations> iteration_flag_ptrs,
+    IntType device_idx) {
+
+  arch::launch_dependent_grids();
+  arch::wait_on_dependent_grids();
+
+  CUTLASS_PRAGMA_UNROLL
+  for (FlagType i = 0; i < Iterations; ++i) {
+    iteration_flag_ptrs[i][0] = static_cast<FlagType>(0);
+  }
+
+  IntType val = 1;
+  IntType max_val = static_cast<IntType>(NP - 1);
+
+  CUTLASS_PRAGMA_UNROLL
+  for (IntType d = 0; d < NP; ++d) {
+    if (d != device_idx) {
+      atomicAdd(device_arrival_ptrs[d], val);
+    }
+  }
+
+  IntType curr_val = 0;
+  detail::ld_without_cache(curr_val, device_arrival_ptrs[device_idx]);
+  while (curr_val < max_val) {
+    __nanosleep(40);
+    detail::ld_without_cache(curr_val, device_arrival_ptrs[device_idx]);
+  }
+
+  atomicSub(device_arrival_ptrs[device_idx], max_val);
+}
+
+} // namespace cutlass::distributed::kernel
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..73d52adcbb457f71a51c30a41a08bc787777c7d7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp
@@ -0,0 +1,324 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file 1-D Distributed GEMM Schedules
+
+  NOTE: This API is __experimental__ and will change heavily over time. Particularly the use of
+  CuTe layouts as integer functions in defining iteration-to-tile mappings is over-expressive and
+  leaves plenty of room for incorrect/unexpected behavior.
+  Please proceed with caution when modifying these schedules or defining new ones.
+
+  Device/iteration mappings are defined with CuTe layouts, 
+  since they are functions from integers to integers as well.
+  
+  Each mapping is defined as a linear function of 2 variables (rank-2 layout):
+   First variable (mode) is device index, second variable (mode) is iteration.
+   A constant is also added to the final result as an offset value. This is a temporary workaround
+   so that identity ownership mappings in the final iteration can be guaranteed for the schedules
+   currently implemented.
+  How are these mappings defined?
+    Each schedule represents a unique parallel matrix multiplication algorithm, which describes how
+    matrices/tensors are distributed among TP GPUs.
+
+    Depending on the algorithm, access patterns (GPU to tile or (GPU, iteration) to tile) mappings)
+    are not necessarily going to be the identity function.
+
+  Pitfalls:
+    The current representation uses CuTe layouts as arbitrary linear functions that map
+    (GPU, iteration) to tile indices.
+    This approach is over-expressive, and therefore makes a lot of assumptions on the part of the
+    developer in how these mappings are defined. This can easily lead to incorrect implementations
+    if not handled carefully.
+
+  
+  Assumption made in all schedules: TP == number of iterations (stages)
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::schedules {
+
+// GEMM + Reduce Scatter
+// A and B are tiled along the K mode, which means each GPU gets an [M, K / TP]-shaped slice of A,
+// and an [N, K / TP] slice of B.
+// A is further tiled along the M mode, so that each stage/iteration computes a GEMM of shape
+// [M / TP, N, K / TP], and the epilogue will perform the reduction by reading its C tensor directly
+// from the left peer's previous D buffer.
+//
+// Below is an illustration of the tiling and iteration mappings for this pattern in the TP=4 case:
+//
+//   Rows correspond to the M mode, columns correspond to the K mode for A and B and N mode for 
+//   C and D.  Because sharding is done along K, each column of tiles is owned by one GPU.
+//   Values in the grid correspond to the iteration/stage accessing the tile.
+//   * means the same tile is accessed in all iterations/stages.
+//
+//         Tensor A                             Tensor B              
+//                                                                    
+//  GPU0  GPU1  GPU2  GPU3              GPU0  GPU1  GPU2  GPU3        
+// |-----|-----|-----|-----|           |-----|-----|-----|-----|      
+// |     |     |     |     |           |     |     |     |     |      
+// |  3  |  0  |  1  |  2  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |     |     |     |     |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  2  |  3  |  0  |  1  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |  *  |  *  |  *  |  *  |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  1  |  2  |  3  |  0  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |     |     |     |     |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  0  |  1  |  2  |  3  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |_____|_____|_____|_____|      
+//                                                                    
+//                          M x K                               N x K 
+//
+//
+//              Tensor C                            Tensor D              
+//              (Peer's D)
+//                                         
+//                                                                        
+//      |-----------------------|           |-----------------------|     
+//      |                       |           |                       |     
+// GPU0 |         1,2,3         |      GPU0 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU1 |         1,2,3         |      GPU1 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU2 |         1,2,3         |      GPU2 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU3 |         1,2,3         |      GPU3 |           *           |     
+//      |_______________________|           |_______________________|     
+//                                                                        
+//                               M x N                               M x N
+//
+//
+//  Tensor A's access pattern can be expressed as follows as a function of GPU index and iteration:
+//    tile_idx = ((device_idx - 1) - iter + TP) % TP
+//  
+//  and can be expressed with the following CuTe layout:
+//    (TP, TP) : (1, -1)
+//  with ProcessorOffset = -1
+//
+//
+//  Note: Since this schedule does not expose any communication, iteration 0 has no reduction step,
+//  therefore epilogue is sourceless in iteration 0, and in the rest of the iterations the epilogue
+//  source is a remote pointer to Tensor D owned by its left peer.
+//
+//  Left peer is simply (device_idx - 1 + TP) % TP, which is expressed with the following CuTe layout:
+//    (TP, TP) : (1, 0)
+//
+template <class TP_>
+struct ReduceScatter1D_TilingA_RotatingC: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, _1, TP_, _1>,
+    /* IterationTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _0>>,                             // (left neighbor) = (device_idx + ProcessorOffset + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _m1>>,                            // = (device_idx + ProcessorOffset - iter + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _m1,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ true,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_  = */ TP_{} - 1> {};
+
+// This schedule is similar to ReduceScatter1D_TilingA_RotatingC, but with the second tiling
+// done along N instead of M. All other details remain unchanged.
+template <class TP_>
+struct ReduceScatter1D_TilingB_RotatingC: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, _1, TP_, _1>,
+    /* IterationTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _0>>,                             // (left neighbor) = (device_idx + ProcessorOffset + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _m1>>,                            // = (device_idx + ProcessorOffset - iter + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _m1,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ true,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_  = */ TP_{} - 1> {};
+
+
+// AllGather + GEMM
+// A and B are tiled along the N mode, which means each GPU allgathers A,
+// and operates with an [N / TP, K] slice of B.
+// For pipelining, A is further tiled along the M mode, so that each stage/iteration computes a
+// GEMM of shape [M / TP, N / TP, K], and concurrently we copy a peer's A slice into a local buffer
+// for the next stage/iteration.
+//
+// Below is an illustration of the tiling and iteration mappings for this pattern in the TP=4 case:
+//
+//   Rows correspond to the M mode, columns correspond to the K mode for A and B and N mode for 
+//   C and D.
+//
+//   Since this is a pipelined schedule without exposed communication, the first iteration starts
+//   off immediately and operates on local slices of A and B. In the rest of the iterations, each
+//   GPU accesses a slice of A copied from a peer GPU while it was busy with the last stage.
+//
+//   Values in the following grids correspond to the peer buffer accessed by each GPU during
+//   different iterations:
+//
+//              Tensor A                         Tensor A               
+//               iter 0                           iter 1                
+//                                                                      
+//      |-----------------------|        |-----------------------|      
+//      |                       |        |                       |      
+// GPU0 |           0           |        |           1           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU1 |           1           |        |           2           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU2 |           2           |        |           3           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU3 |           3           |        |           0           |      
+//      |_______________________|        |_______________________|      
+//                                                                      
+//                               M x K                            M x K 
+//
+//              Tensor A                         Tensor A               
+//               iter 2                           iter 3                
+//                                                                      
+//      |-----------------------|        |-----------------------|      
+//      |                       |        |                       |      
+// GPU0 |           2           |        |           3           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU1 |           3           |        |           0           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU2 |           0           |        |           1           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU3 |           1           |        |           2           |      
+//      |_______________________|        |_______________________|      
+//                                                                      
+//                               M x K                            M x K 
+//
+//   Values in the following grids correspond to the tile accessed during each iteration.
+//   * means the same tile is accessed in all iterations/stages.
+//
+//              Tensor B                             Tensor C/D               
+//                                                                          
+//                                                                          
+//      |-----------------------|            |-----|-----|-----|-----|      
+//      |                       |            |     |     |     |     |      
+// GPU0 |           *           |       GPU0 |  0  |  1  |  2  |  3  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU1 |           *           |       GPU1 |  3  |  0  |  1  |  2  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU2 |           *           |       GPU2 |  2  |  3  |  0  |  1  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU3 |           *           |       GPU3 |  1  |  2  |  3  |  0  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//                                                                          
+//                               N x K                                M x N 
+//
+//
+//  Tensor C/D's access pattern can be expressed as follows as a function of GPU index and iteration:
+//    tile_idx = (device_idx + iter) % TP
+//  
+//  and can be expressed with the following CuTe layout:
+//    (TP, TP) : (1, 1)
+//
+//  This schedule does not need a ProcessorOffset constant.
+//
+//  Peer devices from which A slices are copied is also expressed with the same function and CuTe
+//  layout.
+//
+template <class TP_>
+struct AllGather1D_TilingCD_RotatingA: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* IterationTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _0,
+    /* MemcpyA_ = */ true,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ false,
+    /* NumBuffersA_ = */ TP_{} - 1,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_ = */ 0>{};
+
+// This schedule is similar to AllGather1D_TilingCD_RotatingA, but with the order of tiling
+// swapped from N then M to M then N. This means slices of B are rotated around GPUs instead of
+// slices of A. All other details remain unchanged.
+template <class TP_>
+struct AllGather1D_TilingCD_RotatingB: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* IterationTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::M == 1) = 0
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _0,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ true,
+    /* KernelWritesArrivalFlag_ = */ false,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ TP_{} - 1,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_ = */ 0>{};
+
+
+} // namespace cutlass::distributed::schedules
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a2d33281379f71b504f7303637e410c787bba83
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Base Schedule for Distributed GEMM
+
+  Templates Distributed GEMM schedules so that they can be expressed as a set of CuTe primitives and
+  other static values.
+
+  NOTE: This API is __experimental__ and will change heavily over time. Particularly the use of
+  CuTe layouts as integer functions in defining iteration-to-tile mappings is over-expressive and
+  leaves plenty of room for incorrect/unexpected behavior.
+  Please proceed with caution when modifying these schedules or defining new ones.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::schedules {
+
+/*
+ * Distributed GEMM schedules define exactly how operand tensors are tiled and sliced across 
+ * processors (GPUs) and stages/iterations.
+ *
+ * BaseSchedule's role is to ease the implementation of arbitrary Distributed GEMM schedules
+ * and reduce code repetition, simply by reducing the implementation to CuTe primitives and a few
+ * other static values (buffer sizes, whether tensors are rotated using memcpies or not, and the
+ * like.)
+ */
+template <
+  class TP_,                      // CuTe constant defining the number of processors / GPUs / TP value
+  class ProcessorTiler_,          // CuTe tiler defining how fully materialized tensors are sharded across devices
+  class IterationTiler_,          // CuTe tiler defining how local tensors are tiled across stages/iterations
+  class PeerDeviceMapping_,       // CuTe layout mapping device index and stage/iteration to the device's peer index for that stage/iteration
+  class IterationMappingM_,       // CuTe layout mapping device index and stage/iteration to M tile index
+  class IterationMappingN_,       // CuTe layout mapping device index and stage/iteration to N tile index
+  class IterationMappingK_,       // CuTe layout mapping device index and stage/iteration to K tile index
+  class IterationMappingL_,       // CuTe layout mapping device index and stage/iteration to L tile index
+  class ProcessorOffset_,         // Constant offset for processor / GPU index in iteration mapping
+  bool MemcpyA_,                  // Whether tensor A is memcpied
+  bool MemcpyB_,                  // Whether tensor B is memcpied
+  bool KernelWritesArrivalFlag_,  // Whether the kernel writes arrival flags (when tensors are directly accessed from peer and not memcpied)
+  int NumBuffersA_,               // Number of buffers required for tensor A
+  int NumBuffersB_,               // Number of buffers required for tensor B
+  int NumBuffersC_,               // Number of buffers required for tensor C
+  int NumBuffersD_>               // Number of buffers required for tensor D
+struct BaseSchedule {
+
+  using TP = TP_;
+
+  static_assert(
+      cute::is_static<TP>::value && cute::is_integral<TP>::value && cute::rank(TP{}) == 1 && cute::depth(TP{}) == 0,
+      "Only integers allowed for TP at this time.");
+
+  static_assert(cute::rank(ProcessorTiler_{}) == 4, "Expected rank-4 processor tiler.");
+  static_assert(cute::rank(IterationTiler_{}) == 4, "Expected rank-4 iteration tiler.");
+
+  static_assert(cute::rank(PeerDeviceMapping_{}) == 2, 
+      "PeerDeviceMapping must be rank-2 (device_idx, iter)");
+
+  static_assert(cute::rank(IterationMappingM_{}) == 2, 
+      "IterationMappingM must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingN_{}) == 2, 
+      "IterationMappingN must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingK_{}) == 2, 
+      "IterationMappingK must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingL_{}) == 2, 
+      "IterationMappingL must be rank-2 (device_idx, iter).");
+
+  using ProcessorTiler = ProcessorTiler_;
+  using IterationTiler = IterationTiler_;
+
+  using PeerDeviceMapping = PeerDeviceMapping_;
+  using IterationMappingM = IterationMappingM_;
+  using IterationMappingN = IterationMappingN_;
+  using IterationMappingK = IterationMappingK_;
+  using IterationMappingL = IterationMappingL_;
+
+  using ProcessorOffset = ProcessorOffset_;
+
+  static constexpr bool KernelWritesArrivalFlag = KernelWritesArrivalFlag_;
+  static constexpr bool MemcpyA = MemcpyA_;
+  static constexpr bool MemcpyB = MemcpyB_;
+  static constexpr bool HasMemcpy = MemcpyA || MemcpyB;
+
+  static constexpr int NumBuffersA = NumBuffersA_;
+  static constexpr int NumBuffersB = NumBuffersB_;
+  static constexpr int NumBuffersC = NumBuffersC_;
+  static constexpr int NumBuffersD = NumBuffersD_;
+
+  static_assert(
+      NumBuffersA > 0 ^ 
+      NumBuffersB > 0 ^ 
+      NumBuffersC > 0 ^ 
+      NumBuffersD > 0,
+      "Only one of the ABCD tensors can be buffered!");
+
+  static constexpr bool BufferedOutput = NumBuffersC > 0 || NumBuffersD > 0;
+  static constexpr bool RemoteC = NumBuffersC == 0 && NumBuffersD > 0;
+  static constexpr bool RemoteD = NumBuffersD == 0 && NumBuffersC > 0;
+
+  static_assert(not RemoteD, "Remote D is not supported yet.");
+
+  // Host-side API: can_implement based on the GLOBAL problem shape
+  template <typename ProblemShape>
+  static bool
+  can_implement_global(ProblemShape const& global_problem_shape) {
+    auto [M, N, K, L] = append<4>(global_problem_shape, 1);
+
+    auto [ptileM, ptileN, ptileK, ptileL] = ProcessorTiler{};
+    auto [itileM, itileN, itileK, itileL] = IterationTiler{};
+
+    auto tileM = ptileM * itileM;
+    auto tileN = ptileN * itileN;
+    auto tileK = ptileK * itileK;
+    auto tileL = ptileL * itileL;
+
+    return M % tileM == 0 && N % tileN == 0 && K % tileK == 0 && L % tileL == 0;
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_gemm_shape(ProblemShape const& global_problem_shape) {
+    auto problem_shape_MNKL = append<4>(global_problem_shape, 1);
+
+    return shape_div(
+        shape_div(
+          problem_shape_MNKL,
+          ProcessorTiler{}),
+        IterationTiler{});
+  }
+
+  // Host-side API: determine peers
+  static auto
+  get_peers_for_device(int device_idx) {
+    auto left_peer_id = device_idx > 0 ? device_idx - 1 : TP{} - 1;
+    auto right_peer_id = device_idx < TP{} - 1 ? device_idx + 1 : 0;
+
+    return cute::make_tuple(left_peer_id, right_peer_id);
+  }
+
+  // Determines peer given device index and iteration
+  static int
+  get_remote_peer_id(int device_idx, int iteration) {
+    auto device_iter_to_peer_idx = PeerDeviceMapping{};
+    auto peer_idx = (
+      device_iter_to_peer_idx(device_idx + ProcessorOffset{}, iteration) + TP{}
+    ) % TP{};
+    return peer_idx;
+  }
+
+  // Construct tilers and index mappers for sharding across processors
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_a(Tensor tensor) {
+    if constexpr (NumBuffersA > 0) {
+      return shape_div(tensor.shape(), select<0,2,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<0,2,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_b(Tensor tensor) {
+    if constexpr (NumBuffersB > 0) {
+      return shape_div(tensor.shape(), select<1,2,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<1,2,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_c(Tensor tensor) {
+    if constexpr (BufferedOutput) {
+      return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<0,1,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_d(Tensor tensor) {
+    return get_processor_tiler_c(tensor);
+  }
+
+  // Construct tilers and index mappers for tiling and iterating on device
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_a(Tensor tensor) {
+    static_assert(NumBuffersA == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,2,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_b(Tensor tensor) {
+    static_assert(NumBuffersB == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<1,2,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_c(Tensor tensor) {
+    static_assert(NumBuffersC == 0 && NumBuffersD == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_d(Tensor tensor) {
+    static_assert(NumBuffersC == 0 && NumBuffersD == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+  }
+
+  // Map device index and iteration to tile coordinate
+  // Must be implemented by children for now.
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_a(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_k = IterationMappingK{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_k = (mapping_k(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_k, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_b(int device_idx, int iteration) {
+    auto mapping_n = IterationMappingN{};
+    auto mapping_k = IterationMappingK{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_k = (mapping_k(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_n, crd_k, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_c(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_n = IterationMappingN{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_n, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_d(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_n = IterationMappingN{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_n, crd_l);
+  }
+
+  // Device Partitioners: partition non-buffered processor-resident operands.
+  // Processor-resident operands fall into two categories: buffered, and not buffered.
+  // Those buffered aren't expected to be further partitioned, and those 
+  template <typename Tensor>
+  static auto
+  get_tensor_A(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element* ptr = const_cast<Element*>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (NumBuffersA  == 0) {
+      auto tiler = get_device_tiler_a(tensor);
+      auto idx = get_device_tile_idx_a(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      Element* ptr_buffer = reinterpret_cast<Element*>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_B(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (NumBuffersB  == 0) {
+      auto tiler = get_device_tiler_b(tensor);
+      auto idx = get_device_tile_idx_b(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_C(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (not BufferedOutput) {
+      auto tiler = get_device_tiler_c(tensor);
+      auto idx = get_device_tile_idx_c(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      // implement Remote D
+      static_assert(RemoteC, "");
+
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_D(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (not BufferedOutput) {
+      auto tiler = get_device_tiler_d(tensor);
+      auto idx = get_device_tile_idx_d(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      // implement Remote D
+      static_assert(RemoteC, "");
+
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      // last iteration is the local tensor, the rest are buffers
+      if (iteration == TP{} - 1) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * iteration; // note: iteration, not iteration - 1
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_a_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (NumBuffersA == 0) {
+      return shape_div(
+            select<0,2,3>(problem_shape_MNKL),
+            select<0,2,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,2,3>(problem_shape_MNKL),
+            select<0,2,3>(ProcessorTiler{})),
+          select<0,2,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_b_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (NumBuffersB == 0) {
+      return shape_div(
+            select<1,2,3>(problem_shape_MNKL),
+            select<1,2,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<1,2,3>(problem_shape_MNKL),
+            select<1,2,3>(ProcessorTiler{})),
+          select<1,2,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_c_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (not BufferedOutput) {
+      return shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{})),
+          select<0,1,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_d_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (not BufferedOutput) {
+      return shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{})),
+          select<0,1,3>(IterationTiler{}));
+    }
+  }
+
+  // Host-side APIs: get_device_slice_{A,B,C,D}
+  // Slice off a view of the GLOBAL tensor that corresponds to the shard that 
+  // is going to be owned by a specific device. This helps with the initial 
+  // distribution of the GLOBAL operands among devices.
+  template <typename Tensor>
+  static auto
+  get_device_slice_A(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_a(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_B(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_b(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_C(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_c(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_D(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_d(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+};
+
+
+
+} // namespace cutlass::gemm::distributed
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/fast_math.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/fast_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb14856f081f26b591cd4524b55f1cfadca245a7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/fast_math.h
@@ -0,0 +1,1085 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#include <cmath>
+#include <type_traits>
+#endif
+#if !defined(__QNX__)
+#include CUDA_STD_HEADER(utility)
+#endif
+#include "cutlass/array.h"
+#include "cutlass/uint128.h"
+#include "cutlass/coord.h"
+#include "cutlass/half.h"
+
+/**
+ * \file
+ * \brief Math utilities
+ */
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#if !defined(__QNX__)
+using ::cuda::std::swap;
+#else
+template <typename T>
+CUTLASS_HOST_DEVICE void swap(T &lhs, T &rhs) {
+  T tmp = lhs;
+  lhs = rhs;
+  rhs = tmp;
+}
+#endif
+
+/******************************************************************************
+ * Static math utilities
+ ******************************************************************************/
+
+/// Mixed precision dot product
+template <typename Index, typename LongIndex, int N>
+CUTLASS_HOST_DEVICE LongIndex dot(
+  Coord<N, Index> const &coord,
+  Coord<N, LongIndex> const &stride,
+  LongIndex acc = LongIndex()) {
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int n = 0; n < N; ++n) {
+    acc += LongIndex(coord[n]) * stride[n];
+  }
+  return acc;
+}
+
+/**
+ * Statically determine if N is a power-of-two
+ */
+template <int N>
+struct is_pow2 {
+  static bool const value = ((N & (N - 1)) == 0);
+};
+
+/**
+ * Statically determine log2(N), rounded down
+ */
+template <int N, int CurrentVal = N, int Count = 0>
+struct log2_down {
+  /// Static logarithm value
+  enum { value = log2_down<N, (CurrentVal >> 1), Count + 1>::value };
+};
+
+// Base case
+template <int N, int Count>
+struct log2_down<N, 1, Count> {
+  enum { value = Count };
+};
+
+/**
+ * Statically determine log2(N), rounded up
+ */
+template <int N, int CurrentVal = N, int Count = 0>
+struct log2_up {
+  /// Static logarithm value
+  enum { value = log2_up<N, (CurrentVal >> 1), Count + 1>::value };
+};
+
+// Base case
+template <int N, int Count>
+struct log2_up<N, 1, Count> {
+  enum { value = ((1 << Count) < N) ? Count + 1 : Count };
+};
+
+/**
+ * Statically estimate sqrt(N) to the nearest power-of-two
+ */
+template <int N>
+struct sqrt_est {
+  enum { value = 1 << (log2_up<N>::value / 2) };
+};
+
+/**
+ * For performing a constant-division with a compile-time assertion that the
+ * Divisor evenly-divides the Dividend.
+ */
+template <int Dividend, int Divisor>
+struct divide_assert {
+  enum { value = Dividend / Divisor };
+
+  static_assert((Dividend % Divisor == 0), "Not an even multiple");
+};
+
+/******************************************************************************
+ * Rounding
+ ******************************************************************************/
+
+/**
+ * Round dividend up to the nearest multiple of divisor
+ */
+template <typename dividend_t, typename divisor_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+dividend_t round_nearest(dividend_t dividend, divisor_t divisor) {
+  return ((dividend + divisor - 1) / divisor) * divisor;
+}
+
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t abs_for_integer(value_t a) {
+  return ((a > value_t{0}) ? a : -a);
+}
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd(value_t a, value_t b) {
+  for (;;) {
+    if (a == value_t{0}) return cutlass::abs_for_integer(b);
+    b %= a;
+    if (b == value_t{0}) return cutlass::abs_for_integer(a);
+    a %= b;
+  }
+}
+
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm(value_t a, value_t b) {
+  value_t temp = cutlass::gcd(a, b);
+  return (temp != value_t{0}) ? value_t(cutlass::abs_for_integer(a) / temp * cutlass::abs_for_integer(b)) : value_t{};
+}
+
+/**
+ * Greatest common divisor
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t gcd_cxx11(value_t a, value_t b) {
+  return (a == value_t{0} || b == value_t{0}) ? cutlass::abs_for_integer(a | b) : cutlass::gcd_cxx11(b, a % b);
+}
+
+/**
+ * Least common multiple
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t lcm_cxx11(value_t a, value_t b) {
+  return cutlass::gcd_cxx11(a, b) ? (cutlass::abs_for_integer(a) / cutlass::gcd_cxx11(a, b) *
+                                    cutlass::abs_for_integer(b))
+                                  : value_t{};
+}
+
+/// Returns the smallest value in the half-open range [a, a+b) that is a multiple of b
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+int round_up(int a, int b) {
+  return ((a + b - 1) / b) * b;
+}
+
+/// Returns the ceiling of (a / b)
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+int ceil_div(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * log2 computation, what's the
+ * difference between the below codes and
+ * log2_up/down codes?
+ */
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t clz(value_t x) {
+  for (int i = 31; i >= 0; --i) {
+    if ((1 << i) & x)
+      return value_t(31 - i);
+  }
+  return value_t(32);
+}
+
+template <typename value_t>
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+value_t find_log2(value_t x) {
+  int a = int(31 - clz(x));
+  a += (x & (x - 1)) != 0;  // Round up, add 1 if not a power of 2.
+  return a;
+}
+
+
+/**
+ * Find divisor, using find_log2
+ */
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void find_divisor(unsigned int& mul, unsigned int& shr, unsigned int denom) {
+  if (denom == 1) {
+    mul = 0;
+    shr = 0;
+  } else {
+    unsigned int p = 31 + find_log2(denom);
+    unsigned m = unsigned(((1ull << p) + unsigned(denom) - 1) / unsigned(denom));
+
+    mul = m;
+    shr = p - 32;
+  }
+}
+
+/**
+ * Find quotient and remainder using device-side intrinsics
+ */
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void fast_divmod(int& quo, int& rem, int src, int div, unsigned int mul, unsigned int shr) {
+
+  #if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+  #else
+  quo = int((div != 1) ? int(((int64_t)src * mul) >> 32) >> shr : src);
+  #endif
+
+  // The remainder.
+  rem = src - (quo * div);
+}
+
+// For long int input
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17
+void fast_divmod(int& quo, int64_t& rem, int64_t src, int div, unsigned int mul, unsigned int shr) {
+
+  #if defined(__CUDA_ARCH__)
+  // Use IMUL.HI if div != 1, else simply copy the source.
+  quo = (div != 1) ? __umulhi(src, mul) >> shr : src;
+  #else
+  quo = int((div != 1) ? ((src * mul) >> 32) >> shr : src);
+  #endif
+  // The remainder.
+  rem = src - (quo * div);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation.
+///
+/// This object precomputes two values used to accelerate the computation and is best used
+/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
+/// marshalled along other kernel arguments using the 'Params' pattern.
+///
+/// Example:
+///
+///
+///   int quotient, remainder, dividend, divisor;
+///
+///   FastDivmod divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmod {
+  using value_div_type = int;
+  using value_mod_type = int64_t;
+  int32_t divisor = 1;
+  uint32_t multiplier = 0u;
+  uint32_t shift_right = 0u;
+
+  // Find quotient and remainder using device-side intrinsics
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int& remainder, int dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? int(((int64_t)dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
+
+  /// For long int input
+  CUTLASS_HOST_DEVICE
+  void fast_divmod(int& quotient, int64_t& remainder, int64_t dividend) const {
+
+#if defined(__CUDA_ARCH__)
+    // Use IMUL.HI if divisor != 1, else simply copy the source.
+    quotient = (divisor != 1) ? __umulhi(dividend, multiplier) >> shift_right : dividend;
+#else
+    quotient = int((divisor != 1) ? ((dividend * multiplier) >> 32) >> shift_right : dividend);
+#endif
+    // The remainder.
+    remainder = dividend - (quotient * divisor);
+  }
+
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+
+  constexpr FastDivmod() = default;
+
+  CUTLASS_HOST_DEVICE
+  FastDivmod(int divisor_): divisor(divisor_) {
+    assert(divisor_ >= 0);
+    if (divisor != 1) {
+      unsigned int p = 31 + find_log2(divisor);
+      unsigned m = unsigned(((1ull << p) + unsigned(divisor) - 1) / unsigned(divisor));
+
+      multiplier = m;
+      shift_right = p - 32;
+    }
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(int &quotient, int &remainder, int dividend) const {
+    fast_divmod(quotient, remainder, dividend);
+  }
+
+  /// Computes integer division using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  int div(int dividend) const {
+    int quotient, remainder;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Alias for `div` to match the interface of FastDivmodU64
+  CUTLASS_HOST_DEVICE
+  int divide(int dividend) const {
+    return div(dividend);
+  }
+
+  /// Computes integer division remainder using precomputed values.
+  CUTLASS_HOST_DEVICE
+  int rem(int dividend) const {
+    int quotient, remainder;
+    fast_divmod(quotient, remainder, dividend);
+    return remainder;
+  }
+
+  /// Alias for `rem`
+  CUTLASS_HOST_DEVICE
+  int remainder(int dividend) const {
+    return rem(dividend);
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  ///
+  /// Simply returns the quotient
+  CUTLASS_HOST_DEVICE
+  int divmod(int &remainder, int dividend) const {
+    int quotient;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(int &quotient, int64_t &remainder, int64_t dividend) const {
+    fast_divmod(quotient, remainder, dividend);
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  int divmod(int64_t &remainder, int64_t dividend) const {
+    int quotient;
+    fast_divmod(quotient, remainder, dividend);
+    return quotient;
+  }
+
+  /// Returns the divisor when cast to integer
+  CUTLASS_HOST_DEVICE
+  operator int() const { return divisor; }
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division.
+///
+/// This object precomputes two values used to accelerate the computation and is best used
+/// when the divisor is a grid-invariant. In this case, it may be computed in host code and
+/// marshalled along other kernel arguments using the 'Params' pattern.
+///
+/// Example:
+///
+///
+///   uint64_t quotient, remainder, dividend, divisor;
+///
+///   FastDivmodU64 divmod(divisor);
+///
+///   divmod(quotient, remainder, dividend);
+///
+///   // quotient = (dividend / divisor)
+///   // remainder = (dividend % divisor)
+///
+struct FastDivmodU64 {
+
+  uint64_t divisor;
+  uint64_t multiplier;
+  unsigned int shift_right;
+  unsigned int round_up;
+
+  //
+  // Static methods
+  //
+
+  /// Computes b, where 2^b is the greatest power of two that is less than or equal to x
+  CUTLASS_HOST_DEVICE
+  static uint32_t integer_log2(uint64_t x) {
+    uint32_t n = 0;
+    while (x >>= 1) {
+      ++n;
+    }
+    return n;
+  }
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(): divisor(0), multiplier(0), shift_right(0), round_up(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64(uint64_t divisor_): divisor(divisor_), multiplier(1), shift_right(0), round_up(0) {
+
+    if (divisor) {
+      shift_right = integer_log2(divisor);
+
+      if ((divisor & (divisor - 1)) == 0) {
+        multiplier = 0;
+      }
+      else {
+        uint64_t power_of_two = (uint64_t(1) << shift_right);
+        uint64_t multiplier_lo = uint128_t(0, power_of_two) / divisor;
+        multiplier = uint128_t(power_of_two, power_of_two) / divisor;
+        round_up = (multiplier_lo == multiplier ? 1 : 0);
+      }
+    }
+  }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    uint64_t quotient = 0;
+
+    #ifdef __CUDA_ARCH__
+      uint64_t x = dividend;
+      if (multiplier) {
+        x = __umul64hi(dividend + round_up, multiplier);
+      }
+      quotient = (x >> shift_right);
+    #else
+      quotient = dividend / divisor;
+    #endif
+
+    return quotient;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t quotient, uint64_t dividend) const {
+    return dividend - quotient * divisor;
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(quotient, dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object to encapsulate the fast division+modulus operation for 64b integer division
+/// in which the divisor is a power of two.
+struct FastDivmodU64Pow2 {
+
+  uint64_t divisor;
+  unsigned int shift_right;
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(): divisor(0), shift_right(0) { }
+
+  /// Construct the FastDivmod object, in host code ideally.
+  ///
+  /// This precomputes some values based on the divisor and is computationally expensive.
+  CUTLASS_HOST_DEVICE
+  FastDivmodU64Pow2(uint64_t divisor_): divisor(divisor_), shift_right(FastDivmodU64::integer_log2(divisor_)) { }
+
+  /// Returns the quotient of floor(dividend / divisor)
+  CUTLASS_HOST_DEVICE
+  uint64_t divide(uint64_t dividend) const {
+    return dividend >> shift_right;
+  }
+
+  /// Computes the remainder given a computed quotient and dividend
+  CUTLASS_HOST_DEVICE
+  uint64_t modulus(uint64_t dividend) const {
+    // See https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#division-modulo-operations
+    return dividend & (divisor - 1);
+  }
+
+  /// Returns the quotient of floor(dividend / divisor) and computes the remainder
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t dividend) const {
+    uint64_t quotient = divide(dividend);
+    remainder = modulus(dividend);
+    return quotient;
+  }
+
+  /// Computes integer division and modulus using precomputed values. This is computationally
+  /// inexpensive.
+  CUTLASS_HOST_DEVICE
+  void operator()(uint64_t &quotient, uint64_t &remainder, uint64_t dividend) const {
+    quotient = divmod(remainder, dividend);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes the coordinate decomposition from a linear index (64-bit linear index => coord<int32_t>)
+///
+/// This decomposition is accelerated by the FastDivmodU64 object. It is assumed that
+/// a coordinate of <Rank> indices can be decomposed by <Rank - 1> div/mod operations.
+/// Note, is assumed that element divmod[0] divides by extent[1].
+///
+/// For example, assume 4-D coordinate (n, p, q, c) is mapped to a linear index `npqc`. This
+/// can be decomposed via three divide and modulus operations:
+///
+///      c = npqc % C;         |  divmod[2] = FastDivmodU64(C)
+///    npq = npqc / C;         |   coord[3] = c
+///
+///      q =  npq % Q;         |  divmod[1] = FastDivmodU64(Q)
+///     np =  npq / Q;         |   coord[2] = q
+///
+///      p =   np % P;         |  divmod[0] = FastDivmodU64(P)
+///      n =   np / P;         |   coord[1] = p
+///
+///                            |   coord[0] = n
+///
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
+  uint64_t linear_idx,                    ///< Linear index to decompose
+  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = Rank; i > 1; --i) {
+    uint64_t remainder;
+    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
+    coord[i - 1] = int(remainder);
+  }
+
+  coord[0] = int(linear_idx);
+
+  return coord;
+}
+
+/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecomposition(
+  int linear_idx,                    ///< Linear index to decompose
+  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = Rank; i > 1; --i) {
+    int remainder;
+    linear_idx = divmod[i - 2].divmod(remainder, linear_idx);
+    coord[i - 1] = int(remainder);
+  }
+
+  coord[0] = int(linear_idx);
+
+  return coord;
+}
+
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
+  uint64_t linear_idx,                    ///< Linear index to decompose
+  FastDivmodU64 const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank - 1; ++i) {
+    uint64_t remainder;
+    linear_idx = divmod[i].divmod(remainder, linear_idx);
+    coord[i] = int(remainder);
+  }
+
+  coord[Rank - 1] = int(linear_idx);
+
+  return coord;
+}
+
+/// Computes the coordinate decomposition from a linear index (32-bit linear index => coord<int32_t>)
+template <int Rank>
+CUTLASS_HOST_DEVICE Coord<Rank> CoordinateDecompositionLittleEndian(
+  int linear_idx,                    ///< Linear index to decompose
+  FastDivmod const *divmod) {          ///< Pointer to array of Rank-1 FastDivmodU64 objects
+
+  static_assert(Rank > 0, "CoordinateDecomposition requires Rank=1 or greater.");
+
+  Coord<Rank> coord;
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < Rank - 1; ++i) {
+    int remainder;
+    linear_idx = divmod[i].divmod(remainder, linear_idx);
+    coord[i] = int(remainder);
+  }
+
+  coord[Rank - 1] = int(linear_idx);
+
+  return coord;
+}
+
+/// Safely computes the offset of a linear index in bytes for all types
+template <typename Element>
+CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index) {
+
+  static_assert(
+    (sizeof_bits<Element>::value >= 8 && !(sizeof_bits<Element>::value % 8)) ||
+    (sizeof_bits<Element>::value <  8 && !(8 % sizeof_bits<Element>::value)),
+    "Size of numeric type in bits must either be divisible by 8 bits, or 8 bits must be divisible by the size.");
+
+  if (sizeof_bits<Element>::value >= 8) {
+    return index * (sizeof_bits<Element>::value / 8);
+  }
+  else {
+    int const kElementsPerByte = ((8 / sizeof_bits<Element>::value) + ((sizeof_bits<Element>::value >= 8) ? 1 : 0));
+    return index / kElementsPerByte;
+  }
+}
+
+CUTLASS_HOST_DEVICE int64_t OffsetBytes(int64_t index, int64_t element_sizeof_bits) {
+  if (element_sizeof_bits >= 8) {
+    return index * (element_sizeof_bits / 8);
+  }
+  else {
+    int64_t const kElementsPerByte = ((8 / element_sizeof_bits) + ((element_sizeof_bits >= 8) ? 1 : 0));
+    return index / kElementsPerByte;
+  }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Min/Max
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int A, int B>
+struct Min {
+  static int const kValue = (A < B) ? A : B;
+};
+
+template <int A, int B>
+struct Max {
+  static int const kValue = (A > B) ? A : B;
+};
+
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17 int const_min(int a, int b) {
+    return (b < a ? b : a);
+}
+
+CUTLASS_HOST_DEVICE
+CUTLASS_CONSTEXPR_IF_CXX17 int const_max(int a, int b) {
+    return (b > a ? b : a);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_min(T a, T b) {
+  return (b < a ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_min(float a, float b) {
+  return fminf(a, b);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+T fast_max(T a, T b) {
+  return (a < b ? b : a);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+float fast_max(float a, float b) {
+  return fmaxf(a, b);
+}
+
+CUTLASS_HOST_DEVICE
+float fast_cos(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::cosf(theta);
+  #else
+  return std::cos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_cos(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::cos(theta);
+  #else
+  return std::cos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_sin(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sinf(theta);
+  #else
+  return std::sin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_sin(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sin(theta);
+  #else
+  return std::sin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_acos(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::acosf(theta);
+  #else
+  return std::acos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_acos(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::acos(theta);
+  #else
+  return std::acos(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_asin(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::asinf(theta);
+  #else
+  return std::asin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_asin(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::asin(theta);
+  #else
+  return std::asin(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_sqrt(float theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sqrtf(theta);
+  #else
+  return std::sqrt(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_sqrt(double theta) {
+  #if defined(__CUDA_ARCH__)
+  return ::sqrt(theta);
+  #else
+  return std::sqrt(theta);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_exp(float x) {
+  #if defined(__CUDA_ARCH__)
+  return ::expf(x);
+  #else
+  return std::exp(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_exp(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::exp(x);
+  #else
+  return std::exp(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t fast_exp(half_t x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
+      return (half_t)(::hexp(x.to_half()));
+  #else
+      return (half_t)(fast_exp(float(x)));
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_log(float x) {
+  #if defined(__CUDA_ARCH__)
+  return ::logf(x);
+  #else
+  return std::log(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_log(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::log(x);
+  #else
+  return std::log(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+float fast_tanh(float x) {
+  #if defined(__CUDA_ARCH__)
+    #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+      float y;
+      asm volatile ( "tanh.approx.f32 %0, %1; " : "=f"(y) : "f"(x));
+      return y;
+    #else
+      return ::tanhf(x);
+    #endif
+  #else
+  return std::tanh(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+double fast_tanh(double x) {
+  #if defined(__CUDA_ARCH__)
+  return ::tanh(x);
+  #else
+  return std::tanh(x);
+  #endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t fast_tanh(half_t x) {
+  #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(x.raw()) : "h"(x.raw()));
+  return x;
+
+  #else
+  return half_t(fast_tanh(float(x)));
+  #endif
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct fast_exp_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &rhs) const {
+    return fast_exp(rhs);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 10) && (__CUDA_ARCH__ >= 750)
+template <int N>
+struct fast_exp_op<Array<half_t, N>> {
+  CUTLASS_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
+
+    Array<half_t, N> result;
+
+    // use x2 specialization
+    __half2 const *in  = reinterpret_cast<__half2 const *>(&rhs);
+    __half2 *out = reinterpret_cast<__half2 *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      out[i] = ::h2exp(in[i]);
+    }
+
+    // residual
+    if (N % 2) {
+      half_t last = rhs[N - 1];
+      result[N - 1] = half_t(::hexp(last.to_half()));
+    }
+
+    return result;
+  }
+};
+#endif // #if defined(__CUDA_ARCH__)
+
+template <typename T, int N>
+struct fast_exp_op<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+
+    fast_exp_op<T> fast_op;
+    Array<T, N> y;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = fast_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct fast_tanh_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &rhs) const {
+    return fast_tanh(rhs);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
+template <int N>
+struct fast_tanh_op<Array<half_t, N>> {
+  CUTLASS_DEVICE
+  Array<half_t, N> operator()(Array<half_t, N> const &rhs) const {
+
+    Array<half_t, N> result;
+
+    // use x2 specialization
+    uint32_t const *in  = reinterpret_cast<uint32_t const *>(&rhs);
+    uint32_t *out = reinterpret_cast<uint32_t *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm volatile ("tanh.approx.f16x2 %0, %1;" : "=r"(out[i]) : "r"(in[i]));
+    }
+
+    // residual
+    if (N % 2) {
+      uint16_t const *in = reinterpret_cast<uint16_t const *>(&rhs);
+      uint16_t *out = reinterpret_cast<uint16_t *>(&result);
+      asm volatile ("tanh.approx.f16 %0, %1;" : "=h"(out[N - 1]) : "h"(in[N - 1]));
+    }
+
+    return result;
+  }
+};
+#endif // #if defined(__CUDA_ARCH__)
+
+template <typename T, int N>
+struct fast_tanh_op<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &rhs) const {
+
+    fast_tanh_op<T> fast_op;
+    Array<T, N> y;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = fast_op(rhs[i]);
+    }
+
+    return y;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Absolute value function
+template <typename T>
+CUTLASS_HOST_DEVICE
+T absolute_value(T x) {
+  if (x < T()) {
+    return -x;
+  }
+  return x;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float8.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float8.h
new file mode 100644
index 0000000000000000000000000000000000000000..eab0b35f901198316b2f2416fd24bcd6c7d2af70
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float8.h
@@ -0,0 +1,1685 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using IEEE half-precision floating-point types in host or
+      device code.
+*/
+
+#pragma once
+
+
+#include "cutlass/arch/config.h"
+
+
+// FP8 types are available starting CUDA 11.8+
+#if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#define CUDA_FP8_ENABLED 1
+#endif
+
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ >= 900)
+#    if (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))
+#  elif (__CUDA_ARCH__ == 890)
+#    if (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#      define CUDA_PTX_FP8_CVT_ENABLED 1
+#    endif // (__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1))
+#  endif // (__CUDA_ARCH__ >= 900)
+#endif // defined(__CUDA_ARCH__)
+
+
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
+#  define CUDA_PTX_UE8M0_CVT_ENABLED 1
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
+#  define CUDA_PTX_UE8M0_CVT_ENABLED 1
+#endif
+
+#ifdef __GNUC__
+// Ignore checks on reinterpret-casts that are being used for bitcasts.
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDACC_RTC__)
+
+#include "cutlass/floating_point_nvrtc.h"
+
+#else
+//
+// Standard Library headers belong here to avoid conflicts with NVRTC.
+//
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+#ifdef CUDA_FP8_ENABLED
+#include <cuda_fp8.h>
+#endif
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/exmy_base.h"
+
+#include "cute/util/type_traits.hpp"
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  FP8 Has 2 encodings possible : E4M3 and E5M2
+//
+//  E4M3 : 7  |  6 5 4 3  |  2 1 0
+//  E5M2 : 7  |  6 5 4 3 2  |  1 0
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class FloatEncoding {
+    E4M3,
+    E5M2
+};
+
+template<FloatEncoding T>
+struct alignas(1) float8_base {
+
+    static constexpr bool IS_E4M3 = (T == FloatEncoding::E4M3);
+    static constexpr bool IS_E5M2 = (T == FloatEncoding::E5M2);
+
+    // Number of Bits representing mantissa and exponents
+    static constexpr int FP32_NUM_BITS = 32;
+    static constexpr int FP32_NUM_EXPONENT_BITS = 8;
+    static constexpr int FP32_NUM_MANTISSA_BITS = 23;
+    static constexpr uint32_t FP32_NAN = 0x7fffffff;
+    static constexpr uint32_t FP32_INFINITY_MASK = 0x7f800000;
+    static constexpr int FP32_MAX_EXPONENT  =  127;
+    static constexpr int FP32_MIN_EXPONENT  = -126;
+    static constexpr int FP32_EXPONENT_BIAS =  127;
+
+    static constexpr int FP16_NUM_BITS = 16;
+    static constexpr int FP16_NUM_EXPONENT_BITS = 5;
+    static constexpr int FP16_NUM_MANTISSA_BITS = 10;
+    static constexpr uint16_t FP16_NAN = 0x7fff;
+    static constexpr uint16_t FP16_INFINITY_MASK = 0x7c00;
+    static constexpr int FP16_MAX_EXPONENT  = 15;
+    static constexpr int FP16_MIN_EXPONENT  = -14;
+    static constexpr int FP16_EXPONENT_BIAS = 15;
+
+    static constexpr int FP8_NUM_BITS = 8;
+    static constexpr int FP8_NUM_EXPONENT_BITS = IS_E4M3 ? 4 : 5;
+    static constexpr int FP8_NUM_MANTISSA_BITS = IS_E4M3 ? 3 : 2;
+    static constexpr uint8_t  FP8_NAN = 0x7f; // Also F8_INF
+    static constexpr uint8_t  FP8_INFINITY_MASK = IS_E4M3 ? 0x78 : 0x7c;
+    static constexpr int FP8_MAX_EXPONENT  = IS_E4M3 ?  7 :  15;
+    static constexpr int FP8_MIN_EXPONENT  = IS_E4M3 ? -6 : -14;
+    static constexpr int FP8_EXPONENT_BIAS = IS_E4M3 ?  7 :  15;
+
+    static constexpr uint8_t  FP8_EXPONENT_MASK = (1 << FP8_NUM_EXPONENT_BITS) - 1;
+    static constexpr uint8_t  FP8_MANTISSA_MASK = (1 << FP8_NUM_MANTISSA_BITS) - 1;
+
+    static constexpr uint8_t FP8_MAX_FLT = (IS_E4M3 ? 0x7e : 0x7b);
+
+    // 256 in float
+    static constexpr uint32_t FP8_SAT_VAL_FP32 = 0x43800000;
+
+    //
+    // Data members
+    //
+
+    /// Data container
+    uint8_t storage;
+
+    /// Ctors.
+    CUTLASS_HOST_DEVICE
+    float8_base() : storage(0) { }
+
+    /// Is finite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isfinite(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7f800000) < 0x7f800000;
+    }
+
+    /// Is NaN implementation
+    CUTLASS_HOST_DEVICE
+    static bool isnan(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        return (s & 0x7fffffff) > 0x7f800000;
+    }
+
+    /// Is infinite implementation
+    CUTLASS_HOST_DEVICE
+    static bool isinf(float flt) {
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Sign = 0 for +inf, 1 for -inf
+        // Exponent = all ones
+        // Mantissa = all zeros
+        return (s == 0x7f800000) || (s == 0xff800000);
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static uint8_t convert_float_to_fp8(float const& flt) {
+
+        // software implementation rounds toward nearest even
+        uint32_t s;
+
+        #if defined(__CUDA_ARCH__)
+        s = reinterpret_cast<uint32_t const &>(flt);
+        #else
+        std::memcpy(&s, &flt, sizeof(s));
+        #endif
+
+        // Extract the bits in the FP32 type
+        uint8_t sign = uint8_t((s >> 24 & 0x80));
+        int32_t exp = int32_t((s >> FP32_NUM_MANTISSA_BITS) & 0xff) - FP32_EXPONENT_BIAS;
+        int mantissa = s & 0x7fffff;
+        uint8_t u = 0;
+
+        uint8_t const kF8_NaN = 0x7f;
+
+        // NaN => NaN
+        if (isnan(flt)) {
+            return kF8_NaN;
+        }
+
+        // Inf => MAX_FLT (satfinite)
+        if (isinf(flt)) {
+            return sign | FP8_MAX_FLT;
+        }
+
+        // Special handling
+        if (exp == -128) {
+            // int8 range is from -128 to 127
+            // So 255(inf) - 127(bias) = 128 - will show up as -128
+
+            // satfinite
+            return (sign | FP8_MAX_FLT);
+        }
+
+        int sticky_bit = 0;
+
+        bool skip_sign = false;
+        bool may_be_nan = false;
+
+        if ( (exp >= FP8_MIN_EXPONENT) && (exp <= FP8_MAX_EXPONENT) ) {
+            // normal fp32 to normal fp8
+            exp = exp + FP8_EXPONENT_BIAS;
+            u = uint8_t((uint32_t(exp) & FP8_EXPONENT_MASK) << FP8_NUM_MANTISSA_BITS);
+            u = uint8_t(u | (mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS)));
+        } else if(exp < FP8_MIN_EXPONENT) {
+            // normal single-precision to subnormal float8-precision representation
+            int rshift = (FP8_MIN_EXPONENT - exp);
+            if (rshift < FP32_NUM_BITS) {
+                mantissa |= (1 << FP32_NUM_MANTISSA_BITS);
+
+                sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
+
+                mantissa = (mantissa >> rshift);
+                u = (uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS- FP8_NUM_MANTISSA_BITS)) & FP8_MANTISSA_MASK);
+            } else {
+                mantissa = 0;
+                u = 0;
+            }
+        // Exponent > FP8_MAX_EXPONENT - this is a special case done to match HW
+        // 0x4380_0000 to 0x43e0_0000 - maps from 256 to 448, and does not saturate / inf.
+        } else {
+            if( exp == (FP8_MAX_EXPONENT + 1) ) {
+                uint8_t mantissa_tmp = uint8_t(mantissa >> (FP32_NUM_MANTISSA_BITS - FP8_NUM_MANTISSA_BITS));
+                if( mantissa_tmp < FP8_MANTISSA_MASK) {
+                    exp = exp + FP8_EXPONENT_BIAS;
+                    u = uint8_t(uint32_t(exp) << FP8_NUM_MANTISSA_BITS) | mantissa_tmp;
+                    may_be_nan =  (mantissa_tmp == (FP8_MANTISSA_MASK-1));
+                } else {
+                    // satfinite
+                    return (sign | FP8_MAX_FLT);
+                }
+            } else{
+                // satfinite
+                return (sign | FP8_MAX_FLT);
+            }
+        }
+
+        // round to nearest even
+        int NUM_BITS_SHIFT = FP32_NUM_MANTISSA_BITS - (FP8_NUM_MANTISSA_BITS + 1);
+        int round_bit = ((mantissa >> NUM_BITS_SHIFT) & 1);
+        sticky_bit |= ((mantissa & ((1 << NUM_BITS_SHIFT) - 1)) != 0);
+
+        if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+            u = uint8_t(u + 1);
+            if( may_be_nan ) {
+                skip_sign = true;
+            }
+        }
+
+        if (u > FP8_MAX_FLT) {
+            // satfinite
+            u = (sign | FP8_MAX_FLT);
+        }
+
+        if( ! skip_sign ) {
+            u |= sign;
+        }
+
+        return u;
+    }
+
+
+    /// Converts a fp8 value stored as a uint8_t to a float
+    CUTLASS_HOST_DEVICE
+    static float convert_fp8_to_float(uint8_t const& x) {
+
+        uint32_t constexpr kF32_NaN = 0x7fffffff;
+
+        uint8_t const &f8 = x;
+        uint32_t sign = (f8 >> (FP8_NUM_BITS - 1)) & 1;
+        uint32_t exp = (f8 >> FP8_NUM_MANTISSA_BITS) & FP8_EXPONENT_MASK;
+        uint32_t mantissa = f8 & FP8_MANTISSA_MASK;
+        unsigned f = (sign << (FP32_NUM_BITS-1));
+
+        if (IS_E4M3 && exp == 15 && mantissa == 0x7) {
+            f = kF32_NaN;
+        }
+        else if (exp > 0 && (IS_E4M3 || exp < (FP8_MAX_EXPONENT + FP8_EXPONENT_BIAS + 1))) {
+            // normal
+            exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS);
+            f = f |
+                (exp << FP32_NUM_MANTISSA_BITS) |
+                (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+        } else if (exp == 0) {
+            if (mantissa) {
+                // subnormal
+                exp += (FP32_EXPONENT_BIAS - FP8_EXPONENT_BIAS) + 1;
+                while ((mantissa & (1 << FP8_NUM_MANTISSA_BITS)) == 0) {
+                    mantissa <<= 1;
+                    exp--;
+                }
+                mantissa &= FP8_MANTISSA_MASK;
+                f = f |
+                    (exp << FP32_NUM_MANTISSA_BITS) |
+                    (mantissa << (FP32_NUM_MANTISSA_BITS-FP8_NUM_MANTISSA_BITS));
+            } else {
+                // sign-preserving zero
+            }
+        } else {
+            if(mantissa == 0){
+                // Sign-preserving infinity
+                f = (f | 0x7f800000);
+            } else {
+                // Canonical NaN
+                f = kF32_NaN;
+            }
+        }
+
+        #if defined(__CUDA_ARCH__)
+        return reinterpret_cast<float const&>(f);
+        #else
+        float flt;
+        std::memcpy(&flt, &f, sizeof(flt));
+        return flt;
+        #endif
+    }
+};
+
+
+// Forward declaration of float_e5m2_t to define float_e4m3_t <=> float_e5m2_t
+// conversions in class float_e4m3_t
+struct float_e5m2_t;
+
+
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E4M3
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e4m3_t : float8_base<FloatEncoding::E4M3> {
+
+    using Base = float8_base<FloatEncoding::E4M3>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t bitcast(uint8_t x) {
+        float_e4m3_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e4m3_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e4m3_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E4M3 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E4M3 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e4m3_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e4m3_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(half x) {
+        storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(double x): float_e4m3_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(int x): float_e4m3_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(unsigned x): float_e4m3_t(float(x)) {
+    }
+
+    /// E5M2 conversion. Defined after float_e5m2_t is defined.
+    CUTLASS_HOST_DEVICE
+    explicit float_e4m3_t(float_e5m2_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e4m3_t & operator=(__nv_fp8_e4m3 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+        return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e4m3_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : E5M2
+///
+///////////////////////////////////////////////////////////////
+struct alignas(1) float_e5m2_t : float8_base<FloatEncoding::E5M2> {
+
+    using Base = float8_base<FloatEncoding::E5M2>;
+
+    static constexpr int MAX_EXPONENT = Base::FP8_MAX_EXPONENT;
+
+    //
+    // Static conversion operators
+    //
+
+    /// Constructs from an uint8_t
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t bitcast(uint8_t x) {
+        float_e5m2_t f;
+        f.storage = x;
+        return f;
+    }
+
+    /// FP32 -> FP8 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_float(float const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp;
+        float y = float();
+        asm volatile("cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(flt));
+    #endif
+    }
+
+    /// FP16 -> E5M2 conversion - rounds to nearest even
+    CUTLASS_HOST_DEVICE
+    static float_e5m2_t from_half(half const& flt) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t tmp = 0;
+        uint32_t bits = reinterpret_cast<uint16_t const &>(flt);
+        asm volatile("cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;" : "=h"(tmp) : "r"(bits));
+
+        return *reinterpret_cast<float_e5m2_t *>(&tmp);
+    #else
+        return bitcast(Base::convert_float_to_fp8(__half2float(flt)));
+    #endif
+    }
+
+    // E5M2 -> half
+    CUTLASS_HOST_DEVICE
+    static half to_half(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return reinterpret_cast<half2 const &>(packed).x;
+    #else
+        return __float2half(Base::convert_fp8_to_float(x.storage));
+    #endif
+    }
+
+    // E5M2 -> Float
+    CUTLASS_HOST_DEVICE
+    static float to_float(float_e5m2_t const& x) {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+        uint16_t bits = x.storage;
+        uint32_t packed;
+        asm volatile("cvt.rn.f16x2.e5m2x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+
+        return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else
+        return Base::convert_fp8_to_float(x.storage);
+    #endif
+    }
+
+    //
+    // Methods
+    //
+
+    /// Constructor inheritance
+    using Base::Base;
+
+    /// Default constructor
+    float_e5m2_t() = default;
+
+#ifdef CUDA_FP8_ENABLED
+    /// Conversion from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+    }
+#endif
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float x) {
+        storage = from_float(x).storage;
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(half x) {
+      storage = from_half(x).storage;
+    }
+
+    /// Floating point conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(double x): float_e5m2_t(float(x)) {
+    }
+
+    /// Integer conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(int x): float_e5m2_t(float(x)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(unsigned x): float_e5m2_t(float(x)) {
+    }
+
+    /// E4M3 conversion
+    CUTLASS_HOST_DEVICE
+    explicit float_e5m2_t(float_e4m3_t x);
+
+#ifdef CUDA_FP8_ENABLED
+    /// Assignment from CUDA's FP8 type
+    CUTLASS_HOST_DEVICE
+    float_e5m2_t & operator=(__nv_fp8_e5m2 x) {
+        storage = x.__x;
+        return *this;
+    }
+#endif
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    operator float() const {
+        return to_float(*this);
+    }
+
+    /// Converts to half
+    CUTLASS_HOST_DEVICE
+    operator half() const {
+      return to_half(*this);
+    }
+
+    /// Converts to float
+    CUTLASS_HOST_DEVICE
+    explicit operator double() const {
+        return double(to_float(*this));
+    }
+
+    /// Converts to int
+    CUTLASS_HOST_DEVICE
+    explicit operator int() const {
+    #if defined(__CUDA_ARCH__)
+        return __half2int_rn(to_half(*this));
+    #else
+        return int(to_float(*this));
+    #endif
+    }
+
+    /// Casts to bool
+    CUTLASS_HOST_DEVICE
+    explicit operator bool() const {
+    #if defined(__CUDA_ARCH__)
+        return bool(__half2int_rn(to_half(*this)));
+    #else
+        return bool(int(to_float(*this)));
+    #endif
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t& raw() {
+        return storage;
+    }
+
+    /// Accesses raw internal state
+    CUTLASS_HOST_DEVICE
+    uint8_t raw() const {
+        return storage;
+    }
+
+    /// Returns the sign bit
+    CUTLASS_HOST_DEVICE
+    bool signbit() const {
+        return ((storage & (1 << (Base::FP8_NUM_BITS - 1))) != 0);
+    }
+
+    /// Returns the biased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent_biased() const {
+        return int((storage >> FP8_NUM_MANTISSA_BITS) & Base::FP8_EXPONENT_MASK);
+    }
+
+    /// Returns the unbiased exponent
+    CUTLASS_HOST_DEVICE
+    int exponent() const {
+        return exponent_biased() - 15;
+    }
+
+    /// Returns the mantissa
+    CUTLASS_HOST_DEVICE
+    int mantissa() const {
+        return int(storage & Base::FP8_MANTISSA_MASK);
+    }
+    
+    CUTLASS_HOST_DEVICE
+    friend bool isnan(float_e5m2_t const& x) {
+      return x.storage == uint8_t(0x7f);
+    }
+
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator+(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs) {
+    return float_e4m3_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator-(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator*(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator/(float_e4m3_t const& lhs, float_e4m3_t const& rhs) {
+    return float_e4m3_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator+=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator-=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator*=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator/=(float_e4m3_t & lhs, float_e4m3_t const& rhs) {
+    lhs = float_e4m3_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator++(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t& operator--(float_e4m3_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e4m3_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator++(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e4m3_t operator--(float_e4m3_t & lhs, int) {
+    float_e4m3_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e4m3_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+bool operator==(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator+(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) + float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs) {
+    return float_e5m2_t(-float(lhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator-(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator*(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator/(float_e5m2_t const& lhs, float_e5m2_t const& rhs) {
+    return float_e5m2_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator+=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) + float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator-=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) - float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator*=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) * float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator/=(float_e5m2_t & lhs, float_e5m2_t const& rhs) {
+    lhs = float_e5m2_t(float(lhs) / float(rhs));
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator++(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    ++tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t& operator--(float_e5m2_t & lhs) {
+    float tmp(lhs);
+    --tmp;
+    lhs = float_e5m2_t(tmp);
+    return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator++(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp++;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+CUTLASS_HOST_DEVICE
+float_e5m2_t operator--(float_e5m2_t & lhs, int) {
+    float_e5m2_t ret(lhs);
+    float tmp(lhs);
+    tmp--;
+    lhs = float_e5m2_t(tmp);
+    return ret;
+}
+
+
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : UE4M3
+///
+///////////////////////////////////////////////////////////////
+// UE4M3:
+//   4 Exponent bits, 3 Mantissa bits
+//   Range: [0:448]
+//   has_inf: false
+//   has_NaN: true
+//   has_denorm: true
+//   Exponent bias (exp_bias): 7
+struct float_ue4m3_t : public float_exmy_base<cutlass::detail::FpEncoding::UE4M3, float_ue4m3_t> {
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::UE4M3, float_ue4m3_t>;
+
+  float_ue4m3_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_ue4m3_t convert_from_float(float const &flt) const {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+      uint16_t tmp;
+      float y = float();
+      asm volatile("cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;" : "=h"(tmp) : "f"(y), "f"(flt));
+      return bitcast(*reinterpret_cast<uint8_t *>(&tmp));
+    #else 
+      Base::FP32BitRepresentation::Storage fp32_bits = Base::FP32BitRepresentation::to_bits(flt);
+      return bitcast(BitRepresentation::convert_from(fp32_bits, Base::FP32BitRepresentation{}));
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  float convert_to_float(float_ue4m3_t const &x) const {
+    #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+      uint16_t bits = x.storage;
+      uint32_t packed;
+      asm volatile("cvt.rn.f16x2.e4m3x2 %0, %1;\n" : "=r"(packed) : "h"(bits));
+      return __half2float(reinterpret_cast<half2 const &>(packed).x);
+    #else 
+      Base::FP32BitRepresentation::Storage fp32_bits;
+      fp32_bits = Base::BitRepresentation::convert_to(x.storage, Base::FP32BitRepresentation{});
+      return detail::copy_bits<Base::FP32BitRepresentation::Storage, float>(fp32_bits);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue4m3_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue4m3_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue4m3_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue4m3_t(unsigned x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_ue4m3_t(Base x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool isnan(float_ue4m3_t const& x) {
+    return x.storage == uint8_t(0x7f);
+  }
+
+};
+
+/// Defines the size of an element in bits - specialized for float_ue4m3_t
+template <>
+struct sizeof_bits<float_ue4m3_t> {
+  static constexpr int value = sizeof_bits<float_exmy_base<cutlass::detail::FpEncoding::UE4M3, float_ue4m3_t>>::value;
+};
+
+
+
+///////////////////////////////////////////////////////////////
+///
+/// floating-point 8 type : UE8M0
+///
+///////////////////////////////////////////////////////////////
+// UE8M0:
+//   8 Exponent bits, 0 Mantissa bits
+//   Range: [2^-127:2^127]
+//   has_inf: false
+//   has_NaN: true (11111111)
+//   has_denorm: true
+//   Exponent bias (exp_bias): 8
+
+struct float_ue8m0_t : public float_exmy_base<cutlass::detail::FpEncoding::UE8M0, float_ue8m0_t> {
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::UE8M0, float_ue8m0_t>;
+  using FP32Bits = typename Base::FP32BitRepresentation;
+
+  float_ue8m0_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_ue8m0_t convert_from_float(float const &flt) const {
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint16_t out;
+    asm volatile(
+        "{ cvt.rp.satfinite.ue8m0x2.f32 %0, 0.0, %1; }"
+        : "=h"(out) : "f"(flt));      
+    return bitcast(*reinterpret_cast<uint8_t *>(&out));
+  #else
+    if (CUTLASS_CMATH_NAMESPACE::isnan(flt) || CUTLASS_CMATH_NAMESPACE::isinf(flt)) {
+      return bitcast(0xFF);
+    }
+    uint32_t flt_uint32 = cutlass::detail::copy_bits<float, uint32_t>(flt);
+    uint8_t exp = (flt_uint32 >> 23) & 0xff;  // Extract the 8 bit exponent
+    uint32_t mant = flt_uint32 & 0x7fffff;    // Extract the 23 bit mantissa
+    // Do the round up
+    // Deals w/ satfinite all at once
+    if ((mant > 0) && (exp != 0xFE) && !(exp == 0 && mant <= 0x00400000)) {
+      exp++;
+    }
+    return bitcast(exp);
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  float convert_to_float(float_ue8m0_t const &x) const {
+    //////////////////////////////////////////////////////////////
+    // The conversion of UE8M0 to FP32 scale can be done simply
+    // with a left shift (No rounding necessary)
+    // Note: The base class implements ue8m0 to FP32 based on the rules of float math conversions.
+    //       The result of current implementation and base class are aligned.
+    //////////////////////////////////////////////////////////////
+    #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+      uint16_t bits = x.storage;
+      uint32_t bf16x2_val;
+      // E8 -> BF16
+      asm volatile(
+        "{\n"
+        "cvt.rn.bf16x2.ue8m0x2 %0, %1;\n"
+        "}\n" : "=r"(bf16x2_val): "h"(bits));
+      // BF16 -> FP32
+      float f1;
+      asm(
+        "{\n"
+        "prmt.b32 %0, %1, %2, %3;\n"
+        "}\n"
+          : "=f"(f1) 
+          : "r"(0), "r"(bf16x2_val), "r"(0x5410));
+      return f1;
+    #else
+      using FP32Bits = cutlass::detail::FpBitRepresentation<uint32_t, 32, 8, 23, cutlass::detail::NanInfEncoding::IEEE_754>;
+      if (x.storage == 0x00) {
+        return cutlass::detail::copy_bits<FP32Bits::Storage, float>(0x00400000);
+      }
+      else if (x.storage == 0xFF) {
+        return cutlass::detail::copy_bits<FP32Bits::Storage, float>(0x7fffffff);
+      }
+      else {
+        auto f8 = static_cast<FP32Bits::Storage>(x.storage);
+        FP32Bits::Storage f = (f8 << FP32Bits::NUM_MANTISSA_BITS);
+        return cutlass::detail::copy_bits<FP32Bits::Storage, float>(f);
+      }
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue8m0_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue8m0_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue8m0_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_ue8m0_t(unsigned x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_ue8m0_t(Base x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool isnan(float_ue8m0_t const& x) {
+    return x.storage == uint8_t(0xff);
+  }
+
+};
+
+/// Defines the size of an element in bits - specialized for float_ue8m0_t
+template <>
+struct sizeof_bits<float_ue8m0_t> {
+  static constexpr int value = sizeof_bits<float_exmy_base<cutlass::detail::FpEncoding::UE8M0, float_ue8m0_t>>::value;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// float_e4m3_t <=> float_e5m2_t conversions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// float_e4m3_t <= float_e5m2_t
+CUTLASS_HOST_DEVICE
+float_e4m3_t::float_e4m3_t(float_e5m2_t x) {
+    storage = from_float(float_e5m2_t::to_float(x)).storage;
+}
+
+/// float_e5m2_t <= float_e4m3_t
+CUTLASS_HOST_DEVICE
+float_e5m2_t::float_e5m2_t(float_e4m3_t x) {
+    storage = from_float(float_e4m3_t::to_float(x)).storage;
+}
+
+///////////////////////////////////////////////////////////////
+///
+/// Umbrella floating-point 8-bit data type : type_erased_dynamic_float8_t
+/// This umbrella datatype can be enabled when a user provides a specific
+/// datatype in runtime argument list.
+///
+/// Currently supported runtime datatypes compatible with type_erased_dynamic_float8_t:
+///   MXF8F6F4Format::E5M2
+///   MXF8F6F4Format::E4M3
+///
+///////////////////////////////////////////////////////////////
+
+union type_erased_dynamic_float8_t {
+  uint8_t data;
+  cutlass::float_e5m2_t e5m2;
+  cutlass::float_e4m3_t e4m3;
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e5m2_t() const {
+    return e5m2;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e4m3_t() const {
+    return e4m3;
+  }
+
+};
+
+
+
+///////////////////////////////////////////////////////////////
+/// MX type for float8
+/// Intended to be used in builders
+///////////////////////////////////////////////////////////////
+
+template <class F8Type>
+struct mx_float8_t {
+  static_assert(cute::is_same_v<F8Type,cutlass::float_e5m2_t>
+                || cute::is_same_v<F8Type,cutlass::float_e4m3_t>
+                || cute::is_same_v<F8Type,type_erased_dynamic_float8_t>
+                , "Only float_e5m2_t, float_e4m3_t can have scale factors for MXFP8");
+  using ScaleFactorType = cutlass::float_ue8m0_t;
+  using DataType = F8Type;
+};
+
+using type_erased_dynamic_mx_float8_t = mx_float8_t<type_erased_dynamic_float8_t>;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+
+template <typename T>
+struct float8_exmy_numeric_limits
+{
+private:
+  using type = T;
+
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static bool const has_denorm_loss = true;
+  static cutlass::platform::float_denorm_style const has_denorm = cutlass::platform::denorm_present;
+  static cutlass::platform::float_round_style const round_style = cutlass::platform::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = type::Base::BitRepresentation::NUM_MANTISSA_BITS;
+  static bool const has_infinity = false;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static type min() { return type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static type max() { return type::bitcast(type::Base::BitRepresentation::MAX_VALUE); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static type round_error() { return type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static type infinity() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static type quiet_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static type signaling_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static type denorm_min() { return type::bitcast(0x01); }
+};
+
+/// Numeric limits for float_ue8m0_t
+template <>
+struct numeric_limits<cutlass::float_ue8m0_t> :
+    public float8_exmy_numeric_limits<cutlass::float_ue8m0_t> {
+  static bool const has_infinity = false;
+  static bool const is_signed = false;
+
+  /// Minimum finite value
+  static cutlass::float_ue8m0_t lowest() { return cutlass::float_ue8m0_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value (2^0)
+  static cutlass::float_ue8m0_t epsilon() { return cutlass::float_ue8m0_t::bitcast(0x7f); }
+};
+
+
+}  // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Numeric limits common to all float8 types
+template <typename T>
+struct float8_base_numeric_limits {
+private:
+  using F8Type = T;
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = F8Type::FP8_NUM_MANTISSA_BITS;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static F8Type min() { return F8Type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static F8Type max() { return F8Type::bitcast(F8Type::FP8_MAX_FLT); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static F8Type round_error() { return F8Type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static F8Type infinity() { return F8Type::bitcast(F8Type::FP8_INFINITY_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type quiet_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static F8Type signaling_NaN() { return F8Type::bitcast(F8Type::FP8_NAN); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static F8Type denorm_min() { return F8Type::bitcast(0x01); }
+};
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits for float_e4m3_t
+template <>
+struct numeric_limits<cutlass::float_e4m3_t> :
+    public float8_base_numeric_limits<cutlass::float_e4m3_t> {
+  static bool const has_infinity = false;
+
+  /// Minimum finite value
+  static cutlass::float_e4m3_t lowest() { return cutlass::float_e4m3_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e4m3_t epsilon() { return cutlass::float_e4m3_t::bitcast(0x20); }
+};
+
+/// Numeric limits for float_e5m2_t
+template <>
+struct numeric_limits<cutlass::float_e5m2_t>  :
+    public float8_base_numeric_limits<cutlass::float_e5m2_t> {
+  static bool const has_infinity = true;
+
+  /// Minimum finite value
+  static cutlass::float_e5m2_t lowest() { return cutlass::float_e5m2_t::bitcast(0xfb); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value
+  static cutlass::float_e5m2_t epsilon() { return cutlass::float_e5m2_t::bitcast(0x34); }
+};
+
+
+template <typename T>
+struct float8_exmy_numeric_limits
+{
+private:
+  using type = T;
+
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static bool const has_denorm_loss = true;
+  static cutlass::platform::float_denorm_style const has_denorm = cutlass::platform::denorm_present;
+  static cutlass::platform::float_round_style const round_style = cutlass::platform::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = type::Base::BitRepresentation::NUM_MANTISSA_BITS;
+  static bool const has_infinity = false;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static type min() { return type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static type max() { return type::bitcast(type::Base::BitRepresentation::MAX_VALUE); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static type round_error() { return type(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static type infinity() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static type quiet_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static type signaling_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static type denorm_min() { return type::bitcast(0x01); }
+};
+
+/// Numeric limits for float_ue8m0_t
+template <>
+struct numeric_limits<cutlass::float_ue8m0_t> :
+    public float8_exmy_numeric_limits<cutlass::float_ue8m0_t> {
+  static bool const has_infinity = false;
+  static bool const is_signed = false;
+
+  /// Minimum finite value
+  static cutlass::float_ue8m0_t lowest() { return cutlass::float_ue8m0_t::bitcast(0xfe); }
+
+  /// Machine epsilon, that is, the difference between 1.0 and the next representable value (2^0)
+  static cutlass::float_ue8m0_t epsilon() { return cutlass::float_ue8m0_t::bitcast(0x7f); }
+};
+
+
+}  // namespace platform
+
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(long double x) {
+  return cutlass::float_e4m3_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e4m3_t operator "" _fe4m3(unsigned long long int x) {
+  return cutlass::float_e4m3_t(int(x));
+}
+
+
+CUTLASS_HOST_DEVICE
+cutlass::float_ue4m3_t operator "" _fue4m3(long double x) {
+  return cutlass::float_ue4m3_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_ue4m3_t operator "" _fue4m3(unsigned long long int x) {
+  return cutlass::float_ue4m3_t(int(x));
+}
+
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(long double x) {
+  return cutlass::float_e5m2_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e5m2_t operator "" _fe5m2(unsigned long long int x) {
+  return cutlass::float_e5m2_t(int(x));
+}
+
+
+CUTLASS_HOST_DEVICE
+cutlass::float_ue8m0_t operator "" _fue8m0(long double x)
+{
+  return cutlass::float_ue8m0_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_ue8m0_t operator "" _fue8m0(unsigned long long int x)
+{
+  return cutlass::float_ue8m0_t(int(x));
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float_subbyte.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float_subbyte.h
new file mode 100644
index 0000000000000000000000000000000000000000..eefab027291f6dcbec5dc795b2cf8f50b1728d4e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/float_subbyte.h
@@ -0,0 +1,797 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*!
+  \file
+  \brief Defines classes for FP4/FP6 datatypes
+*/
+#pragma once
+
+#include "cutlass/arch/config.h"
+#include "cutlass/float8.h"
+
+// FP4 types are available starting CUDA 12+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+#define CUDA_FP4_ENABLED 1
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110A_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121A_ENABLED))
+#  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
+#endif
+
+#if (defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM103F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM110F_ENABLED) ||\
+     defined(CUTLASS_ARCH_MMA_SM120F_ENABLED) || defined(CUTLASS_ARCH_MMA_SM121F_ENABLED))
+#  define CUDA_PTX_FP4FP6_CVT_ENABLED 1
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/exmy_base.h"
+
+#include "cute/util/type_traits.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+// FP4 and FP6 types
+struct float_e2m1_t;
+struct float_e3m2_t;
+// E2M1:
+//   2 Exponent bits with 1 Mantissa bit
+//   Range: +-[0,0.5,1,1.5,2,3,4,5,6]
+//   has_Inf: false
+//   has_NaN: false
+//   has_denorm: true
+//   Exponent bias (exp_bias): 1
+
+struct float_e2m1_t : public float_exmy_base<cutlass::detail::FpEncoding::E2M1, float_e2m1_t> {
+  
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E2M1, float_e2m1_t>;
+
+  float_e2m1_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e2m1_t(Base x) : Base(x) {
+  }
+};
+
+namespace detail {
+
+// This new type is used to select correct MMA type and TMA type.
+struct float_e2m1_unpacksmem_t : public float_exmy_base<cutlass::detail::FpEncoding::E2M1, float_e2m1_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E2M1, float_e2m1_t>;
+
+  float_e2m1_unpacksmem_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_e2m1_unpacksmem_t(float_e2m1_unpacksmem_t const& x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_unpacksmem_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_unpacksmem_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m1_unpacksmem_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e2m1_unpacksmem_t(Base x) : Base(x) {
+  }
+};
+
+} // namespace detail
+
+/// Defines the size of an element in bits - specialized for float_e2m1_t
+template <>
+struct sizeof_bits<float_e2m1_t> {
+  static constexpr int value = 4;
+};
+
+template <>
+struct sizeof_bits<detail::float_e2m1_unpacksmem_t> {
+  static constexpr int value = 4;
+};
+
+CUTLASS_HOST_DEVICE
+float_e2m1_t abs(float_e2m1_t const& val) {
+  using BaseType = typename float_e2m1_t::Base;
+  return float_e2m1_t(abs(BaseType{val.raw()}));
+}
+
+
+// E2M3:
+//   2 Exponent bits with 3 Mantissa bit
+//   Range: [-7.5,+7.5]
+//   has_Inf: false
+//   has_NaN: false
+//   has_denorm: true
+//   Exponent bias (exp_bias): 1
+
+struct float_e2m3_t : public float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_t>;
+
+  float_e2m3_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e2m3_t(Base x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_t(float_e3m2_t x);
+};
+
+namespace detail {
+
+struct float_e2m3_unpack8bits_t: public float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_unpack8bits_t> {
+  // Used in register.
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_unpack8bits_t>;
+
+  float_e2m3_unpack8bits_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpack8bits_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpack8bits_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpack8bits_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e2m3_unpack8bits_t(Base x) : Base(x) {
+  }
+};
+
+// This new type is used to select correct MMA type and TMA type.
+struct float_e2m3_unpacksmem_t : public float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E2M3, float_e2m3_t>;
+
+  float_e2m3_unpacksmem_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_e2m3_unpacksmem_t(float_e2m3_unpacksmem_t const& x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpacksmem_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpacksmem_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e2m3_unpacksmem_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e2m3_unpacksmem_t(Base x) : Base(x) {
+  }
+};
+
+} // namespace detail
+
+/// Defines the size of an element in bits - specialized for float_e2m3_t
+template <>
+struct sizeof_bits<float_e2m3_t> {
+  static constexpr int value = 6;
+};
+
+/// Defines the size of an element in bits - specialized for float_e2m3_unpacksmem_t
+template <>
+struct sizeof_bits<detail::float_e2m3_unpacksmem_t> {
+  static constexpr int value = 6;
+};
+
+CUTLASS_HOST_DEVICE
+float_e2m3_t abs(float_e2m3_t const& val) {
+  using BaseType = typename float_e2m3_t::Base;
+  return float_e2m3_t(abs(BaseType{val.raw()}));
+}
+
+// E3M2:
+//   3 Exponent bits, 2 Mantissa bits
+//   Range: [-28:+28]
+//   has_inf: false
+//   has_NaN: false
+//   has_denorm: true
+//   Exponent bias (exp_bias): 3
+
+struct float_e3m2_t : public float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_t>;
+
+  float_e3m2_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e3m2_t(Base x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_t(float_e2m3_t x);
+};
+
+namespace detail {
+
+struct float_e3m2_unpack8bits_t : public float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_unpack8bits_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_unpack8bits_t>;
+
+  float_e3m2_unpack8bits_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpack8bits_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpack8bits_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpack8bits_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e3m2_unpack8bits_t(Base x) : Base(x) {
+  }
+};
+
+// This new type is used to select correct MMA type and TMA type.
+struct float_e3m2_unpacksmem_t : public float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_t> {
+
+  using Base = float_exmy_base<cutlass::detail::FpEncoding::E3M2, float_e3m2_t>;
+
+  float_e3m2_unpacksmem_t() = default;
+
+  CUTLASS_HOST_DEVICE
+  float_e3m2_unpacksmem_t(float_e3m2_unpacksmem_t const& x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpacksmem_t(double x) : Base(float(x)) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpacksmem_t(float x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit float_e3m2_unpacksmem_t(int x) : Base(x) {
+  }
+
+  CUTLASS_HOST_DEVICE
+  float_e3m2_unpacksmem_t(Base x) : Base(x) {
+  }
+};
+
+} // namespace detail
+
+/// Defines the size of an element in bits - specialized for float_e3m2_t
+template <>
+struct sizeof_bits<float_e3m2_t> {
+  static constexpr int value = 6;
+};
+
+/// Defines the size of an element in bits - specialized for float_e3m2_unpacksmem_t
+template <>
+struct sizeof_bits<detail::float_e3m2_unpacksmem_t> {
+  static constexpr int value = 6;
+};
+
+CUTLASS_HOST_DEVICE
+float_e3m2_t abs(float_e3m2_t const& val) {
+  using BaseType = typename float_e3m2_t::Base;
+  return float_e3m2_t(abs(BaseType{val.raw()}));
+}
+
+/// Defines the size of an element in bits - specialized for float_e3m2_unpack8bits_t
+template <>
+struct sizeof_bits<detail::float_e3m2_unpack8bits_t> {
+  static constexpr int value = 8;
+};
+
+/// Defines the size of an element in bits - specialized for float_e2m3_unpack8bits_t
+template <>
+struct sizeof_bits<detail::float_e2m3_unpack8bits_t> {
+  static constexpr int value = 8;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Get the register type used in kernel
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<typename T>
+struct get_unpacked_element_type;
+
+template <>
+struct get_unpacked_element_type<float_e2m3_t> {
+  using type = detail::float_e2m3_unpack8bits_t;
+};
+
+template <>
+struct get_unpacked_element_type<float_e3m2_t> {
+  using type = detail::float_e3m2_unpack8bits_t;
+};
+} // namespace detail
+// ///////////////////////////////////////////////////////////////////////////////////////////////////
+// //
+// // float_e2m3_t <=> float_e3m2_t conversions
+// //
+// ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+float_e2m3_t::float_e2m3_t(float_e3m2_t x)
+{
+  storage = convert_from_float(float(x)).storage;
+}
+
+CUTLASS_HOST_DEVICE
+float_e3m2_t::float_e3m2_t(float_e2m3_t x)
+{
+  storage = convert_from_float(float(x)).storage;
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////
+///
+/// Umbrella floating-point 6-bit data type : type_erased_dynamic_float6_t
+/// This umbrella datatype can be enabled when a user provides a specific
+/// datatype in runtime argument list.
+/// 
+/// Currently supported runtime datatypes compatible with type_erased_dynamic_float6_t:
+///   MXF8F6F4Format::E2M3
+///   MXF8F6F4Format::E3M2
+///
+///////////////////////////////////////////////////////////////
+
+union type_erased_dynamic_float6_t {
+  cutlass::float_e2m3_t e2m3;
+  cutlass::float_e3m2_t e3m2;
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e2m3_t() const { 
+    return e2m3;
+  }
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e3m2_t() const { 
+    return e3m2;
+  }
+};
+
+template <>
+struct sizeof_bits<type_erased_dynamic_float6_t> {
+  static constexpr int value = 6;
+};
+
+///////////////////////////////////////////////////////////////
+///
+/// Umbrella floating-point 4-bit data type : type_erased_dynamic_float4_t
+/// This umbrella datatype can be enabled when a user provides a specific
+/// datatype in runtime argument list.
+/// 
+/// Currently supported runtime datatypes compatible with type_erased_dynamic_float4_t:
+///   MXF8F6F4Format::E2M1
+///
+///////////////////////////////////////////////////////////////
+
+union type_erased_dynamic_float4_t {
+  cutlass::float_e2m1_t e2m1;
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::float_e2m1_t() const { 
+    return e2m1;
+  }
+};
+
+template <>
+struct sizeof_bits<type_erased_dynamic_float4_t> {
+  static constexpr int value = 4;
+};
+
+
+///////////////////////////////////////////////////////////////
+/// MX/NV types for float6 and float4
+/// Intended to be used in builders
+///////////////////////////////////////////////////////////////
+
+template <class F6Type>
+struct mx_float6_t
+{
+  static_assert(cute::is_same_v<F6Type,cutlass::float_e2m3_t>
+                || cute::is_same_v<F6Type,cutlass::float_e3m2_t>
+                || cute::is_same_v<F6Type,type_erased_dynamic_float6_t>
+                , "Only float_e2m3_t, float_e3m2_t can have scale factors for MXFP6");
+  using ScaleFactorType = cutlass::float_ue8m0_t;
+  using DataType = F6Type;
+};
+
+using type_erased_dynamic_mx_float6_t = mx_float6_t<type_erased_dynamic_float6_t>;
+
+template <class F4Type>
+struct mx_float4_t
+{
+  static_assert(cute::is_same_v<F4Type,cutlass::float_e2m1_t>
+                || cute::is_same_v<F4Type,type_erased_dynamic_float4_t>
+                , "Only float_e2m1_t type_erased_dynamic_float4_t can have scale factors for MXFP4");
+  using ScaleFactorType = cutlass::float_ue8m0_t;
+  using DataType = F4Type;
+};
+
+using type_erased_dynamic_mx_float4_t = mx_float4_t<type_erased_dynamic_float4_t>;
+
+template <class F4Type>
+struct nv_float4_t
+{
+  static_assert(cute::is_same_v<F4Type,cutlass::float_e2m1_t>
+                || cute::is_same_v<F4Type,type_erased_dynamic_float4_t>
+                , "Only float_e2m1_t type_erased_dynamic_float4_t can have scale factors for NVFP4");
+  using ScaleFactorType = cutlass::float_ue4m3_t;
+  using DataType = F4Type;
+};
+
+using type_erased_dynamic_nv_float4_t = nv_float4_t<type_erased_dynamic_float4_t>;
+
+
+namespace detail {
+
+union type_erased_dynamic_float6_unpacksmem_t {
+  cutlass::detail::float_e2m3_unpacksmem_t e2m3_unpacksmem;
+  cutlass::detail::float_e3m2_unpacksmem_t e3m2_unpacksmem;
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::detail::float_e2m3_unpacksmem_t() const { 
+    return e2m3_unpacksmem;
+  }
+  
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::detail::float_e3m2_unpacksmem_t() const { 
+    return e3m2_unpacksmem;
+  }
+};
+
+union type_erased_dynamic_float4_unpacksmem_t {
+  cutlass::detail::float_e2m1_unpacksmem_t e2m1_unpacksmem;
+
+  CUTLASS_HOST_DEVICE
+  explicit operator cutlass::detail::float_e2m1_unpacksmem_t() const { 
+    return e2m1_unpacksmem;
+  }
+};
+
+};
+
+template <>
+struct sizeof_bits<detail::type_erased_dynamic_float6_unpacksmem_t> {
+  static constexpr int value = 6;
+};
+
+
+template <>
+struct sizeof_bits<detail::type_erased_dynamic_float4_unpacksmem_t> {
+  static constexpr int value = 4;
+};
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#if !defined(__CUDACC_RTC__)
+namespace std {
+/// Numeric limits common to all float4 types
+template <typename T>
+struct float_subbyte_base_numeric_limits
+{
+private:
+  using type = T;
+
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = false;
+  static bool const has_signaling_NaN = false;
+  static bool const has_denorm_loss = true;
+  static cutlass::platform::float_denorm_style const has_denorm = cutlass::platform::denorm_present;
+  static cutlass::platform::float_round_style const round_style = cutlass::platform::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = type::Base::BitRepresentation::NUM_MANTISSA_BITS;
+  static bool const has_infinity = false;
+
+  /// Least positive value
+  static type min() { return type::bitcast(0x01); }
+
+  /// Maximum finite value
+  static type max() { return type::bitcast(type::Base::BitRepresentation::MAX_VALUE); }
+
+  /// Returns maximum rounding error
+  static type round_error() { return type(0.5f); }
+
+  /// Returns positive infinity value
+  static type infinity() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns quiet NaN value
+  static type quiet_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns signaling NaN value
+  static type signaling_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns smallest positive subnormal value
+  static type denorm_min() { return type::bitcast(0x01); }
+};
+/// Numeric limits for float_e2m1_t
+template <>
+struct numeric_limits<cutlass::float_e2m1_t> : public float_subbyte_base_numeric_limits<cutlass::float_e2m1_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e2m1_t lowest() { return cutlass::float_e2m1_t::bitcast(0xf); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e2m1_t epsilon() { return cutlass::float_e2m1_t::bitcast(0x1); }
+};
+
+/// Numeric limits for float_e2m3_t
+template <>
+struct numeric_limits<cutlass::float_e2m3_t> : public float_subbyte_base_numeric_limits<cutlass::float_e2m3_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e2m3_t lowest() { return cutlass::float_e2m3_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e2m3_t epsilon() { return cutlass::float_e2m3_t::bitcast(0x1); }   
+};
+
+/// Numeric limits for float_e3m2_t
+
+template <>
+struct numeric_limits<cutlass::float_e3m2_t> : public float_subbyte_base_numeric_limits<cutlass::float_e3m2_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e3m2_t lowest() { return cutlass::float_e3m2_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e3m2_t epsilon() { return cutlass::float_e3m2_t::bitcast(0x4); }
+};
+} // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Numeric limits common to all float4 types
+template <typename T>
+struct float_subbyte_base_numeric_limits
+{
+private:
+  using type = T;
+
+public:
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_quiet_NaN = false;
+  static bool const has_signaling_NaN = false;
+  static bool const has_denorm_loss = true;
+  static cutlass::platform::float_denorm_style const has_denorm = cutlass::platform::denorm_present;
+  static cutlass::platform::float_round_style const round_style = cutlass::platform::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = type::Base::BitRepresentation::NUM_MANTISSA_BITS;
+  static bool const has_infinity = false;
+
+  /// Least positive value
+  static type min() { return type::bitcast(0x01); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE static type max() { return type::bitcast(type::Base::BitRepresentation::MAX_VALUE); }
+
+  /// Returns maximum rounding error
+  static type round_error() { return type(0.5f); }
+
+  /// Returns positive infinity value
+  static type infinity() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns quiet NaN value
+  static type quiet_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns signaling NaN value
+  static type signaling_NaN() { return type::bitcast(type::Base::BitRepresentation::INF_MASK); }
+
+  /// Returns smallest positive subnormal value
+  static type denorm_min() { return type::bitcast(0x01); }
+};
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+/// Numeric limits for float_e2m1_t
+template <>
+struct numeric_limits<cutlass::float_e2m1_t> : public float_subbyte_base_numeric_limits<cutlass::float_e2m1_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e2m1_t lowest() { return cutlass::float_e2m1_t::bitcast(0xf); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e2m1_t epsilon() { return cutlass::float_e2m1_t::bitcast(0x1); }
+};
+
+/// Numeric limits for float_e2m3_t
+template <>
+struct numeric_limits<cutlass::float_e2m3_t> : public float_subbyte_base_numeric_limits<cutlass::float_e2m3_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e2m3_t lowest() { return cutlass::float_e2m3_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e2m3_t epsilon() { return cutlass::float_e2m3_t::bitcast(0x1); }   
+};
+
+/// Numeric limits for float_e3m2_t
+
+template <>
+struct numeric_limits<cutlass::float_e3m2_t> : public float_subbyte_base_numeric_limits<cutlass::float_e3m2_t>
+{
+  /// Minimum finite value
+  static cutlass::float_e3m2_t lowest() { return cutlass::float_e3m2_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::float_e3m2_t epsilon() { return cutlass::float_e3m2_t::bitcast(0x4); }
+};
+
+/// Numeric limits for float_e2m3_unpack8bits_t
+template <>
+struct numeric_limits<cutlass::detail::float_e2m3_unpack8bits_t> : public float_subbyte_base_numeric_limits<cutlass::detail::float_e2m3_unpack8bits_t>
+{
+  /// Minimum finite value
+  static cutlass::detail::float_e2m3_unpack8bits_t lowest() { return cutlass::detail::float_e2m3_unpack8bits_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::detail::float_e2m3_unpack8bits_t epsilon() { return cutlass::detail::float_e2m3_unpack8bits_t::bitcast(0x1); }   
+};
+
+/// Numeric limits for float_e3m2_unpack8bits_t
+
+template <>
+struct numeric_limits<cutlass::detail::float_e3m2_unpack8bits_t> : public float_subbyte_base_numeric_limits<cutlass::detail::float_e3m2_unpack8bits_t>
+{
+  /// Minimum finite value
+  static cutlass::detail::float_e3m2_unpack8bits_t lowest() { return cutlass::detail::float_e3m2_unpack8bits_t::bitcast(0x2f); }
+
+  /// Returns machine epsilon, that is, the difference between 1.0 and the next value representable by the floating-point
+  static cutlass::detail::float_e3m2_unpack8bits_t epsilon() { return cutlass::detail::float_e3m2_unpack8bits_t::bitcast(0x4); }
+};
+} // namespace platform
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+CUTLASS_HOST_DEVICE
+cutlass::float_e2m1_t operator"" _fe2m1(long double x)
+{
+  return cutlass::float_e2m1_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e2m1_t operator"" _fe2m1(unsigned long long int x)
+{
+  return cutlass::float_e2m1_t(int(x));
+}
+CUTLASS_HOST_DEVICE
+cutlass::float_e2m3_t operator"" _fe2m3(long double x)
+{
+  return cutlass::float_e2m3_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e2m3_t operator"" _fe2m3(unsigned long long int x)
+{
+  return cutlass::float_e2m3_t(int(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e3m2_t operator"" _fe3m2(long double x)
+{
+  return cutlass::float_e3m2_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::float_e3m2_t operator"" _fe3m2(unsigned long long int x)
+{
+  return cutlass::float_e3m2_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/floating_point_nvrtc.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/floating_point_nvrtc.h
new file mode 100644
index 0000000000000000000000000000000000000000..6496fea077d59e0c0f7dfbf946534416c2189ca9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/floating_point_nvrtc.h
@@ -0,0 +1,104 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines categories for floating point numbers for use in NVRTC-compiled code
+*/
+
+#pragma once
+
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_HOST_DEVICE
+#include <cutlass/platform/platform.h> // uint32_t
+#if !defined(__CUDACC_RTC__)
+#include <cstring> // std::memcpy
+#endif
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// All floating-point numbers can be put in one of these categories.
+enum  {
+    FP_NAN =
+# define FP_NAN 0
+      FP_NAN,
+    FP_INFINITE =
+# define FP_INFINITE 1
+      FP_INFINITE,
+    FP_ZERO =
+# define FP_ZERO 2
+      FP_ZERO,
+    FP_SUBNORMAL =
+# define FP_SUBNORMAL 3
+      FP_SUBNORMAL,
+    FP_NORMAL =
+# define FP_NORMAL 4
+      FP_NORMAL
+};
+
+CUTLASS_HOST_DEVICE
+int fpclassify(float const& f) {
+
+  uint32_t s;
+
+  #if defined(__CUDA_ARCH__)
+  s = reinterpret_cast<uint32_t const &>(f);
+  #else
+  std::memcpy(&s, &f, sizeof(s));
+  #endif
+
+  uint32_t exp      = s & 0x7f800000;
+  uint32_t mantissa = s & 0x007fffff;
+
+  if (exp == 0x7f800000) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/functional.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..636cb8ca8a388430acdf1678f45045ab1805f9b6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/functional.h
@@ -0,0 +1,1106 @@
+  /***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Define basic numeric operators
+
+    This is inspired by the Standard Library's <functional> header.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#endif
+
+#include <cuda_runtime.h>
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include <mma.h>
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#ifdef _MSC_VER
+// Provides support for alternate operators such as 'and', 'or', ...
+#include <ciso646>
+#include <intrin.h>
+#endif // _MSC_VER
+
+#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM100F_ENABLED) ||\
+    defined(CUTLASS_ARCH_MMA_SM103A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM103F_ENABLED)
+#  define CUTLASS_ARCH_CREDUX_ENABLED
+#endif
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+  CUTLASS_HOST_DEVICE int32_t popcount(int32_t x) {
+    #if defined(__CUDA_ARCH__)
+    return __popc(x);
+    #elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_popcount(x);
+    #elif (defined(_MSC_VER) && !defined(_M_ARM64))
+    return __popcnt(x);
+    #else
+    int32_t count = 0;
+    while (x) {
+      count += x & 1;
+      x >>= 1;
+    }
+    return count;
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE int64_t popcount(int64_t x) {
+    #if defined(__CUDA_ARCH__)
+    return __popcll(x);
+    #elif defined(__GNUC__) || defined(__clang__)
+    return __builtin_popcountll(x);
+    #elif (defined(_MSC_VER) && !defined(_M_ARM64))
+    return __popcnt64(x);
+    #else
+    int64_t count = 0;
+    while (x) {
+      count += x & 1;
+      x >>= 1;
+    }
+    return count;
+    #endif
+  }
+
+} // namespace detail
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct absolute_value_op {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return abs(lhs);
+  }
+};
+
+template <>
+struct absolute_value_op<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const { return fabs(lhs); }
+};
+
+template <typename T>
+struct plus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs += rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct minus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs -= rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct multiplies {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs *= rhs;
+    return lhs;
+  }
+};
+
+template <typename T>
+struct scale {
+  T const scaling_factor_;
+
+  CUTLASS_HOST_DEVICE
+  scale(float scaling_factor) : scaling_factor_(scaling_factor) {
+  }
+
+  T operator()(T const &rhs) const {
+    T result = rhs * scaling_factor_;
+    return result;
+  }
+};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+/// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
+template<>
+struct plus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hadd2(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hsub2(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half2> {
+  CUTLASS_HOST_DEVICE
+  __half2 operator()(__half2 lhs, __half2 const &rhs) const {
+    return __hmul2(lhs, rhs);
+  }
+};
+
+/// Partial specializations needed when __CUDA_NO_HALF_OPERATORS__ is set
+template<>
+struct plus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hadd(lhs, rhs);
+  }
+};
+
+template<>
+struct minus<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hsub(lhs, rhs);
+  }
+};
+
+template<>
+struct multiplies<__half> {
+  CUTLASS_HOST_DEVICE
+  __half operator()(__half lhs, __half const &rhs) const {
+    return __hmul(lhs, rhs);
+  }
+};
+#endif // defined(__CUDA_ARCH__)
+
+
+/// Squares with optional conversion
+template <typename T, typename Output = T>
+struct square {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Returns the magnitude squared of an element.
+template <typename T, typename Output = T>
+struct magnitude_squared {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output = T>
+struct square_difference {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs, T rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs) - Output(rhs);
+    return mul_op(y, y);
+  }
+};
+
+/// Computes the square of a difference with optional conversion
+template <typename T, typename Output = T>
+struct magnitude_squared_difference {
+  CUTLASS_HOST_DEVICE
+  Output operator()(T lhs, T rhs) const {
+    multiplies<Output> mul_op;
+
+    Output y = Output(lhs) - Output(rhs);
+    return mul_op(y, y);
+  }
+};
+
+// Computes the reciprocal square root
+template <typename T>
+struct inverse_square_root;
+
+template <>
+struct inverse_square_root<float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs) const {
+#if defined(__CUDA_ARCH__)
+    return rsqrtf(lhs);
+#else
+    return 1.f / std::sqrt(lhs);
+#endif
+  }
+};
+
+template <>
+struct inverse_square_root<half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &lhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 520)
+    auto result = hrsqrt(reinterpret_cast<__half const &>(lhs));
+    return reinterpret_cast<half_t const &>(result);
+#else
+    return half_t(1.f / std::sqrt(half_t::convert(lhs)));
+#endif
+  }
+};
+
+/// Divides
+template <typename T>
+struct divides {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    lhs /= rhs;
+    return lhs;
+  }
+};
+
+/// reciprocal_approximate
+template <typename T>
+struct reciprocal_approximate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return divides<T>{}(T(1), lhs);
+  }
+};
+
+template <>
+struct reciprocal_approximate <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      ret = 1.0f / lhs;
+    #endif
+    return ret;
+  }
+};
+
+
+template <>
+struct reciprocal_approximate<cutlass::float_ue8m0_t> {
+  CUTLASS_HOST_DEVICE
+  cutlass::float_ue8m0_t operator()(cutlass::float_ue8m0_t lhs) const {
+    return cutlass::float_ue8m0_t::bitcast(static_cast<uint8_t>(static_cast<uint8_t>(254u) - lhs.storage));
+  }
+};
+
+
+/// reciprocal_approximate with ftz
+template<typename T>
+struct reciprocal_approximate_ftz :  reciprocal_approximate<T>
+{};
+
+template <>
+struct reciprocal_approximate_ftz <float> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs) const {
+    float ret;
+    #if defined(__CUDA_ARCH__)
+      asm volatile ("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(ret) : "f"(lhs));
+    #else
+      if (std::fpclassify(lhs) == FP_SUBNORMAL) {
+        lhs = 0.0f;
+      }
+      ret = 1.0f / lhs;
+      if (std::fpclassify(ret) == FP_SUBNORMAL) {
+        ret = 0.0f;
+      }
+    #endif
+    return ret;
+  }
+};
+
+/// Negate
+template <typename T>
+struct negate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const {
+    return -lhs;
+  }
+};
+
+/// Greater equal
+template <typename T>
+struct greater_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs >= rhs);
+  }
+};
+
+/// Greater
+template <typename T>
+struct greater {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs > rhs);
+  }
+};
+
+/// Less equal
+template <typename T>
+struct less_equal {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs <= rhs);
+  }
+};
+
+/// Less
+template <typename T>
+struct less {
+  CUTLASS_HOST_DEVICE
+  bool operator()(T const &lhs, T const &rhs) const {
+    return (lhs < rhs);
+  }
+};
+
+template <typename T, bool PropagateNaN = false>
+struct maximum {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &lhs, T const &rhs) const {
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      // Call isnan unqualified, so argument-dependent lookup (ADL)
+      // will find overloads such as cutlass::isnan(half_t).
+      // Calling ::isnan or std::isnan directly would force
+      // implicit conversions to float of custom number types
+      // in the cutlass namespace (e.g., cutlass::half_t).
+      return lhs > rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (lhs < rhs ? rhs : lhs);
+    }
+
+    CUTE_GCC_UNREACHABLE;
+  }
+};
+
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template<typename T>
+struct maximum_with_default_nan_propagation : public maximum<T>
+{};
+
+template <>
+struct maximum<float, false> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs, float const &rhs) const {
+    return fmaxf(lhs, rhs);
+  }
+};
+
+template <>
+struct maximum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    return lhs > rhs || isnan(lhs) ? lhs : rhs;
+#endif
+  }
+};
+
+// This is a subclass and not an alias
+// in order to work around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template <typename T>
+struct maximum_with_nan_propagation : maximum<T, true>
+{};
+
+// This alias exists for backwards compatibility only.
+// Please use the correctly spelled class template above.
+template <typename T>
+using maximum_with_nan_propogation = maximum_with_nan_propagation<T>;
+
+template <typename T, bool PropagateNaN = false>
+struct minimum {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &lhs, T const &rhs) const {
+    if constexpr (PropagateNaN && cutlass::platform::is_floating_point<T>::value) {
+      using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+      return lhs < rhs || isnan(lhs) ? lhs : rhs;
+    }
+    else {
+      return (rhs < lhs ? rhs : lhs);
+    }
+  }
+};
+
+template <>
+struct minimum<float, false> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float const &lhs, float const &rhs) const {
+    return fminf(lhs, rhs);
+  }
+};
+
+template <>
+struct minimum<float, true> {
+  CUTLASS_HOST_DEVICE
+  float operator()(float lhs, float rhs) const {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+    float res;
+    asm volatile("min.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+    return res;
+#else
+    // No need for ADL; call std::isnan(float) on host and ::isnan(float) on device.
+    return lhs < rhs || (CUTLASS_CMATH_NAMESPACE :: isnan(lhs)) ? lhs : rhs;
+#endif
+  }
+};
+
+template <typename T>
+struct minimum_with_nan_propagation : minimum<T, true> 
+{};
+
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(abs_op(lhs), abs_op(rhs));
+  }
+};
+
+// assumes the left operand is already an absolute value
+template <typename T, bool PropagateNaN = false>
+struct maximum_absolute_value_reduction {
+  CUTLASS_HOST_DEVICE
+  float operator()(T const &lhs, T const &rhs) const {
+    absolute_value_op<T> abs_op;
+    maximum<T, PropagateNaN> max_op;
+
+    return max_op(lhs, abs_op(rhs));
+  }
+};
+
+/// Fused multiply-add
+template <typename A, typename B = A, typename C = A>
+struct multiply_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    return C(a) * C(b) + c;
+  }
+};
+
+template <typename T>
+struct square_and_plus {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    multiply_add<T> multiply_add_op;
+    return multiply_add_op(rhs, rhs, lhs);
+  }
+};
+
+// Fused multiply-add that takes exactly one template parameter.
+// This is useful for working around a known Clang issue,
+// where a template template parameter with one template parameter
+// does not match classes that take multiple template parameters
+// but have defaults for all but the first.
+template <typename A>
+struct homogeneous_multiply_add : public multiply_add<A, A, A>
+{};
+
+/// Fused multiply-add
+template <typename A, typename B = A, typename C = A>
+struct multiply_add_relu0 {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    maximum<C> mx;
+    return mx(C(a) * C(b) + c, C(0));
+  }
+};
+
+/// Guarded-multiply-add
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    return C(a) * C(b) + c;
+  }
+};
+
+/// Guarded-multiply-add
+template <>
+struct guarded_multiply_add<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    // Namespace-qualifying isnan as cutlass::isnan saves the compiler
+    // the trouble of argument-dependent lookup.  Calling std::isnan or
+    // ::isnan here would result in unwanted implicit conversion to float.
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    return a * b + c;
+#endif
+  }
+};
+
+/// Guarded-multiply-add-relu0
+template <typename A, typename B = A, typename C = A>
+struct guarded_multiply_add_relu0 {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    using CUTLASS_CMATH_NAMESPACE :: isnan;
+
+    if (isnan(a) || isnan(b)) {
+      return C(0);
+    }
+    maximum<C> mx;
+    return mx(C(a) * C(b) + c, C(0));
+  }
+};
+
+template <>
+struct guarded_multiply_add_relu0<half_t, half_t, half_t> {
+  CUTLASS_HOST_DEVICE
+  half_t operator()(half_t const &a, half_t const &b, half_t const &c) const {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    half_t result;
+    asm ("fma.rn.oob.relu.f16 %0, %1, %2, %3;\n"
+      : "=h"(*reinterpret_cast<uint16_t*>(&result))
+      : "h"(*reinterpret_cast<uint16_t const*>(&a)), "h"(*reinterpret_cast<uint16_t const*>(&b)), "h"(*reinterpret_cast<uint16_t const*>(&c)));
+    return result;
+#else
+    if (cutlass::isnan(a) || cutlass::isnan(b)) {
+      return half_t(0);
+    }
+    maximum<half_t> mx;
+    return mx(a * b + c, half_t(0));
+#endif
+  }
+};
+
+
+/// Fused and-popc-add
+template <typename A, typename B = A, typename C = A>
+struct and_popc_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    A and_result = a & b;
+    int32_t popc_result = detail::popcount(and_result);
+    return C(popc_result) + c;
+  }
+};
+
+/// Fused and-add
+template <typename T>
+struct and_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a & b) + c);
+  }
+};
+
+
+
+/// Fused xor-popc-add
+template <typename A, typename B = A, typename C = A>
+struct xor_popc_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    A xor_result = a ^ b;
+    int32_t popc_result = detail::popcount(xor_result);
+    return C(popc_result) + c;
+  }
+};
+
+/// Fused xor-add
+template <typename T>
+struct xor_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a ^ b) + c);
+  }
+};
+
+
+/// Fused or-popc-add
+template <typename A, typename B = A, typename C = A>
+struct or_popc_add {
+  CUTLASS_HOST_DEVICE
+  C operator()(A const &a, B const &b, C const &c) const {
+    A or_result = a | b;
+    int32_t popc_result = detail::popcount(or_result);
+    return C(popc_result) + c;
+  }
+};
+
+
+/// Fused or-add
+template <typename T>
+struct or_add {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b, T const &c) const {
+    return ((a | b) + c);
+  }
+};
+
+namespace detail {
+
+// Whether namespace-unqualified conj(t) for t of type T is
+// well-formed.  This says whether the compiler can find
+// namespace-unqualified conj(T) via argument-dependent lookup.
+// If so, then CUTLASS assumes that conj(t) returns
+// the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_unqualified_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_unqualified_conj<
+    T,
+    decltype(static_cast<void>(conj(cutlass::platform::declval<T>())), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_unqualified_conj_v = has_unqualified_conj<T>::value;
+  
+} // namespace detail
+
+// forward declaration (needed for conjugate below)
+template<class T>
+CUTLASS_HOST_DEVICE T conj(T const& z);
+
+namespace detail {
+
+// Whether cutlass::conj(t) for t of type T is well-formed.
+// If so, then CUTLASS assumes that cutlass::conj(t)
+// returns the complex conjugate of t.
+template <typename T, typename Enable = void>
+struct has_cutlass_conj : cutlass::platform::false_type
+{};
+
+template<typename T>
+struct has_cutlass_conj<
+    T,
+    decltype(cutlass::conj(cutlass::platform::declval<T>()), void())
+  > : cutlass::platform::true_type
+{};
+
+template <typename T>
+constexpr bool has_cutlass_conj_v = has_cutlass_conj<T>::value;
+
+} // namespace detail
+  
+// Return the complex conjugate of the input.
+//
+// If the struct hasn't already been specialized for type T, then
+//
+// 1. for arithmetic types, return z;
+//
+// 2. for types where either (namespace-unqualified) conj(z) or
+//    cutlass::conj(z) is well formed, declare "using cutlass::conj;"
+//    and return conj(z); and
+//
+// 3. for everything else, return z.
+//
+// Regarding (1), the C++ Standard Library makes std::conj always
+// return std::complex, even for (noncomplex) arithmetic types.
+// cutlass::conj(T t) needs to return type T.  This follows the
+// convention of linear algebra software like the BLAS, where
+// "conjugate transpose" means the same thing as "transpose" for a
+// matrix of noncomplex numbers.
+//
+// Case (2) covers std::complex, cuda::std::complex, and non-Standard
+// (including user-defined) complex number types (for which "conj(z)"
+// is findable via argument-dependent lookup).  cutlass::conj has a
+// totally generic overload, but a more type-specific overload in any
+// namespace will take precedence.
+//
+// Case (3) covers non-Standard non-complex number types.
+//
+// Users should not generally need to specialize this struct for their
+// own custom complex or noncomplex types.  The idiomatic way to
+// identify a type T as "complex" is to make namespace-unqualified
+// calls to conj(T) findable via argument-dependent lookup.
+template <typename T>
+struct conjugate {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& z) const {
+    if constexpr (cutlass::platform::is_arithmetic_v<T>) {
+      return z;
+    }
+    else if constexpr (detail::has_unqualified_conj_v<T> || detail::has_cutlass_conj_v<T>) {
+      using cutlass::conj;
+      return conj(z);
+    }
+    else {
+      return z;
+    }
+  }
+};
+
+template <typename T>
+struct first {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first, T const &...) const {
+    return first;
+  }
+  CUTLASS_HOST_DEVICE
+  T operator()(T const & first) const {
+    return first;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct logical_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((static_cast<bool>(a) && static_cast<bool>(b)) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return ((static_cast<bool>(a) || static_cast<bool>(b)) ? T(1) : T());
+  }
+};
+
+template <typename T>
+struct logical_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return T(!(a));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct bit_and {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a & b;
+  }
+};
+
+template <typename T>
+struct bit_or {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a | b;
+  }
+};
+
+template <typename T>
+struct bit_not {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a) const {
+    return ~a;
+  }
+};
+
+template <typename T>
+struct bit_xor {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const &a, T const &b) const {
+    return a ^ b;
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// Atomic reductions
+
+template <typename T>
+struct atomic_add
+{
+  CUTLASS_DEVICE
+  void operator()(T *ptr, const T &data)
+  {
+#if defined(__CUDA_ARCH__)
+    atomicAdd(ptr, data);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#endif
+  }
+};
+
+template<>
+struct atomic_add<double>
+{
+  CUTLASS_DEVICE
+  void operator()(double *ptr, const double &data)
+  {
+#if !defined(__CUDA_ARCH__)
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(data);
+    CUTLASS_NOT_IMPLEMENTED();
+#elif (__CUDA_ARCH__ >= 600)
+    atomicAdd(ptr, data);
+#else
+    // Use CAS loop
+    unsigned long long int* ptr_int = reinterpret_cast<unsigned long long int*>(ptr);
+    unsigned long long int old_int = *ptr_int;
+    unsigned long long int assumed_int;
+
+    do {
+      double update = data + __longlong_as_double(old_int);
+      assumed_int = old_int;
+      old_int = atomicCAS(ptr_int, assumed_int, __double_as_longlong(update));
+    } while (assumed_int != old_int);
+#endif // (__CUDA_ARCH__ >= 600)
+  }
+};
+
+template<>
+struct atomic_add<half2>
+{
+  CUTLASS_DEVICE
+  void operator()(half2 *ptr, const half2 &data)
+  {
+#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 600))
+      CUTLASS_UNUSED(ptr);
+      CUTLASS_UNUSED(data);
+      CUTLASS_NOT_IMPLEMENTED();
+#else
+    // Vector-2 atomic reduction requires .target sm_60 or higher
+    uint32_t word = reinterpret_cast<const uint32_t&>(data);
+    asm volatile ("red.gpu.global.add.noftz.f16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
+#endif // (__CUDA_ARCH__ >= 600)
+  }
+};
+
+template <typename T>
+using red [[deprecated("use atomic_add instead")]] = atomic_add<T>;
+
+template <typename T>
+struct atomic_maximum {
+  CUTLASS_DEVICE
+  T operator()(T *ptr, T value) const {
+#if defined(__CUDA_ARCH__)
+    return atomicMax(ptr, value);
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+template <>
+struct atomic_maximum<float> {
+  CUTLASS_DEVICE
+  float operator()(float *ptr, float value) const {
+#if defined(__CUDA_ARCH__)
+    // In device code, make sure that we do NOT try to use
+    // std::signbit, as that won't work if building with NVRTC.
+    // Instead, prefix "::" to call signbit from the global namespace,
+    // which CUDA guarantees to work in device code without including
+    // any headers.
+    //
+    return ! ::signbit(value) ?
+      __int_as_float(atomicMax((int*)ptr, __float_as_int(value))) :
+      __uint_as_float(atomicMin((unsigned int*)ptr, __float_as_uint(value)));
+#else
+    CUTLASS_UNUSED(ptr);
+    CUTLASS_UNUSED(value);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+// is_atomic
+template <class Fn>
+struct is_atomic : platform::false_type {};
+template <class T>
+struct is_atomic<atomic_add<T>> : platform::true_type {};
+template <class T>
+struct is_atomic<atomic_maximum<T>> : platform::true_type {};
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+/// Parallel Synchronization and Communication Instructions
+template <typename T>
+struct redux_abs_max_nan_propagation_sync_warp;
+
+template <>
+struct redux_abs_max_nan_propagation_sync_warp <float>{
+  CUTLASS_DEVICE
+  float operator()(float const &lhs) const {
+#if defined(CUTLASS_ARCH_CREDUX_ENABLED)
+    float result;
+    asm volatile("redux.sync.max.abs.NaN.f32 %0, %1, 0xffffffff;\n" : "=f"(result) : "f"(lhs));
+    return result;
+#elif defined(__CUDA_ARCH__)
+    cutlass::maximum<float, /*PropagateNaN*/true> max_op;
+    int shuffle_width = 32;
+    float abs_max = cutlass::absolute_value_op<float>{}(lhs);
+    CUTLASS_PRAGMA_UNROLL
+    for(int offset = shuffle_width / 2; offset > 0; offset /= 2) {
+      float value = __shfl_down_sync(0xffffffff, abs_max, offset, shuffle_width);
+      abs_max = max_op(abs_max,value);
+    }
+    // Broadcast the maximum to all threads participating in the reduction.
+    abs_max = __shfl_sync(0xffffffff, abs_max, 0, shuffle_width);
+    return abs_max;
+#else
+    CUTLASS_UNUSED(lhs);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+template <typename T>
+struct redux_abs_max_nan_propagation_sync_warp_t0t15_t16t31;
+
+template <>
+struct redux_abs_max_nan_propagation_sync_warp_t0t15_t16t31<float>{
+  CUTLASS_DEVICE
+  float operator()(float const &max) const {
+#if defined(CUTLASS_ARCH_CREDUX_ENABLED)
+    int half_warp_idx = threadIdx.x / (NumThreadsPerWarp / 2);
+    bool first_half_threads = (half_warp_idx % 2) == 0;
+    float value0 =  first_half_threads ? max : 0;
+    float v0 = cutlass::redux_abs_max_nan_propagation_sync_warp<float>{}(value0);
+
+    float value1 = !first_half_threads ? max : 0;
+    float v1 = cutlass::redux_abs_max_nan_propagation_sync_warp<float>{}(value1);
+    return first_half_threads ? v0: v1;
+    
+#elif defined(__CUDA_ARCH__)
+    float abs_max = cutlass::absolute_value_op<float>{}(max);
+    cutlass::maximum<float, /*PropagateNaN*/true> max_op;
+    constexpr int shuffle_width = 16;
+    CUTLASS_PRAGMA_UNROLL
+    for(int offset = shuffle_width/2; offset > 0; offset /= 2) {
+      float value = __shfl_down_sync(0xffffffff, abs_max, offset, shuffle_width);
+        abs_max  = max_op(abs_max,value);
+    }
+    // Broadcast the maximum to all threads participating in the reduction.
+    abs_max = __shfl_sync(0xffffffff, abs_max, 0, shuffle_width);
+    return abs_max;
+#else 
+    CUTLASS_UNUSED(max);
+    CUTLASS_NOT_IMPLEMENTED();
+    return 0;
+#endif
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for nvcuda::wmma::fragment<Use, m, n, k, T, Layout>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+template<typename Use, int m, int n, int k, typename T, typename Layout>
+struct plus<nvcuda::wmma::fragment<Use, m, n, k, T, Layout>>
+{
+  using Fragment = nvcuda::wmma::fragment<Use, m, n, k, T, Layout>;
+  using ElementType = typename Fragment::element_type;
+
+  CUTLASS_HOST_DEVICE
+  Fragment operator()(Fragment const &lhs, Fragment const &rhs) const
+  {
+    Fragment result;
+    plus<ElementType> scalar_op;
+
+    ElementType *result_elts = reinterpret_cast<ElementType*>(&result);
+    const ElementType *lhs_elts = reinterpret_cast<const ElementType*>(&lhs);
+    const ElementType *rhs_elts = reinterpret_cast<const ElementType*>(&rhs);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Fragment::num_elements; i++) {
+      result_elts[i] = scalar_op(lhs_elts[i], rhs_elts[i]);
+    }
+
+    return result;
+  }
+};
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..83a65059af41edf89fd4b977e6973a3e6d612ea5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder.hpp
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/collective_builder_decl.hpp"
+#include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl"
+#if !defined(__CUDACC_RTC__) 
+#include "cutlass/gemm/collective/builders/sm100_umma_builder.inl"              
+#include "cutlass/gemm/collective/builders/sm100_9xBF16_umma_builder.inl"       
+#include "cutlass/gemm/collective/builders/sm100_sparse_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_blockscaled_umma_builder.inl"  
+#include "cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_blockscaled_sparse_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_simt_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_mixed_input_umma_builder.inl"       
+#include "cutlass/gemm/collective/builders/sm100_cpasync_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_mixed_tma_cpasync_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm100_blockscaled_mixed_tma_cpasync_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm103_blockscaled_umma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_blockscaled_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_sparse_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_blockscaled_sparse_mma_builder.inl"
+#include "cutlass/gemm/collective/builders/sm120_blockwise_mma_builder.inl"
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aae73348b5a205494a7f7c2ee0407bd67a5b42a3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_builder_decl.hpp
@@ -0,0 +1,100 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Used to specify stage counts or dispatch to automatic computation of stage count
+template<int num_stages>
+struct StageCount {
+  static constexpr int value = num_stages;
+
+  StageCount() = default;
+  explicit StageCount(cute::Int<num_stages>) {}
+};
+
+template<int carveout_bytes>
+struct StageCountAutoCarveout {
+  static constexpr int bytes = carveout_bytes;
+
+  StageCountAutoCarveout() = default;
+  explicit StageCountAutoCarveout(cute::Int<carveout_bytes>) {}
+};
+
+namespace detail {
+
+// Forward Declaration
+template<class CollectiveEpilogue>
+constexpr int
+compute_carveout_from_epi();
+
+} // namespace detail
+
+template<class CollectiveEpilogue>
+struct StageCountAutoCarveoutEpi : StageCountAutoCarveout<detail::compute_carveout_from_epi<CollectiveEpilogue>()> {};
+
+using StageCountAuto = StageCountAutoCarveout<0>;
+
+// Used to automatically let the builder pick the kernel schedule.
+// Can be overridden with kernel schedule tags in cutlass/gemm/dispatch_policy.hpp
+struct KernelScheduleAuto final {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ArchTag,
+  class OpClass,
+  class ElementA,
+  class GmemLayoutA,
+  int AlignmentA,
+  class ElementB,
+  class GmemLayoutB,
+  int AlignmentB,
+  class ElementAccumulator,
+  class TileShape_MNK,
+  class ClusterShape_MNK,
+  class StageCountType,
+  class KernelScheduleType,
+  class Enable = void
+>
+struct CollectiveBuilder {
+  static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9e3ae8003794507f9c9d7183c388fcf6074a40eb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma.hpp
@@ -0,0 +1,84 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/collective/sm70_mma_twostage.hpp"
+#include "cutlass/gemm/collective/sm80_mma_multistage.hpp"
+#include "cutlass/gemm/collective/sm80_mma_array_multistage.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp" 
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp"
+#include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+#include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/gemm/collective/sm100_mma_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp"
+#include "cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp"
+#include "cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp" 
+#include "cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp" 
+#include "cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp"
+#include "cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp"
+#include "cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp"
+#include "cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_mma_mixed_tma_cpasync_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm100_blockscaled_mma_mixed_tma_cpasync_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm103_blockscaled_mma_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm103_blockscaled_mma_array_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm120_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp"
+#include "cutlass/gemm/collective/sm120_sparse_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp"
+#include "cutlass/gemm/collective/sm120_mma_tma_blockwise_scaling.hpp"
+#include "cutlass/gemm/collective/sm120_mma_array_tma_blockwise_scaling.hpp"
+#endif // !defined(__CUDACC_RTC__) 
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2faa1ff28e0fc52491937fd003396fca1ffe646
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <cute/numeric/integral_constant.hpp>
+#include <cutlass/detail/dependent_false.hpp>
+
+namespace cutlass::gemm::collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class DispatchPolicy,
+  class TileShape,
+  class ElementA,
+  class StrideA,
+  class ElementB,
+  class StrideB,
+  class TiledMma,
+  class GmemTiledCopyA,
+  class SmemLayoutAtomA,
+  class SmemCopyAtomA,
+  class TransformA,
+  class GmemTiledCopyB,
+  class SmemLayoutAtomB,
+  class SmemCopyAtomB,
+  class TransformB
+>
+struct CollectiveMma {
+  static_assert(cutlass::detail::dependent_false<ElementA>, "Could not find a mainloop specialization.");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ff3a94478fa1916b77938d2ca77178ef7d6bc43
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/fp8_accumulation.hpp
@@ -0,0 +1,279 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/clear.hpp"
+#include "cute/tensor.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////FP8 Accumulation///////////////////////////
+//////////////////////////////////////////////////////////////////////////////
+/// This class provides API to promote (add) or scale (multiply_add) the results 
+/// from the tensor core accumulators to the main accumulators when the number 
+/// of MMAs reaches the max number of MMA interval specified by user, after that
+/// the tensor core accumulators are zeroed.
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+
+template <
+    class EngineAccum,
+    class LayoutAccum>
+struct GmmaFP8Accumulation {  
+  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
+  using ElementAccumulator = typename EngineAccum::value_type;
+
+  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
+  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
+
+private:
+  TensorAccum& accum_;
+  TensorAccum accum_temp_;
+
+  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
+  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
+  uint32_t mma_count_;                        // current executed MMAs
+  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
+
+  // promote or `add` the partial accumulators to main accumulator (FADD).
+  CUTLASS_DEVICE
+  void promote_core() {
+    warpgroup_wait<0>();
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i);
+    }
+  }
+
+  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
+  CUTLASS_DEVICE
+  void scale_core(ElementAccumulator const &scale) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale;
+    }
+  }
+
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
+
+    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
+    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scale(i);
+    }
+  }
+
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_core(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    using TensorScaleA = cute::Tensor<EngineScaleA, LayoutScaleA>;
+    using TensorScaleB = cute::Tensor<EngineScaleB, LayoutScaleB>;
+
+    static_assert(is_static<LayoutScaleA>::value, "ScaleA Layout should be static");
+    static_assert(is_static<LayoutScaleB>::value, "ScaleB Layout should be static");
+    static_assert(is_rmem<TensorScaleA>::value, "ScaleA tensor must be rmem resident.");
+    static_assert(is_rmem<TensorScaleB>::value, "ScaleB tensor must be rmem resident.");
+
+    static_assert(LayoutAccum{}.shape() == LayoutScaleA{}.shape(), "Accumulator and scaleA must have same shape.");
+    static_assert(LayoutAccum{}.shape() == LayoutScaleB{}.shape(), "Accumulator and scaleB must have same shape.");
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(accum_); ++i) {
+      accum_(i) += accum_temp_(i) * scaleA(i) * scaleB(i);
+    }
+  }
+
+public:
+  CUTLASS_DEVICE
+  GmmaFP8Accumulation(
+      TensorAccum &accum,
+      uint32_t accum_promotion_interval,
+      uint32_t mma_count_per_mainloop_iteration)
+      : accum_(accum), 
+        accum_promotion_interval_(accum_promotion_interval),
+        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
+        mma_count_(0), 
+        reset_accum_flag_(0) 
+  {
+    accum_temp_ = cute::make_fragment_like(accum);
+  }
+
+  //
+  // Methods (Common)
+  //
+
+  CUTLASS_DEVICE 
+  TensorAccum& operator()() {
+    return accum_temp_;
+  }
+
+  /// prepare the MMA accumulators when initialization or zeroing is required.
+  CUTLASS_DEVICE
+  bool prepare_if_needed() { 
+    return reset_accum_flag_;
+  }
+
+  //
+  // Methods (for FADD version)
+  //
+
+  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_if_needed() {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      promote_core();
+      mma_count_ = 0;
+    }
+  }
+
+  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void promote_residue_if_needed() {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      promote_core();
+    }
+  }
+
+  //
+  // Methods (for FFMA version)
+  //
+
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void scale_if_needed(ElementAccumulator const &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scale);
+      mma_count_ = 0;
+    }
+  }
+
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_if_needed(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    mma_count_ += mma_count_per_mainloop_iteration_;
+    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
+    if (reset_accum_flag_) {
+      scale_core(scaleA, scaleB);
+      mma_count_ = 0;
+    }
+  }
+  
+  /// scale (multiply_add) the results from the MMA accumulators to main accumulator without checking the counter.
+  CUTLASS_DEVICE
+  void scale(ElementAccumulator const &scale) {
+    scale_core(scale);
+  }
+
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    scale_core(scale);
+  }
+
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    scale_core(scaleA, scaleB);
+  }
+
+  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(ElementAccumulator const &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+
+  template <
+    class EngineScale,
+    class LayoutScale>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scale);
+    }
+  }
+
+  template <
+    class EngineScaleA,
+    class LayoutScaleA,
+    class EngineScaleB,
+    class LayoutScaleB>
+  CUTLASS_DEVICE
+  void scale_residue_if_needed(const cute::Tensor<EngineScaleA, LayoutScaleA> &scaleA, const cute::Tensor<EngineScaleB, LayoutScaleB> &scaleB) {
+    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
+      scale_core(scaleA, scaleB);
+    }
+  }
+};
+
+} // namespace cutlass::gemm::collective
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2665ef1c2e894f7f937700f5d18902c122147bfb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100ArrayTmaUmmaWarpSpecializedBlockScaled<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100ArrayTmaUmmaWarpSpecializedBlockScaled<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+  // Due to an MSVC bug, we can't use decltype(make_tiled_mma()) interface.
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 64 or
+      shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 64/128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+  using InternalStrideA  = cute::remove_pointer_t<StrideA>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using InternalStrideB  = cute::remove_pointer_t<StrideB>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TmaInternalElementA = cute::conditional_t<IsF8F6F4, ElementAMma, ElementA>;
+  using TmaInternalElementB = cute::conditional_t<IsF8F6F4, ElementBMma, ElementB>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = typename detail::sm10x_block_scale_runtime_input_t<ElementAMma, IsRuntimeDataTypeA>::Type;
+  using RuntimeDataTypeB = typename detail::sm10x_block_scale_runtime_input_t<ElementBMma, IsRuntimeDataTypeB>::Type;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_SFA;
+      cute::TmaDescriptor smem_tensormap_SFB;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+  static constexpr uint32_t TmaTransactionBytes = ABTmaTransactionBytes + SFTransactionBytes;
+
+  template <class AccTensor, class SfaTensor, class SfbTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+    SfaTensor tCtSFA;
+    SfbTensor tCtSFB;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFA = decltype(make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFB = decltype(make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        ClusterLayoutSfb_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+    cute::TmaDescriptor* tensormaps;
+    ArrayElementA const** ptr_A;
+    StrideA dA;
+    ArrayElementB const** ptr_B;
+    StrideB dB;
+    ElementSF const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementSF const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_SFA_(params.layout_SFA)
+    , layout_SFB_(params.layout_SFB)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape problem_shapes,
+    Arguments const& args,
+    void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_M = int32_t(size<0>(TileShape{}));
+    auto init_N = int32_t(size<1>(TileShape{}));
+    auto init_K = int32_t(size<2>(TileShape{}));
+    auto init_L = 1;
+
+    // Tensor pointers will be fixed before the first access
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    InternalLayoutSFA layout_SFA;
+    InternalLayoutSFB layout_SFB;
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+      layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+      layout_SFA = args.layout_SFA;
+      layout_SFB = args.layout_SFB;
+    }
+
+    // Batches/Groups are managed by using appropriate pointers to input matrices.
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    // Tensor pointers will be fixed before the first access
+    ElementSF const* ptr_SFA_first_batch = nullptr;
+    ElementSF const* ptr_SFB_first_batch = nullptr;
+
+    Tensor tensor_sfa = make_tensor(ptr_SFA_first_batch, layout_SFA);
+    Tensor tensor_sfb = make_tensor(ptr_SFB_first_batch, layout_SFB);
+
+    // Cluster layout for TMA construction of SFB
+    auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk);
+
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFA),
+      args.layout_SFA,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFB),
+      args.layout_SFB,
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 4;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return tmem_storage.accumulators(_,_,_,stage);
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
+
+    TmemStorage<decltype(accumulators), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtSFA = tCtSFA;
+    tmem_storage.tCtSFB = tCtSFB;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtSFA.data() = tmem_storage.accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+    tmem_storage.tCtSFB.data() = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// tAgSFA_mkl - partitioned gmem tensor for SFA
+  /// tBgSFB_nkl - partitioned gmem tensor for SFB
+  /// tAsSFA - partitioned tmem tensor for SFA
+  /// tAsSFB - partitioned tmem tensor for SFB
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count, int32_t const sm_idx,
+      int32_t init_group) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+    // we are managing TMA descriptors to change batches, we need to neglect the L mode
+    const int32_t mock_L = 1;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,mock_L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,mock_L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Represent the full tensor of Scale factors
+    InternalLayoutSFA layout_SFA{};
+    InternalLayoutSFB layout_SFB{};
+    if constexpr (IsGroupedGemmKernel) {
+      layout_SFA = params.layout_SFA[init_group];
+      layout_SFB = params.layout_SFB[init_group];
+    }
+    else {
+      layout_SFA = params.layout_SFA;
+      layout_SFB = params.layout_SFB;
+    }
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp),
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+      }
+    }();
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    ThrMMA cta_mma_sfb = TiledMMA_SF{}.get_slice(blockIdx.x % size(typename TiledMMA_SF::AtomThrID{}));
+    Tensor tCgSFA_mkl = cta_mma.partition_A(gSFA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = cta_mma_sfb.partition_B(gSFB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Define the CTA-in-Cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(sSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+
+    // Fetch a copy of tensormaps for the CTA from Params
+    auto input_tensormaps = tensormaps_init(params, shared_tensormaps, sm_count, sm_idx);
+
+    return cute::make_tuple(
+      gA_mkl, gB_nkl,                         // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,         // for input tensor values
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB, // for input scale factor tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb, // multicast masks
+      input_tensormaps);                                          // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = tmem_storage.tCtSFA;
+    Tensor tCtSFB = tmem_storage.tCtSFB;
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_compact_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_compact_s2t_);
+    auto thr_tCtSFA_compact_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_compact_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_compact_s2t_);
+    auto thr_tCtSFB_compact_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return cute::make_tuple(
+      tiled_mma,
+      tCrA, tCrB, tCtSFA, tCtSFB,
+      tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB,
+    class TensorMapA, class TensorMapB,
+    class TensorMapSFA, class TensorMapSFB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    Params const& params,
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                GTensorPartitionedA, GTensorPartitionedB,
+                STensorA, STensorB,
+                GTensorPartitionedSFA, GTensorPartitionedSFB,
+                STensorSFA, STensorSFB,
+                uint16_t, uint16_t,
+                uint16_t, uint16_t,
+                cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB>> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count,
+    bool did_batch_change) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,
+          mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb,
+          input_tensormaps] = load_inputs;
+
+    // Check to see if tensormaps have been replaced in gmem
+    if (did_batch_change) {
+      tensormaps_fence_acquire(input_tensormaps);
+    }
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        copy(observed_tma_load_sfa_->with(get<2>(input_tensormaps), *tma_barrier, mcast_mask_sfa), tAgSFA(_,*k_tile_iter), tAsSFA(_,write_stage));
+        copy(observed_tma_load_sfb_->with(get<3>(input_tensormaps), *tma_barrier, mcast_mask_sfb), tBgSFB(_,*k_tile_iter), tBsSFB(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class FragmentSFA, class FragmentSFB,
+    class CtaTileCoord,
+    class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma,
+                  FragmentA, FragmentB,
+                  FragmentSFA, FragmentSFB,
+                  SFATiledCopy, SmemFrgSFA, TmemFrgSFA,
+                  SFBTiledCopy, SmemFrgSFB, TmemFrgSFB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto [tiled_mma,
+          tCrA, tCrB, tCtSFA, tCtSFB,
+          tiled_copy_s2t_SFA, thr_tCsSFA_s2t,
+          thr_tCtSFA_s2t, tiled_copy_s2t_SFB,
+          thr_tCsSFB_s2t, thr_tCtSFB_s2t] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    auto tCtSFB_mma = [tCtSFB = tCtSFB, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB;
+        if (size<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB;
+      }
+    }();
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    if constexpr (IsOverlappingAccum) {
+      // first iteration manual unroll for tmem overlap kernel
+      if (k_tile_count > 0) {
+        // WAIT on mainloop_pipe_consumer_state until its data are available
+        // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+        mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+        // Compute on k_tile
+        int read_stage = mainloop_pipe_consumer_state.index();
+        // Save current mainlop pipeline read state
+        auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+        // Advance mainloop_pipe
+        ++mainloop_pipe_consumer_state;
+        --k_tile_count;
+        skip_wait = k_tile_count <= 0;
+        // Peek at next iteration
+        barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+          copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+        // Unroll the K mode manually so we can set scale C to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                    tCtSFA(_,_,k_block),
+                                    tCtSFB_mma(_,_,k_block)),
+              tCrA(_,_,k_block,read_stage),
+              tCrB(_,_,k_block,read_stage),
+              accumulators);
+          tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+      }
+    }
+    else {
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if (cute::elect_one_sync()) {
+        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                  tCtSFA(_,_,k_block),
+                                  tCtSFB_mma(_,_,k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    cute::TmaDescriptor* tma_desc_sfa = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_sfb = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(observed_tma_load_a_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(observed_tma_load_b_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      Tensor pSFA_tensormap = make_tensor(observed_tma_load_sfa_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFA), Int<1>{}, Int<1>{});
+      Tensor pSFB_tensormap = make_tensor(observed_tma_load_sfb_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFB), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+
+      copy(recast<uint128_t>(pSFA_tensormap), recast<uint128_t>(sSFA_tensormap));
+      copy(recast<uint128_t>(pSFB_tensormap), recast<uint128_t>(sSFB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_sfa, tma_desc_sfb);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                    mainloop_params.ptr_SFA[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                    mainloop_params.ptr_SFB[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFA  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFA = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFB  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFB = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    ElementSF const* ptr_SF = nullptr;
+    Tensor tensor_sfa = make_tensor(ptr_SF, mainloop_params.layout_SFA[next_group]);
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_a_, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_sfa_, tensor_sfa,
+                                             prob_shape_SFA, prob_stride_SFA);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_b_, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_sfb_, tensor_sfb,
+                                             prob_shape_SFB, prob_stride_SFB);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFA) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFB) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                            prob_shape_SFA,
+                                                            prob_stride_SFA);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                            prob_shape_SFB,
+                                                            prob_stride_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release(shared_tensormaps, input_tensormaps);
+  }
+
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+
+    tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_SFA);
+    tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+
+  LayoutSFA layout_SFA_;
+  LayoutSFB layout_SFB_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_mixed_tma_cpasync_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_mixed_tma_cpasync_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..344de4d33ba04dbf2d147a035614c8445fca8d25
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_mixed_tma_cpasync_warpspecialized.hpp
@@ -0,0 +1,1043 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/memory.h"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100UmmaMixedTmaCpAsyncWarpSpecializedBlockScaled<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  // Statically asserting to ensure only 1x1x1 cluster shape & 1sm setup is received
+  static_assert(size(AtomThrShapeMNK{}) == 1, "Lower alignment SM100 GEMM only supports 1SM MMA");
+  static_assert(size(ClusterShape{}) == 1, "CPASYNC does not support multicast so the cluster shape is restricted to 1, 1, 1");
+
+  static_assert(size(typename TiledMma::AtomThrID{}) == 1);
+
+  using DispatchPolicy = MainloopSm100UmmaMixedTmaCpAsyncWarpSpecializedBlockScaled<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  // TileShape refers to MmaTileShape to adapt for runtime cluster
+  using TileShape = TileShape_;
+  using TiledMma_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+  static_assert(!IsOverlappingAccum, "TMA+CPASYNC kernel currently only supports non-overlapping accum.");
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  // Define A and B block shapes
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  // using LoadShapeA_MK = decltype(select<0,2>(TileShape{}));
+  using LoadShapeB_NK = decltype(select<1,2>(TileShape{}));
+
+  // CtaShape_MNK is queried from collective in all kernel layers
+  using CtaShape_MNK = TileShape;
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 64 or
+      shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 64/128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                              remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+
+  static constexpr bool IsRuntimeDataTypeA = cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float8_t>;
+  static constexpr bool IsRuntimeDataTypeB = cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float8_t>;
+
+  static_assert(IsRuntimeDataTypeA == IsRuntimeDataTypeB,
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipelineTMA = cutlass::PipelineTmaUmmaAsync<DispatchPolicy::Stages, ClusterShape, AtomThrShapeMNK>;
+  using MainloopPipelineTMAState = typename MainloopPipelineTMA::PipelineState;
+
+  using MainloopPipelineCpAsync = cutlass::PipelineUmmaConsumerAsync<DispatchPolicy::Stages, AtomThrShapeMNK>;
+  using MainloopPipelineCpAsyncState = typename MainloopPipelineCpAsync::PipelineState;
+
+  // static_assert(size(GmemTiledCopyA{}) == size(GmemTiledCopyB{}), "A and B GmemTiledCopy should share the same thread count");
+  static constexpr int NumLoadThreadsCpAsync = size(GmemTiledCopyB{});
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+
+  using MmaSmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  using LoadSmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      append(LoadShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::conditional_t<IsF8F6F4, ElementAMma, ElementA>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = typename detail::sm10x_block_scale_runtime_input_t<ElementAMma, IsRuntimeDataTypeA>::Type;
+  using RuntimeDataTypeB = typename detail::sm10x_block_scale_runtime_input_t<ElementBMma, IsRuntimeDataTypeB>::Type;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<LoadSmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    using PipelineStorageTMA = typename MainloopPipelineTMA::SharedStorage;
+    using PipelineStorageCpAsync = typename MainloopPipelineCpAsync::SharedStorage;
+
+    struct PipelineStorage : cute::aligned_struct<16, _0> {
+      alignas(16) PipelineStorageTMA tma;
+      alignas(16) PipelineStorageCpAsync cpasync;
+    } pipelines;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  static constexpr uint32_t ATmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>);
+  static constexpr uint32_t TmaTransactionBytes = ATmaTransactionBytes + SFTransactionBytes;
+
+  template <class AccTensor, class SfaTensor, class SfbTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+    SfaTensor tCtSFA;
+    SfbTensor tCtSFB;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(ClusterShape{}),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+    using ClusterLayoutSfb_VMNK = decltype(tiled_divide(make_layout(ClusterShape{}),
+                                                     make_tile(typename TiledMma_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    using TMA_SFA = decltype(make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    using TMA_SFB = decltype(make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMma_SF{},
+        ClusterLayoutSfb_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params)
+    : layout_SFA_(params.layout_SFA)
+    , layout_SFB_(params.layout_SFB)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    
+    observed_tma_load_a_ = &params.tma_load_a;
+    observed_tma_load_sfa_ = &params.tma_load_sfa;
+    observed_tma_load_sfb_ = &params.tma_load_sfb;
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape const& problem_shape,
+      Arguments const& args,
+      [[maybe_unused]] void* workspace,
+      cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<ElementBMma>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    auto cluster_layout_vmnk = tiled_divide(make_layout(ClusterShape{}), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(ClusterShape{}), make_tile(typename TiledMma_SF::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMma_SF{},
+        cluster_layout_sfb_vmnk);
+
+    return {
+      tma_load_a,
+      tma_load_sfa,
+      tma_load_sfb,
+      args.ptr_B,
+      args.dB,
+      args.layout_SFA,
+      args.layout_SFB,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+
+    bool implementable = true;
+
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for CpAsync.\n");
+    }
+
+    // Check for SFA SFB layout requirement
+    const auto layout_sfa_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    const auto layout_sfb_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    implementable = implementable && (layout_sfa_ref == args.layout_SFA);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFA mismatch, layout_SFA needs to be K-major\n");
+    }
+
+    implementable = implementable && (layout_sfb_ref == args.layout_SFB);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFB mismatch, layout_SFB needs to be K-major\n");
+    }
+
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfa_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfb_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static 
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
+
+    TmemStorage<decltype(accumulators), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtSFA = tCtSFA;
+    tmem_storage.tCtSFB = tCtSFB;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtSFA.data() = tmem_storage.accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+    tmem_storage.tCtSFB.data() = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
+  }
+
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init_tma(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA_));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp),
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+      }
+    }();
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(0);
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    ThrMMA cta_mma_sfb = TiledMma_SF{}.get_slice(blockIdx.x % size(typename TiledMma_SF::AtomThrID{}));
+    Tensor tCgSFA_mkl = cta_mma.partition_A(gSFA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = cta_mma_sfb.partition_B(gSFB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(ClusterShape{});
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(0);
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(0);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sSFA), group_modes<0,3>(tCgSFA_mkl));
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(sSFB), group_modes<0,3>(tCgSFB_nkl));
+                                      
+    return cute::make_tuple(
+      shape<3>(gA_mkl),      // for scheduler
+      tAgA_mkl, tAsA,        // for input tensor values
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB // for input scale factor tensor values
+    );
+  }
+
+  template <class ProblemShape_MNKL, class TileScheduler>
+  CUTLASS_DEVICE auto
+  load_init_cpasync(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TileScheduler const& scheduler,
+      typename TileScheduler::WorkTileInfo const& work_tile_info) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // convert to subptr iterator if necessary
+    auto ptr_B = recast_ptr<ElementBMma>(params.ptr_B);
+    // Represent the full tensors
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(ptr_B), make_shape(N,K,L), params.dB); //(n,k,l)
+    // Partition for cpasync
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l)
+
+    // Build the coordinate tensors with the same shape as input matrices
+    Tensor cB_nk  = make_identity_tensor(make_shape(N,K));
+    // Slice the coordinate tensors in the same way as A/B tensor partitioning
+    Tensor cgB_nk = local_tile(cB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), LoadSmemLayoutB{});
+
+    GmemTiledCopyB gmem_to_smem_b_tiled_copy;
+
+    int thread_idx = threadIdx.x % NumLoadThreadsCpAsync;
+    auto thr_copy_b = gmem_to_smem_b_tiled_copy.get_slice(thread_idx);
+
+    return cute::make_tuple(
+      gB_nkl, cgB_nk, sB, 
+      // problem_shape_MNKL, 
+      gmem_to_smem_b_tiled_copy, thr_copy_b);
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+      Params const& params,
+      TmemStorage tmem_storage,
+      // [[maybe_unused]] cute::tuple<cute::Tensor<FrgEngine, FrgLayout>, cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), MmaSmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = tmem_storage.tCtSFA;
+    Tensor tCtSFB = tmem_storage.tCtSFB;
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_compact_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_compact_s2t_);
+    auto thr_tCtSFA_compact_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_compact_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_compact_s2t_);
+    auto thr_tCtSFB_compact_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    return cute::make_tuple(
+      tiled_mma, 
+      tCrA, tCrB,
+      tCtSFA, tCtSFB,
+      tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t
+      
+      // debug
+      // , sA, sB, tCsSFA, tCsSFB
+    );
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    // class KTileCount,
+    // class GTensorPartitionedA,
+    // class STensorA,
+    class TileCoordMNKL,
+    class KTileIterator,
+    class... TLoadParams  // see load_init_tma
+  >
+  CUTLASS_DEVICE auto
+  load_tma(
+    MainloopPipelineTMA mainloop_pipeline,
+    MainloopPipelineTMAState mainloop_pipe_producer_state,
+    cute::tuple<TLoadParams...> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+    
+    // Unpack from load_inputs
+    // KTileCount k_tiles = get<0>(load_inputs);
+    // GTensorPartitionedA tAgA_mkl = get<1>(load_inputs);
+    // STensorA tAsA = get<2>(load_inputs);
+
+    auto [k_tiles,
+          tAgA_mkl, tAsA,
+          tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopPipelineTMA::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_sfa_->with(*tma_barrier), tAgSFA(_,*k_tile_iter), tAsSFA(_,write_stage));
+        copy(observed_tma_load_sfb_->with(*tma_barrier), tBgSFB(_,*k_tile_iter), tBsSFB(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+
+  template <
+    // class GTensorB,
+    // class CTensorB,
+    // class STensorB,
+    // class ProblemShape_MNKL,
+    // class TiledCopyB,
+    // class ThreadCopyB,
+    class TileCoordMNKL,
+    class KTileIterator,
+    class ProblemShape_MNKL,
+    class... TParams
+  >
+  CUTLASS_DEVICE auto
+  load_cpasync(
+    Params const& params,
+    MainloopPipelineCpAsync mainloop_pipeline,
+    MainloopPipelineCpAsyncState mainloop_pipe_producer_state,
+    cute::tuple<TParams...> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count,
+    ProblemShape_MNKL effective_shape
+  ) {
+
+    // Unpack from load_inputs
+    // GTensorB tBgB_nkl = get<0>(load_inputs);
+    // CTensorB cgB_nk = get<1>(load_inputs);
+    // STensorB sB = get<2>(load_inputs);
+    // ProblemShape_MNKL problem_shape_MNKL = get<3>(load_inputs);
+    // TiledCopyB gmem_to_smem_b_tiled_copy = get<4>(load_inputs);
+    // ThreadCopyB thr_copy_b = get<5>(load_inputs);
+
+    // auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto [
+      tBgB_nkl, cgB_nk, sB, 
+      // problem_shape_MNKL, 
+      gmem_to_smem_b_tiled_copy, thr_copy_b] = load_inputs;
+
+    auto [M,N,K,L] = effective_shape;
+
+    // Slice out the work coord from partitioned tensors
+    Tensor gB_in = tBgB_nkl(_, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    // Repeat slicing out coordinate tensor exactly the same as input tensor does
+    Tensor cgB_nk_in = cgB_nk(_, _, get<1>(cta_coord_mnkl), _);
+
+    auto k_residue    = K - size<1>(gB_in) * size<2>(gB_in);  // K - BLK_K * k is negative
+
+    Tensor gB = gB_in;
+    Tensor cB = cgB_nk_in;
+
+    auto tBgB = thr_copy_b.partition_S(gB);
+    auto tBsB = thr_copy_b.partition_D(sB);
+
+    // Allocate predicate tensors for n
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+    Tensor tBcB_nk = thr_copy_b.partition_S(cgB_nk_in);
+    Tensor tBcB = thr_copy_b.partition_S(cB);
+
+    // Copy gmem to smem for *k_tile_iter, predicating for k residue
+    Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+
+    // Repeating on predicators with the same operations on tBgB
+    Tensor tBcBk = tBcB(_,_,_,*k_tile_iter);
+
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = elem_less(get<0>(tBcBk(0,n,0)), N);  // blk_n coord < N
+    }
+
+    // we will process the last tile after the mainloop
+    if (k_residue != 0) {
+      --k_tile_count;
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+
+      copy_if(gmem_to_smem_b_tiled_copy, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      --k_tile_count;
+      ++k_tile_iter;
+      ++mainloop_pipe_producer_state;
+    }
+    
+    // last tile with predication on k to account for residue
+    // For performance consideration,
+    // this predicated block for K-tail is only activated when there is k-residue
+    if (k_residue != 0)  {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (int(get<1>(tBcBk(0,0,k))) >= 0) {      // blk_k coord < K
+          copy_if(gmem_to_smem_b_tiled_copy, tBpB(_,k), tBgB(_,_,k,*k_tile_iter), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK mainloop_pipe_producer_state
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance mainloop_pipe_producer_state
+      ++mainloop_pipe_producer_state;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail_tma(MainloopPipelineTMA mainloop_pipeline, MainloopPipelineTMAState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+  CUTLASS_DEVICE void
+  load_tail_cpasync(MainloopPipelineCpAsync mainloop_pipeline, MainloopPipelineCpAsyncState mainloop_pipe_producer_state) {
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class CtaTileCoord,
+    class... TMmaParams
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipelineTMA,
+                  MainloopPipelineCpAsync,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineTMAState,
+                  MainloopPipelineCpAsyncState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      cute::tuple<TMmaParams...> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma, tCrA, tCrB, tCtSFA, tCtSFB,
+          tiled_copy_s2t_SFA, thr_tCsSFA_s2t,
+          thr_tCtSFA_s2t, tiled_copy_s2t_SFB,
+          thr_tCsSFB_s2t, thr_tCtSFB_s2t
+
+          // debug
+          // , sA, sB, tCsSFA, tCsSFB
+        ] = mma_inputs;
+
+    auto [mainloop_pipeline_tma, mainloop_pipeline_cpasync, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_tma_consumer_state, mainloop_pipe_cpasync_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    auto tCtSFB_mma = [tCtSFB = tCtSFB, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB;
+        if (size<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB;
+      }
+    }();
+
+    // Wait for tmem accumulator buffer to become empty with a flipped phase
+    accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      mainloop_pipeline_tma.consumer_wait(mainloop_pipe_tma_consumer_state);
+      mainloop_pipeline_cpasync.consumer_wait(mainloop_pipe_cpasync_consumer_state);
+
+      int read_stage_tma = mainloop_pipe_tma_consumer_state.index();
+      int read_stage_cpasync = mainloop_pipe_cpasync_consumer_state.index();
+
+      if (cute::elect_one_sync()) {
+        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage_tma), thr_tCtSFA_s2t);
+        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage_tma), thr_tCtSFB_s2t);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                  tCtSFA(_,_,k_block),
+                                  tCtSFB_mma(_,_,k_block)), 
+            tCrA(_,_,k_block,read_stage_tma), 
+            tCrB(_,_,k_block,read_stage_cpasync), 
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline_tma.consumer_release(mainloop_pipe_tma_consumer_state);
+      mainloop_pipeline_cpasync.consumer_release(mainloop_pipe_cpasync_consumer_state);
+      --k_tile_count;
+      ++mainloop_pipe_tma_consumer_state;
+      ++mainloop_pipe_cpasync_consumer_state;
+    }
+
+    return cute::make_tuple(mainloop_pipe_tma_consumer_state, mainloop_pipe_cpasync_consumer_state);
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+
+  LayoutSFA layout_SFA_;
+  LayoutSFB layout_SFB_;
+
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  // ClusterShape cluster_shape_;
+  // uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..79a97bed9a5b7d886fce70841c439504fda6cadb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp
@@ -0,0 +1,1104 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedBlockScaled<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedBlockScaled<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 64 or
+      shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 64/128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::conditional_t<IsF8F6F4, ElementAMma, ElementA>;
+  using TmaInternalElementB = cute::conditional_t<IsF8F6F4, ElementBMma, ElementB>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = typename detail::sm10x_block_scale_runtime_input_t<ElementAMma, IsRuntimeDataTypeA>::Type;
+  using RuntimeDataTypeB = typename detail::sm10x_block_scale_runtime_input_t<ElementBMma, IsRuntimeDataTypeB>::Type;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+  static constexpr uint32_t TmaTransactionBytes = ABTmaTransactionBytes + SFTransactionBytes;
+
+  template <class AccTensor, class SfaTensor, class SfbTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+    SfaTensor tCtSFA;
+    SfbTensor tCtSFB;
+  };
+
+  template <
+    class KTileCount,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    // for scale factor tensor values
+    GTensorPartitionedSFA tAgSFA_mkl;
+    GTensorPartitionedSFB tBgSFB_nkl;
+    STensorSFA tAsSFA;
+    STensorSFB tBsSFB;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+    uint16_t mcast_mask_sfa;
+    uint16_t mcast_mask_sfb;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_,
+        STensorA tAsA_, STensorB tBsB_,
+        GTensorPartitionedSFA tAgSFA_mkl_, GTensorPartitionedSFB tBgSFB_nkl_,
+        STensorSFA tAsSFA_, STensorSFB tBsSFB_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_,
+        uint16_t mcast_mask_sfa_, uint16_t mcast_mask_sfb_)
+      : k_tiles(k_tiles_)
+      , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_)
+      , tAsA(tAsA_), tBsB(tBsB_)
+      , tAgSFA_mkl(tAgSFA_mkl_), tBgSFB_nkl(tBgSFB_nkl_)
+      , tAsSFA(tAsSFA_), tBsSFB(tBsSFB_)
+      , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_)
+      , mcast_mask_sfa(mcast_mask_sfa_), mcast_mask_sfb(mcast_mask_sfb_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB,
+    class FragmentSFA, class FragmentSFB,
+    class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    FragmentA tCrA;
+    FragmentB tCrB;
+    FragmentSFA tCtSFA;
+    FragmentSFB tCtSFB;
+    SFATiledCopy tiled_copy_s2t_SFA;
+    SmemFrgSFA thr_tCsSFA_s2t;
+    TmemFrgSFA thr_tCtSFA_s2t;
+    SFBTiledCopy tiled_copy_s2t_SFB;
+    SmemFrgSFB thr_tCsSFB_s2t;
+    TmemFrgSFB thr_tCtSFB_s2t;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_, FragmentSFA tCtSFA_, FragmentSFB tCtSFB_,
+        SFATiledCopy tiled_copy_s2t_SFA_, SmemFrgSFA thr_tCsSFA_s2t_, TmemFrgSFA thr_tCtSFA_s2t_,
+        SFBTiledCopy tiled_copy_s2t_SFB_, SmemFrgSFB thr_tCsSFB_s2t_, TmemFrgSFB thr_tCtSFB_s2t_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_), tCtSFA(tCtSFA_), tCtSFB(tCtSFB_)
+    , tiled_copy_s2t_SFA(tiled_copy_s2t_SFA_), thr_tCsSFA_s2t(thr_tCsSFA_s2t_), thr_tCtSFA_s2t(thr_tCtSFA_s2t_)
+    , tiled_copy_s2t_SFB(tiled_copy_s2t_SFB_), thr_tCsSFB_s2t(thr_tCsSFB_s2t_), thr_tCtSFB_s2t(thr_tCtSFB_s2t_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFA = decltype(make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFB = decltype(make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        ClusterLayoutSfb_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_SFA_(params.layout_SFA)
+    , layout_SFB_(params.layout_SFB)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    // Cluster layout for TMA construction of SFB
+    auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk);
+
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      args.layout_SFA,
+      args.layout_SFB,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    // Check for SFA SFB layout requirement
+    const auto layout_sfa_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    const auto layout_sfb_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    implementable = implementable && (layout_sfa_ref == args.layout_SFA);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFA mismatch, layout_SFA needs to be K-major\n");
+    }
+
+    implementable = implementable && (layout_sfb_ref == args.layout_SFB);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFB mismatch, layout_SFB needs to be K-major\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfa_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfb_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
+
+    TmemStorage<decltype(accumulators), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtSFA = tCtSFA;
+    tmem_storage.tCtSFB = tCtSFB;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtSFA.data() = tmem_storage.accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+    tmem_storage.tCtSFB.data() = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// tAgSFA_mkl - partitioned gmem tensor for SFA
+  /// tBgSFB_nkl - partitioned gmem tensor for SFB
+  /// tAsSFA - partitioned tmem tensor for SFA
+  /// tAsSFB - partitioned tmem tensor for SFB
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA_));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp),
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+      }
+    }();
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    ThrMMA cta_mma_sfb = TiledMMA_SF{}.get_slice(blockIdx.x % size(typename TiledMMA_SF::AtomThrID{}));
+    Tensor tCgSFA_mkl = cta_mma.partition_A(gSFA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = cta_mma_sfb.partition_B(gSFB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(sSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+
+    return LoadParams{
+      size<3>(gA_mkl),                                            // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,                             // for input tensor values
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,                     // for input scale factor tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb}; // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = tmem_storage.tCtSFA;
+    Tensor tCtSFB = tmem_storage.tCtSFB;
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_compact_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_compact_s2t_);
+    auto thr_tCtSFA_compact_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_compact_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_compact_s2t_);
+    auto thr_tCtSFB_compact_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB, tCtSFA, tCtSFB,
+      tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_k_tiles,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,
+          mcast_mask_a, mcast_mask_b, mcast_mask_sfa, mcast_mask_sfb] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        copy(observed_tma_load_sfa_->with(*tma_barrier, mcast_mask_sfa), tAgSFA(_,*k_tile_iter), tAsSFA(_,write_stage));
+        copy(observed_tma_load_sfb_->with(*tma_barrier, mcast_mask_sfb), tBgSFB(_,*k_tile_iter), tBsSFB(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma,
+          tCrA, tCrB, tCtSFA, tCtSFB,
+          tiled_copy_s2t_SFA, thr_tCsSFA_s2t,
+          thr_tCtSFA_s2t, tiled_copy_s2t_SFB,
+          thr_tCsSFB_s2t, thr_tCtSFB_s2t] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    auto tCtSFB_mma = [tCtSFB = tCtSFB, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB;
+        if (size<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB;
+      }
+    }();
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    if constexpr (IsOverlappingAccum) {
+      // first iteration manual unroll for tmem overlap kernel
+      if (k_tile_count > 0) {
+        // WAIT on mainloop_pipe_consumer_state until its data are available
+        // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+        mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+        // Compute on k_tile
+        int read_stage = mainloop_pipe_consumer_state.index();
+        // Save current mainlop pipeline read state
+        auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+        // Advance mainloop_pipe
+        ++mainloop_pipe_consumer_state;
+        --k_tile_count;
+        skip_wait = k_tile_count <= 0;
+        // Peek at next iteration
+        barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+          copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+        // Unroll the K mode manually so we can set scale C to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                    tCtSFA(_,_,k_block),
+                                    tCtSFB_mma(_,_,k_block)),
+              tCrA(_,_,k_block,read_stage),
+              tCrB(_,_,k_block,read_stage),
+              accumulators);
+          tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+      }
+    }
+    else {
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if (cute::elect_one_sync()) {
+        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                  tCtSFA(_,_,k_block),
+                                  tCtSFB_mma(_,_,k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+
+  LayoutSFA layout_SFA_;
+  LayoutSFB layout_SFB_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bcf88620c589fcd452840e1fa1fea798b23dd5d1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_blockscaled_sparse_mma_warpspecialized.hpp
@@ -0,0 +1,1321 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class LayoutPairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    LayoutPairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static int constexpr CTA_N_SF = cutlass::ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{};
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  // CtaK needs to be multiplier of SFAtomK
+  using SfAtom = typename Sm1xxBlkScaledConfig::SfAtom;
+  using SfAtomK = cute::Int<cute::size<1>(SfAtom{})>;
+  static_assert( shape<2>(CtaShape_MNK{}) % SfAtomK{} == 0, "CtaK needs to be multiplier of SFAtomK");
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  static_assert(get<0,0>(MmaShapeA_MK{}) == 128 &&
+                (get<2>(MmaShapeA_MK{}) == 2 || get<2>(MmaShapeA_MK{}) == 4),
+                "This kernel only support MmaShape=128 and 2/4 kphase.");
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using LayoutPairA = LayoutPairA_;
+  using StridePairB = StridePairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A, B, and E matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairA{}))>;
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairA{}))>;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<2>(LayoutPairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) || (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+  static constexpr int ElementASparsity = 2; // typename SparseConfig::ElementASparsity{};
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using MainloopPipeline = cutlass::PipelineTmaSparseUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static constexpr int UtccpReuseCnt = ((size<2>(TileShape{}) / typename SparseConfig::TensorEAtomK{}) == 0) ?
+                                        typename SparseConfig::TensorEAtomK{} / size<2>(TileShape{}) : 1;
+  static_assert(UtccpReuseCnt == 1 || UtccpReuseCnt == 2, "UTCCP reuse count can only be either one or two");
+  // (TileM, TileN, TileK) TileK is adjusted according to the reuse.
+  using TileShapeE = decltype(replace<2>(TileShape{}, cute::lcm(size<2>(TileShape{}), typename SparseConfig::TensorEAtomK{})));
+  using MmaShapeE_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShapeE{}), size<2>(TileShapeE{}))));
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide the tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE) that one UTCCP instruction can provide
+  using SmemLayoutE = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomE{},
+      append(MmaShapeE_MK{}, Int<DispatchPolicy::Stages>{})));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::sparse_elem<ElementASparsity,
+                                                cute::conditional_t<IsF8F6F4, ElementAMmaRaw, ElementA>>;
+  using TmaInternalElementB = cute::conditional_t<IsF8F6F4, ElementBMma, ElementB>;
+
+  using SmemAllocTypeA = cute::sparse_elem<ElementAMmaSparsity,
+                                           cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMmaRaw> < 8,
+                                                               uint8_t,
+                                                               ElementAMmaRaw>>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  // Kernel Input Data Type that consider runtime dtype
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>,
+                                            ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA,
+                                               cute::conditional_t<IsF8F6F4,
+                                                                   cute::UMMA::MXF8F6F4Format,
+                                                                   cute::UMMA::MXF4Format>,
+                                               void*>;
+
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB,
+                                               cute::conditional_t<IsF8F6F4,
+                                                                   cute::UMMA::MXF8F6F4Format,
+                                                                   cute::UMMA::MXF4Format>,
+                                               void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+  static constexpr uint32_t MetadataTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+  static constexpr uint32_t MainLoadTmaTransactionBytes = SFTransactionBytes + ABTmaTransactionBytes;
+
+  template <
+    class AccTensor,
+    class ETensor, class SfaTensor, class SfbTensor
+  >
+  struct TmemStorage {
+    AccTensor accumulators;
+    ETensor tCtE;
+    SfaTensor tCtSFA;
+    SfbTensor tCtSFB;
+  };
+
+  template <
+    class KTileCount,
+    class GTensorPartitionedA, class GTensorPartitionedB, class GTensorPartitionedE,
+    class STensorA, class STensorB, class STensorE,
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    GTensorPartitionedE tEgE_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    STensorE tEsE;
+    GTensorPartitionedSFA tAgSFA_mkl;
+    GTensorPartitionedSFB tBgSFB_nkl;
+    STensorSFA tAsSFA;
+    STensorSFB tBsSFB;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+    uint16_t mcast_mask_e;
+    uint16_t mcast_mask_sfa;
+    uint16_t mcast_mask_sfb;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_, GTensorPartitionedE tEgE_nkl_,
+        STensorA tAsA_, STensorB tBsB_, STensorE tEsE_,
+        GTensorPartitionedSFA tAgSFA_mkl_, GTensorPartitionedSFB tBgSFB_nkl_,
+        STensorSFA tAsSFA_, STensorSFB tBsSFB_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_, uint16_t mcast_mask_e_,
+        uint16_t mcast_mask_sfa_, uint16_t mcast_mask_sfb_)
+    : k_tiles(k_tiles_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_), tEgE_nkl(tEgE_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_), tEsE(tEsE_)
+    , tAgSFA_mkl(tAgSFA_mkl_), tBgSFB_nkl(tBgSFB_nkl_)
+    , tAsSFA(tAsSFA_), tBsSFB(tBsSFB_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_), mcast_mask_e(mcast_mask_e_)
+    , mcast_mask_sfa(mcast_mask_sfa_), mcast_mask_sfb(mcast_mask_sfb_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB,
+    class FragmentE,   class ETiledCopy,   class SmemFrgE,   class TmemFrgE,
+    class FragmentSFA, class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class FragmentSFB, class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    // A
+    FragmentA tCrA;
+    // B
+    FragmentB tCrB;
+    // E
+    FragmentE tCtE;
+    ETiledCopy tiled_copy_s2t_E;
+    SmemFrgE thr_tCsE_s2t;
+    TmemFrgE thr_tCtE_s2t;
+    // SFA
+    FragmentSFA tCtSFA;
+    SFATiledCopy tiled_copy_s2t_SFA;
+    SmemFrgSFA thr_tCsSFA_s2t;
+    TmemFrgSFA thr_tCtSFA_s2t;
+    // SFB
+    FragmentSFB tCtSFB;
+    SFBTiledCopy tiled_copy_s2t_SFB;
+    SmemFrgSFB thr_tCsSFB_s2t;
+    TmemFrgSFB thr_tCtSFB_s2t;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_,
+        FragmentE tCtE_, ETiledCopy tiled_copy_s2t_E_,
+        SmemFrgE thr_tCsE_s2t_, TmemFrgE thr_tCtE_s2t_,
+        FragmentSFA tCtSFA_, SFATiledCopy tiled_copy_s2t_SFA_,
+        SmemFrgSFA thr_tCsSFA_s2t_, TmemFrgSFA thr_tCtSFA_s2t_,
+        FragmentSFB tCtSFB_, SFBTiledCopy tiled_copy_s2t_SFB_,
+        SmemFrgSFB thr_tCsSFB_s2t_, TmemFrgSFB thr_tCtSFB_s2t_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_)
+    , tCtE(tCtE_), tiled_copy_s2t_E(tiled_copy_s2t_E_)
+    , thr_tCsE_s2t(thr_tCsE_s2t_), thr_tCtE_s2t(thr_tCtE_s2t_)
+    , tCtSFA(tCtSFA_), tiled_copy_s2t_SFA(tiled_copy_s2t_SFA_)
+    , thr_tCsSFA_s2t(thr_tCsSFA_s2t_), thr_tCtSFA_s2t(thr_tCtSFA_s2t_)
+    , tCtSFB(tCtSFB_), tiled_copy_s2t_SFB(tiled_copy_s2t_SFB_)
+    , thr_tCsSFB_s2t(thr_tCsSFB_s2t_), thr_tCtSFB_s2t(thr_tCtSFB_s2t_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    // A is A Compressed, not raw tensorA
+    ArrayElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_E = decltype(make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFA = decltype(make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_SFB = decltype(make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        ClusterLayoutSfb_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_E tma_load_e_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_a_(params.layout_a)
+    , layout_e_(params.layout_e)
+    , layout_SFA_(params.layout_SFA)
+    , layout_SFB_(params.layout_SFB)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_e_ = is_fallback_cluster ? &params.tma_load_e_fallback : &params.tma_load_e;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_e_ = &params.tma_load_e;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    // Cluster layout for TMA construction of SFB
+    auto cluster_layout_sfb_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cluster_layout_sfb_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMMA_SF::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_E tma_load_e = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_E tma_load_e_fallback = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk);
+
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom_A_sm100<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom_B_sm100<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        TileShape_SF{},
+        TiledMMA_SF{},
+        cluster_layout_sfb_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_e_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      args.layout_a,
+      args.layout_e,
+      args.layout_SFA,
+      args.layout_SFB,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    // Check for Alignment Requirement
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits_v<ElementA>;
+
+    bool implementable = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,     K/2, L),
+                                                                                                    cute::make_stride(_1{}, M,   M*K/2));
+    }
+    else { // If A is K-major
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,    K/2,  L),
+                                                                                                    cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorA\n");
+    }
+
+    // Check Alignment B
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits_v<ElementB>;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorB\n");
+    }
+
+    // Check for AB layout requirement
+    const auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    const auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    implementable = implementable && (layout_a_ref == args.layout_a);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_a mismatch\n");
+    }
+
+    implementable = implementable && (layout_e_ref == args.layout_e);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_e mismatch\n");
+    }
+
+    // Check for SFA SFB layout requirement
+    const auto layout_sfa_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    const auto layout_sfb_ref = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+    implementable = implementable && (layout_sfa_ref == args.layout_SFA);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFA mismatch, layout_SFA needs to be K-major\n");
+    }
+
+    implementable = implementable && (layout_sfb_ref == args.layout_SFB);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_SFB mismatch, layout_SFB needs to be K-major\n");
+    }
+
+    if constexpr (IsRuntimeDataType && detail::is_sm10x_mxf4nvf4_input<ElementAMma>() && detail::is_sm10x_mxf4nvf4_input<ElementBMma>()) {
+      bool is_compatible = (SFVecSize == 32 ||
+                           (SFVecSize == 64 && is_same_v<ElementSF, cutlass::float_ue8m0_t>
+                                            && args.runtime_data_type_a == cute::UMMA::MXF4Format::E2M1
+                                            && args.runtime_data_type_b == cute::UMMA::MXF4Format::E2M1));
+      if (!is_compatible) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: 2x mode (VectorSize=64) only supports float_e2m1_t for a/b types and ue8m0_t for sf type.\n");
+      }
+      implementable &= is_compatible;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_e_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfa_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_sfb_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(shape(SmemLayoutAtomSFA{}));
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(shape(SmemLayoutAtomSFB{}));
+    Tensor tCtE   = make_tensor<typename TiledMma::FrgTypeE>(take<0,3>(shape(SmemLayoutE{})));
+
+    TmemStorage<decltype(accumulators), decltype(tCtE), decltype(tCtSFA), decltype(tCtSFB)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtSFA = tCtSFA;
+    tmem_storage.tCtSFB = tCtSFB;
+    tmem_storage.tCtE = tCtE;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtE.data()         = tmem_base_addr + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+    tmem_storage.tCtSFA.data()       = tmem_storage.tCtE.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtE);
+    tmem_storage.tCtSFB.data()       = tmem_storage.tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.tCtSFA);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// tAgSFA_mkl - partitioned gmem tensor for SFA
+  /// tBgSFB_nkl - partitioned gmem tensor for SFB
+  /// tAsSFA - partitioned tmem tensor for SFA
+  /// tAsSFB - partitioned tmem tensor for SFB
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(layout_a_.shape());
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+    Tensor mE_mkl = observed_tma_load_e_->get_tma_tensor(layout_e_.shape());
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShapeE{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA_));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+        auto new_shape = make_shape(make_shape(shape<0,0>(mSFB_tmp),
+                                    make_shape(_2{} , shape<0,1>(mSFB_tmp))), shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(_0{}, stride<0,1>(mSFB_tmp))), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB_));
+      }
+    }();
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{},    make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape_SF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+    Tensor tCgE_mkl = cta_mma.partition_A(gE_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    ThrMMA cta_mma_sfb = TiledMMA_SF{}.get_slice(blockIdx.x % size(typename TiledMMA_SF::AtomThrID{}));
+    Tensor tCgSFA_mkl = cta_mma.partition_A(gSFA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = cta_mma_sfb.partition_B(gSFB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(sSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tEgE_mkl, tEsE] = tma_partition(*observed_tma_load_e_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sE), group_modes<0,3>(tCgE_mkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+    uint16_t mcast_mask_e = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return LoadParams{
+      size<3>(gA_mkl),                                // for scheduler
+      tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE, // for input tensor values
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,         // for input scale factor tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_e, mcast_mask_sfa, mcast_mask_sfb}; // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A B E matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE) that one UTCCP can provide
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sE));                                     // PIPE
+
+    Tensor tCtE = tmem_storage.tCtE;
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpEOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      cute::SM100_UTCCP_128dp128bit_2cta, cute::SM100_UTCCP_128dp128bit_1cta>;
+    auto tiled_copy_s2t_E = make_utccp_copy(UtccpEOp{}, recast<ElementE>(tCtE));
+
+    auto thr_copy_s2t_E = tiled_copy_s2t_E.get_slice(0);
+    Tensor thr_tCsE_s2t_ = thr_copy_s2t_E.partition_S(recast<ElementE>(sE));
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    Tensor thr_tCsE_s2t = get_utccp_smem_desc_tensor<UtccpEOp>(thr_tCsE_s2t_);
+    Tensor thr_tCtE_s2t = thr_copy_s2t_E.partition_D(recast<ElementE>(tCtE));
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = tmem_storage.tCtSFA;
+    Tensor tCtSFB = tmem_storage.tCtSFB;
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_s2t_);
+    auto thr_tCtSFA_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_s2t_);
+    auto thr_tCtSFB_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB,
+      tCtE,   tiled_copy_s2t_E,   thr_tCsE_s2t,   thr_tCtE_s2t,
+      tCtSFA, tiled_copy_s2t_SFA, thr_tCsSFA_s2t, thr_tCtSFA_s2t,
+      tCtSFB, tiled_copy_s2t_SFB, thr_tCsSFB_s2t, thr_tCtSFB_s2t};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [k_tiles,
+          tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE,
+          tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB,
+          mcast_mask_a, mcast_mask_b, mcast_mask_e,
+          mcast_mask_sfa, mcast_mask_sfb] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tEgE = tEgE_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+        copy(observed_tma_load_sfa_->with(*tma_barrier, mcast_mask_sfa), tAgSFA(_,*k_tile_iter), tAsSFA(_,write_stage));
+        copy(observed_tma_load_sfb_->with(*tma_barrier, mcast_mask_sfb), tBgSFB(_,*k_tile_iter), tBsSFB(_,write_stage));
+        copy(observed_tma_load_e_->with(*tma_barrier, mcast_mask_e), tEgE(_,*k_tile_iter), tEsE(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma,
+          tCrA, tCrB,
+          tCtE,   tiled_copy_s2t_E,   thr_tCsE_s2t, thr_tCtE_s2t,
+          tCtSFA, tiled_copy_s2t_SFA, thr_tCsSFA_s2t, thr_tCtSFA_s2t,
+          tCtSFB, tiled_copy_s2t_SFB, thr_tCsSFB_s2t, thr_tCtSFB_s2t] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    auto tCtSFB_mma = [tCtSFB = tCtSFB, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB;
+        if (size<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else if constexpr (IsCtaN64) {
+        // Move in increments of 64 columns of SFB
+        auto tCtSFB_tmp = tCtSFB;
+        tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + (size<1>(cta_tile_coord) % 2) * 2;
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB;
+      }
+    }();
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    if constexpr (IsOverlappingAccum) {
+      // first iteration manual unroll for tmem overlap kernel
+      if (k_tile_count > 0) {
+        // WAIT on mainloop_pipe_consumer_state until its data are available
+        // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+        mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+        // Compute on k_tile
+        int read_stage = mainloop_pipe_consumer_state.index();
+        // Save current mainlop pipeline read state
+        auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+        // Advance mainloop_pipe
+        ++mainloop_pipe_consumer_state;
+        --k_tile_count;
+        skip_wait = k_tile_count <= 0;
+        // Peek at next iteration
+        barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_E,   thr_tCsE_s2t(_,_,_,_,read_stage),   thr_tCtE_s2t);
+          copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+          copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+        // Unroll the K mode manually so we can set scale C to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                    tCtE(_,_,k_block),
+                                    tCtSFA(_,_,k_block),
+                                    tCtSFB_mma(_,_,k_block)),
+              tCrA(_,_,k_block,read_stage),
+              tCrB(_,_,k_block,read_stage),
+              accumulators);
+          tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+
+        mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+      }
+    }
+    else {
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if (cute::elect_one_sync()) {
+        copy(tiled_copy_s2t_E,   thr_tCsE_s2t(_,_,_,_,read_stage),   thr_tCtE_s2t);
+        copy(tiled_copy_s2t_SFA, thr_tCsSFA_s2t(_,_,_,_,read_stage), thr_tCtSFA_s2t);
+        copy(tiled_copy_s2t_SFB, thr_tCsSFB_s2t(_,_,_,_,read_stage), thr_tCtSFB_s2t);
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tiled_mma.accumulate_,
+                                  tCtE(_,_,k_block),
+                                  tCtSFA(_,_,k_block),
+                                  tCtSFB_mma(_,_,k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_E const* observed_tma_load_e_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+
+  LayoutA layout_a_;
+  LayoutE layout_e_;
+  LayoutSFA layout_SFA_;
+  LayoutSFB layout_SFB_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d832a1fc4f3ae135ed32d10b266b34381cecee47
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp
@@ -0,0 +1,894 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100ArrayTmaUmmaWarpSpecialized<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100ArrayTmaUmmaWarpSpecialized<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, cutlass::tfloat32_t, ElementAMma>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+
+  template <class AccTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+    cute::TmaDescriptor* tensormaps;
+    ArrayElementA const** ptr_A;
+    StrideA dA;
+    ArrayElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape problem_shapes,
+    Arguments const& args,
+    void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    auto init_L = get<3>(init_shape);
+
+    // Tensor pointers will be fixed before the first access
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+
+    // Batches/Groups are managed by using appropriate pointers to input matrices.
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+    static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    return partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return tmem_storage.accumulators(_,_,_,stage);
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    TmemStorage<decltype(accumulators)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count, int32_t const sm_idx,
+      [[maybe_unused]] int32_t init_group) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+    // we are managing TMA descriptors to change batches, we need to neglect the L mode
+    const int32_t mock_L = 1;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,mock_L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,mock_L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-Cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    // Fetch a copy of tensormaps for the CTA from Params
+    auto input_tensormaps = tensormaps_init(params, shared_tensormaps, sm_count, sm_idx);
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl,                        // for scheduler
+        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b,            // multicast masks
+        input_tensormaps);                     // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    [[maybe_unused]] TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TensorMapA, class TensorMapB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    Params const& params,
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                GTensorPartitionedA, GTensorPartitionedB,
+                STensorA, STensorB,
+                uint16_t, uint16_t,
+                cute::tuple<TensorMapA, TensorMapB>> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count,
+    bool did_batch_change) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b,
+          input_tensormaps] = load_inputs;
+
+    // Check to see if tensormaps have been replaced in gmem
+    if (did_batch_change) {
+      tensormaps_fence_acquire(input_tensormaps);
+    }
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    // Wait for tmem accumulator buffer to become empty with a flipped phase
+    accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma,
+                   tCrA(_,_,k_block,read_stage),
+                   tCrB(_,_,k_block,read_stage),
+                   accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(observed_tma_load_a_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(observed_tma_load_b_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_a_, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_b_, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release(shared_tensormaps, input_tensormaps);
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..812553afc959e972df280e767ab1de1b558634fc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp
@@ -0,0 +1,1342 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100ArrayTmaUmmaWarpSpecializedBlockwiseScaling<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100ArrayTmaUmmaWarpSpecializedBlockwiseScaling<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = cute::remove_cvref_t<decltype(get<0>(StridePairA_{}))>;
+  using LayoutSFA = cute::remove_cvref_t<decltype(get<1>(StridePairA_{}))>;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = cute::remove_cvref_t<decltype(get<0>(StridePairB_{}))>;
+  using LayoutSFB = cute::remove_cvref_t<decltype(get<1>(StridePairB_{}))>;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  static constexpr int ScaleGranularityM = size<0,0>(InternalLayoutSFA{});
+
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static_assert(size<0>(TileShape{}) % ScaleGranularityM == 0 and ScaleGranularityM <= size<0>(TileShape{}), "Scale Granularity M must divide Tile Shape");
+
+  static constexpr int ScaleGranularityN = size<0,0>(InternalLayoutSFB{});
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+  static_assert(size<1>(TileShape{}) % ScaleGranularityN == 0 and ScaleGranularityN <= size<1>(TileShape{}), "Scale Granularity N must divide Tile Shape");
+
+  static_assert(size<1, 0>(InternalLayoutSFA{}) == size<1, 0>(InternalLayoutSFB{}), "Vector size K must be equal for SFA and SFB");
+
+  static constexpr int ScaleGranularityK = size<1, 0>(InternalLayoutSFA{});
+  static constexpr int ScaleKsPerTile = size<2>(TileShape{}) / ScaleGranularityK;
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0 and ScaleGranularityK <= size<2>(TileShape{}), "Scale Granularity K must divide Tile Shape");
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0, "Scale Granularity K must be divisible by MMA_K");
+
+  static constexpr int K_BLOCK_MMAS_PER_SCALE_K = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
+
+  static_assert(size<0>(CtaShape_MNK{}) >= ScaleGranularityM, "Scale Granularity must be smaller than or equal to the tile shape");
+  static_assert(size<1>(CtaShape_MNK{}) >= ScaleGranularityN, "Scale Granularity must be smaller than or equal to the tile shape");
+  static_assert(size<2>(CtaShape_MNK{}) >= ScaleGranularityK, "Scale Granularity must be smaller than or equal to the tile shape");
+
+  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K,
+      size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K>;
+
+
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(CtaShape_MNK{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(CtaShape_MNK{}));
+
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = cute::remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopySFA = cute::remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopyB = cute::remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB_{}))>;
+  using GmemTiledCopySFB = cute::remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB_{}))>;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
+  static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ?
+      (size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1);
+  static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ?
+      (size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1);
+
+
+  using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
+                                DispatchPolicy::Stages,
+                                ClusterShape,
+                                AtomThrShapeMNK>;
+  using MainloopABPipelineState = typename MainloopABPipeline::PipelineState;
+
+  using MainloopSFPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using MainloopSFPipelineState = typename MainloopSFPipeline::PipelineState;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<
+                                  AccumulatorPipelineStageCount,
+                                  AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  // Two arrivals per thread in the warp (1 arrival and 1 arrival through cp.async.mbarrier)
+  static constexpr int NumMainloopSFProducerThreadEvents = 64;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, cutlass::tfloat32_t, ElementAMma>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  using SmemLayoutScaleA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutScaleB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementAccumulator, cute::cosize_v<SmemLayoutScaleA>> smem_SFA;
+      cute::ArrayEngine<ElementAccumulator, cute::cosize_v<SmemLayoutScaleB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineABStorage = typename MainloopABPipeline::SharedStorage;
+    using PipelineSFStorage = typename MainloopSFPipeline::SharedStorage;
+    using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+    struct PipelineStorage {
+      alignas(16) PipelineABStorage pipeline_ab;
+      alignas(16) PipelineSFStorage pipeline_sf;
+      alignas(16) AccumulatorPipelineStorage pipeline_accum;
+    };
+ };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementAccumulator const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementAccumulator const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+    cute::TmaDescriptor* tensormaps;
+    ArrayElementA const** ptr_A;
+    StrideA dA;
+    ArrayElementB const** ptr_B;
+    StrideB dB;
+
+    ElementAccumulator const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementAccumulator const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape problem_shapes,
+    Arguments const& args,
+    void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    auto init_L = get<3>(init_shape);
+
+    // Tensor pointers will be fixed before the first access
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+
+    // Batches/Groups are managed by using appropriate pointers to input matrices.
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB,
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+    static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    bool implementable_sf = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFA>(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
+        implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFB>(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
+        if (!implementable_sf) {
+          CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
+        }
+      }
+    }
+    else {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Ignoring check to can implement because host problem shape is not available.\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    implementable = implementable && implementable_sf;
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));     // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  slice_accumulator(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, int stage) {
+    return accumulators(_,_,_,stage);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_ab_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count, int32_t const sm_idx,
+      [[maybe_unused]] int32_t init_group) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+    // we are managing TMA descriptors to change batches, we need to neglect the L mode
+    const int32_t mock_L = 1;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,mock_L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,mock_L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);                                       // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);                                       // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-Cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    // Fetch a copy of tensormaps for the CTA from Params
+    auto input_tensormaps = tensormaps_init(params, shared_tensormaps, sm_count, sm_idx);
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl,                        // for scheduler
+        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b,            // multicast masks
+        input_tensormaps);                     // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_sf_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      int current_group) const {
+    return load_sf_update(problem_shape_MNKL, params, shared_tensors, current_group);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_sf_update(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      int current_group) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+    // we are managing TMA descriptors to change batches, we need to neglect the L mode
+    const int32_t mock_L = 1;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,mock_L));
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    auto layout_SFA = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (IsGroupedGemmKernel) {
+        return params.layout_SFA[current_group];
+      }
+      else {
+        return params.layout_SFA;
+      }
+    }();
+
+    auto layout_SFB = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      if constexpr (IsGroupedGemmKernel) {
+        return params.layout_SFB[current_group];
+      }
+      else {
+        return params.layout_SFB;
+      }
+    }();
+
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(params.ptr_SFA[current_group]), layout_SFA);                  // (m,k,l)
+
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(params.ptr_SFB[current_group]), layout_SFB);                  // (n,k,l)
+
+    Tensor SFA_mkl_ident = make_identity_tensor(shape(layout_SFA));
+
+    Tensor SFB_nkl_ident = make_identity_tensor(shape(layout_SFB));
+
+    // Tile the tensors and defer the slice
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, CtaShape_MNK{},
+        make_coord(_,_,_), Step<_1, X,_1>{});                                                 // (BLK_M, BLK_K, m, k, l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, CtaShape_MNK{},
+        make_coord(_,_,_), Step< X,_1,_1>{});                                                 // (BLK_N, BLK_K, n, k, l)
+
+    Tensor identSFA_mkl = local_tile(SFA_mkl_ident, CtaShape_MNK{},
+        make_coord(_,_,_), Step<_1, X,_1>{});                                                 // (BLK_M, BLK_K, m, k, l)
+    Tensor identSFB_nkl = local_tile(SFB_nkl_ident, CtaShape_MNK{},
+        make_coord(_,_,_), Step< X,_1,_1>{});                                                 // (BLK_N, BLK_K, n, k, l)
+
+    static_assert(rank(decltype(gSFA_mkl){}) == 5);
+    static_assert(rank(decltype(gSFB_nkl){}) == 5);
+
+    // 1 thread copies entire set of scalar
+    GmemTiledCopySFA scale_copy_a{};
+    GmemTiledCopySFB scale_copy_b{};
+
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x % size(scale_copy_a));
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x % size(scale_copy_b));
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()),
+        SmemLayoutScaleA{});                                                                          // (CTA_M,CTA_K,P)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()),
+        SmemLayoutScaleB{});                                                                          // (CTA_M,CTA_K,P)
+
+    Tensor tSFAgSFA_mkl = thr_scale_copy_a.partition_S(gSFA_mkl);                        // (CPY, BLK_M, BLK_K, m, k, l)
+    Tensor tSFAIdentSFA_mkl = thr_scale_copy_a.partition_S(identSFA_mkl);                // (CPY, BLK_M, BLK_K, m, k, l)
+
+    Tensor tSFAsSFA = thr_scale_copy_a.partition_D(sSFA);
+
+    Tensor tSFBgSFB_nkl = thr_scale_copy_b.partition_S(gSFB_nkl);                        // (CPY, BLK_N, BLK_K, m, k, l)
+    Tensor tSFBIdentSFB_nkl = thr_scale_copy_b.partition_S(identSFB_nkl);                // (CPY, BLK_N, BLK_K, m, k, l)
+    Tensor tSFBsSFB = thr_scale_copy_b.partition_D(sSFB);
+
+    static_assert(rank(decltype(tSFAgSFA_mkl){}) == 6);
+    static_assert(rank(decltype(tSFBgSFB_nkl){}) == 6);
+
+    return cute::make_tuple(gA_mkl,
+                            tSFAgSFA_mkl, tSFBgSFB_nkl,
+                            tSFAsSFA, tSFBsSFB,
+                            tSFAIdentSFA_mkl, tSFBIdentSFB_nkl,
+                            layout_SFA, layout_SFB);
+  }
+
+  /// Setup data needed for transform
+  CUTLASS_DEVICE auto
+  accum_init(
+      TensorStorage& shared_tensors) const {
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()),
+        SmemLayoutScaleA{});                                                                          // (CTA_M,CTA_K,P)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()),
+        SmemLayoutScaleB{});                                                                          // (CTA_M,CTA_K,P)
+
+    return cute::make_tuple(sSFA, sSFB);
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(
+      Params const& params,
+      [[maybe_unused]] cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] uint32_t const tmem_nonaccum_offset) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA_ = TiledMma::make_fragment_A(sA);                                              // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB_ = TiledMma::make_fragment_B(sB);                                              // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(rank(tCrA_) == _4{});
+
+    auto mma_tile_shape_A = make_shape(get<0>(shape(tCrA_.layout())),
+                                       get<1>(shape(tCrA_.layout())),
+                                       Int<K_BLOCK_MMAS_PER_SCALE_K>{},
+                                       _1{});
+
+    auto mma_tile_shape_B = make_shape(get<0>(shape(tCrB_.layout())),
+                                       get<1>(shape(tCrB_.layout())),
+                                       Int<K_BLOCK_MMAS_PER_SCALE_K>{},
+                                       _1{});
+
+    Tensor tCrA = flat_divide(tCrA_,
+        mma_tile_shape_A)(_,_,_,_0{},_0{},_0{},_,_);                      // (MMA,MMA_M,MMA_K_PER_SCALE,MMA_K_REST,PIPE)
+
+    Tensor tCrB = flat_divide(tCrB_,
+        mma_tile_shape_B)(_,_,_,_0{},_0{},_0{},_,_);                      // (MMA,MMA_N,MMA_K_PER_SCALE,MMA_K_REST,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                          // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TensorMapA, class TensorMapB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_ab(
+    Params const& params,
+    MainloopABPipeline mainloop_ab_pipeline,
+    MainloopABPipelineState mainloop_ab_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                GTensorPartitionedA, GTensorPartitionedB,
+                STensorA, STensorB,
+                uint16_t, uint16_t,
+                cute::tuple<TensorMapA, TensorMapB>> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count,
+    bool did_batch_change) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b,
+          input_tensormaps] = load_inputs;
+
+    // Check to see if tensormaps have been replaced in gmem
+    if (did_batch_change) {
+      tensormaps_fence_acquire(input_tensormaps);
+    }
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_ab_pipeline.producer_try_acquire(mainloop_ab_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_ab_pipeline.producer_acquire(mainloop_ab_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopABPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_ab_pipeline.producer_get_barrier(mainloop_ab_pipe_producer_state);
+
+      int write_stage = mainloop_ab_pipe_producer_state.index();
+      ++mainloop_ab_pipe_producer_state;
+      barrier_token = mainloop_ab_pipeline.producer_try_acquire(mainloop_ab_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_ab_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_ab_tail(MainloopABPipeline mainloop_ab_pipeline, MainloopABPipelineState mainloop_ab_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_ab_pipeline.producer_tail(mainloop_ab_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped transform
+  /// Producer Perspective
+  template <
+    class UnusedGTensorA,
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB,
+    class IdentPartitionedSFA, class IdentPartitionedSFB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_sf(
+    MainloopSFPipeline mainloop_sf_pipeline,
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state,
+    cute::tuple<UnusedGTensorA,
+                GTensorPartitionedSFA, GTensorPartitionedSFB,
+                STensorSFA, STensorSFB,
+                IdentPartitionedSFA,
+                IdentPartitionedSFB,
+                InternalLayoutSFA,
+                InternalLayoutSFB> const& mainloop_sf_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused, tSFAgSFA_mkl, tSFBgSFB_nkl,
+          tSFAsSFA, tSFBsSFB,
+          tSFAIdentSFA_mkl, tSFBIdentSFB_nkl,
+          layout_SFA, layout_SFB] = mainloop_sf_inputs;
+
+    // slice out the work coord from partitioned tensors
+    GmemTiledCopySFA scale_copy_a{};
+    GmemTiledCopySFB scale_copy_b{};
+
+    Tensor tSFAgSFA = tSFAgSFA_mkl(_, _, _, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    Tensor tSFBgSFB = tSFBgSFB_nkl(_, _, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    Tensor thr_tile_SFA_k = tSFAIdentSFA_mkl(_0{}, _, _, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor thr_tile_pSFA = make_tensor<bool>(shape(filter_zeros(thr_tile_SFA_k(_,_,_0{}), tSFAgSFA(_0{},_,_,_0{}).stride())));
+    Tensor thr_tile_SFB_k = tSFBIdentSFB_nkl(_0{}, _, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    Tensor thr_tile_pSFB = make_tensor<bool>(shape(filter_zeros(thr_tile_SFB_k(_,_,_0{}), tSFBgSFB(_0{},_,_,_0{}).stride())));
+
+    // Issue the loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK pipe_producer_state for _writing_
+      mainloop_sf_pipeline.producer_acquire(mainloop_sf_pipe_producer_state);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(thr_tile_pSFA); ++i) {
+        Tensor thr_tile_SFA = filter_zeros(thr_tile_SFA_k(_,_,*k_tile_iter), tSFAgSFA(_0{},_,_,_0{}).stride());
+        thr_tile_pSFA(i) = elem_less(thr_tile_SFA(i), shape(filter_zeros(layout_SFA))) && threadIdx.x % 32 < size(scale_copy_a);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(thr_tile_pSFB); ++i) {
+        Tensor thr_tile_SFB = filter_zeros(thr_tile_SFB_k(_,_,*k_tile_iter), tSFBgSFB(_0{},_,_,_0{}).stride());
+        thr_tile_pSFB(i) = elem_less(thr_tile_SFB(i), shape(filter_zeros(layout_SFB))) && threadIdx.x % 32 < size(scale_copy_b);
+      }
+
+      copy_if(scale_copy_a, thr_tile_pSFA, filter_zeros(tSFAgSFA(_,_,_,*k_tile_iter)), filter_zeros(tSFAsSFA(_,_,_,mainloop_sf_pipe_producer_state.index())));
+      copy_if(scale_copy_b, thr_tile_pSFB, filter_zeros(tSFBgSFB(_,_,_,*k_tile_iter)), filter_zeros(tSFBsSFB(_,_,_,mainloop_sf_pipe_producer_state.index())));
+      mainloop_sf_pipeline.producer_commit(mainloop_sf_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      __syncwarp();
+
+      ++mainloop_sf_pipe_producer_state;
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_sf_pipe_producer_state, k_tile_iter);
+
+ }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_sf_tail(
+      MainloopSFPipeline mainloop_sf_pipeline,
+      MainloopSFPipelineState mainloop_sf_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_sf_pipeline.producer_tail(mainloop_sf_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopABPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopABPipelineState,
+                  AccumulatorPipelineState> pipeline_states,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 4, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N, P)");
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int scale_k_iter = 0; scale_k_iter < size<3>(tCrA); ++scale_k_iter) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+        auto acc = slice_accumulator(accumulators, accumulator_pipe_producer_state.index());
+        static_assert(is_tmem<remove_cvref_t<decltype(acc)>>::value, "Accumulator must be tmem resident.");
+        static_assert(rank(remove_cvref_t<decltype(acc)>{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+        // for each set of scale_k_iter we zero the accumulator
+        tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+        // Unroll the K mode manually so we can set scale C to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma,
+                     tCrA(_,_,k_block,scale_k_iter,read_stage),
+                     tCrB(_,_,k_block,scale_k_iter,read_stage),
+                     acc);
+          tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+        accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        ++accumulator_pipe_producer_state;
+      }
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+
+    }
+
+    return make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state);
+
+  }
+
+  /// Transform
+  template <
+    class FrgEngine,
+    class FrgLayout,
+    class TensorsSFA,
+    class TensorsSFB,
+    class CtaTileCoord,
+    class CopyOpT2R,
+    class EpilogueTile
+  >
+  CUTLASS_DEVICE auto
+  accum(
+      cute::tuple<AccumulatorPipeline, MainloopSFPipeline> pipelines,
+      cute::tuple<AccumulatorPipelineState, MainloopSFPipelineState> consumer_states,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      cute::tuple<TensorsSFA, TensorsSFB> const& transform_inputs,
+      CtaTileCoord cta_tile_coord,
+      CopyOpT2R,
+      EpilogueTile,
+      int k_tile_count) {
+
+    static_assert(size<0>(EpilogueTile{}) <= size<0>(CtaShape_MNK{}), "Restrict epilogue tile to be smaller than or equal to CTA Tile");
+    static_assert(size<1>(EpilogueTile{}) <= size<1>(CtaShape_MNK{}), "Restrict epilogue tile to be smaller than or equal to CTA Tile");
+
+
+    //
+    // PIPELINED Transform
+    //
+
+    Tensor acc = slice_accumulator(accumulators, _0{});
+    Tensor tAcc = acc(make_coord(_,_),_0{},_0{});
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    auto [sSFA_, sSFB_] = transform_inputs;
+
+    // Append N with a stride of 0 to SFA
+    Tensor sSFA = make_tensor(sSFA_.data(), make_layout(
+      make_shape(get<0>(sSFA_.shape()), get<1>(CtaShape_MNK{}), get<1>(sSFA_.shape()), get<2>(sSFA_.shape())),
+      make_stride(get<0>(sSFA_.stride()), _0{}, get<1>(sSFA_.stride()), get<2>(sSFA_.stride()))
+    ));
+
+    CUTE_STATIC_ASSERT_V(size<0>(sSFA) == size<0>(tAcc));
+    CUTE_STATIC_ASSERT_V(size<1>(sSFA) == size<1>(tAcc));
+
+    Tensor sSFA_epi = flat_divide(sSFA, EpilogueTile{});
+
+    // Append M with a stride of 0 to SFB
+    Tensor sSFB = make_tensor(sSFB_.data(), make_layout(
+      make_shape(get<0>(CtaShape_MNK{}), get<0>(sSFB_.shape()), get<1>(sSFB_.shape()), get<2>(sSFB_.shape())),
+      make_stride(_0{}, get<0>(sSFB_.stride()), get<1>(sSFB_.stride()), get<2>(sSFB_.stride()))
+    ));
+
+    CUTE_STATIC_ASSERT_V(size<0>(sSFB) == size<0>(tAcc));
+    CUTE_STATIC_ASSERT_V(size<1>(sSFB) == size<1>(tAcc));
+
+    Tensor sSFB_epi = flat_divide(sSFB, EpilogueTile{});
+
+    TiledCopy tiled_t2r_epi = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+
+    int thread_idx = threadIdx.x % size(tiled_t2r_epi);
+
+    ThrCopy thread_t2r_epi = tiled_t2r_epi.get_slice(thread_idx);
+
+    Tensor acc_ident_epi = make_identity_tensor(shape(tAcc_epi));
+
+    Tensor tTR_rAcc_epi = thread_t2r_epi.partition_D(acc_ident_epi);                // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+    Tensor tTR_sSFA_epi = thread_t2r_epi.partition_D(sSFA_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+    Tensor tTR_sSFB_epi = thread_t2r_epi.partition_D(sSFB_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+    static_assert(rank(decltype(tTR_sSFA_epi){}) == 7);
+
+    Tensor tTR_FullAcc = make_tensor<ElementAccumulator>(shape(tTR_rAcc_epi));
+    Tensor tTR_PartAcc = make_tensor<ElementAccumulator>(shape(tTR_rAcc_epi(_,_,_,_0{},_0{})));
+
+    Tensor tTR_rSFA_compact = make_fragment_like<ElementAccumulator>(filter_zeros(tTR_sSFA_epi(_,_,_,_,_,_,_0{})));
+    Tensor tTR_rSFB_compact = make_fragment_like<ElementAccumulator>(filter_zeros(tTR_sSFB_epi(_,_,_,_,_,_,_0{})));
+
+    Layout tTR_rSFA_layout = make_layout(tTR_sSFA_epi(_,_,_,_,_,_,_0{}).shape(), tTR_rSFA_compact.stride());
+    Layout tTR_rSFB_layout = make_layout(tTR_sSFB_epi(_,_,_,_,_,_,_0{}).shape(), tTR_rSFB_compact.stride());
+
+    // Zero our accumulator
+    clear(tTR_FullAcc);
+
+    auto [accumulator_pipeline, mainloop_sf_pipeline] = pipelines;
+    auto [accumulator_pipe_state, mainloop_sf_pipe_state] = consumer_states;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      mainloop_sf_pipeline.consumer_wait(mainloop_sf_pipe_state);
+      int read_idx = mainloop_sf_pipe_state.index();
+
+      copy(filter_zeros(tTR_sSFA_epi(_,_,_,_,_,_,read_idx)), tTR_rSFA_compact);
+      copy(filter_zeros(tTR_sSFB_epi(_,_,_,_,_,_,read_idx)), tTR_rSFB_compact);
+
+      CUTE_STATIC_ASSERT_V(cosize(tTR_rSFA_layout) == size(tTR_rSFA_compact));
+      CUTE_STATIC_ASSERT_V(cosize(tTR_rSFB_layout) == size(tTR_rSFB_compact));
+
+      Tensor tTR_rSFA = make_tensor(tTR_rSFA_compact.data(), tTR_rSFA_layout);
+      Tensor tTR_rSFB = make_tensor(tTR_rSFB_compact.data(), tTR_rSFB_layout);
+
+      mainloop_sf_pipeline.consumer_release(mainloop_sf_pipe_state);
+      ++mainloop_sf_pipe_state;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < ScaleKsPerTile; ++k_block) {
+
+        accumulator_pipeline.consumer_wait(accumulator_pipe_state);
+
+        Tensor acc = slice_accumulator(accumulators, accumulator_pipe_state.index());
+        Tensor tAcc = acc(make_coord(_,_),_0{},_0{});
+        Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                   // (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N)
+        Tensor tTR_tAcc = thread_t2r_epi.partition_S(tAcc_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_m = 0; epi_m < size<2>(tAcc_epi); ++epi_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_n = 0; epi_n < size<3>(tAcc_epi); ++epi_n) {
+
+            auto scale_a = tTR_rSFA(_,_,_,epi_m,epi_n,k_block * ScaleGranularityK);
+            auto scale_b = tTR_rSFB(_,_,_,epi_m,epi_n,k_block * ScaleGranularityK);
+
+            Tensor full_acc = tTR_FullAcc(_,_,_,epi_m,epi_n);
+            // Compute tmem load predication if necessary
+            copy(tiled_t2r_epi, tTR_tAcc(_,_,_,epi_m,epi_n), tTR_PartAcc);
+            cutlass::arch::fence_view_async_tmem_load();
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(full_acc); ++i) {
+              ElementAccumulator scale = scale_a(i) * scale_b(i);
+              full_acc(i) += scale * tTR_PartAcc(i);
+            }
+          }
+        }
+        cutlass::arch::fence_view_async_tmem_load();
+        accumulator_pipeline.consumer_release(accumulator_pipe_state);
+        // release acc
+        ++accumulator_pipe_state;
+      }
+
+      --k_tile_count;
+    }
+
+    return cute::make_tuple(tTR_FullAcc, tiled_t2r_epi, cute::make_tuple(accumulator_pipe_state, mainloop_sf_pipe_state));
+ }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(observed_tma_load_a_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(observed_tma_load_b_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_a_, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_b_, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release(shared_tensormaps, input_tensormaps);
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+private:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0a90566d721f6d18cdca2f3575687991685442b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_emulated.hpp
@@ -0,0 +1,1126 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+
+#pragma once
+#include <cuda_bf16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+#include "cutlass/detail/cluster.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/mma_sm100.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop for FastF32 Kernels
+template <
+  int Load2TransformPipelineStageCount_,
+  int Transform2MmaPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  int NumBandsToCompute_,
+  int ScalingFactor_,
+  int AccPromotionInterval_,
+  class AccumulatorCopyAtom_,
+  class ClusterShape,
+  class TileShape_,
+  class StrideA_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomsA_,
+  class CopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomsB_,
+  class CopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32<
+      Load2TransformPipelineStageCount_,
+      Transform2MmaPipelineStageCount_,
+      SchedulerPipelineStageCount_,
+      AccumulatorPipelineStageCount_,
+      NumBandsToCompute_,
+      ScalingFactor_,
+      AccPromotionInterval_,
+      ClusterShape,
+      AccumulatorCopyAtom_>,
+    TileShape_,
+    float,
+    StrideA_,
+    float,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomsA_,
+    CopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomsB_,
+    CopyAtomsB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32<
+                            Load2TransformPipelineStageCount_,
+                            Transform2MmaPipelineStageCount_,
+                            SchedulerPipelineStageCount_,
+                            AccumulatorPipelineStageCount_,
+                            NumBandsToCompute_,
+                            ScalingFactor_,
+                            AccPromotionInterval_,
+                            ClusterShape,
+                            AccumulatorCopyAtom_>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = float;
+  using PackedElementA = float2;
+  using StrideA = StrideA_;
+  using InternalStrideA  = cute::remove_pointer_t<StrideA>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using PackedElementAMma = uint32_t;
+  using ElementB = float;
+  using PackedElementB = float2;
+  using StrideB = StrideB_;
+  using InternalStrideB  = cute::remove_pointer_t<StrideB>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using PackedElementBMma = uint32_t;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+  using CopyAtomsA = CopyAtomsA_;
+  using CopyAtomsB = CopyAtomsB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::is_same_v<ElementA, float>, "Input type A should be float");
+  static_assert(cute::is_same_v<ElementB, float>, "Input type B should be float");
+  static_assert(cute::is_same_v<ElementAMma, cutlass::bfloat16_t>, "Compute type A should be cutlass::bfloat16_t");
+  static_assert(cute::is_same_v<ElementBMma, cutlass::bfloat16_t>, "Compute type A should be cutlass::bfloat16_t");
+
+  using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync<
+                             DispatchPolicy::Load2TransformPipelineStageCount,
+                             AtomThrShapeMNK>;
+  using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState;
+
+  using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync<
+                              DispatchPolicy::Transform2MmaPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState;
+
+  using Mma2AccumPipeline =  cutlass::PipelineUmmaAsync<
+                              DispatchPolicy::Schedule::AccumulatorPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState;
+
+  // Thread Counts
+  static constexpr uint32_t NumTransformationThreads = 128;
+  static constexpr uint32_t NumAccumThreads = 128;
+
+  // Get the Algorithm parameters
+  constexpr static int NumComputeMtxs = 3;
+  constexpr static int NumBandsToCompute = DispatchPolicy::NumBandsToCompute;
+  constexpr static int ScalingFactor = DispatchPolicy::ScalingFactor;
+  constexpr static int AccPromotionInterval = DispatchPolicy::AccPromotionInterval;
+  constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{}) / DispatchPolicy::AccPromotionInterval;
+  constexpr static int NumBandsMax = 5;
+  static_assert(NumBandsToCompute <= NumBandsMax && NumBandsToCompute >= 3, "NumBandsToCompute should be less than maximum number of bands");
+
+  // Copy atom for Accumulator
+  using AccumulatorCopyAtom = typename DispatchPolicy::AccumulatorCopyAtom;
+
+  static_assert((NumBandsToCompute == 5 || NumBandsToCompute == 4 || NumBandsToCompute == 3),
+                 "9xBF16 with 5/4/3 Bands are supported");
+
+  using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom;
+  using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom;
+  using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom;
+  using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom;
+
+  using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom;
+  using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom;
+  using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom;
+  using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomACompute{},
+      append(append(CtaShapeA_MK{}, Int<NumComputeMtxs>{}), Int<DispatchPolicy::Transform2MmaPipelineStageCount>{})));
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(CtaShapeB_NK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutBCompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomBCompute{},
+      append(append(CtaShapeB_NK{}, Int<NumComputeMtxs>{}), Int<DispatchPolicy::Transform2MmaPipelineStageCount>{})));
+
+  static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert((cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value ||
+                 cute::is_base_of<cute::UMMA::tmem_frg_base,      typename TiledMma::FrgTypeA>::value  ) &&
+                 cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop.");
+  static_assert((cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyA - invalid TMA copy atom specified.");
+  static_assert((cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyB -  invalid TMA copy atom specified.");
+
+  struct PipelineStorage {
+    using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage;
+    alignas(16) Load2TransformPipelineStorage load2transform_pipeline;
+    using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage;
+    alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline;
+    using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage;
+    alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline;
+  };
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      struct TensorStorageUntransformed {
+        cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+        cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> smem_B;
+      };
+
+      struct TensorStorageTransformedAinSmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutACompute>> smem_ACompute;
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutBCompute>> smem_BCompute;
+      };
+
+      union TensorStorageTransformedAinTmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, 1> smem_ACompute;  // No smem_ACompute
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutBCompute>> smem_BCompute;
+      };
+
+      using TensorStorageTransformed = cute::conditional_t<
+                                      cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value,
+                                      TensorStorageTransformedAinSmem,
+                                      TensorStorageTransformedAinTmem>;
+
+      TensorStorageUntransformed input;
+      TensorStorageTransformed compute;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+
+  // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on
+  // loaded input A and B matrices to convert the data type
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+    cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const** ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    using TMA_B = decltype(make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+    cute::TmaDescriptor* tensormaps;
+    ElementA const** ptr_A;
+    ElementB const** ptr_B;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+
+    // Tensor shapes for Ptr-Array are initialized correctly here.
+    auto [M,N,K,mock_L] = problem_shape.get_host_problem_shape(0);
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    mock_L = 1;
+
+    // Tensor pointers will be fixed before the first access
+    ElementA const* ptr_A_first_batch = nullptr;
+    ElementB const* ptr_B_first_batch = nullptr;
+
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(M,K,mock_L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(N,K,mock_L), args.dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ElementA const**>(args.ptr_A),
+      reinterpret_cast<ElementB const**>(args.ptr_B)
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto [M,N,K,L] = problem_shape.get_host_problem_shape(0);
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TensorMapA, class TensorMapB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+      Params const& params,
+      Load2TransformPipeline pipeline,
+      Load2TransformPipelineState load2xform_pipeline_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t,
+                  cute::tuple<TensorMapA, TensorMapB>> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b,
+          input_tensormaps] = load_inputs;
+
+    // slice out the work coord from tiled tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK mainloop_load2xform_pipeline_state for _writing_
+      pipeline.producer_acquire(load2xform_pipeline_state, pipeline_flag);
+      int write_stage = load2xform_pipeline_state.index();
+
+      using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(load2xform_pipeline_state);
+
+      // Advance mainloop_pipe
+      ++load2xform_pipeline_state;
+      skip_wait = (k_tile_count <= 1);
+      pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+      copy(observed_tma_load_a_->with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+      copy(observed_tma_load_b_->with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      ++k_tile_iter;
+    }
+    return cute::make_tuple(load2xform_pipeline_state, k_tile_iter);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_storage,
+      int32_t const sm_count, int32_t const sm_idx) const {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    // Fetch a copy of tensormaps for the CTA from Params
+    auto input_tensormaps = tensormaps_init(params, sm_count, sm_idx);
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl,                        // for scheduler
+        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b,            // multicast masks
+        input_tensormaps);                     // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+  template<
+    class KTileIterator, class Accumulator,
+    class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA,
+    class GTensorB,                 class SrcTensorB, class DstTensorB
+  >
+  CUTLASS_DEVICE auto
+  transform(
+      Load2TransformPipeline load2transform_pipeline,
+      Load2TransformPipelineState load2transform_pipeline_consumer_state,
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_producer_state,
+      Accumulator accumulators,
+      cute::tuple<GTensorA, DstCopyA, SrcTensorA, DstTensorA,
+                  GTensorB,           SrcTensorB, DstTensorB> input_operands,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    static_assert(cute::is_same_v<ElementA, ElementB>, "ElementA and ElementB types should be the same.");
+    static_assert(cute::is_same_v<ElementAMma, ElementBMma>, "ElementAMma and ElementBMma types should be the same.");
+
+    cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier);
+
+    // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tAdA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM or TMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM)
+    auto [unused_tAgA, dst_copy_A, tAsA, tAdACompute,
+          unused_tBgB,             tBsB, tBsBCompute] = input_operands;
+
+    // Create the tensors in registers
+    auto tArA = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
+    auto tArA_temp = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
+    auto tArACompute = make_tensor<ElementAMma>(tAsA(_,_,_,_,0).shape());
+
+    auto tBrB = make_tensor<ElementB>(tBsB(_,_,_,_,0).shape());
+    auto tBrB_temp = make_tensor<ElementB>(tBsB(_,_,_,_,0).shape());
+    auto tBrBCompute = make_tensor<ElementBMma>(tBsB(_,_,_,_,0).shape());
+
+    auto tArA_x2 = recast<Array<ElementA,2>>(tArA);
+    auto tArA_temp_x2 = recast<Array<ElementA,2>>(tArA_temp);
+    auto tArACompute_x2 = recast<Array<ElementAMma,2>>(tArACompute);
+
+    auto tBrB_x2 = recast<Array<ElementB,2>>(tBrB);
+    auto tBrB_temp_x2 = recast<Array<ElementB,2>>(tBrB_temp);
+    auto tBrBCompute_x2 = recast<Array<ElementBMma,2>>(tBrBCompute);
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+    auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag);
+      transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag);
+
+      int load2transform_consumer_index = load2transform_pipeline_consumer_state.index();
+      int transform2mma_producer_index = transform2mma_pipeline_producer_state.index();
+
+      auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state;
+      auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state;
+
+      // Copy the input B matrix from SMEM
+      copy(AutoVectorizingCopy{}, tBsB(_,_,_,_,load2transform_consumer_index), tBrB);
+      // Copy the input A matrix from SMEM
+      copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA);
+
+      CUTE_UNROLL
+      for (int comp_mtx_index = 0; comp_mtx_index < NumComputeMtxs; ++comp_mtx_index) {
+        // Convert from fp32 -> bf16
+        cute::transform(tBrB_x2, tBrBCompute_x2, cutlass::NumericArrayConverter<ElementBMma, ElementB, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
+        copy(AutoVectorizingCopy{}, tBrBCompute, tBsBCompute(_,_,_,_,comp_mtx_index,transform2mma_producer_index));
+
+        // if it is not the last compute matrix, scale and substract
+        if (comp_mtx_index < NumComputeMtxs - 1) {
+          // Convert from bf16 -> fp32 to substract
+          cute::transform(tBrBCompute_x2, tBrB_temp_x2, cutlass::NumericArrayConverter<ElementB, ElementBMma, 2, cutlass::FloatRoundStyle::round_to_nearest>::convert);
+          cute::transform(tBrB_x2, tBrB_temp_x2, tBrB_x2, cutlass::minus<Array<ElementB,2>>{});
+          if constexpr (DispatchPolicy::ScalingFactor != 0) {
+            cute::transform(tBrB_x2, tBrB_x2, cutlass::scale<Array<ElementB,2>>{(1 << DispatchPolicy::ScalingFactor)});
+          }
+        }
+      }
+
+      // Loads from SMEM are done. Signal the mainloop load as early as possible
+      transform_bar.sync();
+      load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state);
+
+      CUTE_UNROLL
+      for (int comp_mtx_index = 0; comp_mtx_index < NumComputeMtxs; ++comp_mtx_index) {
+        // Convert from fp32 -> bf16
+        cute::transform(tArA_x2, tArACompute_x2, cutlass::NumericArrayConverter<ElementAMma, ElementA, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
+        copy(dst_copy_A, tArACompute, tAdACompute(_,_,_,_,comp_mtx_index,transform2mma_producer_index));
+
+        // if it is not the last compute matrix, scale and substract
+        if (comp_mtx_index < NumComputeMtxs - 1) {
+          // Convert from bf16 -> fp32 to substract
+          cute::transform(tArACompute_x2, tArA_temp_x2, cutlass::NumericArrayConverter<ElementA, ElementAMma, 2, cutlass::FloatRoundStyle::round_to_nearest>::convert);
+          cute::transform(tArA_x2, tArA_temp_x2, tArA_x2, cutlass::minus<Array<ElementA,2>>{});
+          if constexpr (DispatchPolicy::ScalingFactor != 0) {
+            cute::transform(tArA_x2, tArA_x2, cutlass::scale<Array<ElementA,2>>{(1 << DispatchPolicy::ScalingFactor)});
+          }
+        }
+      }
+
+      // fence for SMEM writes
+      cutlass::arch::fence_view_async_shared();
+      if constexpr (is_tmem<decltype(tAdACompute)>::value) {
+        // fence for TMEM writes if A operand is coming from TMEM
+        cutlass::arch::fence_view_async_tmem_store();
+      }
+
+      // Let the MMA know we are done transforming
+      transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state);
+      // Next pipeline stage
+      ++load2transform_pipeline_consumer_state;
+      ++transform2mma_pipeline_producer_state;
+
+      skip_wait = (k_tile_count <= 1);
+      // Peek the next pipeline stage's barriers
+      load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+      transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+    }
+    return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state);
+  }
+
+  template<class ProblemShape_MNKL, class Accumulator>
+  CUTLASS_DEVICE auto
+  transform_init(
+      Params const& params,
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Accumulator accumulators,
+      TensorStorage& shared_storage) {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});
+    Tensor sA = as_position_independent_swizzle_tensor(sA_orig);
+    Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+
+    Tensor sB_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});
+    Tensor sB = as_position_independent_swizzle_tensor(sB_orig);
+    Tensor sBCompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_BCompute.begin()), SmemLayoutBCompute{});
+
+    // Map input, compute, and fragment tensors to
+    //   Copy strategies and partitioned tensors. These will become the input
+    //   operands of the transform function. Depending on MMA atom type, the
+    //   operands can reside in SMEM or TMEM
+    auto setup_copy_ops = [&] (
+        auto tensor_input,
+        auto input_copy_atom,
+        auto tensor_compute,
+        auto make_fragment,
+        auto compute_copy_atom) constexpr {
+      auto fragment_compute = make_fragment(tensor_compute);
+      if constexpr (cute::is_tmem<cute::remove_cvref_t<decltype(fragment_compute)>>::value) {
+        // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation.
+        // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation
+        // See: TmemAllocMode::Duplicated.
+        Tensor tensor_input2x = [&] () constexpr {
+        if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) {
+          return make_tensor(tensor_input.data(),
+                             logical_product(tensor_input.layout(),
+                                             make_tile(make_tile(Layout<_2,_0>{},_),_,_,_)));   // ((128,16),m,k,PIPE)
+          }
+          else {
+            return tensor_input;
+          }
+        }();
+
+        fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        auto reg2tmem_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,_,0,0));
+        auto thr_reg2tmem_tiled_copy = reg2tmem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2tmem_tiled_copy.partition_S(tensor_input2x);
+        auto partitioned_tensor_compute = thr_reg2tmem_tiled_copy.partition_D(fragment_compute);
+        return cute::make_tuple(reg2tmem_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+      else {
+        auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute);
+        auto reg2smem_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout<Shape <_128,_8>, Stride<  _8,_1>>{},
+                                                     tensor_compute(_,_,_,0,0).layout());
+
+        auto thr_reg2smem_tiled_copy = reg2smem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2smem_tiled_copy.partition_S(tensor_input);
+        auto partitioned_tensor_compute = thr_reg2smem_tiled_copy.partition_D(tensor_compute_ind_sw);
+
+        return cute::make_tuple(AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+    };
+
+    auto [dst_copy_A, tAsA, tAsACompute] =
+        setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{});
+
+    auto [dst_copy_B, tBsB, tBsBCompute] =
+        setup_copy_ops(sB, InputCopyAtomB{}, sBCompute, [&](auto &arg) {return TiledMma::make_fragment_B(arg);}, ComputeCopyAtomB{});
+
+    return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute,
+                            gB_nkl,             tBsB, tBsBCompute);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class TensorA, class TensorB
+  >
+  CUTLASS_DEVICE auto
+  mma(
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_consumer_state,
+      Mma2AccumPipeline mma2accum_pipeline,
+      Mma2AccumPipelineState mma2accum_pipeline_producer_state,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      cute::tuple<TensorA, TensorB> const& input_operands,
+      int k_tile_count
+  ) {
+    TiledMma tiled_mma;
+
+    auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+    ++next_transform2mma_pipeline_consumer_state;
+
+    // tCrA : (MMA), MMA_M, MMA_K, NumComputeMtxs, SmemStage  (In SMEM or TMEM)
+    //      We use SMEM stages to match #buffers in Load <-> Convert
+    // tCrB : (MMA), MMA_N, MMA_K, NumComputeMtxs, SmemStages (In SMEM)
+    auto const [tCrA, tCrB] = input_operands;
+
+    using ZeroScaler = cute::integral_constant<uint32_t, 0>;
+    using Scaler = cute::integral_constant<uint32_t, ScalingFactor>;
+
+    int remaining_accum_promotions = k_tile_count * StagesPerTile;
+    uint32_t mma2accum_skip_wait = (remaining_accum_promotions <= 0);
+    auto mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag);
+
+      int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); k_block += DispatchPolicy::AccPromotionInterval, --remaining_accum_promotions) {
+        mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state, mma2accum_flag);
+
+        int mma2accum_pipeline_producer_state_index = mma2accum_pipeline_producer_state.index();
+        auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index);
+        auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state;
+
+        ++mma2accum_pipeline_producer_state;
+        mma2accum_skip_wait = (remaining_accum_promotions <= 1);
+        mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
+
+        auto tCrA0 = tCrA(_,_,_,0,transform2mma_pipeline_consumer_state_index);
+        auto tCrA1 = tCrA(_,_,_,1,transform2mma_pipeline_consumer_state_index);
+        auto tCrA2 = tCrA(_,_,_,2,transform2mma_pipeline_consumer_state_index);
+
+        auto tCrB0 = tCrB(_,_,_,0,transform2mma_pipeline_consumer_state_index);
+        auto tCrB1 = tCrB(_,_,_,1,transform2mma_pipeline_consumer_state_index);
+        auto tCrB2 = tCrB(_,_,_,2,transform2mma_pipeline_consumer_state_index);
+
+        // MMA instructions Emulation
+        auto accumulate = UMMA::ScaleOut::Zero;
+
+        // First set of GEMMs that we need to perform for each band are unrolled to set compile-time constant
+        // scaling parameter. Scaled GEMM operations are only needed for the first MMA operation of each band.
+
+        // Band 5
+        if constexpr (NumBandsToCompute == 5) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB2(_,_,k_block), tCtC);         // A[2]*B[2]
+          accumulate = UMMA::ScaleOut::One;
+          CUTLASS_PRAGMA_UNROLL
+          for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);   // A[2]*B[2]
+          }
+        }
+        // Band 4
+        if constexpr (NumBandsToCompute >= 4) {
+          cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA1(_,_,k_block), tCrB2(_,_,k_block), tCtC);             // A[1]*B[2]
+          accumulate = UMMA::ScaleOut::One;
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB1(_,_,k_block), tCtC);         // A[2]*B[1]
+          CUTLASS_PRAGMA_UNROLL
+          for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);   // A[1]*B[2]
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);   // A[2]*B[1]
+          }
+        }
+        // Band 3
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB2(_,_,k_block), tCtC);               // A[2]*B[0]
+        accumulate = UMMA::ScaleOut::One;
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block), tCrB1(_,_,k_block), tCtC);           // A[1]*B[1]
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB0(_,_,k_block), tCtC);           // A[0]*B[2]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);     // A[2]*B[0]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);     // A[1]*B[1]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[0]*B[2]
+        }
+        // Band 2
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB1(_,_,k_block), tCtC);               // A[0]*B[1]
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block), tCrB0(_,_,k_block), tCtC);           // A[1]*B[0]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);     // A[0]*B[1]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[1]*B[0]
+        }
+        // Band 1
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC);               // A[0]*B[0]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[0]*B[0]
+        }
+        mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state);
+      }
+
+      transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state);
+
+      skip_wait = (k_tile_count <= 1);
+      transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+
+      curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state;
+      ++next_transform2mma_pipeline_consumer_state;
+    }
+    return cute::make_tuple(curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state);
+  }
+
+  template<class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TensorStorage& shared_storage) const {
+    TiledMma tiled_mma;
+
+    auto get_tCrA = [&] () constexpr {
+      if constexpr (cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value) {
+        Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+        return tiled_mma.make_fragment_A(sACompute);
+      }
+      else {
+        auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{}));
+        tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        return tCrA;
+      }
+    };
+
+    Tensor tCrA = get_tCrA();
+    Tensor sBCompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_BCompute.begin()), SmemLayoutBCompute{});
+    Tensor tCrB = tiled_mma.make_fragment_B(sBCompute);
+    return cute::make_tuple(tCrA, tCrB);
+  }
+
+  template<class FrgEngine, class FrgLayout, class TmemCopyAtom, class EpilogueTile>
+  CUTLASS_DEVICE auto
+  accum_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) {
+    // Obtain a single accumulator
+    Tensor tAcc = tensor<0>(accumulators(_,_,_,_0{}));
+    // Apply epilogue subtiling
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    // Create the TMEM copy for single EpilogueTile.
+    // Note that EpilogueTile = CtaTile for NoSmem epilogue
+    auto tiled_t2r = make_tmem_copy(tmem_cp_atom, tAcc_epi(_,_,_0{},_0{}));
+    auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
+    Tensor tTR_gC   = thread_t2r.partition_D(tAcc_epi);
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_gC));                               // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc = make_tensor<ElementAccumulator>(shape(tTR_gC));                           // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rAcc);                       // (T2R/2,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rGlobAcc);               // (T2R/2,T2R_M,T2R_N)
+
+    // Apply epilogue subtiling to bulk accumulator
+    // We need to tile the whole bulk_tmem allocation with EpilogueTile.
+    // The accumulation should be aware of the AccumulatorPipelineStages
+    Tensor tBulkAcc_epi = flat_divide(accumulators(make_coord(_,_),_0{},_0{}, _), EpilogueTile{});  // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,PIPE)
+    Tensor tTR_tBulkAcc = thread_t2r.partition_S(tBulkAcc_epi);                                           // (T2R,T2R_M,T2R_N,EPI_M,EPI_N,PIPE)
+    return cute::make_tuple(tiled_t2r, thread_t2r, tTR_tBulkAcc, tTR_rAcc, tTR_rGlobAcc);
+  }
+
+  template<class TiledCopy, class ThrCopy, class AccumulatorTensor, class LocalAccFrg, class GlobalAccFrg>
+  CUTLASS_DEVICE auto
+  accum(cute::tuple<TiledCopy, ThrCopy, AccumulatorTensor, LocalAccFrg, GlobalAccFrg> accum_inputs,
+        Mma2AccumPipeline mma2accum_pipeline,
+        Mma2AccumPipelineState mma2accum_pipeline_consumer_state,
+        int k_tile_count) {
+    auto [tiled_t2r, thread_t2r, tTR_tBulkAcc,
+          tTR_rAcc, tTR_rGlobAcc] = accum_inputs;
+
+
+    Tensor tTR_rAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rAcc);                       // (T2R/2,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rGlobAcc);               // (T2R/2,T2R_M,T2R_N)
+
+    // Clear the global accumulator
+    CUTE_UNROLL
+    for (int i = 0; i<size(tTR_rGlobAcc); i++) {
+      tTR_rGlobAcc(i) = ElementAccumulator(0);
+    }
+
+    uint32_t skip_wait = 0;
+    auto mma2accum_flag = mma2accum_pipeline.consumer_try_wait(mma2accum_pipeline_consumer_state, skip_wait);
+
+    // 1. Global periodic accumulation in registers
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // The stage is limited to a CTA tile
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int k_block = 0; k_block<StagesPerTile; k_block++) {
+        int mma2accum_pipeline_consumer_state_index = mma2accum_pipeline_consumer_state.index();
+        mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state, mma2accum_flag);
+        auto prev_state = mma2accum_pipeline_consumer_state;
+
+        copy(tiled_t2r, tTR_tBulkAcc(_,_,_,_,_,mma2accum_pipeline_consumer_state_index), tTR_rAcc);
+        cute::transform(tTR_rGlobAcc_float2, tTR_rAcc_float2, tTR_rGlobAcc_float2, cutlass::plus<Array<ElementAccumulator,2>>{});
+
+        cutlass::arch::fence_view_async_tmem_load(); // Need a fence bw TMEM_LOAD and arrive
+        mma2accum_pipeline.consumer_release(mma2accum_pipeline_consumer_state);
+
+        ++mma2accum_pipeline_consumer_state;
+        skip_wait = ((k_tile_count <= 1) && (k_block >= (StagesPerTile-1)));
+        mma2accum_flag = mma2accum_pipeline.consumer_try_wait(mma2accum_pipeline_consumer_state, skip_wait);
+      }
+    }
+    return cute::make_tuple(mma2accum_pipeline_consumer_state, tTR_rGlobAcc);
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(Params const& mainloop_params, int32_t const sm_count, int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to gmem for modification later
+      Tensor pA_tensormap = make_tensor(observed_tma_load_a_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor gA_tensormap = make_tensor(tma_desc_a, Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(observed_tma_load_b_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor gB_tensormap = make_tensor(tma_desc_b, Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(gA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(gB_tensormap));
+    }
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Bringing tensormaps to smem (to be done by single thread)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fetch_to_smem(
+      TensorMapStorage& shared_tensormap,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) const {
+    Tensor gA_tensormap = make_tensor(make_gmem_ptr(get<0>(input_tensormaps)), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormap.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor gB_tensormap = make_tensor(make_gmem_ptr(get<1>(input_tensormaps)), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormap.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    copy(recast<uint128_t>(gA_tensormap), recast<uint128_t>(sA_tensormap));
+    copy(recast<uint128_t>(gB_tensormap), recast<uint128_t>(sB_tensormap));
+
+    cp_async_fence();
+    cp_async_wait<0>();
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormap,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormap.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormap.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormap,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      int32_t next_batch,
+      uint32_t lane_predicate) {
+    if (lane_predicate) {
+      // Bringing tensormaps to smem
+      tensormaps_fetch_to_smem(shared_tensormap, input_tensormaps);
+
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormap, mainloop_params, next_batch);
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormap,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormap.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormap.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+protected:
+
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  constexpr auto
+  tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const {
+    using X = cute::Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e744ffb6c2eec59e29f2e2f2fe123a60e3df6b4e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_cpasync_warpspecialized.hpp
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/memory.h"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100UmmaCpAsyncWarpSpecialized<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  // Statically asserting to ensure only 1x1x1 cluster shape & 1sm setup is received
+  static_assert(size(AtomThrShapeMNK{}) == 1, "Lower alignment SM100 GEMM only supports 1SM MMA");
+  static_assert(size(ClusterShape{}) == 1, "CPASYNC does not support multicast so the cluster shape is restricted to 1, 1, 1");
+
+  using DispatchPolicy = MainloopSm100UmmaCpAsyncWarpSpecialized<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  // TileShape refers to MmaTileShape to adapt for runtime cluster shape
+  using TileShape = TileShape_;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  // Define A and B block shapes
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  using LoadShapeA_MK = decltype(select<0,2>(TileShape{}));
+  using LoadShapeB_NK = decltype(select<1,2>(TileShape{}));
+
+  // CtaShape_MNK is queried from collective in all kernel layers
+  using CtaShape_MNK = TileShape;
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+
+  static constexpr bool IsRuntimeDataTypeA = cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float8_t>;
+  static constexpr bool IsRuntimeDataTypeB = cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float8_t>;
+
+  static_assert(IsRuntimeDataTypeA == IsRuntimeDataTypeB,
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineUmmaConsumerAsync<DispatchPolicy::Stages, AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static_assert(size(GmemTiledCopyA{}) == size(GmemTiledCopyB{}), "A and B GmemTiledCopy should share the same thread count");
+  static constexpr int NumLoadThreads = size(GmemTiledCopyA{});
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using MmaSmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  using LoadSmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      append(LoadShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  using MmaSmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  using LoadSmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      append(LoadShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<SmemAllocTypeA, cute::cosize_v<LoadSmemLayoutA>> smem_A;
+      cute::array_aligned<SmemAllocTypeB, cute::cosize_v<LoadSmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    auto ptr_A = recast_ptr<ElementAMma>(args.ptr_A);
+    auto ptr_B = recast_ptr<ElementBMma>(args.ptr_B);
+
+    return {
+      args.ptr_A,
+      args.dA,
+      args.ptr_B,
+      args.dB,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for CpAsync.\n");
+    }
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.ptr_A), make_shape(M,K,L), params.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.ptr_B), make_shape(N,K,L), params.dB); //(n,k,l)
+    // Partition for cpasync
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l)
+
+    // Build the coordinate tensors with the same shape as input matrices
+    Tensor cA_mk  = make_identity_tensor(make_shape(M,K));
+    Tensor cB_nk  = make_identity_tensor(make_shape(N,K));
+
+    // Slice the coordinate tensors in the same way as A/B tensor partitioning
+    Tensor cgA_mk = local_tile(cA_mk, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); // (BLK_M,BLK_K,m,k)
+    Tensor cgB_nk = local_tile(cB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), LoadSmemLayoutA{});
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), LoadSmemLayoutB{});
+
+    GmemTiledCopyA gmem_to_smem_a_tiled_copy;
+    GmemTiledCopyB gmem_to_smem_b_tiled_copy;
+
+    int thread_idx = threadIdx.x % NumLoadThreads;
+    auto thr_copy_a = gmem_to_smem_a_tiled_copy.get_slice(thread_idx);
+    auto thr_copy_b = gmem_to_smem_b_tiled_copy.get_slice(thread_idx);
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl, // gmem
+        cgA_mk, cgB_nk, // crd
+        sA, sB,         // smem
+        problem_shape_MNKL, 
+        gmem_to_smem_a_tiled_copy, gmem_to_smem_b_tiled_copy, 
+        thr_copy_a, thr_copy_b);
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(
+      Params const& params,
+      [[maybe_unused]] cute::tuple<cute::Tensor<FrgEngine, FrgLayout>, cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), MmaSmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), MmaSmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class CTensorA, class CTensorB,
+    class STensorA, class STensorB,
+    class ProblemShape_MNKL,
+    class TiledCopyA, class TiledCopyB,
+    class ThreadCopyA, class ThreadCopyB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    Params const& params,
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                CTensorA, CTensorB,
+                STensorA, STensorB,
+                ProblemShape_MNKL,
+                TiledCopyA, TiledCopyB,
+                ThreadCopyA, ThreadCopyB> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+    // Unpack from load_inputs
+    GTensorA tAgA_mkl = get<0>(load_inputs);
+    GTensorB tBgB_nkl = get<1>(load_inputs);
+    CTensorA cgA_mk = get<2>(load_inputs);
+    CTensorB cgB_nk = get<3>(load_inputs);
+    STensorA sA = get<4>(load_inputs);
+    STensorB sB = get<5>(load_inputs);
+    ProblemShape_MNKL problem_shape_MNKL = get<6>(load_inputs);
+    TiledCopyA gmem_to_smem_a_tiled_copy = get<7>(load_inputs);
+    TiledCopyB gmem_to_smem_b_tiled_copy = get<8>(load_inputs);
+    ThreadCopyA thr_copy_a = get<9>(load_inputs);
+    ThreadCopyB thr_copy_b = get<10>(load_inputs);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Slice out the work coord from partitioned tensors
+    Tensor gA_in = tAgA_mkl(_, _, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor gB_in = tBgB_nkl(_, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    // Repeat slicing out coordinate tensor exactly the same as input tensor does
+    Tensor cgA_mk_in = cgA_mk(_, _, get<0>(cta_coord_mnkl), _);
+    Tensor cgB_nk_in = cgB_nk(_, _, get<1>(cta_coord_mnkl), _);
+
+    auto k_residue    = K - size<1>(gB_in) * size<2>(gA_in);
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, k_residue, 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, k_residue, 0), gB_in);
+
+    Tensor cA = domain_offset(make_coord(0, k_residue, 0), cgA_mk_in);
+    Tensor cB = domain_offset(make_coord(0, k_residue, 0), cgB_nk_in);
+
+    auto tAgA = thr_copy_a.partition_S(gA);
+    auto tAsA = thr_copy_a.partition_D(sA);
+
+    auto tBgB = thr_copy_b.partition_S(gB);
+    auto tBsB = thr_copy_b.partition_D(sB);
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    Tensor tAcA = thr_copy_a.partition_S(cA);
+    Tensor tBcB = thr_copy_b.partition_S(cB);
+
+    // Copy gmem to smem for *k_tile_iter, predicating for k residue
+    Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+    Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+
+    // Repeating on predicators with the same operations on tAgA and tBgB
+    Tensor tAcAk = tAcA(_,_,_,*k_tile_iter);
+    Tensor tBcBk = tBcB(_,_,_,*k_tile_iter);
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = elem_less(get<0>(tAcAk(0,m,0)), M);  // blk_m coord < M
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = elem_less(get<0>(tBcBk(0,n,0)), N);  // blk_n coord < N
+    }
+
+    // 0-th stage with predication on k to account for residue
+    // For performance consideration,
+    // this predicated block for K-tail is only activated when there is k-residue
+    if (k_residue != 0 && k_tile_count > 0)  {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if ( int(get<1>(tAcAk(0,0,k))) >= 0) {      // blk_k coord < K
+          copy_if(gmem_to_smem_a_tiled_copy, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (int(get<1>(tBcBk(0,0,k))) >= 0) {      // blk_k coord < K
+          copy_if(gmem_to_smem_b_tiled_copy, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK mainloop_pipe_producer_state
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance mainloop_pipe_producer_state
+      ++mainloop_pipe_producer_state;
+    }
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      auto mainloop_pipe_producer_state_curr = mainloop_pipe_producer_state;
+      ++mainloop_pipe_producer_state;
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state_curr, barrier_token);
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state_curr.index();
+
+      copy_if(gmem_to_smem_a_tiled_copy, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_to_smem_b_tiled_copy, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state_curr, cutlass::arch::cpasync_barrier_arrive);
+      
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB
+  >
+  CUTLASS_DEVICE auto
+  mma(MainloopPipeline mainloop_pipeline,
+      MainloopPipelineState mainloop_pipe_consumer_state,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>, cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state);
+
+      int read_stage = mainloop_pipe_consumer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline.consumer_release(mainloop_pipe_consumer_state);
+      --k_tile_count;
+      ++mainloop_pipe_consumer_state;
+  }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_mixed_tma_cpasync_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_mixed_tma_cpasync_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c31ec335a5152032fca9a43a4d96613de260d1f3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_mixed_tma_cpasync_warpspecialized.hpp
@@ -0,0 +1,758 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/memory.h"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100UmmaMixedTmaCpAsyncWarpSpecialized<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  // Statically asserting to ensure only 1x1x1 cluster shape & 1sm setup is received
+  static_assert(size(AtomThrShapeMNK{}) == 1, "Lower alignment SM100 GEMM only supports 1SM MMA");
+  static_assert(size(ClusterShape{}) == 1, "CPASYNC does not support multicast so the cluster shape is restricted to 1, 1, 1");
+
+  static_assert(size(typename TiledMma::AtomThrID{}) == 1);
+
+  using DispatchPolicy = MainloopSm100UmmaMixedTmaCpAsyncWarpSpecialized<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  // TileShape refers to MmaTileShape to adapt for runtime cluster
+  using TileShape = TileShape_;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  // Define A and B block shapes
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  // using LoadShapeA_MK = decltype(select<0,2>(TileShape{}));
+  using LoadShapeB_NK = decltype(select<1,2>(TileShape{}));
+
+  // CtaShape_MNK is queried from collective in all kernel layers
+  using CtaShape_MNK = TileShape;
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+
+  static constexpr bool IsRuntimeDataTypeA = cute::is_same_v<ElementA, cutlass::type_erased_dynamic_float8_t>;
+  static constexpr bool IsRuntimeDataTypeB = cute::is_same_v<ElementB, cutlass::type_erased_dynamic_float8_t>;
+
+  static_assert(IsRuntimeDataTypeA == IsRuntimeDataTypeB,
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipelineTMA = cutlass::PipelineTmaUmmaAsync<DispatchPolicy::Stages, ClusterShape, AtomThrShapeMNK>;
+  using MainloopPipelineTMAState = typename MainloopPipelineTMA::PipelineState;
+
+  using MainloopPipelineCpAsync = cutlass::PipelineUmmaConsumerAsync<DispatchPolicy::Stages, AtomThrShapeMNK>;
+  using MainloopPipelineCpAsyncState = typename MainloopPipelineCpAsync::PipelineState;
+
+  // static_assert(size(GmemTiledCopyA{}) == size(GmemTiledCopyB{}), "A and B GmemTiledCopy should share the same thread count");
+  static constexpr int NumLoadThreadsCpAsync = size(GmemTiledCopyB{});
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+
+  using MmaSmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  using LoadSmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      append(LoadShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, cutlass::tfloat32_t, ElementAMma>;
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<SmemAllocTypeB, cute::cosize_v<LoadSmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorageTMA = typename MainloopPipelineTMA::SharedStorage;
+    using PipelineStorageCpAsync = typename MainloopPipelineCpAsync::SharedStorage;
+
+    struct PipelineStorage : cute::aligned_struct<16, _0> {
+      alignas(16) PipelineStorageTMA tma;
+      alignas(16) PipelineStorageCpAsync cpasync;
+    } pipelines;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>);
+
+  template <class AccTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(ClusterShape{}),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params)
+    : runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    
+    observed_tma_load_a_ = &params.tma_load_a;
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<ElementBMma>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+
+    auto cluster_layout_vmnk = tiled_divide(make_layout(ClusterShape{}), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    return {
+      tma_load_a,
+      args.ptr_B,
+      args.dB,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+
+    bool implementable = true;
+
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for CpAsync.\n");
+    }
+    
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static 
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    TmemStorage<decltype(accumulators)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init_tma(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(0);
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(ClusterShape{});
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(0);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+                                      
+    return cute::make_tuple(
+      shape<3>(gA_mkl),      // for scheduler
+      tAgA_mkl, tAsA        // for input tensor values
+    );
+  }
+
+  template <class ProblemShape_MNKL, class TileScheduler>
+  CUTLASS_DEVICE auto
+  load_init_cpasync(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TileScheduler const& scheduler,
+      typename TileScheduler::WorkTileInfo const& work_tile_info) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.ptr_B), make_shape(N,K,L), params.dB); //(n,k,l)
+    // Partition for cpasync
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k,l)
+
+    // Build the coordinate tensors with the same shape as input matrices
+    Tensor cB_nk  = make_identity_tensor(make_shape(N,K));
+    // Slice the coordinate tensors in the same way as A/B tensor partitioning
+    Tensor cgB_nk = local_tile(cB_nk, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); // (BLK_N,BLK_K,n,k)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), LoadSmemLayoutB{});
+
+    GmemTiledCopyB gmem_to_smem_b_tiled_copy;
+
+    int thread_idx = threadIdx.x % NumLoadThreadsCpAsync;
+    auto thr_copy_b = gmem_to_smem_b_tiled_copy.get_slice(thread_idx);
+
+    return cute::make_tuple(
+      gB_nkl, cgB_nk, sB, 
+      gmem_to_smem_b_tiled_copy, thr_copy_b);
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+      Params const& params,
+      [[maybe_unused]] TmemStorage tmem_storage,
+      // [[maybe_unused]] cute::tuple<cute::Tensor<FrgEngine, FrgLayout>, cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), MmaSmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    return cute::make_tuple(tiled_mma, tCrA, tCrB);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class KTileCount,
+    class GTensorPartitionedA,
+    class STensorA,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_tma(
+    MainloopPipelineTMA mainloop_pipeline,
+    MainloopPipelineTMAState mainloop_pipe_producer_state,
+    cute::tuple<KTileCount, 
+                GTensorPartitionedA,
+                STensorA> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+    
+    // Unpack from load_inputs
+    KTileCount k_tiles = get<0>(load_inputs);
+    GTensorPartitionedA tAgA_mkl = get<1>(load_inputs);
+    STensorA tAsA = get<2>(load_inputs);
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopPipelineTMA::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+
+  template <
+    // class GTensorB,
+    // class CTensorB,
+    // class STensorB,
+    // class ProblemShape_MNKL,
+    // class TiledCopyB,
+    // class ThreadCopyB,
+    class TileCoordMNKL,
+    class KTileIterator,
+    class ProblemShape_MNKL,
+    class... TParams
+  >
+  CUTLASS_DEVICE auto
+  load_cpasync(
+    Params const& params,
+    MainloopPipelineCpAsync mainloop_pipeline,
+    MainloopPipelineCpAsyncState mainloop_pipe_producer_state,
+    cute::tuple<TParams...> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count,
+    ProblemShape_MNKL effective_shape
+  ) {
+
+    // Unpack from load_inputs
+    // GTensorB tBgB_nkl = get<0>(load_inputs);
+    // CTensorB cgB_nk = get<1>(load_inputs);
+    // STensorB sB = get<2>(load_inputs);
+    // ProblemShape_MNKL problem_shape_MNKL = get<3>(load_inputs);
+    // TiledCopyB gmem_to_smem_b_tiled_copy = get<4>(load_inputs);
+    // ThreadCopyB thr_copy_b = get<5>(load_inputs);
+
+    auto [
+      tBgB_nkl, cgB_nk, sB, 
+      // problem_shape_MNKL, 
+      gmem_to_smem_b_tiled_copy, thr_copy_b] = load_inputs;
+
+    auto [M,N,K,L] = effective_shape;
+
+    // Slice out the work coord from partitioned tensors
+    Tensor gB_in = tBgB_nkl(_, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    // Repeat slicing out coordinate tensor exactly the same as input tensor does
+    Tensor cgB_nk_in = cgB_nk(_, _, get<1>(cta_coord_mnkl), _);
+
+    auto k_residue    = K - size<1>(gB_in) * size<2>(gB_in);  // K - BLK_K * k is negative
+
+    Tensor gB = gB_in;
+    Tensor cB = cgB_nk_in;
+
+    auto tBgB = thr_copy_b.partition_S(gB);
+    auto tBsB = thr_copy_b.partition_D(sB);
+
+    // Allocate predicate tensors for n
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+    Tensor tBcB_nk = thr_copy_b.partition_S(cgB_nk_in);
+    Tensor tBcB = thr_copy_b.partition_S(cB);
+
+    // Copy gmem to smem for *k_tile_iter, predicating for k residue
+    Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+
+    // Repeating on predicators with the same operations on tBgB
+    Tensor tBcBk = tBcB(_,_,_,*k_tile_iter);
+
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = elem_less(get<0>(tBcBk(0,n,0)), N);  // blk_n coord < N
+    }
+
+    // we will process the last tile after the mainloop
+    if (k_residue != 0) {
+      --k_tile_count;
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+
+      copy_if(gmem_to_smem_b_tiled_copy, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive);
+      --k_tile_count;
+      ++k_tile_iter;
+      ++mainloop_pipe_producer_state;
+    }
+    
+    // last tile with predication on k to account for residue
+    // For performance consideration,
+    // this predicated block for K-tail is only activated when there is k-residue
+    if (k_residue != 0)  {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state);
+      int write_stage = mainloop_pipe_producer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (int(get<1>(tBcBk(0,0,k))) >= 0) {      // blk_k coord < K
+          copy_if(gmem_to_smem_b_tiled_copy, tBpB(_,k), tBgB(_,_,k,*k_tile_iter), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK mainloop_pipe_producer_state
+      mainloop_pipeline.producer_commit(mainloop_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance mainloop_pipe_producer_state
+      ++mainloop_pipe_producer_state;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail_tma(MainloopPipelineTMA mainloop_pipeline, MainloopPipelineTMAState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+  CUTLASS_DEVICE void
+  load_tail_cpasync(MainloopPipelineCpAsync mainloop_pipeline, MainloopPipelineCpAsyncState mainloop_pipe_producer_state) {
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipelineTMA,
+                  MainloopPipelineCpAsync,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineTMAState,
+                  MainloopPipelineCpAsyncState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      cute::tuple<TiledMma, FragmentA, FragmentB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    auto [mainloop_pipeline_tma, mainloop_pipeline_cpasync, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_tma_consumer_state, mainloop_pipe_cpasync_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    // Wait for tmem accumulator buffer to become empty with a flipped phase
+    accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      mainloop_pipeline_tma.consumer_wait(mainloop_pipe_tma_consumer_state);
+      mainloop_pipeline_cpasync.consumer_wait(mainloop_pipe_cpasync_consumer_state);
+
+      int read_stage_tma = mainloop_pipe_tma_consumer_state.index();
+      int read_stage_cpasync = mainloop_pipe_cpasync_consumer_state.index();
+
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage_tma), tCrB(_,_,k_block,read_stage_cpasync), accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      mainloop_pipeline_tma.consumer_release(mainloop_pipe_tma_consumer_state);
+      mainloop_pipeline_cpasync.consumer_release(mainloop_pipe_cpasync_consumer_state);
+      --k_tile_count;
+      ++mainloop_pipe_tma_consumer_state;
+      ++mainloop_pipe_cpasync_consumer_state;
+    }
+
+    return cute::make_tuple(mainloop_pipe_tma_consumer_state, mainloop_pipe_cpasync_consumer_state);
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe5ee3cd31c20f2e4f504777a33d2a25fb99a1cd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp
@@ -0,0 +1,726 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecialized<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecialized<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, cutlass::tfloat32_t, ElementAMma>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+
+  template <class AccTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+  };
+
+  template <
+    class KTileCount,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_,
+        STensorA tAsA_, STensorB tBsB_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_)
+    : k_tiles(k_tiles_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    FragmentA tCrA;
+    FragmentB tCrB;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    TmemStorage<decltype(accumulators)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return LoadParams{
+      shape<3>(gA_mkl),                       // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+      mcast_mask_a, mcast_mask_b};           // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    [[maybe_unused]] TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_k_tiles,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    // Wait for tmem accumulator buffer to become empty with a flipped phase
+    accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma,
+                   tCrA(_,_,k_block,read_stage),
+                   tCrB(_,_,k_block,read_stage),
+                   accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..047d9b98ab2c0a638304b789caac96f800992a82
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp
@@ -0,0 +1,1239 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedBlockwiseScaling<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedBlockwiseScaling<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using StrideA = cute::remove_cvref_t<decltype(get<0>(StridePairA_{}))>;
+  using LayoutSFA = cute::remove_cvref_t<decltype(get<1>(StridePairA_{}))>;
+  using ElementSFA = typename TiledMma::ValTypeC;
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = cute::remove_cvref_t<decltype(get<0>(StridePairB_{}))>;
+  using LayoutSFB = cute::remove_cvref_t<decltype(get<1>(StridePairB_{}))>;
+  using ElementSFB = typename TiledMma::ValTypeC;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  static constexpr int ScaleGranularityM = size<0,0>(LayoutSFA{});
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static_assert(size<0>(TileShape{}) % ScaleGranularityM == 0 and ScaleGranularityM <= size<0>(TileShape{}), "Scale Granularity M must divide Tile Shape");
+
+  static constexpr int ScaleGranularityN = size<0,0>(LayoutSFB{});
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+  static_assert(size<1>(TileShape{}) % ScaleGranularityN == 0 and ScaleGranularityN <= size<1>(TileShape{}), "Scale Granularity N must divide Tile Shape");
+
+  static_assert(size<1, 0>(LayoutSFA{}) == size<1, 0>(LayoutSFB{}), "Vector size K must be equal for SFA and SFB");
+
+  static constexpr int ScaleGranularityK = size<1, 0>(LayoutSFA{});
+  static constexpr int ScaleKsPerTile = size<2>(TileShape{}) / ScaleGranularityK;
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0 and ScaleGranularityK <= size<2>(TileShape{}), "Scale Granularity K must divide Tile Shape");
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0, "Scale Granularity K must be divisible by MMA_K");
+
+  static constexpr int K_BLOCK_MMAS_PER_SCALE_K = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
+
+  using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig<ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      size<0,1>(LayoutSFA{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K,
+      size<0,1>(LayoutSFB{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K>;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  static_assert(size<0>(CtaShape_MNK{}) >= ScaleGranularityM, "Scale Granularity must be smaller than or equal to the tile shape");
+  static_assert(size<1>(CtaShape_MNK{}) >= ScaleGranularityN, "Scale Granularity must be smaller than or equal to the tile shape");
+  static_assert(size<2>(CtaShape_MNK{}) >= ScaleGranularityK, "Scale Granularity must be smaller than or equal to the tile shape");
+
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(CtaShape_MNK{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(CtaShape_MNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = cute::remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopySFA = cute::remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopyB = cute::remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB_{}))>;
+  using GmemTiledCopySFB = cute::remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB_{}))>;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
+                                DispatchPolicy::Stages,
+                                ClusterShape,
+                                AtomThrShapeMNK>;
+  using MainloopABPipelineState = typename MainloopABPipeline::PipelineState;
+
+  using MainloopSFPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using MainloopSFPipelineState = typename MainloopSFPipeline::PipelineState;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<
+                                  AccumulatorPipelineStageCount,
+                                  AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator);
+  static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator);
+
+  static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ?
+      (size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1);
+  static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ?
+      (size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1);
+
+
+  // Two arrivals per thread in the warp (1 arrival and 1 arrival through cp.async.mbarrier)
+  static constexpr int NumMainloopSFProducerThreadEvents = 64;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtomA must be rank 2 (M,K)");
+  static_assert(((size<0,0>(MmaShapeA_MK{}) * size<1>(MmaShapeA_MK{})) % size<0>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeA_MK{}) * size<2>(MmaShapeA_MK{})) % size<1>(SmemLayoutAtomA{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtomB must be rank 2 (N,K)");
+  static_assert(((size<0,0>(MmaShapeB_NK{}) * size<1>(MmaShapeB_NK{})) % size<0>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(((size<0,1>(MmaShapeB_NK{}) * size<2>(MmaShapeB_NK{})) % size<1>(SmemLayoutAtomB{})) == 0,
+      "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>, cutlass::tfloat32_t, ElementAMma>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::conditional_t<cute::sizeof_bits_v<ElementAMma> < 8, uint8_t, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  using SmemLayoutScaleA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutScaleB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementAccumulator, cute::cosize_v<SmemLayoutScaleA>> smem_SFA;
+      cute::ArrayEngine<ElementAccumulator, cute::cosize_v<SmemLayoutScaleB>> smem_SFB;
+    } tensors;
+
+    using PipelineABStorage = typename MainloopABPipeline::SharedStorage;
+    using PipelineSFStorage = typename MainloopSFPipeline::SharedStorage;
+    using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+    struct PipelineStorage {
+      alignas(16) PipelineABStorage pipeline_ab;
+      alignas(16) PipelineSFStorage pipeline_sf;
+      alignas(16) AccumulatorPipelineStorage pipeline_accum;
+    };
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+
+  template<class AccTensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+  };
+
+  template<
+    class KTileCount,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB
+  >
+  struct LoadABParams {
+    // for scheduler
+    KTileCount k_tiles;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+
+    CUTLASS_DEVICE
+    LoadABParams (
+        KTileCount k_tiles_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_,
+        STensorA tAsA_, STensorB tBsB_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_)
+    : k_tiles(k_tiles_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_) {}
+  };
+
+  template<
+    class KTileCount,
+    class GTensorScaleA, class GTensorScaleB,
+    class IdentTensorScaleA, class IdentTensorScaleB,
+    class STensorScaleA, class STensorScaleB
+  >
+  struct LoadSFParams {
+    // for scheduler
+    KTileCount k_tiles;
+
+    GTensorScaleA gSFA_mkl;
+    GTensorScaleB gSFB_nkl;
+    IdentTensorScaleA identSFA_mkl;
+    IdentTensorScaleB identSFB_nkl;
+    STensorScaleA sSFA;
+    STensorScaleB sSFB;
+
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+
+    CUTLASS_DEVICE
+    LoadSFParams (
+        KTileCount k_tiles_,
+        GTensorScaleA gSFA_mkl_, GTensorScaleB gSFB_nkl_,
+        IdentTensorScaleA identSFA_mkl_, IdentTensorScaleB identSFB_nkl_,
+        STensorScaleA sSFA_, STensorScaleB sSFB_,
+        LayoutSFA layout_SFA_, LayoutSFB layout_SFB_)
+    : k_tiles(k_tiles_)
+    , gSFA_mkl(gSFA_mkl_), gSFB_nkl(gSFB_nkl_)
+    , identSFA_mkl(identSFA_mkl_), identSFB_nkl(identSFB_nkl_)
+    , sSFA(sSFA_), sSFB(sSFB_)
+    , layout_SFA(layout_SFA_), layout_SFB(layout_SFB_) {}
+  };
+
+  template<class FragmentA, class FragmentB>
+  struct MmaParams {
+    TiledMma tiled_mma;
+    FragmentA tCrA;
+    FragmentB tCrB;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_) {}
+  };
+
+  template<
+    class STensorScaleA, class STensorScaleB
+  >
+  struct AccumTransformParams {
+    // for scheduler
+
+    STensorScaleA sSFA;
+    STensorScaleB sSFB;
+
+    CUTLASS_DEVICE
+    AccumTransformParams (
+        STensorScaleA sSFA_, STensorScaleB sSFB_)
+    :  sSFA(sSFA_), sSFB(sSFB_) {}
+  };
+
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementAccumulator const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementAccumulator const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+
+    ElementAccumulator const* ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementAccumulator const* ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    bool implementable_sf = cutlass::detail::check_alignment<CopyAlignmentSFA>(args.layout_SFA);
+    implementable_sf = implementable_sf && cutlass::detail::check_alignment<CopyAlignmentSFB>(args.layout_SFB);
+
+    if (!implementable_sf) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n");
+    }
+
+    return implementable && implementable_sf;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));     // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template<class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    TmemStorage<decltype(accumulators)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    return tmem_storage;
+  }
+
+  template<class AccTensor>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage<AccTensor>& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return load params containing
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL,
+            class MainloopParams>
+  CUTLASS_DEVICE auto
+  load_ab_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);                                       // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);                                       // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    LoadABParams load_params {
+      shape<3>(gA_mkl),                               // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,                 // for input tensor values
+      mcast_mask_a, mcast_mask_b,                     // multicast masks
+    };
+    return load_params;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return load params containing
+  /// tSFAgSFA_mkl - partitioned gmem tensor for SFA
+  /// tSFBgSFB_nkl - partitioned gmem tensor for SFB
+  /// tSFAIdentSFA_mkl - partitioned identity tensor for SFA in gmem
+  /// tSFBIdentSFB_nkl - partitioned identity tensor for SFB in gmem
+  /// tSFAsSFA - partitioned smem tensor for SFA
+  /// tSFBsSFB - partitioned smem tensor for SFB
+  /// layout_SFA - layout of SFA in gmem
+  /// layout_SFB - layout of SFB in gmem
+  template <class ProblemShape_MNKL,
+            class MainloopParams>
+  CUTLASS_DEVICE auto
+  load_sf_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      MainloopParams const& mainloop_params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA), mainloop_params.layout_SFA);    // (m,k,l)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB), mainloop_params.layout_SFB);    // (n,k,l)
+
+    Tensor SFA_mkl_ident = make_identity_tensor(shape(mainloop_params.layout_SFA));
+
+    Tensor SFB_nkl_ident = make_identity_tensor(shape(mainloop_params.layout_SFB));
+
+    // Tile the tensors and defer the slice
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, CtaShape_MNK{},
+        make_coord(_,_,_), Step<_1, X,_1>{});                                                 // (BLK_M, BLK_K, m, k, l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, CtaShape_MNK{},
+        make_coord(_,_,_), Step< X,_1,_1>{});                                                 // (BLK_N, BLK_K, n, k, l)
+
+    Tensor identSFA_mkl = local_tile(SFA_mkl_ident, CtaShape_MNK{},
+        make_coord(_,_,_), Step<_1, X,_1>{});                                                 // (BLK_M, BLK_K, m, k, l)
+    Tensor identSFB_nkl = local_tile(SFB_nkl_ident, CtaShape_MNK{},
+        make_coord(_,_,_), Step< X,_1,_1>{});                                                 // (BLK_N, BLK_K, n, k, l)
+
+    static_assert(rank(decltype(gSFA_mkl){}) == 5);
+    static_assert(rank(decltype(gSFB_nkl){}) == 5);
+
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()),
+        SmemLayoutScaleA{});                                                                          // (CTA_M,CTA_K,P)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()),
+        SmemLayoutScaleB{});                                                                          // (CTA_M,CTA_K,P)
+
+    LoadSFParams load_params {
+      size<3>(gSFA_mkl),
+      gSFA_mkl, gSFB_nkl,                             // for input scale tensor values
+      identSFA_mkl, identSFB_nkl,                     // for predicating scale tensor copies
+      sSFA, sSFB,                                     // for scale tensor values
+      mainloop_params.layout_SFA,                     // for predicating scale tensor copies
+      mainloop_params.layout_SFB                      // for predicating scale tensor copies
+    };
+    return load_params;
+  }
+
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class AccTensor>
+  CUTLASS_DEVICE auto
+  mma_init(
+      [[maybe_unused]] TmemStorage<AccTensor> tmem_tensors,
+      TensorStorage& shared_tensors) const {
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA_ = TiledMma::make_fragment_A(sA);                                              // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB_ = TiledMma::make_fragment_B(sB);                                              // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(rank(tCrA_) == _4{});
+
+    auto mma_tile_shape_A = make_shape(get<0>(shape(tCrA_.layout())),
+                                       get<1>(shape(tCrA_.layout())),
+                                       Int<K_BLOCK_MMAS_PER_SCALE_K>{},
+                                       _1{});
+
+    auto mma_tile_shape_B = make_shape(get<0>(shape(tCrB_.layout())),
+                                       get<1>(shape(tCrB_.layout())),
+                                       Int<K_BLOCK_MMAS_PER_SCALE_K>{},
+                                       _1{});
+
+    Tensor tCrA = flat_divide(tCrA_,
+        mma_tile_shape_A)(_,_,_,_0{},_0{},_0{},_,_);                      // (MMA,MMA_M,MMA_K_PER_SCALE,MMA_K_REST,PIPE)
+
+    Tensor tCrB = flat_divide(tCrB_,
+        mma_tile_shape_B)(_,_,_,_0{},_0{},_0{},_,_);                      // (MMA,MMA_N,MMA_K_PER_SCALE,MMA_K_REST,PIPE)
+
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                          // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+    MmaParams<decltype(tCrA), decltype(tCrB)> mma_params {
+      tiled_mma,
+      tCrA, tCrB
+    };
+    return mma_params;
+  }
+
+  /// Set up the data needed by this collective for transform.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  accum_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.begin()),
+        SmemLayoutScaleA{});                                                        // (ScaleMsPerTile,ScakeKsPerTile,P)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.begin()),
+        SmemLayoutScaleB{});                                                        // (ScaleNsPerTile,ScaleKsPerTile,P)
+
+
+    AccumTransformParams transform_params {
+      sSFA, sSFB                        // for input tensor values
+    };
+    return transform_params;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadABParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_ab(
+      MainloopABPipeline mainloop_pipeline,
+      MainloopABPipelineState mainloop_pipe_producer_state,
+      LoadABParams const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_k_tiles,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+
+      using BarrierType = typename MainloopABPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      auto curr_mainloop_pipe_producer_state = mainloop_pipe_producer_state;
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_ab_tail(
+      MainloopABPipeline mainloop_pipeline,
+      MainloopABPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped transform
+  /// Load producer Perspective
+  template <
+    class LoadSFParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_sf(
+      MainloopSFPipeline mainloop_sf_pipeline,
+      MainloopSFPipelineState mainloop_sf_pipe_producer_state,
+      LoadSFParams const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_k_tiles,
+          gSFA_mkl, gSFB_nkl,
+          identSFA_mkl, identSFB_nkl,
+          sSFA, sSFB,
+          layout_SFA, layout_SFB] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    GmemTiledCopySFA scale_copy_a{};
+    GmemTiledCopySFB scale_copy_b{};
+
+    Tensor gSFA_k_compact = filter_zeros(
+      gSFA_mkl(_, _, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)));               // (BLK_M_CPT, BLK_K_CPT, k_cpt)
+    Tensor gSFB_k_compact = filter_zeros(
+      gSFB_nkl(_, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)));               // (BLK_N_CPT, BLK_K_CPT, k_cpt)
+
+    Tensor identSFA_k_compact = filter_zeros(
+        identSFA_mkl(_, _, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)), 
+        gSFA_k_compact.stride());                                                       // (BLK_M_CPT, BLK_K_CPT, k_cpt)
+    Tensor identSFB_k_compact = filter_zeros(
+        identSFB_nkl(_, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)), 
+        gSFB_k_compact.stride());                                                       // (BLK_N_CPT, BLK_K_CPT, k_cpt)
+
+    Tensor sSFA_compact = filter_zeros(sSFA);                                               // (BLK_M_CPT, BLK_K_CPT, P)
+    Tensor sSFB_compact = filter_zeros(sSFB);                                               // (BLK_N_CPT, BLK_K_CPT, P)
+
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x % size(scale_copy_a));
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x % size(scale_copy_b));
+
+    Tensor tSFAgSFA_k_compact = thr_scale_copy_a.partition_S(gSFA_k_compact);                  // (CPY, BLK_M, BLK_K, k)
+    Tensor tSFAIdentSFA_k_compact = thr_scale_copy_a.partition_S(identSFA_k_compact);          // (CPY, BLK_M, BLK_K, k)
+
+    Tensor tSFAsSFA_compact = thr_scale_copy_a.partition_D(sSFA_compact);
+
+    Tensor tSFBgSFB_k_compact = thr_scale_copy_b.partition_S(gSFB_k_compact);                  // (CPY, BLK_N, BLK_K, k)
+    Tensor tSFBIdentSFB_k_compact = thr_scale_copy_b.partition_S(identSFB_k_compact);          // (CPY, BLK_N, BLK_K, k)
+    Tensor tSFBsSFB_compact = thr_scale_copy_b.partition_D(sSFB_compact);
+
+    Tensor thr_tile_pSFA = make_fragment_like<bool>(tSFAgSFA_k_compact(_0{},_,_,_0{}));
+    Tensor thr_tile_pSFB = make_fragment_like<bool>(tSFBgSFB_k_compact(_0{},_,_,_0{}));
+
+    // Issue the loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // LOCK pipe_producer_state for _writing_
+      mainloop_sf_pipeline.producer_acquire(mainloop_sf_pipe_producer_state);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(thr_tile_pSFA); ++i) {
+        Tensor tSFAIdentSFA_compact = tSFAIdentSFA_k_compact(_0{},_,_,*k_tile_iter);
+        thr_tile_pSFA(i) = elem_less(tSFAIdentSFA_compact(i), 
+            shape(filter_zeros(layout_SFA))) && threadIdx.x % 32 < size(scale_copy_a);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(thr_tile_pSFB); ++i) {
+        Tensor tSFBIdentSFB_compact = tSFBIdentSFB_k_compact(_0{},_,_,*k_tile_iter);
+        thr_tile_pSFB(i) = elem_less(tSFBIdentSFB_compact(i), 
+            shape(filter_zeros(layout_SFB))) && threadIdx.x % 32 < size(scale_copy_b);
+      }
+
+      copy_if(scale_copy_a, thr_tile_pSFA, tSFAgSFA_k_compact(_,_,_,*k_tile_iter), 
+          tSFAsSFA_compact(_,_,_,mainloop_sf_pipe_producer_state.index()));
+      copy_if(scale_copy_b, thr_tile_pSFB, tSFBgSFB_k_compact(_,_,_,*k_tile_iter), 
+          tSFBsSFB_compact(_,_,_,mainloop_sf_pipe_producer_state.index()));
+      mainloop_sf_pipeline.producer_commit(mainloop_sf_pipe_producer_state, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      __syncwarp();
+
+      ++mainloop_sf_pipe_producer_state;
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_sf_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_sf_tail(
+      MainloopSFPipeline mainloop_sf_pipeline,
+      MainloopSFPipelineState mainloop_sf_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_sf_pipeline.producer_tail(mainloop_sf_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class TmemStorage,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(
+      cute::tuple<MainloopABPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopABPipelineState,
+                  AccumulatorPipelineState> pipeline_states,
+      TmemStorage tmem_storage,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count) {
+    auto [tiled_mma, tCrA, tCrB] = mma_inputs;
+
+    auto [mainloop_pipeline,
+          accumulator_pipeline] = pipelines;
+
+    auto [mainloop_pipe_consumer_state,
+          accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int scale_k_iter = 0; scale_k_iter < size<3>(tCrA); ++scale_k_iter) {
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+
+        auto acc = get<0>(slice_accumulator(tmem_storage, accumulator_pipe_producer_state.index()));
+        static_assert(is_tmem<remove_cvref_t<decltype(acc)>>::value, "Accumulator must be tmem resident.");
+        static_assert(rank(remove_cvref_t<decltype(acc)>{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+        // for each set of scale_k_blocks we zero the accumulator
+        tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+        // Unroll the K mode manually so we can set scale C to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          // (V,M) x (V,N) => (V,M,N)
+          cute::gemm(tiled_mma,
+                     tCrA(_,_,k_block,scale_k_iter,read_stage),
+                     tCrB(_,_,k_block,scale_k_iter,read_stage),
+                     acc);
+          tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+        }
+        accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        ++accumulator_pipe_producer_state;
+      }
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+
+    }
+
+    return make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state);
+  }
+
+  /// Transform
+  template <
+    class AccumTransformParams,
+    class TmemStorage,
+    class CtaTileCoord,
+    class CopyOpT2R,
+    class EpilogueTile
+  >
+  CUTLASS_DEVICE auto
+  accum(
+      cute::tuple<AccumulatorPipeline, MainloopSFPipeline> pipelines,
+      cute::tuple<AccumulatorPipelineState, MainloopSFPipelineState> consumer_states,
+      TmemStorage tmem_storage,
+      AccumTransformParams const& transform_inputs,
+      CtaTileCoord cta_tile_coord,
+      CopyOpT2R,
+      EpilogueTile,
+      int k_tile_count) {
+
+    static_assert(size<0>(EpilogueTile{}) <= size<0>(CtaShape_MNK{}), "Restrict epilogue tile to be smaller than or equal to CTA Tile");
+    static_assert(size<1>(EpilogueTile{}) <= size<1>(CtaShape_MNK{}), "Restrict epilogue tile to be smaller than or equal to CTA Tile");
+
+
+    //
+    // PIPELINED Transform
+    //
+
+    Tensor acc = get<0>(slice_accumulator(tmem_storage, _0{}));
+
+    Tensor tAcc = acc(make_coord(_,_),_0{},_0{});
+
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+
+    // Append N with a stride of 0 to SFA
+    Tensor sSFA_ = transform_inputs.sSFA;
+    Tensor sSFA = make_tensor(sSFA_.data(), make_layout(
+      make_shape(get<0>(sSFA_.shape()), get<1>(CtaShape_MNK{}), get<1>(sSFA_.shape()), get<2>(sSFA_.shape())),
+      make_stride(get<0>(sSFA_.stride()), _0{}, get<1>(sSFA_.stride()), get<2>(sSFA_.stride()))
+    ));
+
+    CUTE_STATIC_ASSERT_V(size<0>(sSFA) == size<0>(tAcc));
+    CUTE_STATIC_ASSERT_V(size<1>(sSFA) == size<1>(tAcc));
+
+    Tensor sSFA_epi = flat_divide(sSFA, EpilogueTile{});
+
+    // Append M with a stride of 0 to SFB
+    Tensor sSFB_ = transform_inputs.sSFB;
+    Tensor sSFB = make_tensor(sSFB_.data(), make_layout(
+      make_shape(get<0>(CtaShape_MNK{}), get<0>(sSFB_.shape()), get<1>(sSFB_.shape()), get<2>(sSFB_.shape())),
+      make_stride(_0{}, get<0>(sSFB_.stride()), get<1>(sSFB_.stride()), get<2>(sSFB_.stride()))
+    ));
+
+    CUTE_STATIC_ASSERT_V(size<0>(sSFB) == size<0>(tAcc));
+    CUTE_STATIC_ASSERT_V(size<1>(sSFB) == size<1>(tAcc));
+
+    Tensor sSFB_epi = flat_divide(sSFB, EpilogueTile{});
+
+    TiledCopy tiled_t2r_epi = make_tmem_copy(CopyOpT2R{}, tAcc_epi(_,_,_0{},_0{}));
+
+    int thread_idx = threadIdx.x % size(tiled_t2r_epi);
+
+    ThrCopy thread_t2r_epi = tiled_t2r_epi.get_slice(thread_idx);
+
+    Tensor acc_ident_epi = make_identity_tensor(shape(tAcc_epi));
+
+    Tensor tTR_rAcc_epi = thread_t2r_epi.partition_D(acc_ident_epi);                // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+    Tensor tTR_sSFA_epi = thread_t2r_epi.partition_D(sSFA_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+    Tensor tTR_sSFB_epi = thread_t2r_epi.partition_D(sSFB_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+    static_assert(rank(decltype(tTR_sSFA_epi){}) == 7);
+
+    Tensor tTR_FullAcc = make_tensor<ElementAccumulator>(shape(tTR_rAcc_epi));
+    Tensor tTR_PartAcc = make_tensor<ElementAccumulator>(shape(tTR_rAcc_epi(_,_,_,_0{},_0{})));
+
+    Tensor tTR_rSFA_compact = make_fragment_like<ElementAccumulator>(filter_zeros(tTR_sSFA_epi(_,_,_,_,_,_,_0{})));
+    Tensor tTR_rSFB_compact = make_fragment_like<ElementAccumulator>(filter_zeros(tTR_sSFB_epi(_,_,_,_,_,_,_0{})));
+
+    Layout tTR_rSFA_layout = make_layout(tTR_sSFA_epi(_,_,_,_,_,_,_0{}).shape(), tTR_rSFA_compact.stride());
+    Layout tTR_rSFB_layout = make_layout(tTR_sSFB_epi(_,_,_,_,_,_,_0{}).shape(), tTR_rSFB_compact.stride());
+
+    // Zero our accumulator
+    clear(tTR_FullAcc);
+
+    auto [accumulator_pipeline, mainloop_sf_pipeline] = pipelines;
+    auto [accumulator_pipe_state, mainloop_sf_pipe_state] = consumer_states;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+
+      mainloop_sf_pipeline.consumer_wait(mainloop_sf_pipe_state);
+      int read_idx = mainloop_sf_pipe_state.index();
+
+      copy(filter_zeros(tTR_sSFA_epi(_,_,_,_,_,_,read_idx)), tTR_rSFA_compact);
+      copy(filter_zeros(tTR_sSFB_epi(_,_,_,_,_,_,read_idx)), tTR_rSFB_compact);
+
+      CUTE_STATIC_ASSERT_V(cosize(tTR_rSFA_layout) == size(tTR_rSFA_compact));
+      CUTE_STATIC_ASSERT_V(cosize(tTR_rSFB_layout) == size(tTR_rSFB_compact));
+
+      Tensor tTR_rSFA = make_tensor(tTR_rSFA_compact.data(), tTR_rSFA_layout);
+      Tensor tTR_rSFB = make_tensor(tTR_rSFB_compact.data(), tTR_rSFB_layout);
+
+      mainloop_sf_pipeline.consumer_release(mainloop_sf_pipe_state);
+      ++mainloop_sf_pipe_state;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < ScaleKsPerTile; ++k_block) {
+
+        accumulator_pipeline.consumer_wait(accumulator_pipe_state);
+
+        Tensor acc = get<0>(slice_accumulator(tmem_storage, accumulator_pipe_state.index()));
+        Tensor tAcc = acc(make_coord(_,_),_0{},_0{});
+        Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                   // (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N)
+        Tensor tTR_tAcc = thread_t2r_epi.partition_S(tAcc_epi);                     // (T2R, T2R_M, T2R_N, EPI_M, EPI_N)
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int epi_m = 0; epi_m < size<2>(tAcc_epi); ++epi_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int epi_n = 0; epi_n < size<3>(tAcc_epi); ++epi_n) {
+
+            auto scale_a = tTR_rSFA(_,_,_,epi_m,epi_n,k_block * ScaleGranularityK);
+            auto scale_b = tTR_rSFB(_,_,_,epi_m,epi_n,k_block * ScaleGranularityK);
+
+            Tensor full_acc = tTR_FullAcc(_,_,_,epi_m,epi_n);
+            // Compute tmem load predication if necessary
+            copy(tiled_t2r_epi, tTR_tAcc(_,_,_,epi_m,epi_n), tTR_PartAcc);
+            cutlass::arch::fence_view_async_tmem_load();
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int i = 0; i < size(full_acc); ++i) {
+              ElementAccumulator scale = scale_a(i) * scale_b(i);
+              full_acc(i) += scale * tTR_PartAcc(i);
+            }
+          }
+        }
+        cutlass::arch::fence_view_async_tmem_load();
+        accumulator_pipeline.consumer_release(accumulator_pipe_state);
+        // release acc
+        ++accumulator_pipe_state;
+      }
+
+      --k_tile_count;
+    }
+
+    return cute::make_tuple(tTR_FullAcc, tiled_t2r_epi, cute::make_tuple(accumulator_pipe_state, mainloop_sf_pipe_state));
+ }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..54c3bd581a313d23d75c6b991e4373d78f670555
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_emulated.hpp
@@ -0,0 +1,1018 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+
+#pragma once
+#include <cuda_bf16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+#include "cutlass/detail/cluster.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/mma_sm100.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+namespace detail {
+template<class InputLayoutAtom_, class ComputeLayoutAtom_>
+struct CollectiveMmaEmulatedLayoutAtomType {
+  using InputLayoutAtom = InputLayoutAtom_;
+  using ComputeLayoutAtom = ComputeLayoutAtom_;
+};
+
+template<class InputCopyAtom_, class ComputeCopyAtom_>
+struct CollectiveMmaEmulatedCopyType {
+  using InputCopyAtom = InputCopyAtom_;
+  using ComputeCopyAtom = ComputeCopyAtom_;
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop for FastF32 Kernels
+template <
+  int Load2TransformPipelineStageCount_,
+  int Transform2MmaPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  int NumBandsToCompute_,
+  int ScalingFactor_,
+  int AccPromotionInterval_,
+  class AccumulatorCopyAtom_,
+  class ClusterShape,
+  class TileShape_,
+  class StrideA_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomsA_,
+  class CopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomsB_,
+  class CopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedFastF32<
+      Load2TransformPipelineStageCount_,
+      Transform2MmaPipelineStageCount_,
+      SchedulerPipelineStageCount_,
+      AccumulatorPipelineStageCount_,
+      NumBandsToCompute_,
+      ScalingFactor_,
+      AccPromotionInterval_,
+      ClusterShape,
+      AccumulatorCopyAtom_>,
+    TileShape_,
+    float,
+    StrideA_,
+    float,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomsA_,
+    CopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomsB_,
+    CopyAtomsB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedFastF32<
+                            Load2TransformPipelineStageCount_,
+                            Transform2MmaPipelineStageCount_,
+                            SchedulerPipelineStageCount_,
+                            AccumulatorPipelineStageCount_,
+                            NumBandsToCompute_,
+                            ScalingFactor_,
+                            AccPromotionInterval_,
+                            ClusterShape,
+                            AccumulatorCopyAtom_>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementA = float;
+  using PackedElementA = float2;
+  using StrideA = StrideA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using PackedElementAMma = uint32_t;
+  using ElementB = float;
+  using PackedElementB = float2;
+  using StrideB = StrideB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using PackedElementBMma = uint32_t;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+  using CopyAtomsA = CopyAtomsA_;
+  using CopyAtomsB = CopyAtomsB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::is_same_v<ElementA, float>, "Input type A should be float");
+  static_assert(cute::is_same_v<ElementB, float>, "Input type B should be float");
+  static_assert(cute::is_same_v<ElementAMma, cutlass::bfloat16_t>, "Compute type A should be cutlass::bfloat16_t");
+  static_assert(cute::is_same_v<ElementBMma, cutlass::bfloat16_t>, "Compute type A should be cutlass::bfloat16_t");
+
+  using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync<
+                             DispatchPolicy::Load2TransformPipelineStageCount,
+                             AtomThrShapeMNK>;
+  using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState;
+
+  using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync<
+                              DispatchPolicy::Transform2MmaPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState;
+
+  using Mma2AccumPipeline =  cutlass::PipelineUmmaAsync<
+                              DispatchPolicy::Schedule::AccumulatorPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState;
+
+  // Thread Counts
+  static constexpr uint32_t NumTransformationThreads = 128;
+  static constexpr uint32_t NumAccumThreads = 128;
+
+  // Get the Algorithm parameters
+  constexpr static int NumComputeMtxs = 3;
+  constexpr static int NumBandsToCompute = DispatchPolicy::NumBandsToCompute;
+  constexpr static int ScalingFactor = DispatchPolicy::ScalingFactor;
+  constexpr static int AccPromotionInterval = DispatchPolicy::AccPromotionInterval;
+  constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{}) / DispatchPolicy::AccPromotionInterval;
+  constexpr static int NumBandsMax = 5;
+  static_assert(NumBandsToCompute <= NumBandsMax && NumBandsToCompute >= 3, "NumBandsToCompute should be less than maximum number of bands");
+
+  // Copy atom for Accumulator
+  using AccumulatorCopyAtom = typename DispatchPolicy::AccumulatorCopyAtom;
+
+  static_assert((NumBandsToCompute == 5 || NumBandsToCompute == 4 || NumBandsToCompute == 3),
+                 "9xBF16 with 5/4/3 Bands are supported");
+
+  using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom;
+  using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom;
+  using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom;
+  using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom;
+
+  using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom;
+  using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom;
+  using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom;
+  using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomACompute{},
+      append(append(CtaShapeA_MK{}, Int<NumComputeMtxs>{}), Int<DispatchPolicy::Transform2MmaPipelineStageCount>{})));
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(CtaShapeB_NK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutBCompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomBCompute{},
+      append(append(CtaShapeB_NK{}, Int<NumComputeMtxs>{}), Int<DispatchPolicy::Transform2MmaPipelineStageCount>{})));
+
+  static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert((cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value ||
+                 cute::is_base_of<cute::UMMA::tmem_frg_base,      typename TiledMma::FrgTypeA>::value  ) &&
+                 cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop.");
+  static_assert((cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyA - invalid TMA copy atom specified.");
+  static_assert((cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyB -  invalid TMA copy atom specified.");
+
+  struct PipelineStorage {
+    using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage;
+    alignas(16) Load2TransformPipelineStorage load2transform_pipeline;
+    using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage;
+    alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline;
+    using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage;
+    alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline;
+  };
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      struct TensorStorageUntransformed {
+        cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+        cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> smem_B;
+      };
+
+      struct TensorStorageTransformedAinSmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutACompute>> smem_ACompute;
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutBCompute>> smem_BCompute;
+      };
+
+      union TensorStorageTransformedAinTmem {
+        alignas(1024) cute::ArrayEngine<ElementAMma, 1> smem_ACompute;  // No smem_ACompute
+        alignas(1024) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutBCompute>> smem_BCompute;
+      };
+
+      using TensorStorageTransformed = cute::conditional_t<
+                                      cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value,
+                                      TensorStorageTransformedAinSmem,
+                                      TensorStorageTransformedAinTmem>;
+
+      TensorStorageUntransformed input;
+      TensorStorageTransformed compute;
+    } tensors;
+
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on
+  // loaded input A and B matrices to convert the data type
+  static constexpr uint32_t TmaTransactionBytes =
+    cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+    cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    using TMA_B = decltype(make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<ElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      hw_info.cluster_shape_fallback
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE static void
+  prefetch_tma_descriptors(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      if (is_fallback_cluster) {
+        cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor());
+      }
+      else {
+        cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+      }
+    }
+    else {
+      cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+    }
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+      Params const& params,
+      Load2TransformPipeline pipeline,
+      Load2TransformPipelineState load2xform_pipeline_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b] = load_inputs;
+
+    // slice out the work coord from tiled tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK mainloop_load2xform_pipeline_state for _writing_
+      pipeline.producer_acquire(load2xform_pipeline_state, pipeline_flag);
+      int write_stage = load2xform_pipeline_state.index();
+
+      using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(load2xform_pipeline_state);
+
+      // Advance mainloop_pipe
+      ++load2xform_pipeline_state;
+      skip_wait = (k_tile_count <= 1);
+      pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+      copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+      copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      ++k_tile_iter;
+    }
+    return cute::make_tuple(load2xform_pipeline_state, k_tile_iter);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_storage) const {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return cute::make_tuple(
+        gA_mkl, gB_nkl,                        // for scheduler
+        tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+        mcast_mask_a, mcast_mask_b);           // multicast masks
+  }
+
+  template<
+    class KTileIterator, class Accumulator,
+    class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA,
+    class GTensorB,                 class SrcTensorB, class DstTensorB
+  >
+  CUTLASS_DEVICE auto
+  transform(
+      Load2TransformPipeline load2transform_pipeline,
+      Load2TransformPipelineState load2transform_pipeline_consumer_state,
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_producer_state,
+      Accumulator accumulators,
+      cute::tuple<GTensorA, DstCopyA, SrcTensorA, DstTensorA,
+                  GTensorB,           SrcTensorB, DstTensorB> input_operands,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    static_assert(cute::is_same_v<ElementA, ElementB>, "ElementA and ElementB types should be the same.");
+    static_assert(cute::is_same_v<ElementAMma, ElementBMma>, "ElementAMma and ElementBMma types should be the same.");
+
+    cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier);
+
+    // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tAdA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM or TMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM)
+    auto [unused_tAgA, dst_copy_A, tAsA, tAdACompute,
+          unused_tBgB,             tBsB, tBsBCompute] = input_operands;
+
+    // Create the tensors in registers
+    auto tArA = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
+    auto tArA_temp = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());
+    auto tArACompute = make_tensor<ElementAMma>(tAsA(_,_,_,_,0).shape());
+
+    auto tBrB = make_tensor<ElementB>(tBsB(_,_,_,_,0).shape());
+    auto tBrB_temp = make_tensor<ElementB>(tBsB(_,_,_,_,0).shape());
+    auto tBrBCompute = make_tensor<ElementBMma>(tBsB(_,_,_,_,0).shape());
+
+    auto tArA_x2 = recast<Array<ElementA,2>>(tArA);
+    auto tArA_temp_x2 = recast<Array<ElementA,2>>(tArA_temp);
+    auto tArACompute_x2 = recast<Array<ElementAMma,2>>(tArACompute);
+
+    auto tBrB_x2 = recast<Array<ElementB,2>>(tBrB);
+    auto tBrB_temp_x2 = recast<Array<ElementB,2>>(tBrB_temp);
+    auto tBrBCompute_x2 = recast<Array<ElementBMma,2>>(tBrBCompute);
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+    auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag);
+      transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag);
+
+      int load2transform_consumer_index = load2transform_pipeline_consumer_state.index();
+      int transform2mma_producer_index = transform2mma_pipeline_producer_state.index();
+
+      auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state;
+      auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state;
+
+      // Copy the input B matrix from SMEM
+      copy(AutoVectorizingCopy{}, tBsB(_,_,_,_,load2transform_consumer_index), tBrB);
+      // Copy the input A matrix from SMEM
+      copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA);
+
+      CUTE_UNROLL
+      for (int comp_mtx_index = 0; comp_mtx_index < NumComputeMtxs; ++comp_mtx_index) {
+        // Convert from fp32 -> bf16
+        cute::transform(tBrB_x2, tBrBCompute_x2, cutlass::NumericArrayConverter<ElementBMma, ElementB, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
+        copy(AutoVectorizingCopy{}, tBrBCompute, tBsBCompute(_,_,_,_,comp_mtx_index,transform2mma_producer_index));
+
+        // if it is not the last compute matrix, scale and substract
+        if (comp_mtx_index < NumComputeMtxs - 1) {
+          // Convert from bf16 -> fp32 to substract
+          cute::transform(tBrBCompute_x2, tBrB_temp_x2, cutlass::NumericArrayConverter<ElementB, ElementBMma, 2, cutlass::FloatRoundStyle::round_to_nearest>::convert);
+          cute::transform(tBrB_x2, tBrB_temp_x2, tBrB_x2, cutlass::minus<Array<ElementB,2>>{});
+          if constexpr (DispatchPolicy::ScalingFactor != 0) {
+            cute::transform(tBrB_x2, tBrB_x2, cutlass::scale<Array<ElementB,2>>{(1 << DispatchPolicy::ScalingFactor)});
+          }
+        }
+      }
+
+      // Loads from SMEM are done. Signal the mainloop load as early as possible
+      transform_bar.sync();
+      load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state);
+
+      CUTE_UNROLL
+      for (int comp_mtx_index = 0; comp_mtx_index < NumComputeMtxs; ++comp_mtx_index) {
+        // Convert from fp32 -> bf16
+        cute::transform(tArA_x2, tArACompute_x2, cutlass::NumericArrayConverter<ElementAMma, ElementA, 2, cutlass::FloatRoundStyle::round_to_nearest_satfinite>::convert);
+        copy(dst_copy_A, tArACompute, tAdACompute(_,_,_,_,comp_mtx_index,transform2mma_producer_index));
+
+        // if it is not the last compute matrix, scale and substract
+        if (comp_mtx_index < NumComputeMtxs - 1) {
+          // Convert from bf16 -> fp32 to substract
+          cute::transform(tArACompute_x2, tArA_temp_x2, cutlass::NumericArrayConverter<ElementA, ElementAMma, 2, cutlass::FloatRoundStyle::round_to_nearest>::convert);
+          cute::transform(tArA_x2, tArA_temp_x2, tArA_x2, cutlass::minus<Array<ElementA,2>>{});
+          if constexpr (DispatchPolicy::ScalingFactor != 0) {
+            cute::transform(tArA_x2, tArA_x2, cutlass::scale<Array<ElementA,2>>{(1 << DispatchPolicy::ScalingFactor)});
+          }
+        }
+      }
+
+      // fence for SMEM writes
+      cutlass::arch::fence_view_async_shared();
+      if constexpr (is_tmem<decltype(tAdACompute)>::value) {
+        // fence for TMEM writes if A operand is coming from TMEM
+        cutlass::arch::fence_view_async_tmem_store();
+      }
+
+      // Let the MMA know we are done transforming
+      transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state);
+      // Next pipeline stage
+      ++load2transform_pipeline_consumer_state;
+      ++transform2mma_pipeline_producer_state;
+
+      skip_wait = (k_tile_count <= 1);
+      // Peek the next pipeline stage's barriers
+      load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+      transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+    }
+    return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state);
+  }
+
+  template<class ProblemShape_MNKL, class Accumulator>
+  CUTLASS_DEVICE auto
+  transform_init(
+      Params const& params,
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Accumulator accumulators,
+      TensorStorage& shared_storage) {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});
+    Tensor sA = as_position_independent_swizzle_tensor(sA_orig);
+    Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+
+    Tensor sB_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});
+    Tensor sB = as_position_independent_swizzle_tensor(sB_orig);
+    Tensor sBCompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_BCompute.begin()), SmemLayoutBCompute{});
+
+    // Map input, compute, and fragment tensors to
+    //   Copy strategies and partitioned tensors. These will become the input
+    //   operands of the transform function. Depending on MMA atom type, the
+    //   operands can reside in SMEM or TMEM
+    auto setup_copy_ops = [&] (
+        auto tensor_input,
+        auto input_copy_atom,
+        auto tensor_compute,
+        auto make_fragment,
+        auto compute_copy_atom) constexpr {
+      auto fragment_compute = make_fragment(tensor_compute);
+      if constexpr (cute::is_tmem<cute::remove_cvref_t<decltype(fragment_compute)>>::value) {
+        // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation.
+        // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation
+        // See: TmemAllocMode::Duplicated.
+        Tensor tensor_input2x = [&] () constexpr {
+        if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) {
+          return make_tensor(tensor_input.data(),
+                             logical_product(tensor_input.layout(),
+                                             make_tile(make_tile(Layout<_2,_0>{},_),_,_,_)));   // ((128,16),m,k,PIPE)
+          }
+          else {
+            return tensor_input;
+          }
+        }();
+
+        fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        auto reg2tmem_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,_,0,0));
+        auto thr_reg2tmem_tiled_copy = reg2tmem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2tmem_tiled_copy.partition_S(tensor_input2x);
+        auto partitioned_tensor_compute = thr_reg2tmem_tiled_copy.partition_D(fragment_compute);
+        return cute::make_tuple(reg2tmem_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+      else {
+        auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute);
+        auto reg2smem_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout<Shape <_128,_8>, Stride<  _8,_1>>{},
+                                                     tensor_compute(_,_,_,0,0).layout());
+
+        auto thr_reg2smem_tiled_copy = reg2smem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_reg2smem_tiled_copy.partition_S(tensor_input);
+        auto partitioned_tensor_compute = thr_reg2smem_tiled_copy.partition_D(tensor_compute_ind_sw);
+
+        return cute::make_tuple(AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+    };
+
+    auto [dst_copy_A, tAsA, tAsACompute] =
+        setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{});
+
+    auto [dst_copy_B, tBsB, tBsBCompute] =
+        setup_copy_ops(sB, InputCopyAtomB{}, sBCompute, [&](auto &arg) {return TiledMma::make_fragment_B(arg);}, ComputeCopyAtomB{});
+
+    return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute,
+                            gB_nkl,             tBsB, tBsBCompute);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class TensorA, class TensorB
+  >
+  CUTLASS_DEVICE auto
+  mma(
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_consumer_state,
+      Mma2AccumPipeline mma2accum_pipeline,
+      Mma2AccumPipelineState mma2accum_pipeline_producer_state,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      cute::tuple<TensorA, TensorB> const& input_operands,
+      int k_tile_count
+  ) {
+    TiledMma tiled_mma;
+
+    auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+    ++next_transform2mma_pipeline_consumer_state;
+
+    // tCrA : (MMA), MMA_M, MMA_K, NumComputeMtxs, SmemStage  (In SMEM or TMEM)
+    //      We use SMEM stages to match #buffers in Load <-> Convert
+    // tCrB : (MMA), MMA_N, MMA_K, NumComputeMtxs, SmemStages (In SMEM)
+    auto const [tCrA, tCrB] = input_operands;
+
+    using ZeroScaler = cute::integral_constant<uint32_t, 0>;
+    using Scaler = cute::integral_constant<uint32_t, ScalingFactor>;
+
+    int remaining_accum_promotions = k_tile_count * StagesPerTile;
+    uint32_t mma2accum_skip_wait = (remaining_accum_promotions <= 0);
+    auto mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag);
+
+      int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); k_block += DispatchPolicy::AccPromotionInterval, --remaining_accum_promotions) {
+        mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state, mma2accum_flag);
+
+        int mma2accum_pipeline_producer_state_index = mma2accum_pipeline_producer_state.index();
+        auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index);
+        auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state;
+
+        ++mma2accum_pipeline_producer_state;
+        mma2accum_skip_wait = (remaining_accum_promotions <= 1);
+        mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait);
+
+        auto tCrA0 = tCrA(_,_,_,0,transform2mma_pipeline_consumer_state_index);
+        auto tCrA1 = tCrA(_,_,_,1,transform2mma_pipeline_consumer_state_index);
+        auto tCrA2 = tCrA(_,_,_,2,transform2mma_pipeline_consumer_state_index);
+
+        auto tCrB0 = tCrB(_,_,_,0,transform2mma_pipeline_consumer_state_index);
+        auto tCrB1 = tCrB(_,_,_,1,transform2mma_pipeline_consumer_state_index);
+        auto tCrB2 = tCrB(_,_,_,2,transform2mma_pipeline_consumer_state_index);
+
+        // MMA instructions Emulation
+        auto accumulate = UMMA::ScaleOut::Zero;
+        // First set of GEMMs that we need to perform for each band are unrolled to set compile-time constant
+        // scaling parameter. Scaled GEMM operations are only needed for the first MMA operation of each band.
+
+        // Band 5
+        if constexpr (NumBandsToCompute == 5) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB2(_,_,k_block), tCtC);         // A[2]*B[2]
+          accumulate = UMMA::ScaleOut::One;
+          CUTLASS_PRAGMA_UNROLL
+          for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);   // A[2]*B[2]
+          }
+        }
+        // Band 4
+        if constexpr (NumBandsToCompute >= 4) {
+          cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA1(_,_,k_block), tCrB2(_,_,k_block), tCtC);             // A[1]*B[2]
+          accumulate = UMMA::ScaleOut::One;
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB1(_,_,k_block), tCtC);         // A[2]*B[1]
+          CUTLASS_PRAGMA_UNROLL
+          for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);   // A[1]*B[2]
+            cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);   // A[2]*B[1]
+          }
+        }
+        // Band 3
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB2(_,_,k_block), tCtC);               // A[2]*B[0]
+        accumulate = UMMA::ScaleOut::One;
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block), tCrB1(_,_,k_block), tCtC);           // A[1]*B[1]
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block), tCrB0(_,_,k_block), tCtC);           // A[0]*B[2]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB2(_,_,k_block+s), tCtC);     // A[2]*B[0]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);     // A[1]*B[1]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA2(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[0]*B[2]
+        }
+        // Band 2
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB1(_,_,k_block), tCtC);               // A[0]*B[1]
+        cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block), tCrB0(_,_,k_block), tCtC);           // A[1]*B[0]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB1(_,_,k_block+s), tCtC);     // A[0]*B[1]
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA1(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[1]*B[0]
+        }
+        // Band 1
+        cute::gemm(tiled_mma.with(accumulate, Scaler{}), tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC);               // A[0]*B[0]
+        CUTLASS_PRAGMA_UNROLL
+        for (int s = 1; s < DispatchPolicy::AccPromotionInterval; s++) {
+          cute::gemm(tiled_mma.with(accumulate, ZeroScaler{}), tCrA0(_,_,k_block+s), tCrB0(_,_,k_block+s), tCtC);     // A[0]*B[0]
+        }
+        mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state);
+      }
+
+      transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state);
+
+      skip_wait = (k_tile_count <= 1);
+      transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+
+      curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state;
+      ++next_transform2mma_pipeline_consumer_state;
+    }
+    return cute::make_tuple(curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state);
+  }
+
+  template<class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TensorStorage& shared_storage) const {
+    TiledMma tiled_mma;
+
+    auto get_tCrA = [&] () constexpr {
+      if constexpr (cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value) {
+        Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+        return tiled_mma.make_fragment_A(sACompute);
+      }
+      else {
+        auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{}));
+        tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        return tCrA;
+      }
+    };
+
+    Tensor tCrA = get_tCrA();
+    Tensor sBCompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_BCompute.begin()), SmemLayoutBCompute{});
+    Tensor tCrB = tiled_mma.make_fragment_B(sBCompute);
+    return cute::make_tuple(tCrA, tCrB);
+  }
+
+  template<class FrgEngine, class FrgLayout, class TmemCopyAtom, class EpilogueTile>
+  CUTLASS_DEVICE auto
+  accum_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) {
+    // Obtain a single accumulator
+    Tensor tAcc = tensor<0>(accumulators(_,_,_,_0{}));
+    // Apply epilogue subtiling
+    Tensor tAcc_epi = flat_divide(tAcc, EpilogueTile{});                          // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N)
+    // Create the TMEM copy for single EpilogueTile.
+    // Note that EpilogueTile = CtaTile for NoSmem epilogue
+    auto tiled_t2r = make_tmem_copy(tmem_cp_atom, tAcc_epi(_,_,_0{},_0{}));
+    auto thread_t2r = tiled_t2r.get_slice(threadIdx.x % size(tiled_t2r));
+    Tensor tTR_gC   = thread_t2r.partition_D(tAcc_epi);
+    Tensor tTR_rAcc = make_tensor<ElementAccumulator>(shape(tTR_gC));                               // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc = make_tensor<ElementAccumulator>(shape(tTR_gC));                           // (T2R,T2R_M,T2R_N)
+    Tensor tTR_rAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rAcc);                       // (T2R/2,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rGlobAcc);               // (T2R/2,T2R_M,T2R_N)
+
+    // Apply epilogue subtiling to bulk accumulator
+    // We need to tile the whole bulk_tmem allocation with EpilogueTile.
+    // The accumulation should be aware of the AccumulatorPipelineStages
+    Tensor tBulkAcc_epi = flat_divide(accumulators(make_coord(_,_),_0{},_0{}, _), EpilogueTile{});  // (EPI_TILE_M,EPI_TILE_N,EPI_M,EPI_N,PIPE)
+    Tensor tTR_tBulkAcc = thread_t2r.partition_S(tBulkAcc_epi);                                           // (T2R,T2R_M,T2R_N,EPI_M,EPI_N,PIPE)
+    return cute::make_tuple(tiled_t2r, thread_t2r, tTR_tBulkAcc, tTR_rAcc, tTR_rGlobAcc);
+  }
+
+  template<class TiledCopy, class ThrCopy, class AccumulatorTensor, class LocalAccFrg, class GlobalAccFrg>
+  CUTLASS_DEVICE auto
+  accum(cute::tuple<TiledCopy, ThrCopy, AccumulatorTensor, LocalAccFrg, GlobalAccFrg> accum_inputs,
+        Mma2AccumPipeline mma2accum_pipeline,
+        Mma2AccumPipelineState mma2accum_pipeline_consumer_state,
+        int k_tile_count) {
+    auto [tiled_t2r, thread_t2r, tTR_tBulkAcc,
+          tTR_rAcc, tTR_rGlobAcc] = accum_inputs;
+
+
+    Tensor tTR_rAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rAcc);                       // (T2R/2,T2R_M,T2R_N)
+    Tensor tTR_rGlobAcc_float2 = recast<Array<ElementAccumulator,2>>(tTR_rGlobAcc);               // (T2R/2,T2R_M,T2R_N)
+
+    // Clear the global accumulator
+    CUTE_UNROLL
+    for (int i = 0; i<size(tTR_rGlobAcc); i++) {
+      tTR_rGlobAcc(i) = ElementAccumulator(0);
+    }
+
+    uint32_t skip_wait = 0;
+    auto mma2accum_flag = mma2accum_pipeline.consumer_try_wait(mma2accum_pipeline_consumer_state, skip_wait);
+
+    // 1. Global periodic accumulation in registers
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (; k_tile_count > 0; --k_tile_count) {
+      // The stage is limited to a CTA tile
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int k_block = 0; k_block<StagesPerTile; k_block++) {
+        int mma2accum_pipeline_consumer_state_index = mma2accum_pipeline_consumer_state.index();
+        mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state, mma2accum_flag);
+        auto prev_state = mma2accum_pipeline_consumer_state;
+
+        copy(tiled_t2r, tTR_tBulkAcc(_,_,_,_,_,mma2accum_pipeline_consumer_state_index), tTR_rAcc);
+        cute::transform(tTR_rGlobAcc_float2, tTR_rAcc_float2, tTR_rGlobAcc_float2, cutlass::plus<Array<ElementAccumulator,2>>{});
+
+        cutlass::arch::fence_view_async_tmem_load(); // Need a fence bw TMEM_LOAD and arrive
+        mma2accum_pipeline.consumer_release(mma2accum_pipeline_consumer_state);
+
+        ++mma2accum_pipeline_consumer_state;
+        skip_wait = ((k_tile_count <= 1) && (k_block >= (StagesPerTile-1)));
+        mma2accum_flag = mma2accum_pipeline.consumer_try_wait(mma2accum_pipeline_consumer_state, skip_wait);
+      }
+    }
+    return cute::make_tuple(mma2accum_pipeline_consumer_state, tTR_rGlobAcc);
+  }
+
+protected:
+
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  constexpr auto
+  tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const {
+    using X = cute::Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5adc2b817e81c0f7f05a9dd1816c7990280d02f4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1296 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cuda_bf16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+#include "cutlass/detail/sm100_mixed_dtype_blockwise_layout.hpp"
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/arch/mma_sm100.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop for Mixed Input Kernels
+template <
+  int Load2TransformPipelineStageCount_,
+  int Transform2MmaPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape,
+  class TileShape_,
+  class ElementAOptionalTuple_,
+  class StridePairA_,
+  class ElementBOptionalTuple_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomsA_,
+  class CopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomsB_,
+  class CopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedMixedInput<
+      Load2TransformPipelineStageCount_,
+      Transform2MmaPipelineStageCount_,
+      SchedulerPipelineStageCount_,
+      AccumulatorPipelineStageCount_,
+      ClusterShape>,
+    TileShape_,
+    ElementAOptionalTuple_,
+    StridePairA_,
+    ElementBOptionalTuple_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomsA_,
+    CopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomsB_,
+    CopyAtomsB_,
+    TransformB_>
+{
+public:
+  //
+  // Type Aliases
+  //
+
+  using ConversionMode = cutlass::detail::ConversionMode;
+  // Determine MMA type: MMA_1SM vs MMA_2SM
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedMixedInput<
+                            Load2TransformPipelineStageCount_,
+                            Transform2MmaPipelineStageCount_,
+                            SchedulerPipelineStageCount_,
+                            AccumulatorPipelineStageCount_,
+                            ClusterShape>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using KernelSchedule = typename DispatchPolicy::Schedule;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  using ElementAOptionalTuple = ElementAOptionalTuple_;
+  using ElementBOptionalTuple = ElementBOptionalTuple_;
+
+private:
+
+  template<class T> friend struct detail::MixedInputUtils;
+  using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_, 
+                                       ElementAOptionalTuple, StridePairA_, 
+                                       ElementBOptionalTuple, StrideB_,
+                                       TiledMma_, 
+                                       GmemTiledCopyA_, SmemLayoutAtomsA_, CopyAtomsA_,
+                                       TransformA_,
+                                       GmemTiledCopyB_, SmemLayoutAtomsB_, CopyAtomsB_,
+                                       TransformB_>;
+  using Utils = detail::MixedInputUtils<CollectiveType>;
+
+  using ElementScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple_>;
+  using ElementScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ElementZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ElementZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value, 
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+    "[ElementZero]}. Inputs in [] are optional.");
+  
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ElementScaleA, ElementScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ElementZeroA, ElementZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = cute::remove_cvref_t<decltype(get<0>(StridePairA_{}))>;
+  using LayoutScale = cute::remove_cvref_t<decltype(get<1>(StridePairA_{}))>;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  static_assert((IsATransformed && cutlass::gemm::detail::is_k_major<StrideA>()) || 
+                (!IsATransformed && cutlass::gemm::detail::is_k_major<StrideB>()),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                (cutlass::gemm::detail::is_k_major<StrideA>() && 
+                 cutlass::gemm::detail::is_k_major<StrideB>()), 
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementBMma = typename TiledMma::ValTypeB;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = GmemTiledCopyA_;
+
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+  using CopyAtomsA = CopyAtomsA_;
+  using CopyAtomsB = CopyAtomsB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom;
+  using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom;
+  using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom;
+  using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom;
+
+  using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom;
+  using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom;
+  using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom;
+  using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemLayoutAtomACompute = cute::conditional_t<!SwapAB, SmemLayoutAtomACompute, SmemLayoutAtomBCompute>;
+  using InternalSmemLayoutAtomBCompute = cute::conditional_t<!SwapAB, SmemLayoutAtomBCompute, SmemLayoutAtomACompute>;
+
+  using InternalInputCopyAtomA   = cute::conditional_t<!SwapAB, InputCopyAtomA, InputCopyAtomB>;
+  using InternalInputCopyAtomB   = cute::conditional_t<!SwapAB, InputCopyAtomB, InputCopyAtomA>;
+  using InternalComputeCopyAtomA   = cute::conditional_t<!SwapAB, ComputeCopyAtomA, ComputeCopyAtomB>;
+  using InternalComputeCopyAtomB   = cute::conditional_t<!SwapAB, ComputeCopyAtomB, ComputeCopyAtomA>;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using InternalTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using InternalTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static_assert(cute::is_same_v<ElementAMma, cutlass::bfloat16_t> || cute::is_same_v<ElementAMma, cutlass::half_t> || cute::is_same_v<ElementAMma, cutlass::float_e4m3_t>, 
+         "Compute type A should be cutlass::bfloat16_t or cutlass::half_t or cutlass::float_e4m3_t");
+
+  using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync<
+                             DispatchPolicy::Load2TransformPipelineStageCount,
+                             AtomThrShapeMNK>;
+  using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState;
+
+  using Load2MmaPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::Load2TransformPipelineStageCount,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using Load2MmaPipelineState = typename Load2MmaPipeline::PipelineState;
+
+  using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync<
+                              DispatchPolicy::Transform2MmaPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState;
+
+  using Mma2AccumPipeline =  cutlass::PipelineUmmaAsync<
+                              DispatchPolicy::Schedule::AccumulatorPipelineStageCount,
+                              AtomThrShapeMNK>;
+  using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState;
+
+
+  static constexpr int ScaleGranularityMN = size<0,0>(LayoutScale{});
+  static constexpr int ScaleGranularityK = size<1,0>(LayoutScale{});
+  using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig<
+      ScaleGranularityMN, 
+      ScaleGranularityK>; 
+ 
+  using ScaleTileShape = cute::conditional_t<!SwapAB, 
+          decltype(make_shape(size<0>(TileShape{}), size<2>(TileShape{}))), 
+          decltype(make_shape(size<1>(TileShape{}), size<2>(TileShape{})))>;
+
+  static constexpr int ScaleTileShape_MN = get<0>(ScaleTileShape{});
+
+  static constexpr int ScaleK = get<1>(ScaleTileShape{}) / ScaleGranularityK;
+
+  using SmemLayoutAtomScale = decltype(ScaleConfig::smem_atom_layout_scale(ScaleTileShape{})); 
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Thread Counts
+  static constexpr uint32_t NumTransformationThreads = 128;
+  static constexpr uint32_t NumAccumThreads = 128; //Maintains compatibility with input_transform kernel
+
+  // Get the Algorithm parameters
+  constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{});
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+  static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomACompute{},
+      append(CtaShapeA_MK{}, Int<DispatchPolicy::Transform2MmaPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(CtaShapeB_NK{}, Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+             (cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})));
+
+  using SmemLayoutScale = decltype(make_layout(
+    append(shape(SmemLayoutAtomScale{}), Int<DispatchPolicy::Load2TransformPipelineStageCount>{}),
+    append(stride(SmemLayoutAtomScale{}), size(filter_zeros(SmemLayoutAtomScale{})))
+  ));
+
+  static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2,
+                "Specialization requires Stages set to value 2 or more.");
+  static_assert((cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value ||
+                 cute::is_base_of<cute::UMMA::tmem_frg_base,      typename TiledMma::FrgTypeA>::value  ) &&
+                 cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                 "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop.");
+  static_assert((cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>),
+                 "GmemTiledCopyA - invalid TMA copy atom specified.");
+
+private:
+  static constexpr ConversionMode 
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    } 
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); 
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct PipelineStorage {
+    using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage;
+    alignas(16) Load2TransformPipelineStorage load2transform_pipeline;
+    using Load2MmaPipelineStorage = typename Load2MmaPipeline::SharedStorage;
+    alignas(16) Load2MmaPipelineStorage load2mma_pipeline;
+    using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage;
+    alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline;
+    using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage;
+    alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline;
+  };
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+
+      struct TensorStorageUntransformed {
+        alignas(512) cute::ArrayEngine<ElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+        alignas(1024) cute::ArrayEngine<ElementB, cute::cosize_v<SmemLayoutB>> smem_B;
+        cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+        cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+      };
+
+      struct TensorStorageTransformedAinSmem {
+        // We require alignas(1024) here because the smem_ACompute may not be aligned to 1024 by default.
+        // We need 1024B alignment of smem_ACompute because we are using Swizzle<3,4,3> here.
+        // The Swizzle<3,4,3> aligns with 1024B. If we don't align the data, the compiler cannot deduce
+        // the base pointer of the data.
+        // This alignment allows us to perform the function swizzle(layout(i) * base_ptr).
+        alignas(1024) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutACompute>> smem_ACompute;
+      };
+
+      union TensorStorageTransformedAinTmem {
+        cute::ArrayEngine<ElementAMma, 1> smem_ACompute;  // No smem_ACompute
+      };
+
+      using TensorStorageTransformed = cute::conditional_t<
+                                      cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value,
+                                      TensorStorageTransformedAinSmem,
+                                      TensorStorageTransformedAinTmem>;
+
+      TensorStorageUntransformed input;
+      TensorStorageTransformed compute;
+    } tensors;
+
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+
+  // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on
+  // loaded input A and B matrices to convert the data type
+  static constexpr uint32_t TmaTransactionBytes_A = cutlass::bits_to_bytes(cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementA>) + Utils::compute_tma_transaction_bytes_extra_transform();
+  static constexpr uint32_t TmaTransactionBytes_B = cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>);
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytes_A + TmaTransactionBytes_B;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementScale const* ptr_S{nullptr};
+    LayoutScale layout_S{};
+    ElementZero const* ptr_Z{nullptr};
+  };
+
+  struct TMAScaleParams {
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                              make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_Scale = decltype(make_tma_atom(
+        GmemTiledCopyScale{},
+        make_tensor(static_cast<NonVoidElementScale const*>(nullptr), LayoutScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        size<2>(ClusterLayout_VMNK{}))
+    );
+
+    TMA_Scale tma_load_scale;
+    TMA_Scale tma_load_zero;
+    
+  };
+
+  struct EmptyScaleParams {};
+
+  // Device side kernel params
+  struct Params : public cute::conditional_t<ModeHasScales, TMAScaleParams, EmptyScaleParams>  {
+
+    using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})),
+                                                     make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+    );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    dim3 cluster_shape_fallback;
+
+    uint32_t tma_transaction_bytes{TmaTransactionBytes};
+    SwappedStrideA dA{};
+    SwappedStrideB dB{};
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape, 
+    Arguments const& args, 
+    void* workspace, 
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<ElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return { 
+        {},
+        tma_load_a, 
+        tma_load_b, 
+        tma_load_a_fallback, 
+        tma_load_b_fallback, 
+        hw_info.cluster_shape_fallback, 
+        tma_transaction_bytes, 
+        args.dA, args.dB };
+    } 
+    else if constexpr (ModeHasScales) {
+      ElementScale const* ptr_S = args.ptr_S;
+    
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), args.layout_S);
+      typename Params::TMA_Scale tma_load_scale = make_tma_atom(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          size<2>(cluster_layout_vmnk)
+      );
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        typename Params::TMAScaleParams scale_params{tma_load_scale, {}};
+        return { 
+          scale_params,
+          tma_load_a, 
+          tma_load_b, 
+          tma_load_a_fallback, 
+          tma_load_b_fallback, 
+          hw_info.cluster_shape_fallback, 
+          tma_transaction_bytes, 
+          args.dA, args.dB };
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(args.ptr_Z), args.layout_S);
+        typename Params::TMA_Scale tma_load_zero = make_tma_atom(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            size<2>(cluster_layout_vmnk));
+
+        typename Params::TMAScaleParams scale_params{tma_load_scale, tma_load_zero};
+        return { 
+          scale_params,
+          tma_load_a, 
+          tma_load_b, 
+          tma_load_a_fallback, 
+          tma_load_b_fallback, 
+          hw_info.cluster_shape_fallback, 
+          tma_transaction_bytes, 
+          args.dA, args.dB };
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    } 
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB>();
+    constexpr int tma_alignment_bits_S = cutlass::detail::get_input_alignment_bits<NonVoidElementScale>();
+
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    bool check_aligned_A = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    bool check_aligned_B = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    bool check_aligned_S = true;
+    bool check_aligned_Z = true;
+    bool check_mode_args = true;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      check_mode_args = check_mode_args && (args.ptr_S == nullptr);
+      check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+    } 
+    else if constexpr (ModeHasScales) {
+      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits_S / cutlass::sizeof_bits<ElementScale>::value;
+      check_aligned_S = cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(args.layout_S);
+      check_mode_args = check_mode_args && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits_S / cutlass::sizeof_bits<ElementZero>::value;
+        check_aligned_Z = cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(args.layout_S);
+        check_mode_args = check_mode_args && (args.ptr_Z != nullptr);
+      } 
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+    }
+
+    if (!check_mode_args) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n");
+    }
+    if (!check_aligned_A) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor A meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_B) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor B meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_S) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor S (scale) meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_Z) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor Z (zeros) meet the minimum alignment requirements for TMA.\n");
+    }
+
+    return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE static void
+  prefetch_tma_descriptors(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      if (is_fallback_cluster) {
+        cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor());
+      }
+      else {
+        cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+      }
+    }
+    else {
+      cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert);
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(params.tma_load_scale.get_tma_descriptor());
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_zero.get_tma_descriptor());
+    }  
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
+    }
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator,
+    class... Ts
+  >
+  CUTLASS_DEVICE auto
+  load_A(
+      Params const& params,
+      Load2TransformPipeline load2xform_pipeline,
+      Load2TransformPipelineState load2xform_pipeline_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t,
+                  cute::tuple<Ts...>> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b, extra_input_partitions] = load_inputs;
+
+    // slice out the work coord from tiled tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2xform_pipeline_flag = load2xform_pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+    //Load2Mma and Load2Transform pipelines both have the same ProducerBarrierType
+    using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // LOCK mainloop_load2xform_pipeline_state for _writing_
+      load2xform_pipeline.producer_acquire(load2xform_pipeline_state, load2xform_pipeline_flag);
+
+      int tile_A_write_stage = load2xform_pipeline_state.index();
+
+      BarrierType* load2xform_tma_barrier = load2xform_pipeline.producer_get_barrier(load2xform_pipeline_state);
+
+      // Advance mainloop load2transform pipeline
+      ++load2xform_pipeline_state;
+
+      skip_wait = (k_tile_count <= 1);
+      load2xform_pipeline_flag = load2xform_pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait);
+
+      // TMA load for A k_tile
+      copy(observed_tma_load_a_->with(*load2xform_tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,tile_A_write_stage));
+
+      if constexpr (ModeHasScales) {
+        auto tSgS_mkl = get<0>(extra_input_partitions);
+        auto tSgS = tSgS_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+        auto tSsS = get<1>(extra_input_partitions);
+        copy(params.tma_load_scale.with(*load2xform_tma_barrier, mcast_mask_a), tSgS(_,*k_tile_iter), tSsS(_,tile_A_write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ_mkl = get<2>(extra_input_partitions);
+          auto tZgZ = tZgZ_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+          auto tZsZ = get<3>(extra_input_partitions);
+          copy(params.tma_load_zero.with(*load2xform_tma_barrier, mcast_mask_a), tZgZ(_,*k_tile_iter), tZsZ(_,tile_A_write_stage));
+        }
+      } 
+      else {
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert);
+        else static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+
+      ++k_tile_iter;
+    }
+
+
+    return cute::make_tuple(load2xform_pipeline_state, k_tile_iter);
+
+  }
+
+  /// Produce the inputs to the transform threads by loading inputs from gmem -> smem
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator,
+    class... Ts
+  >
+  CUTLASS_DEVICE auto
+  load_B(
+      Params const& params,
+      Load2MmaPipeline load2mma_pipeline,
+      Load2MmaPipelineState load2mma_pipeline_state,
+      cute::tuple<GTensorA, GTensorB,
+                  GTensorPartitionedA, GTensorPartitionedB,
+                  STensorA, STensorB,
+                  uint16_t, uint16_t,
+                  cute::tuple<Ts...>> const& load_inputs,
+      TileCoordMNKL const& cta_coord_mnkl,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [unused_gA, unused_gB,
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,
+          mcast_mask_a, mcast_mask_b, extra_input_partitions] = load_inputs;
+
+    // slice out the work coord from tiled tensors
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2mma_pipeline_flag = load2mma_pipeline.producer_try_acquire(load2mma_pipeline_state, skip_wait);
+
+    //Load2Mma and Load2Transform pipelines both have the same ProducerBarrierType
+    using BarrierType = typename Load2TransformPipeline::ProducerBarrierType;
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // LOCK mainloop_load2mma_pipeline_state for _writing_
+      load2mma_pipeline.producer_acquire(load2mma_pipeline_state, load2mma_pipeline_flag);
+
+      int tile_B_write_stage = load2mma_pipeline_state.index();
+
+      BarrierType* load2mma_tma_barrier = load2mma_pipeline.producer_get_barrier(load2mma_pipeline_state);
+
+      // Advance mainloop load2mma pipeline
+      ++load2mma_pipeline_state;
+
+      skip_wait = (k_tile_count <= 1);
+      load2mma_pipeline_flag = load2mma_pipeline.producer_try_acquire(load2mma_pipeline_state, skip_wait);
+
+      // TMA load for B k_tile
+      copy(observed_tma_load_b_->with(*load2mma_tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,tile_B_write_stage));
+
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(load2mma_pipeline_state, k_tile_iter);
+
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tiled tensor for input A
+  /// gB_nkl - The tiled tensor for input B
+  // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_storage) const {
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(
+          gA_mkl, gB_nkl,                        // for scheduler
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+          mcast_mask_a, mcast_mask_b,            // multicast masks
+          cute::make_tuple());           
+    }
+    else if constexpr (ModeHasScales) {
+      // Separate out problem shape for convenience
+      auto [M,N,K,L] = problem_shape_MNKL;
+
+      Tensor mS_mkl = params.tma_load_scale.get_tma_tensor(shape(LayoutScale{}));
+      Tensor gS_mkl = local_tile(mS_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+
+      Tensor sS  = make_tensor(make_smem_ptr(shared_storage.input.smem_scale.begin()), SmemLayoutScale{});
+
+      Tensor tCgS_mkl = cta_mma.partition_A(gS_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+      Tensor tCsS = cta_mma.partition_A(sS);
+
+      // Project the cta_layout for tma_scale along the n-modes
+      auto [tSgS_mkl, tSsS] = tma_partition(params.tma_load_scale,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(tCsS), group_modes<0,3>(tCgS_mkl));
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(
+          gA_mkl, gB_nkl,                        // for scheduler
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+          mcast_mask_a, mcast_mask_b,            // multicast masks
+          cute::make_tuple(tSgS_mkl, tSsS));
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = params.tma_load_scale.get_tma_tensor(shape(LayoutScale{}));
+        Tensor gZ_mkl = local_tile(mS_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+        Tensor sZ  = make_tensor(make_smem_ptr(shared_storage.input.smem_zero.begin()), SmemLayoutScale{});
+
+        Tensor tCgZ_mkl = cta_mma.partition_A(gZ_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+        Tensor tCsZ = cta_mma.partition_A(sZ);
+        // Project the cta_layout for tma_scale along the n-modes
+        auto [tZgZ_mkl, tZsZ] = tma_partition(params.tma_load_zero,
+                                          get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                          group_modes<0,3>(tCsZ), group_modes<0,3>(tCgZ_mkl));
+        return cute::make_tuple(
+          gA_mkl, gB_nkl,                        // for scheduler
+          tAgA_mkl, tBgB_nkl, tAsA, tBsB,        // for input tensor values
+          mcast_mask_a, mcast_mask_b,            // multicast masks
+          cute::make_tuple(tSgS_mkl, tSsS, tZgZ_mkl, tZsZ));
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+
+  }
+
+  template<
+    class KTileIterator, class Accumulator,
+    class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA,
+    class... Ts
+  >
+  CUTLASS_DEVICE auto
+  transform(
+      Load2TransformPipeline load2transform_pipeline,
+      Load2TransformPipelineState load2transform_pipeline_consumer_state,
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_producer_state,
+      Accumulator accumulators,
+      cute::tuple<GTensorA, DstCopyA, SrcTensorA, DstTensorA,
+                  cute::tuple<Ts...>> input_operands,
+      KTileIterator k_tile_iter, int k_tile_count) {
+
+    static_assert(cute::is_same_v<ElementAMma, ElementBMma>, "ElementAMma and ElementBMma types should be the same.");
+    cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier);
+
+    // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM)
+    // tAsACompute : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM or TMEM)
+    auto [unused_tAgA, dst_copy_A, tAsA, tAsACompute,
+          partitioned_extra_info] = input_operands;
+
+    // Create the tensors in registers
+    auto tArA = make_tensor<ElementA>(tAsA(_,_,_,_,0).shape());  //(Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest (Register)
+    auto tArACompute = make_tensor<ElementAMma>(tAsA(_,_,_,_,0).shape());
+    constexpr int K_BLOCK_MAX = size<3>(tArA);
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+    auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag);
+
+      transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag);
+
+      int load2transform_consumer_index = load2transform_pipeline_consumer_state.index(); // read stage
+      int transform2mma_producer_index = transform2mma_pipeline_producer_state.index(); //write stage
+
+      auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state;
+
+      // Copy the input A matrix from SMEM
+      copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA);
+      // Copy scale/zero vector from SMEM
+      Utils::copy_scale_zeros_for_transform(partitioned_extra_info, load2transform_consumer_index);
+
+      // Loads from SMEM are done. Signal the mainloop load as early as possible
+      transform_bar.sync();
+      load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state);
+
+      auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state;
+
+      // Dequantize A with scale/zero in RF
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; k_block ++){
+        Utils::dequantize_A_kblock_for_transform(tArA, tArACompute, partitioned_extra_info, k_block);
+      }
+
+      // Dequantized A is stored into either Smem or Tmem
+      copy(dst_copy_A, tArACompute, tAsACompute(_,_,_,_,transform2mma_producer_index));
+
+      // fence for SMEM writes
+      cutlass::arch::fence_view_async_shared();
+      if constexpr (is_tmem<decltype(tAsACompute)>::value) {
+        // fence for TMEM writes if A operand is coming from TMEM
+        cutlass::arch::fence_view_async_tmem_store();
+      }
+
+      // Let the MMA know we are done transforming
+      transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state);
+      // Next pipeline stage
+      ++load2transform_pipeline_consumer_state;
+      ++transform2mma_pipeline_producer_state;
+
+      skip_wait = (k_tile_count <= 1);
+      // Peek the next pipeline stage's barriers
+      load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait);
+      transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait);
+    }
+    return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state);
+  }
+
+  template<class ProblemShape_MNKL, class Accumulator>
+  CUTLASS_DEVICE auto
+  transform_init(
+      Params const& params,
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Accumulator accumulators,
+      TensorStorage& shared_storage) {
+
+    auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL);
+
+    Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{});
+    Tensor sA = as_position_independent_swizzle_tensor(sA_orig);
+    Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+
+    Tensor sS = make_tensor(make_smem_ptr(shared_storage.input.smem_scale.begin()), SmemLayoutScale{}); 
+    Tensor sZ = make_tensor(make_smem_ptr(shared_storage.input.smem_zero.begin()), SmemLayoutScale{}); 
+
+    // Map input, compute, and fragment tensors to
+    //   Copy strategies and partitioned tensors. These will become the input
+    //   operands of the transform function. Depending on MMA atom type, the
+    //   operands can reside in SMEM or TMEM
+    auto setup_copy_ops = [&] (
+        auto tensor_input,
+        auto input_copy_atom,
+        auto tensor_compute,
+        auto make_fragment,
+        auto compute_copy_atom) constexpr {
+      auto fragment_compute = make_fragment(tensor_compute);
+      if constexpr (cute::is_tmem<cute::remove_cvref_t<decltype(fragment_compute)>>::value) {
+        // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation.
+        // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation
+        // See: TmemAllocMode::Duplicated.
+        Tensor tensor_input2x = [&] () constexpr {
+        if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) {
+          return make_tensor(tensor_input.data(),
+                             logical_product(tensor_input.layout(),
+                                             make_tile(make_tile(Layout<_2,_0>{},_),_,_,_))); 
+          }
+          else {
+            return tensor_input;
+          }
+        }();
+
+        fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        // If operand comes from TMEM, create the TMEM_STORE based copy
+        auto r2t_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,_,0));
+        auto thr_r2t_tiled_copy = r2t_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_r2t_tiled_copy.partition_S(tensor_input2x); //(TMEM_STORE, TMEM_STORE_M, TMEM_STORE_N)
+        auto partitioned_tensor_compute = thr_r2t_tiled_copy.partition_D(fragment_compute); //(TMEM_STORE, TMEM_STORE_M, TMEM_STORE_N)
+
+        // Source copy is based on the source operand of TMEM_STORE copy.
+        auto smem2reg_tiled_copy = make_tiled_copy_S(Copy_Atom<DefaultCopy, ElementA>{}, r2t_tiled_copy);
+        return cute::make_tuple(smem2reg_tiled_copy, r2t_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+      else {
+        auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute);
+        auto r2s_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout<Shape <_128,_8>, Stride<  _8,_1>>{},
+                                                     tensor_compute(_,_,_,0).layout());
+
+        auto smem2reg_tiled_copy = make_tiled_copy_S(input_copy_atom, r2s_tiled_copy);
+        auto thr_r2s_tiled_copy = r2s_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads);
+        auto partitioned_tensor_input = thr_r2s_tiled_copy.partition_S(tensor_input); //(SMEM_STORE, SMEM_STORE_M, SMEM_STORE_N)
+
+        auto partitioned_tensor_compute = thr_r2s_tiled_copy.partition_D(tensor_compute_ind_sw);//(SMEM_STORE, SMEM_STORE_M, SMEM_STORE_N)
+
+
+        return cute::make_tuple(smem2reg_tiled_copy, AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute);
+      }
+    };
+
+    auto [src_copy_A, dst_copy_A, tAsA, tAsACompute] =
+        setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{});
+
+    // Partition of thread -> shared and thread -> RF
+    auto fragment_compute = TiledMma::make_fragment_A(sACompute);
+    fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+    auto r2t_tiled_copy = make_tmem_copy(ComputeCopyAtomA{}, fragment_compute(_,_,_,0));
+    auto src_copy_scale = make_tiled_copy_S(Copy_Atom<DefaultCopy, ElementScale>{}, r2t_tiled_copy);
+
+    auto partitioned_extra_info = Utils::partition_extra_transform_info(TiledMma{}, src_copy_scale, shared_storage);
+
+    return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute,
+                            partitioned_extra_info);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgEngine, class FrgLayout,
+    class TensorA, class TensorB
+  >
+  CUTLASS_DEVICE auto
+  mma(
+      Load2MmaPipeline load2mma_pipeline,
+      Load2MmaPipelineState load2mma_pipeline_consumer_state,
+      Transform2MmaPipeline transform2mma_pipeline,
+      Transform2MmaPipelineState transform2mma_pipeline_consumer_state,
+      Mma2AccumPipeline mma2accum_pipeline,
+      Mma2AccumPipelineState mma2accum_pipeline_producer_state,
+      cute::Tensor<FrgEngine, FrgLayout> const& accumulators,
+      cute::tuple<TensorA, TensorB> const& input_operands,
+      int k_tile_count
+  ) {
+    TiledMma tiled_mma;
+
+    auto curr_load2mma_pipeline_consumer_state = load2mma_pipeline_consumer_state;
+    auto next_load2mma_pipeline_consumer_state = load2mma_pipeline_consumer_state;
+
+    auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+    auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state;
+
+    uint32_t skip_wait = (k_tile_count <= 0);
+    auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+    auto load2mma_flag = load2mma_pipeline.consumer_try_wait(next_load2mma_pipeline_consumer_state, skip_wait);
+    ++next_transform2mma_pipeline_consumer_state;
+    ++next_load2mma_pipeline_consumer_state;
+
+
+    // tCrA : (MMA), MMA_M, MMA_K, SmemStage  (In SMEM or TMEM)
+    //      We use SMEM stages to match #buffers in Load <-> Convert
+    // tCrB : (MMA), MMA_N, MMA_K, SmemStages (In SMEM)
+    auto const [tCrA, tCrB] = input_operands;
+
+    mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state);
+
+    int mma2accum_pipeline_producer_state_index = mma2accum_pipeline_producer_state.index();
+    auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index);
+    auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state;
+    ++mma2accum_pipeline_producer_state;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    // Clear the accumulator
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      load2mma_pipeline.consumer_wait(curr_load2mma_pipeline_consumer_state, load2mma_flag);
+      transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag);
+
+      int load2mma_pipeline_consumer_state_index = curr_load2mma_pipeline_consumer_state.index(); //read_stage
+      int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index(); //read_stage
+
+      auto tCrA0 = tCrA(_,_,_,transform2mma_pipeline_consumer_state_index);
+      auto tCrB0 = tCrB(_,_,_,load2mma_pipeline_consumer_state_index);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); k_block ++) {
+        cute::gemm(tiled_mma, tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC);               // A[0]*B[0]
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      load2mma_pipeline.consumer_release(curr_load2mma_pipeline_consumer_state);
+      transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state);
+
+      skip_wait = (k_tile_count <= 1);
+      load2mma_flag = load2mma_pipeline.consumer_try_wait(next_load2mma_pipeline_consumer_state, skip_wait);
+      transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait);
+
+      curr_load2mma_pipeline_consumer_state = next_load2mma_pipeline_consumer_state;
+      curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state;
+
+      ++next_load2mma_pipeline_consumer_state;
+      ++next_transform2mma_pipeline_consumer_state;
+    }
+
+    mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state);
+
+    return cute::make_tuple(curr_load2mma_pipeline_consumer_state, curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state);
+  }
+
+  template<class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  mma_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TensorStorage& shared_storage) const {
+    TiledMma tiled_mma;
+
+    auto get_tCrA = [&] () constexpr {
+      if constexpr (cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value) {
+        Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{});
+        return tiled_mma.make_fragment_A(sACompute);
+      }
+      else {
+        auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{}));
+        tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+        return tCrA;
+      }
+    };
+
+    Tensor tCrA = get_tCrA();
+    Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{});
+    Tensor tCrB = tiled_mma.make_fragment_B(sB);
+    return cute::make_tuple(tCrA, tCrB);
+  }
+
+  template<class FrgEngine, class FrgLayout, class TmemCopyAtom, class EpilogueTile>
+  CUTLASS_DEVICE auto
+  accum_init(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) {
+    return accumulators;
+  }
+
+private:
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  constexpr auto
+  tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const {
+    using X = cute::Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  typename Params::TMA_A const* observed_tma_load_a_ = nullptr;
+  typename Params::TMA_B const* observed_tma_load_b_ = nullptr;
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2d8172fb808a95f38a542d92c5b300ee5cb3921
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp
@@ -0,0 +1,951 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, _1)
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm100TmaUmmaWarpSpecializedSparse<
+      Stages,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedSparse<
+                          Stages,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape>;
+  using TileShape = TileShape_;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TileShape should be evenly divided by TiledMma");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+
+  // Define A and B block shapes for reduced size TMA_LOADs
+  using MmaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{}))));
+  using MmaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{}))));
+  static_assert(get<0,0>(MmaShapeA_MK{}) == 128 &&
+                (get<2>(MmaShapeA_MK{}) == 2 || get<2>(MmaShapeA_MK{}) == 4),
+                "This kernel only support MmaShape=128 and 2/4 kphase.");
+
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) || (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using MainloopPipeline = cutlass::PipelineTmaSparseUmmaAsync<
+                             DispatchPolicy::Stages,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopPipelineState = typename MainloopPipeline::PipelineState;
+
+  static constexpr int UtccpReuseCnt = ((size<2>(TileShape{}) / typename SparseConfig::TensorEAtomK{}) == 0) ?
+                                        typename SparseConfig::TensorEAtomK{} / size<2>(TileShape{}) : 1;
+  static_assert(UtccpReuseCnt == 1 || UtccpReuseCnt == 2, "UTCCP reuse count can only be either one or two");
+  // (TileM, TileN, TileK) TileK is adjusted according to the reuse.
+  using TileShapeE = decltype(replace<2>(TileShape{}, cute::lcm(size<2>(TileShape{}), typename SparseConfig::TensorEAtomK{})));
+  using MmaShapeE_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShapeE{}), size<2>(TileShapeE{}))));
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtomA must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtomB must evenly divide the tile shape.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide the tile shape.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+  using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomA{},
+      append(MmaShapeA_MK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE) that one UTCCP instruction can provide
+  using SmemLayoutE = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomE{},
+      append(MmaShapeE_MK{}, Int<DispatchPolicy::Stages>{})));
+  // (MMA_TILE_N,MMA_TILE_K),MMA_N,MMA_K,PIPE)
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+      SmemLayoutAtomB{},
+      append(MmaShapeB_NK{}, Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    ElementAMmaRaw>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>, cutlass::tfloat32_t, ElementBMma>;
+
+  using SmemAllocTypeA = cute::sparse_elem<ElementAMmaSparsity,
+                                           cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementAMmaRaw> < 8,
+                                                               uint8_t,
+                                                               ElementAMmaRaw>>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4 && cute::sizeof_bits_v<ElementBMma> < 8, uint8_t, ElementBMma>;
+
+  // Kernel Input Data Type that consider runtime dtype
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>,
+                                            ElementB>;
+
+  using RuntimeDataTypeA = cute::conditional_t<IsRuntimeDataTypeA, cute::UMMA::MXF8F6F4Format, void*>;
+  using RuntimeDataTypeB = cute::conditional_t<IsRuntimeDataTypeB, cute::UMMA::MXF8F6F4Format, void*>;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+  static constexpr uint32_t MetadataTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+  static constexpr uint32_t MainLoadTmaTransactionBytes = ABTmaTransactionBytes;
+
+  template <class AccTensor, class ETensor>
+  struct TmemStorage {
+    AccTensor accumulators;
+    ETensor tCtE;
+  };
+
+  template <
+    class KTileCount, class KTileMetadataCount,
+    class GTensorPartitionedA, class GTensorPartitionedB, class GTensorPartitionedE,
+    class STensorA, class STensorB, class STensorE
+  >
+  struct LoadParams {
+    // for scheduler
+    KTileCount k_tiles;
+    KTileMetadataCount k_tiles_metadata;
+    // for input tensor values
+    GTensorPartitionedA tAgA_mkl;
+    GTensorPartitionedB tBgB_nkl;
+    GTensorPartitionedE tEgE_nkl;
+    STensorA tAsA;
+    STensorB tBsB;
+    STensorE tEsE;
+    // the TMA multicast masks
+    uint16_t mcast_mask_a;
+    uint16_t mcast_mask_b;
+    uint16_t mcast_mask_e;
+
+    CUTLASS_DEVICE
+    LoadParams (
+        KTileCount k_tiles_, KTileMetadataCount k_tiles_metadata_,
+        GTensorPartitionedA tAgA_mkl_, GTensorPartitionedB tBgB_nkl_, GTensorPartitionedE tEgE_nkl_,
+        STensorA tAsA_, STensorB tBsB_, STensorE tEsE_,
+        uint16_t mcast_mask_a_, uint16_t mcast_mask_b_, uint16_t mcast_mask_e_)
+    : k_tiles(k_tiles_), k_tiles_metadata(k_tiles_metadata_)
+    , tAgA_mkl(tAgA_mkl_), tBgB_nkl(tBgB_nkl_), tEgE_nkl(tEgE_nkl_)
+    , tAsA(tAsA_), tBsB(tBsB_), tEsE(tEsE_)
+    , mcast_mask_a(mcast_mask_a_), mcast_mask_b(mcast_mask_b_), mcast_mask_e(mcast_mask_e_) {}
+  };
+
+  template <
+    class TiledMma,
+    class FragmentA, class FragmentB,
+    class FragmentE, class ETiledCopy, class SmemFrgE, class TmemFrgE
+  >
+  struct MmaParams {
+    TiledMma tiled_mma;
+    // A
+    FragmentA tCrA;
+    // B
+    FragmentB tCrB;
+    // E
+    FragmentE tCtE;
+    ETiledCopy tiled_copy_s2t_E;
+    SmemFrgE thr_tCsE_s2t;
+    TmemFrgE thr_tCtE_s2t;
+
+    CUTLASS_DEVICE
+    MmaParams (
+        TiledMma tiled_mma_,
+        FragmentA tCrA_, FragmentB tCrB_,
+        FragmentE tCtE_, ETiledCopy tiled_copy_s2t_E_,
+        SmemFrgE thr_tCsE_s2t_, TmemFrgE thr_tCtE_s2t_)
+    : tiled_mma(tiled_mma_)
+    , tCrA(tCrA_), tCrB(tCrB_)
+    , tCtE(tCtE_), tiled_copy_s2t_E(tiled_copy_s2t_E_)
+    , thr_tCsE_s2t(thr_tCsE_s2t_), thr_tCtE_s2t(thr_tCtE_s2t_) {}
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    // A is A Compressed, not raw tensorA
+    ArrayElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_E = decltype(make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    using TMA_B = decltype(make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        ClusterLayout_VMNK{})
+      );
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    TMA_A tma_load_a_fallback;
+    TMA_E tma_load_e_fallback;
+    TMA_B tma_load_b_fallback;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster)
+    : cluster_shape_(cluster_shape)
+    , block_rank_in_cluster_(block_rank_in_cluster)
+    , layout_a_(params.layout_a)
+    , layout_e_(params.layout_e)
+    , runtime_data_type_a_(params.runtime_data_type_a)
+    , runtime_data_type_b_(params.runtime_data_type_b) {
+    if constexpr (IsDynamicCluster) {
+      const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x &&
+                                        cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_e_ = is_fallback_cluster ? &params.tma_load_e_fallback : &params.tma_load_e;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_e_ = &params.tma_load_e;
+      observed_tma_load_b_ = &params.tma_load_b;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+    typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_E tma_load_e = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk);
+
+    typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_E tma_load_e_fallback = make_tma_atom_A_sm100<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,_,cute::Int<0>{}),
+        TileShapeE{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,_,cute::Int<0>{}),
+        TileShape{},
+        TiledMma{},
+        cluster_layout_vmnk_fallback);
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      tma_load_a_fallback,
+      tma_load_e_fallback,
+      tma_load_b_fallback,
+      args.layout_a,
+      args.layout_e,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits_v<ElementA>;
+
+    bool implementable = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,     K/2, L),
+                                                                                                    cute::make_stride(_1{}, M,   M*K/2));
+    }
+    else { // If A is K-major
+      implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,    K/2,  L),
+                                                                                                    cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorA\n");
+    }
+
+    // Check Alignment B
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits_v<ElementB>;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA on tensorB\n");
+    }
+
+    // Check for AB layout requirement
+    const auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    const auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    implementable = implementable && (layout_a_ref == args.layout_a);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_a mismatch\n");
+    }
+
+    implementable = implementable && (layout_e_ref == args.layout_e);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: layout_e mismatch\n");
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE void
+  prefetch_tma_descriptors() {
+    cute::prefetch_tma_descriptor(observed_tma_load_a_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_b_->get_tma_descriptor());
+    cute::prefetch_tma_descriptor(observed_tma_load_e_->get_tma_descriptor());
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE static
+  auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  auto
+  slice_accumulator(TmemStorage tmem_storage, int stage) {
+    return cute::make_tuple(tmem_storage.accumulators(_,_,_,stage));
+  }
+
+  template <class EpilogueTile, bool IsOverlappingAccum = false>
+  CUTLASS_DEVICE static
+  auto
+  init_tmem_tensors(EpilogueTile epi_tile) {
+    TiledMma tiled_mma;
+    auto acc_shape = partition_accumulator_shape();
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+    Tensor tCtE   = make_tensor<typename TiledMma::FrgTypeE>(take<0,3>(shape(SmemLayoutE{})));
+
+    TmemStorage<decltype(accumulators), decltype(tCtE)> tmem_storage;
+    tmem_storage.accumulators = accumulators;
+    tmem_storage.tCtE = tCtE;
+
+    return tmem_storage;
+  }
+
+  template <class TmemStorage>
+  CUTLASS_DEVICE static
+  void
+  set_tmem_offsets(TmemStorage& tmem_storage, uint32_t tmem_base_addr) {
+    tmem_storage.accumulators.data() = tmem_base_addr;
+    tmem_storage.tCtE.data()         = tmem_base_addr + cutlass::detail::find_tmem_tensor_col_offset(tmem_storage.accumulators);
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(layout_a_.shape());
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L));
+    Tensor mE_mkl = observed_tma_load_e_->get_tma_tensor(layout_e_.shape());
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShapeE{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+    Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl);          // (MMA, MMA_N, MMA_K, n, k, l)
+    Tensor tCgE_mkl = cta_mma.partition_A(gE_mkl);          // (MMA, MMA_M, MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE)
+
+    // Define the CTA-in-cluster Layout and Coord
+    Layout cta_layout_mnk  = make_layout(cluster_shape_);
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl));
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tEgE_mkl, tEsE] = tma_partition(*observed_tma_load_e_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sE), group_modes<0,3>(tCgE_mkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_e = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return LoadParams{
+      size<3>(gA_mkl), size<3>(gE_mkl),               // for scheduler
+      tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE, // for input tensor values
+      mcast_mask_a, mcast_mask_b, mcast_mask_e};      // multicast masks
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  template <class TmemStorage>
+  CUTLASS_DEVICE auto
+  mma_init(
+    TmemStorage tmem_storage,
+    TensorStorage& shared_tensors) const {
+
+    // Allocate "fragments/descriptors" for A B E matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});  // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});  // (MMA,MMA_M,MMA_K,PIPE) that one UTCCP can provide
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = TiledMma::make_fragment_A(sA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = TiledMma::make_fragment_B(sB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sB));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<3>(sE));                                     // PIPE
+
+    Tensor tCtE = tmem_storage.tCtE;
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpEOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      cute::SM100_UTCCP_128dp128bit_2cta, cute::SM100_UTCCP_128dp128bit_1cta>;
+    auto tiled_copy_s2t_E = make_utccp_copy(UtccpEOp{}, recast<ElementE>(tCtE));
+
+    auto thr_copy_s2t_E = tiled_copy_s2t_E.get_slice(0);
+    Tensor thr_tCsE_s2t_ = thr_copy_s2t_E.partition_S(recast<ElementE>(sE));
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    Tensor thr_tCsE_s2t = get_utccp_smem_desc_tensor<UtccpEOp>(thr_tCsE_s2t_);
+    Tensor thr_tCtE_s2t = thr_copy_s2t_E.partition_D(recast<ElementE>(tCtE));
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      // Update instruction descriptor according to runtime argument.
+      // Applying bitmask (0b111) to help compiler deduce that the conversion and assignment are safe.
+      tiled_mma.idesc_.a_format_ = uint8_t(runtime_data_type_a_) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(runtime_data_type_b_) & 0b111;
+    }
+
+    return MmaParams{
+      tiled_mma,
+      tCrA, tCrB,
+      tCtE, tiled_copy_s2t_E, thr_tCsE_s2t, thr_tCtE_s2t};
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class LoadParams,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load(
+    MainloopPipeline mainloop_pipeline,
+    MainloopPipelineState mainloop_pipe_producer_state,
+    LoadParams const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count) {
+
+    auto [k_tiles, k_tiles_metadata,
+          tAgA_mkl, tBgB_nkl, tEgE_mkl, tAsA, tBsB, tEsE,
+          mcast_mask_a, mcast_mask_b, mcast_mask_e] = load_inputs;
+
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tEgE = tEgE_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+    uint32_t iter = 0;
+
+    // K_tile_iter for E
+    auto k_tile_start = cute::crd2idx(k_tile_iter.coord, k_tiles);
+    auto k_utccp_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start / UtccpReuseCnt, k_tiles_metadata), k_tiles_metadata);
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      bool load_e = iter % UtccpReuseCnt == 0;
+
+      // LOCK mainloop_pipe_producer_state for _writing_
+      mainloop_pipeline.producer_acquire(mainloop_pipe_producer_state, load_e, barrier_token);
+      // Note: We don't synchronize the sf_pipeline for "Buffer_Empty". We use mainloop pipeline
+      // to do the synchronization at once.
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = mainloop_pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+
+      int write_stage = mainloop_pipe_producer_state.index();
+      ++mainloop_pipe_producer_state;
+      barrier_token = mainloop_pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+      if (cute::elect_one_sync()) {
+        copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage));
+        copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage));
+      }
+
+      if (load_e) {
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_e_->with(*tma_barrier, mcast_mask_e), tEgE(_,*k_utccp_tile_iter), tEsE(_,write_stage));
+        }
+        ++k_utccp_tile_iter;
+      }
+
+      ++k_tile_iter;
+      --k_tile_count;
+      iter++;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline mainloop_pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    mainloop_pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class MmaParams,
+    class CtaTileCoord
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopPipeline,
+                  AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopPipelineState,
+                  typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::tuple<cute::Tensor<FrgEngine, FrgLayout>> const& accumulators_pair,
+      MmaParams const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+
+    auto accumulators = get<0>(accumulators_pair);
+    auto [tiled_mma,
+          tCrA, tCrB,
+          tCtE, tiled_copy_s2t_E, thr_tCsE_s2t, thr_tCtE_s2t ] = mma_inputs;
+
+    auto [mainloop_pipeline, accumulator_pipeline] = pipelines;
+    auto [mainloop_pipe_consumer_state, accumulator_pipe_producer_state] = pipeline_states;
+
+    uint32_t skip_wait = k_tile_count <= 0;
+    auto barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+    uint32_t math_mma_e_stage_idx = 0;
+    uint32_t iter = 0;
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    if constexpr (not IsOverlappingAccum) {
+      // Wait for tmem accumulator buffer to become empty with a flipped phase
+      accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // WAIT on mainloop_pipe_consumer_state until its data are available
+      // (phase bit flips from mainloop_pipe_consumer_state.phase() value)
+      mainloop_pipeline.consumer_wait(mainloop_pipe_consumer_state, barrier_token);
+
+      // Compute on k_tile
+      int read_stage = mainloop_pipe_consumer_state.index();
+      // Save current mainlop pipeline read state
+      auto curr_mainloop_pipe_consumer_state = mainloop_pipe_consumer_state;
+
+      // Advance mainloop_pipe
+      ++mainloop_pipe_consumer_state;
+      --k_tile_count;
+      skip_wait = k_tile_count <= 0;
+      // Peek at next iteration
+      barrier_token = mainloop_pipeline.consumer_try_wait(mainloop_pipe_consumer_state, skip_wait);
+
+      if constexpr (UtccpReuseCnt == 1) {
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_E, thr_tCsE_s2t(_,_,_,_,read_stage), thr_tCtE_s2t);
+        }
+      }
+      else {
+        if (not (iter & 1)) {
+          if (cute::elect_one_sync()) {
+            copy(tiled_copy_s2t_E, thr_tCsE_s2t(_,_,_,_,read_stage), thr_tCtE_s2t);
+          }
+        }
+      }
+
+      if constexpr (IsOverlappingAccum) {
+        if (iter == 0) {
+          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        }
+      }
+
+      // Unroll the K mode manually so we can set scale C to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma.with(tCtE(_,_,math_mma_e_stage_idx * UtccpReuseCnt + k_block)),
+            tCrA(_,_,k_block,read_stage),
+            tCrB(_,_,k_block,read_stage),
+            accumulators);
+        tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+      }
+
+      if constexpr (UtccpReuseCnt != 1) {
+        // Each E Smem Stage contain two CtaK's Metadata when UtccpReuse
+        math_mma_e_stage_idx ^= 1;
+      }
+
+      mainloop_pipeline.consumer_release(curr_mainloop_pipe_consumer_state);
+      ++iter;
+    }
+
+    return mainloop_pipe_consumer_state;
+  }
+
+protected:
+
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_E const* observed_tma_load_e_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  LayoutA layout_a_;
+  LayoutE layout_e_;
+  RuntimeDataTypeA runtime_data_type_a_{};
+  RuntimeDataTypeB runtime_data_type_b_{};
+
+  ClusterShape cluster_shape_;
+  uint32_t block_rank_in_cluster_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_array_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_array_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e90d727826e51ddecb4c6e1c33eaf230f9220d11
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_array_warpspecialized.hpp
@@ -0,0 +1,1685 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/sm103_blockscaled_layout.hpp"
+#include "cutlass/detail/collective/sm103_kernel_type.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int LoadABPipelineStageCount,
+  int LoadSFPipelineStageCount,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, int)
+  cutlass::sm103::detail::KernelPrefetchType PrefetchType,
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm103ArrayTmaUmmaWarpSpecializedBlockScaled<
+      LoadABPipelineStageCount,
+      LoadSFPipelineStageCount,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape,
+      PrefetchType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm103ArrayTmaUmmaWarpSpecializedBlockScaled<
+                          LoadABPipelineStageCount,
+                          LoadSFPipelineStageCount,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape,
+                          PrefetchType>;
+
+  using TileShape = TileShape_;
+  // Due to an MSVC bug, we can't use decltype(make_tiled_mma()) interface.
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // Assert that TiledMma and TileShape should be weakly compatible
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TiledMma and TileShape should be weakly compatible");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm103BlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static int constexpr CTA_N_SF = cutlass::round_up(size<1>(CtaShape_MNK{}), Blk_MN{});
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  static int constexpr SF_BUFFERS_PER_TILE_K = SFVecSize == 16 ? 4 : 2;
+  using MMA_SF_Tiler = decltype(make_tile(shape<0>(CtaShape_MNK{}), Int<CTA_N_SF>{}, Int<shape<2>(CtaShape_MNK{})/SF_BUFFERS_PER_TILE_K>{}));
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::LoadABPipelineStageCount,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopABPipelineState = typename MainloopABPipeline::PipelineState;
+
+  using MainloopSFPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::LoadSFPipelineStageCount,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopSFPipelineState = typename MainloopSFPipeline::PipelineState;
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM103 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM103 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+ using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomA{},
+    append(make_shape(make_shape(shape<0>(CtaShape_MNK{}), _16{}), _1{}, _8{}), Int<DispatchPolicy::LoadABPipelineStageCount>{} /*PIPE*/),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_M,16bytes),1,8,NUM_PIPES)
+  using SmemLayoutA_tma = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomA{},
+    append(make_shape(make_shape(shape<0>(CtaShape_MNK{}), _16{}), _1{}, _8{}), Int<3>{}  /*Per mainloop iteration */),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_M,16bytes),1,8,3)
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomB{},
+    append(make_shape(make_shape(shape<1>(CtaShape_MNK{}) / size(typename TiledMma::AtomThrID{}), _16{}), _1{}, _8{}), Int<DispatchPolicy::LoadABPipelineStageCount>{} /*PIPE*/),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_N,16bytes),1,8,NUM_PIPES)
+  using SmemLayoutB_tma = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomB{},
+    append(make_shape(make_shape(shape<1>(CtaShape_MNK{}) / size(typename TiledMma::AtomThrID{}), _16{}), _1{}, _8{}), Int<3>{} /*Per mainloop iteration */),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_N,16bytes),1,8,3)
+
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::LoadSFPipelineStageCount>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::LoadSFPipelineStageCount>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  using TmaInternalElementA = uint8_t;
+  using TmaInternalElementB = uint8_t;
+
+  using SmemAllocTypeA = uint8_t;
+  using SmemAllocTypeB = uint8_t;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = typename detail::sm10x_block_scale_runtime_input_t<ElementAMma, IsRuntimeDataTypeA>::Type;
+  using RuntimeDataTypeB = typename detail::sm10x_block_scale_runtime_input_t<ElementBMma, IsRuntimeDataTypeB>::Type;
+
+  using SmemPrefetchType = uint8_t;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA,   cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB,   cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF,        cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF,        cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_SFA;
+      cute::TmaDescriptor smem_tensormap_SFB;
+    } tensormaps;
+
+    using PipelineABStorage = typename MainloopABPipeline::SharedStorage;
+    using PipelineSFStorage = typename MainloopSFPipeline::SharedStorage;
+    struct PipelineStorage {
+      PipelineABStorage pipeline_ab;
+      PipelineSFStorage pipeline_sf;
+    };
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom<uint8_t>(
+        GmemTiledCopyA{},
+        recast<uint8_t>(make_tensor(recast_ptr<ElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{})),
+        SmemLayoutA_tma{},
+        make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+        size<1>(ClusterShape{}))
+      );
+
+    using TMA_B = decltype(make_tma_atom<uint8_t>(
+        GmemTiledCopyB{},
+        recast<uint8_t>(make_tensor(recast_ptr<ElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{})),
+        SmemLayoutB_tma{},
+        make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+        size<0>(ClusterShape{})/size(typename TiledMma::AtomThrID{}))
+      );
+
+    using TMA_SFA = decltype(make_tma_atom<uint8_t>( // using legacy sm90 make_tma_atom
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+        size<1>(ClusterShape{}))
+      );
+
+    using TMA_SFB = decltype(make_tma_atom<uint8_t>( // using legacy sm90 make_tma_atom
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+        size<0>(ClusterShape{})/size(typename TiledMMA_SF::AtomThrID{}))
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+    cute::TmaDescriptor* tensormaps;
+    ArrayElementA const** ptr_A;
+    StrideA dA;
+    ArrayElementB const** ptr_B;
+    StrideB dB;
+    ElementSF const** ptr_SFA;
+    ElementSF const** ptr_SFB;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+    }
+  }
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shapes,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_M = int32_t(size<0>(TileShape{}));
+    auto init_N = int32_t(size<1>(TileShape{}));
+    auto init_K = int32_t(size<2>(TileShape{}));
+    auto init_L = 1;
+
+    // Tensor pointers will be fixed before the first access
+    ElementA const* ptr_A_first_batch = nullptr;
+    ElementB const* ptr_B_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    InternalLayoutSFA layout_SFA;
+    InternalLayoutSFB layout_SFB;
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+      layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+      layout_SFA = args.layout_SFA;
+      layout_SFB = args.layout_SFB;
+    }
+
+    // Batches/Groups are managed by using appropriate pointers to input matrices.
+    Tensor tensor_a = recast<TmaInternalElementA>(make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a)));
+    Tensor tensor_b = recast<TmaInternalElementB>(make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b)));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    // Tensor pointers will be fixed before the first access
+    ElementSF const* ptr_SFA_first_batch = nullptr;
+    ElementSF const* ptr_SFB_first_batch = nullptr;
+
+    Tensor tensor_sfa = make_tensor(ptr_SFA_first_batch, layout_SFA);
+    Tensor tensor_sfb = make_tensor(ptr_SFB_first_batch, layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_atom<uint8_t>(
+      GmemTiledCopyA{},
+      tensor_a,
+      SmemLayoutA_tma{},
+      make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+      size<1>(cluster_shape)
+    );
+
+    typename Params::TMA_B tma_load_b = make_tma_atom<uint8_t>(
+      GmemTiledCopyB{},
+      tensor_b,
+      SmemLayoutB_tma{},
+      make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+      size<0>(cluster_shape)/size(typename TiledMma::AtomThrID{})
+    );
+
+    typename Params::TMA_A tma_load_a_fallback =  make_tma_atom<uint8_t>(
+      GmemTiledCopyA{},
+      tensor_a,
+      SmemLayoutA_tma{},
+      make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+      size<1>(cluster_shape_fallback)
+    );
+
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopyB{},
+      tensor_b,
+      SmemLayoutB_tma{},
+      make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+      size<0>(cluster_shape_fallback)/size(typename TiledMma::AtomThrID{})
+    );
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom<uint8_t>(
+      GmemTiledCopySFA{},
+      tensor_sfa,
+      SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<1>(cluster_shape)
+    );
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom<uint8_t>(
+      GmemTiledCopySFB{},
+      tensor_sfb,
+      SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<0>(cluster_shape)/size(typename TiledMMA_SF::AtomThrID{})
+    );
+
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopySFA{},
+      tensor_sfa,
+      SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<1>(cluster_shape_fallback)
+    );
+
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopySFB{},
+      tensor_sfb,
+      SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<0>(cluster_shape_fallback)/size(typename TiledMMA_SF::AtomThrID{})
+    );
+
+    #if 0
+    print("tma_load_a:\n");
+    print(tma_load_a);
+    print("tma_load_a.tma_desc:\n"); print(tma_load_a.tma_desc_);          print("\n");
+
+    print("tma_load_b:\n");
+    print(tma_load_b);
+    print("tma_load_b.tma_desc:\n"); print(tma_load_b.tma_desc_);          print("\n");
+
+    print("layout_SFA:      "); print(args.layout_SFA); print("\n");
+    print("tma_load_sfa:\n");
+    print(tma_load_sfa);
+    print("tma_load_sfa.tma_desc:\n"); print(tma_load_sfa.tma_desc_);      print("\n");
+
+    print("layout_SFB:      "); print(args.layout_SFB); print("\n");
+    print("tma_load_sfb:\n");
+    print(tma_load_sfb);
+    print("tma_load_sfb.tma_desc:\n"); print(tma_load_sfb.tma_desc_);      print("\n");
+
+    print("layout_sfa:      "); print(args.layout_SFA); print("\n");
+    print("tma_load_sfa_fallback:\n");
+    print(tma_load_sfa_fallback);
+    print("tma_load_sfa_fallback.tma_desc:\n"); print(tma_load_sfa_fallback.tma_desc_);      print("\n");
+
+    print("layout_sfb:      "); print(args.layout_SFB); print("\n");
+    print("tma_load_sfb_fallback:\n");
+    print(tma_load_sfb_fallback);
+    print("tma_load_sfb_fallback.tma_desc:\n"); print(tma_load_sfb_fallback.tma_desc_);      print("\n");
+    #endif
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      args.layout_SFA,
+      args.layout_SFB,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFA),
+      reinterpret_cast<ElementSF const**>(args.ptr_SFB)
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 4;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if constexpr (IsRuntimeDataType && detail::is_sm10x_mxf4nvf4_input<ElementAMma>() && detail::is_sm10x_mxf4nvf4_input<ElementBMma>()) {
+      bool is_compatible = (SFVecSize == 16 ||
+                           (SFVecSize == 32 && is_same_v<ElementSF, cutlass::float_ue8m0_t>
+                                            && args.runtime_data_type_a == cute::UMMA::MXF4Format::E2M1
+                                            && args.runtime_data_type_b == cute::UMMA::MXF4Format::E2M1));
+      if (!is_compatible) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: 2x mode (VectorSize=32) only supports float_e2m1_t for a/b types and ue8m0_t for sf type.\n");
+      }
+      implementable &= is_compatible;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  template <class FrgEngine, class FrgLayout>
+  CUTLASS_DEVICE auto
+  slice_accumulator(cute::Tensor<FrgEngine, FrgLayout> const& accumulators, int stage) {
+    return accumulators(_,_,_,stage);
+  }
+
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  get_mkl_shape_tensor (
+      ProblemShape_MNKL const& problem_shape_MNKL) const {
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+    int K_recast = (K*cute::sizeof_bits_v<ElementA>/8);
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K_recast,mock_L));
+    Tensor gA_mkl = local_tile(mA_mkl, replace<2>(TileShape{}, _384{}), make_coord(_,_,_), Step<_1, X,_1>{});
+    return gA_mkl;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_ab_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count, int32_t const sm_idx) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+    int K_recast = (K*cute::sizeof_bits_v<ElementA>/8);
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K_recast,mock_L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K_recast,mock_L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, replace<2>(TileShape{}, _384{}), make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, replace<2>(TileShape{}, _384{}), make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl_tmp = cta_mma.partition_A(gA_mkl);                                       // ((CTA_MMA_M,96),Rest_MMA_M,Rest_MMA_K, m, k, l)
+    Tensor cta_tCgA = make_tensor(tCgA_mkl_tmp.data(), make_layout(coalesce(make_layout(cute::layout<0,0>(tCgA_mkl_tmp), cute::layout<1>(tCgA_mkl_tmp))),
+                                                                   coalesce(make_layout(cute::layout<0,1>(tCgA_mkl_tmp), cute::layout<2>(tCgA_mkl_tmp))),
+                                                                   cute::layout<3>(tCgA_mkl_tmp), cute::layout<4>(tCgA_mkl_tmp), cute::layout<5>(tCgA_mkl_tmp)));   // (CTA_M,CTA_K,m,k,l)
+
+    Tensor tCgA_mkl = make_tensor(cta_tCgA.data(), tiled_divide(cta_tCgA.layout(),
+                                                                make_tile(size<1,0>(typename TiledMma::ALayout{}) /*MMA_M for SM100*/,
+                                                                _128{} /*128bytes*/)));      // ((CTA_MMA_M,256),Rest_MMA_M,Rest_MMA_K, m, k, l)
+
+    Tensor tCgB_nkl_tmp = cta_mma.partition_B(gB_nkl);                                       // ((MMA_ATOM_M,96),Rest_MMA_M,Rest_MMA_K, n, k, l)
+    Tensor cta_tCgB = make_tensor(tCgB_nkl_tmp.data(), make_layout(coalesce(make_layout(cute::layout<0,0>(tCgB_nkl_tmp), cute::layout<1>(tCgB_nkl_tmp))),
+                                                                   coalesce(make_layout(cute::layout<0,1>(tCgB_nkl_tmp), cute::layout<2>(tCgB_nkl_tmp))),
+                                                                  cute::layout<3>(tCgB_nkl_tmp), cute::layout<4>(tCgB_nkl_tmp), cute::layout<5>(tCgB_nkl_tmp)));   // (CTA_M,CTA_K,m,k,l)
+    Tensor tCgB_nkl = make_tensor(cta_tCgB.data(), tiled_divide(cta_tCgB.layout(),
+                                                                make_tile(size<1,0>(typename TiledMma::BLayout{}) /*MMA_M for SM100*/,
+                                                                _128{} /*128bytes*/)));      // ((CTA_MMA_M,256),Rest_MMA_M, Rest_MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});    // ((CTA_MMA_N,32),Rest_MMA_N,8,NUM_PIPE)
+
+
+    Layout cta_layout_mnk  = make_layout(cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape()));
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    int block_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,1>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,1>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+    // Fetch a copy of tensormaps for the CTA from Params
+    auto input_tensormaps = tensormaps_init_ab(params, shared_tensormaps, sm_count, sm_idx);
+
+    return cute::make_tuple(
+      gA_mkl, gB_nkl,                         // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,         // for input tensor values
+      mcast_mask_a, mcast_mask_b,            // multicast masks
+      input_tensormaps);                      // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_sf_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count, int32_t const sm_idx,
+      int32_t init_group) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+
+    InternalLayoutSFA layout_SFA{};
+    InternalLayoutSFB layout_SFB{};
+    if constexpr (IsGroupedGemmKernel) {
+      layout_SFA = params.layout_SFA[init_group];
+      layout_SFB = params.layout_SFB[init_group];
+    }
+    else {
+      layout_SFA = params.layout_SFA;
+      layout_SFB = params.layout_SFB;
+    }
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(layout_SFA));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(layout_SFB));
+      }
+    }();
+
+    // Partition for this CTA
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, MMA_SF_Tiler{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, MMA_SF_Tiler{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    Tensor tCgSFA_mkl = make_tensor(gSFA_mkl.data(), tiled_divide(gSFA_mkl.layout(), make_tile(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})))); // ((MMA_M,MMA_K),Rest_MMA_M,Rest_MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = make_tensor(gSFB_nkl.data(), tiled_divide(gSFB_nkl.layout(), make_tile(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})))); // ((MMA_N,MMA_K),Rest_MMA_N,Rest_MMA_K, n, k, l)
+
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    Layout cta_layout_mnk  = make_layout(cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape()));
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    int block_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster);
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(tCsSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(tCsSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+
+    auto input_tensormaps = tensormaps_init_sf(params, shared_tensormaps, sm_count, sm_idx);
+
+    return cute::make_tuple(
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB, // for input scale factor tensor values
+      mcast_mask_sfa, mcast_mask_sfb,         // multicast masks
+      input_tensormaps);                        // for tma descriptor modification (per-CTA tensormap copy)
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  CUTLASS_DEVICE auto
+  mma_init(
+    Params const& params,
+    TensorStorage& shared_tensors,
+    uint32_t const tmem_offset) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = make_tensor<typename TiledMma::FrgTypeA>(sA);;
+    Tensor tCrB = make_tensor<typename TiledMma::FrgTypeB>(sB);;
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::LoadABPipelineStageCount>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::LoadABPipelineStageCount>{} == size<3>(sB));                                     // PIPE
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(take<0,3>(shape(SmemLayoutAtomSFA{})));
+    // TMEM allocations for SFA and SFB will always start at DP 0.
+    tCtSFA.data() = tmem_offset;
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(take<0,3>(shape(SmemLayoutAtomSFB{})));
+
+    tCtSFB.data() = tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tCtSFA);
+
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tCtSFA_compact_copy = make_tensor(tCtSFA_compact.data(), append<3>(tCtSFA_compact(_,_0{},_0{}).layout()));
+    auto tCtSFB_compact_copy = make_tensor(tCtSFB_compact.data(), append<3>(tCtSFB_compact(_,_0{},_0{}).layout()));
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact_copy);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact_copy);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_compact_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_compact_s2t_);
+    auto thr_tCtSFA_compact_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_compact_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_compact_s2t_);
+    auto thr_tCtSFB_compact_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    // using MMA_SF_Tiler = decltype(make_tile(shape<0>(CtaShape_MNK{}), Int<CTA_N_SF>{}, Int<shape<2>(CtaShape_MNK{})/2>{}));  // 128x128x384
+    // MMA shapes are ((_128,_96),_1,_8) which makes the MMA_SFA_Shape ((128, (16,3)), 1, 8/3)
+    // The number is not divisible by 4 in K dimension which is needed for TMEM allocation.
+    // To be able to iterate thru the SFs for MMA, we model this as (MMA), MMA_M, MMA_K: ((128, (16,1)), 1, 24)
+    // with this layout we can iterate thru the SFs by incrementing MMA_K mode by 3/6 for this example (Vs=16 vs Vs=32).
+    constexpr int MMA_M = size<0>(CtaShape_MNK{});
+    constexpr int MMA_N_SF = CTA_N_SF;
+    constexpr int MMA_K_SF = shape<2>(CtaShape_MNK{}) / 2;
+    auto mnBasicBlockShape  =  make_shape(_32{}, _4{});
+    auto kBasicBlockShape_single   = make_shape(Int<SFVecSize>{}, Int<1>{});
+    auto mma_iter_SFA_shape  = make_shape( prepend(Int<MMA_M/128>{},  mnBasicBlockShape),  kBasicBlockShape_single);
+    auto sSFA_iter_shape  =   make_shape(mma_iter_SFA_shape,  _1{},  Int<MMA_K_SF/SFVecSize>{});
+    auto mma_iter_SFB_shape  = make_shape( prepend(Int<MMA_N_SF/128>{},  mnBasicBlockShape),  kBasicBlockShape_single);
+    auto sSFB_iter_shape  =   make_shape(mma_iter_SFB_shape,  _1{},  Int<MMA_K_SF/SFVecSize>{});
+
+    // Used for MMAs
+    using MmaIterShapeSFA = decltype(sSFA_iter_shape);  // ((32,4),(SFVecSize,1), MMA_M/128, SF_MMA_K/SfVecSize
+    using MmaIterShapeSFB = decltype(sSFB_iter_shape);  // ((32,4),(SFVecSize,1), MMA_N/128, SF_MMA_K/SfVecSize
+
+    Tensor tCtSFA_mma = make_tensor<typename TiledMma::FrgTypeSFA>(MmaIterShapeSFA{});
+    tCtSFA_mma.data() = tCtSFA.data();
+    Tensor tCtSFB_mma = make_tensor<typename TiledMma::FrgTypeSFB>(MmaIterShapeSFB{});
+    tCtSFB_mma.data() = tCtSFB.data();
+
+    return cute::make_tuple(
+      tiled_mma,
+      tCrA, tCrB, tCtSFA, tCtSFB, tCtSFA_mma, tCtSFB_mma,
+      tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t);
+  }
+
+// Helper function to handle both prefetch types
+  template <int BuffersPerKtile, typename TmaPrefetchFn, typename KTileIterator>
+  CUTLASS_DEVICE void issue_prefetch(
+      int& prefetch_k_tile_count,
+      int& prefetch_buf_idx,
+      KTileIterator& prefetch_k_tile,
+      TmaPrefetchFn&& tma_prefetch_fn)
+  {
+    if (prefetch_k_tile_count > 0) {
+      if constexpr (PrefetchType == cutlass::sm103::detail::KernelPrefetchType::TmaPrefetch) {
+        tma_prefetch_fn();
+      }
+
+      prefetch_buf_idx = (prefetch_buf_idx + 1) % BuffersPerKtile;
+      if(prefetch_buf_idx == 0) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+  }
+
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TensorMapA, class TensorMapB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_ab(
+    Params const& params,
+    MainloopABPipeline pipeline,
+    MainloopABPipelineState mainloop_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                GTensorPartitionedA, GTensorPartitionedB,
+                STensorA, STensorB,
+                uint16_t, uint16_t,
+                cute::tuple<TensorMapA, TensorMapB>> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count, 
+    bool did_batch_change, int prefetch_k_tile_count = 0) {
+
+    auto tAgA_mkl = get<2>(load_inputs);
+    auto tBgB_nkl = get<3>(load_inputs);
+    auto tAsA = get<4>(load_inputs);
+    auto tBsB = get<5>(load_inputs);
+    auto mcast_mask_a = get<6>(load_inputs);
+    auto mcast_mask_b = get<7>(load_inputs);
+    auto input_tensormaps = get<8>(load_inputs);
+
+    if (did_batch_change) {
+      tensormaps_fence_acquire(get<0>(input_tensormaps));
+      tensormaps_fence_acquire(get<1>(input_tensormaps));
+    }
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, _, _, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, _, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+    constexpr int BuffersPerKtile = 3;
+    auto prefetch_k_tile = k_tile_iter;
+    auto prefetch_buf_idx = 0;
+    auto tile_k_advance = LoadABPipelineStageCount / BuffersPerKtile;
+
+    if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+      prefetch_buf_idx = LoadABPipelineStageCount % BuffersPerKtile;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i=0;i<tile_k_advance;i++) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      using BarrierType = typename MainloopABPipeline::ProducerBarrierType;
+      // In total, we will load 3 buffers per k_tile_iter. Unrolled.
+      CUTLASS_PRAGMA_UNROLL
+      for(int buffer = 0; buffer < BuffersPerKtile; buffer++) {
+        pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+        int write_stage = mainloop_pipe_producer_state.index();
+        ++mainloop_pipe_producer_state;
+        barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+        auto tma_copy_traits_a = observed_tma_load_a_->with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a);
+        auto tma_copy_traits_b = observed_tma_load_b_->with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b);
+
+        if (cute::elect_one_sync()) {
+          copy(tma_copy_traits_a, group_modes<0,2>(tAgA(_,_,buffer,*k_tile_iter)), tAsA(_,write_stage));
+          copy(tma_copy_traits_b, group_modes<0,2>(tBgB(_,_,buffer,*k_tile_iter)), tBsB(_,write_stage));
+        }
+
+        if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+          issue_prefetch <BuffersPerKtile>(
+            prefetch_k_tile_count,
+            prefetch_buf_idx,
+            prefetch_k_tile,
+            [&]() {
+              prefetch(tma_copy_traits_a, group_modes<0,2>(tAgA(_,_,prefetch_buf_idx,*prefetch_k_tile)));
+              prefetch(tma_copy_traits_b, group_modes<0,2>(tBgB(_,_,prefetch_buf_idx,*prefetch_k_tile)));
+            }
+          );
+        }
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB,
+    class TensorMapSFA, class TensorMapSFB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_sf(
+    Params const& params,
+    MainloopSFPipeline pipeline,
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state,
+    cute::tuple<GTensorPartitionedSFA, GTensorPartitionedSFB,
+                STensorSFA, STensorSFB,
+                uint16_t, uint16_t,
+                cute::tuple<TensorMapSFA, TensorMapSFB>> const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count, 
+    bool did_batch_change, int prefetch_k_tile_count = 0) {
+
+    auto tAgSFA_mkl = get<0>(load_inputs);
+    auto tBgSFB_nkl = get<1>(load_inputs);
+    auto tAsSFA = get<2>(load_inputs);
+    auto tBsSFB = get<3>(load_inputs);
+    auto mcast_mask_sfa = get<4>(load_inputs);
+    auto mcast_mask_sfb = get<5>(load_inputs);
+    auto input_tensormaps_sf = get<6>(load_inputs);
+    // slice out the work coord from partitioned tensors
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    // Check to see if tensormaps have been replaced in gmem
+    if (did_batch_change) {
+      tensormaps_fence_acquire(get<0>(input_tensormaps_sf));
+      tensormaps_fence_acquire(get<1>(input_tensormaps_sf));
+    }
+
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_sf_pipe_producer_state);
+
+    using BarrierType = typename MainloopSFPipeline::ProducerBarrierType;
+    auto tAsSFA_compact = make_tensor(tAsSFA.data(), filter_zeros(tAsSFA.layout()));
+    auto tBsSFB_compact = make_tensor(tBsSFB.data(), filter_zeros(tBsSFB.layout()));
+    auto prefetch_k_tile = k_tile_iter;
+    auto prefetch_buf_idx = 0;
+    auto tile_k_advance = LoadSFPipelineStageCount / SF_BUFFERS_PER_TILE_K;
+
+    if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+      prefetch_buf_idx = LoadSFPipelineStageCount % SF_BUFFERS_PER_TILE_K;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i=0;i<tile_k_advance;i++) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // In total, we will load 2 or 4 buffers per k_tile_iter. Unrolled.
+      CUTLASS_PRAGMA_UNROLL
+      for(int buffer = 0; buffer < SF_BUFFERS_PER_TILE_K; buffer++) {
+        pipeline.producer_acquire(mainloop_sf_pipe_producer_state, barrier_token);
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_sf_pipe_producer_state);
+
+        int write_stage = mainloop_sf_pipe_producer_state.index();
+        ++mainloop_sf_pipe_producer_state;
+        barrier_token = pipeline.producer_try_acquire(mainloop_sf_pipe_producer_state);
+        auto tAgSFA_compact = make_tensor(tAgSFA(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).data(), filter_zeros(tAgSFA(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).layout()));
+        auto tBgSFB_compact = make_tensor(tBgSFB(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).data(), filter_zeros(tBgSFB(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).layout()));
+
+        auto tma_copy_traits_sfa = observed_tma_load_sfa_->with(get<0>(input_tensormaps_sf), *tma_barrier, mcast_mask_sfa);
+        auto tma_copy_traits_sfb = observed_tma_load_sfb_->with(get<1>(input_tensormaps_sf), *tma_barrier, mcast_mask_sfb);
+
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_sfa_->with(get<0>(input_tensormaps_sf), *tma_barrier, mcast_mask_sfa), tAgSFA_compact, tAsSFA_compact(_,write_stage));
+          copy(observed_tma_load_sfb_->with(get<1>(input_tensormaps_sf), *tma_barrier, mcast_mask_sfb), tBgSFB_compact, tBsSFB_compact(_,write_stage));
+        }
+
+        auto tAgSFA_compact_prefetch = make_tensor(tAgSFA(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).data(), filter_zeros(tAgSFA(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).layout()));
+        auto tBgSFB_compact_prefetch = make_tensor(tBgSFB(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).data(), filter_zeros(tBgSFB(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).layout()));
+        if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+          issue_prefetch <SF_BUFFERS_PER_TILE_K>(
+            prefetch_k_tile_count,
+            prefetch_buf_idx,
+            prefetch_k_tile,
+            [&]() {
+              prefetch(tma_copy_traits_sfa, tAgSFA_compact_prefetch);
+              prefetch(tma_copy_traits_sfb, tBgSFB_compact_prefetch);
+            }
+          );
+        }
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_sf_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+    template <
+    class MainloopPipeline, class MainloopPipelineState
+  >
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class FragmentSFA, class FragmentSFB,
+    class MmaFragmentSFA, class MmaFragmentSFB,
+    class CtaTileCoord,
+    class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopABPipeline,MainloopSFPipeline,AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopABPipelineState,MainloopSFPipelineState, typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma,
+                  FragmentA, FragmentB,
+                  FragmentSFA, FragmentSFB, MmaFragmentSFA, MmaFragmentSFB,
+                  SFATiledCopy, SmemFrgSFA, TmemFrgSFA,
+                  SFBTiledCopy, SmemFrgSFB, TmemFrgSFB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto pipeline_ab = get<0>(pipelines);
+    auto pipeline_sf = get<1>(pipelines);
+    auto accumulator_pipeline = get<2>(pipelines);
+    auto mainloop_pipe_ab_consumer_state = get<0>(pipeline_states);
+    auto mainloop_pipe_sf_consumer_state = get<1>(pipeline_states);
+    auto accumulator_pipe_producer_state = get<2>(pipeline_states);
+    auto tiled_mma  = get<0>(mma_inputs);
+    auto tCrA       = get<1>(mma_inputs);
+    auto tCrB       = get<2>(mma_inputs);
+    auto tCtSFA     = get<3>(mma_inputs);
+    auto tCtSFB     = get<4>(mma_inputs);
+    auto tCtSFA_mma = get<5>(mma_inputs);
+    auto tCtSFB_mma = get<6>(mma_inputs);
+    auto tiled_copy_s2t_SFA = get<7>(mma_inputs);
+    auto tCsSFA_s2t     = get<8>(mma_inputs);
+    auto tCtSFA_s2t     = get<9>(mma_inputs);
+    auto tiled_copy_s2t_SFB = get<10>(mma_inputs);
+    auto tCsSFB_s2t     = get<11>(mma_inputs);
+    auto tCtSFB_s2t     = get<12>(mma_inputs);
+
+    tCtSFB_mma = [tCtSFB_mma = tCtSFB_mma, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB_mma;
+        if (get<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB_mma;
+      }
+    }();
+
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    constexpr int sf_stride = TiledMma::SFVecSize == 16 ? 6 : 3;
+    auto barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+    auto barrier_token_sf = pipeline_sf.consumer_try_wait(mainloop_pipe_sf_consumer_state);
+    constexpr int MmasPerSfBuffer = 8 / SF_BUFFERS_PER_TILE_K;
+
+    auto sf_load_fn = [&](const int kphase, const int k_tile_count) {
+      if (kphase % MmasPerSfBuffer == 0) {
+        pipeline_sf.consumer_wait(mainloop_pipe_sf_consumer_state, barrier_token_sf);
+        int read_stage_sf_buffer0 = mainloop_pipe_sf_consumer_state.index();
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_SFA, tCsSFA_s2t(_,_,_,_,read_stage_sf_buffer0), tCtSFA_s2t);
+          copy(tiled_copy_s2t_SFB, tCsSFB_s2t(_,_,_,_,read_stage_sf_buffer0), tCtSFB_s2t);
+        }
+        auto buffer0_mainloop_pipe_sf_consumer_state = mainloop_pipe_sf_consumer_state;
+        ++mainloop_pipe_sf_consumer_state;
+        barrier_token_sf = pipeline_sf.consumer_try_wait(mainloop_pipe_sf_consumer_state, (kphase == 8 - MmasPerSfBuffer) && k_tile_count <= 1); // only skip wait for the last one.
+        pipeline_sf.consumer_release(buffer0_mainloop_pipe_sf_consumer_state);
+      }
+    };
+
+    bool is_first_iteration = true;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // MMA 0
+      sf_load_fn(0, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer0 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer0_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+
+      // delay the acc acquire to unblock tmem copy.
+      if constexpr (IsOverlappingAccum) {
+        if(is_first_iteration) {
+          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+          is_first_iteration = false;
+        }
+      };
+
+      cute::gemm(tiled_mma,
+      make_zip_tensor(tCrA(_,_,0,read_stage_ab_buffer0),  // A buffer: Points to buffer[0]
+                      tCrA(_,_,0,read_stage_ab_buffer0),  // Next A buffer for circular buffers: Points to buffer[0]
+                      tCtSFA_mma(_, _, 0 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+      make_zip_tensor(tCrB(_,_,0,read_stage_ab_buffer0),  // B buffer: Points to buffer[0]
+                      tCrB(_,_,0,read_stage_ab_buffer0),  // Next B buffer for circular buffers: Points to buffer[0]
+                      tCtSFB_mma(_, _, 0 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+      accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+
+      // MMA 1
+      sf_load_fn(1, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,3,read_stage_ab_buffer0),  // A buffer: Points to buffer[0] + 48 bytes. Note the 3.
+                        tCrA(_,_,0,read_stage_ab_buffer0),  // Next A buffer for circular buffers: Points to buffer[0]
+                        tCtSFA_mma(_, _, 1 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,3,read_stage_ab_buffer0),  // B buffer: Points to buffer[0] + 48 bytes. Note the 3.
+                        tCrB(_,_,0,read_stage_ab_buffer0),  // Next B buffer for circular buffers: Points to buffer[0]
+                        tCtSFB_mma(_, _, 1 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+
+      // MMA 2
+      sf_load_fn(2, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer1 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer1_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,6,read_stage_ab_buffer0),  // A buffer: Points to buffer[0] + 96 bytes. Note the 6.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 2 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,6,read_stage_ab_buffer0),  // B buffer: Points to buffer[0] + 96 bytes. Note the 6.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 2 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer0_mainloop_pipe_ab_consumer_state);
+
+
+      // MMA 3
+      sf_load_fn(3, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,1,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 16 bytes. Note the 1.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 3 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,1,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 16 bytes. Note the 1.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 3 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      // MMA 4
+        sf_load_fn(4, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,4,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 64 bytes. Note the 1.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 4 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,4,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 64 bytes. Note the 1.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 4 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      // MMA 5
+      sf_load_fn(5, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer2 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer2_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state, k_tile_count <= 1);
+
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,7,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 112 bytes. Note the 7.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 5 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,7,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 112 bytes. Note the 7.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 5 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer1_mainloop_pipe_ab_consumer_state);
+
+      // MMA 6
+      sf_load_fn(6, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,2,read_stage_ab_buffer2),  // A buffer: Points to buffer[1] + 32 bytes. Note the 2.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 6 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,2,read_stage_ab_buffer2),  // B buffer: Points to buffer[1] + 32 bytes. Note the 2.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 6 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+      // MMA 7
+      sf_load_fn(7, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,5,read_stage_ab_buffer2),  // A buffer: Points to buffer[1] + 80 bytes. Note the 5.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 7 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,5,read_stage_ab_buffer2),  // B buffer: Points to buffer[1] + 80 bytes. Note the 5.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 7 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer2_mainloop_pipe_ab_consumer_state);
+      --k_tile_count;
+    }
+    return cute::make_tuple(mainloop_pipe_ab_consumer_state, mainloop_pipe_sf_consumer_state);
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto
+  tensormaps_init_ab(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(observed_tma_load_a_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(observed_tma_load_b_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address_ab(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties_ab(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    ElementA const* ptr_A = nullptr;
+    Tensor tensor_a = recast<TmaInternalElementA>(make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]));
+
+    ElementB const* ptr_B = nullptr;
+    Tensor tensor_b = recast<TmaInternalElementB>(make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_a_, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_b_, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update_ab(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_ab_tensormaps,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address_ab(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties_ab(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release_ab(shared_tensormaps, input_ab_tensormaps);
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release_ab (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_ab_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_ab_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_ab_tensormaps), shared_tensormaps.smem_tensormap_B);
+
+  }
+
+  // SF tensormap ops
+  CUTLASS_DEVICE auto
+  tensormaps_init_sf(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t const sm_count,
+      int32_t const sm_idx) const {
+    cute::TmaDescriptor* gmem_tensormap = mainloop_params.tensormaps;
+
+    cute::TmaDescriptor* tma_desc_sfa = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_sfb = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pSFA_tensormap = make_tensor(observed_tma_load_sfa_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFA), Int<1>{}, Int<1>{});
+      Tensor pSFB_tensormap = make_tensor(observed_tma_load_sfb_->get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFB), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pSFA_tensormap), recast<uint128_t>(sSFA_tensormap));
+      copy(recast<uint128_t>(pSFB_tensormap), recast<uint128_t>(sSFB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_sfa, tma_desc_sfb);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address_sf(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                    mainloop_params.ptr_SFA[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                    mainloop_params.ptr_SFB[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties_sf(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFA  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFA = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFB  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFB = {0,0,0,0,0};
+
+    ElementSF const* ptr_SF = nullptr;
+    Tensor tensor_sfa = make_tensor(ptr_SF, mainloop_params.layout_SFA[next_group]);
+
+    Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_sfa_, tensor_sfa,
+                                             prob_shape_SFA, prob_stride_SFA);
+    cute::detail::fill_tma_gmem_shape_stride(*observed_tma_load_sfb_, tensor_sfb,
+                                             prob_shape_SFB, prob_stride_SFB);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_SFA) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFB) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                            prob_shape_SFA,
+                                                            prob_stride_SFA);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                            prob_shape_SFB,
+                                                            prob_stride_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapSFA, class TensorMapSFB, class ProblemShape>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update_sf(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapSFA, TensorMapSFB> const& input_tensormaps_sf,
+      ProblemShape problem_shape,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address_sf(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(next_batch), 1);
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties_sf(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_MNKL);
+      }
+    }
+    // Ensure warp is converged before issuing tensormap fence release
+    __syncwarp();
+    // Entire warp must do this (ie its aligned)
+    tensormaps_cp_fence_release_sf(shared_tensormaps, input_tensormaps_sf);
+  }
+
+  template <class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release_sf (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapSFA, TensorMapSFB> const& input_tensormaps_sf) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps_sf), shared_tensormaps.smem_tensormap_SFA);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps_sf), shared_tensormaps.smem_tensormap_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::TmaDescriptor const* input_tma_desc) {
+    cute::tma_descriptor_fence_acquire(input_tma_desc);
+  }
+
+protected:
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fefd73271556ff263f9cd836e612a454ee7ee01c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm103_blockscaled_mma_warpspecialized.hpp
@@ -0,0 +1,1276 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/sm103_blockscaled_layout.hpp"
+#include "cutlass/detail/collective/sm103_kernel_type.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+// Both DMA Load and MMA methods of this class must be run by a single thread that's picked by elect_one
+template <
+  int LoadABPipelineStageCount,
+  int LoadSFPipelineStageCount,
+  int SchedulerPipelineStageCount,
+  int AccumulatorPipelineStageCount,
+  class ClusterShape,   // Static cluster shape or dynamic (int, int, int)
+  cutlass::sm103::detail::KernelPrefetchType PrefetchType,
+  class TileShape_,     // (MmaAtomShapeM, MmaAtomShapeN, TileK)
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomPairA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomPairB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm103TmaUmmaWarpSpecializedBlockScaled<
+      LoadABPipelineStageCount,
+      LoadSFPipelineStageCount,
+      SchedulerPipelineStageCount,
+      AccumulatorPipelineStageCount,
+      ClusterShape,
+      PrefetchType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomPairA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomPairB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+
+  using DispatchPolicy = MainloopSm103TmaUmmaWarpSpecializedBlockScaled<
+                          LoadABPipelineStageCount,
+                          LoadSFPipelineStageCount,
+                          SchedulerPipelineStageCount,
+                          AccumulatorPipelineStageCount,
+                          ClusterShape,
+                          PrefetchType>;
+
+  using TileShape = TileShape_;
+  // Due to an MSVC bug, we can't use decltype(make_tiled_mma()) interface.
+  using TiledMMA_SF = TiledMMA<MMA_Atom<typename TiledMma::MMA_ScaleFactor>,
+                                        Layout<Shape<_1,_1,_1>>,
+                                        Tile<Underscore,Underscore,Underscore>>;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr int SFVecSize = TiledMma::SFVecSize;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // Assert that TiledMma and TileShape should be weakly compatible
+  CUTE_STATIC_ASSERT_V(evenly_divides(TileShape{}, tile_shape(TiledMma{})),
+                       "Static cluster shape used: TiledMma and TileShape should be weakly compatible");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{}));
+  static_assert(shape<1>(CtaShape_MNK{}) == 192 or shape<1>(CtaShape_MNK{}) == 128 or shape<1>(CtaShape_MNK{}) == 256,
+      "Cta N should be one of 128/192/256");
+
+  using ClusterTileShape = decltype(make_shape(get<0>(TileShape{})*get<0>(ClusterShape{}),get<1>(TileShape{})*get<1>(ClusterShape{}),get<2>(TileShape{})*get<2>(ClusterShape{})));
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm103BlockScaledConfig<SFVecSize>;
+  using Blk_MN = typename Sm1xxBlkScaledConfig::Blk_MN;
+  static constexpr int IsCtaN192 = shape<1>(CtaShape_MNK{}) == 192;
+  static int constexpr CTA_N_SF = cutlass::round_up(size<1>(CtaShape_MNK{}), Blk_MN{});
+  // Tile shape used for partitioning Scale Factor B.
+  // The M-dim does not affect the SFB, so just set it as the original TileShape;
+  using TileShape_SF = decltype(make_shape(get<0>(CtaShape_MNK{}),
+                                           Int<CTA_N_SF>{} * shape<2>(typename TiledMma::ThrLayoutVMNK()),
+                                           get<2>(TileShape{})));
+
+  static int constexpr SF_BUFFERS_PER_TILE_K = SFVecSize == 16 ? 4 : 2;
+  using MMA_SF_Tiler = decltype(make_tile(shape<0>(CtaShape_MNK{}), Int<CTA_N_SF>{}, Int<shape<2>(CtaShape_MNK{})/SF_BUFFERS_PER_TILE_K>{}));
+
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+  using SmemLayoutAtomPairA = SmemLayoutAtomPairA_;
+  using SmemLayoutAtomPairB = SmemLayoutAtomPairB_;
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomPairB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomPairB{}))>;
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::LoadABPipelineStageCount,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopABPipelineState = typename MainloopABPipeline::PipelineState;
+
+  using MainloopSFPipeline = cutlass::PipelineTmaUmmaAsync<
+                             DispatchPolicy::LoadSFPipelineStageCount,
+                             ClusterShape,
+                             AtomThrShapeMNK>;
+  using MainloopSFPipelineState = typename MainloopSFPipeline::PipelineState;
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM100 UMMA cannot have a non-void copy atom for smem sourced instructions.");
+
+  // Tile along K mode first before tiling over MN. PIPE mode last as usual.
+  // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs.
+  // (MMA_TILE_M,MMA_TILE_K),MMA_M,MMA_K,PIPE)
+ using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomA{},
+    append(make_shape(make_shape(shape<0>(CtaShape_MNK{}), _16{}), _1{}, _8{}), Int<DispatchPolicy::LoadABPipelineStageCount>{} /*PIPE*/),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_M,16bytes),1,8,NUM_PIPES)
+  using SmemLayoutA_tma = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomA{},
+    append(make_shape(make_shape(shape<0>(CtaShape_MNK{}), _16{}), _1{}, _8{}), Int<3>{}  /*Per mainloop iteration */),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_M,16bytes),1,8,3)
+
+  using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomB{},
+    append(make_shape(make_shape(shape<1>(CtaShape_MNK{}) / size(typename TiledMma::AtomThrID{}), _16{}), _1{}, _8{}), Int<DispatchPolicy::LoadABPipelineStageCount>{} /*PIPE*/),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_N,16bytes),1,8,NUM_PIPES)
+  using SmemLayoutB_tma = decltype(UMMA::tile_to_mma_shape(
+    SmemLayoutAtomB{},
+    append(make_shape(make_shape(shape<1>(CtaShape_MNK{}) / size(typename TiledMma::AtomThrID{}), _16{}), _1{}, _8{}), Int<3>{} /*Per mainloop iteration */),
+    cute::conditional_t<cutlass::gemm::detail::is_mn_major<StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));     // ((CTA_MMA_N,16bytes),1,8,3)
+
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::LoadSFPipelineStageCount>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::LoadSFPipelineStageCount>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::UMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyA, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy - invalid TMA copy atom specified.");
+  static_assert(
+      (size(AtomThrShapeMNK{}) == 1 &&
+        (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>)) ||
+      (size(AtomThrShapeMNK{}) == 2 &&
+        (cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD> || cute::is_same_v<GmemTiledCopyB, SM100_TMA_2SM_LOAD_MULTICAST>)),
+      "GmemTiledCopy -  invalid TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  using TmaInternalElementA = uint8_t;
+  using TmaInternalElementB = uint8_t;
+
+  using SmemAllocTypeA = uint8_t;
+  using SmemAllocTypeB = uint8_t;
+
+  using BitTypeElementA = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using BitTypeElementB = cute::uint_bit_t<cute::sizeof_bits_v<ElementB>>;
+
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA, BitTypeElementA, ElementA>;
+  using ArrayElementB = cute::conditional_t<IsRuntimeDataTypeB, BitTypeElementB, ElementB>;
+
+  using RuntimeDataTypeA = typename detail::sm10x_block_scale_runtime_input_t<ElementAMma, IsRuntimeDataTypeA>::Type;
+  using RuntimeDataTypeB = typename detail::sm10x_block_scale_runtime_input_t<ElementBMma, IsRuntimeDataTypeB>::Type;
+
+  using SmemPrefetchType = uint8_t;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::ArrayEngine<SmemAllocTypeA,   cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::ArrayEngine<SmemAllocTypeB,   cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF,        cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF,        cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    using PipelineABStorage = typename MainloopABPipeline::SharedStorage;
+    using PipelineSFStorage = typename MainloopSFPipeline::SharedStorage;
+    struct PipelineStorage {
+      PipelineABStorage pipeline_ab;
+      PipelineSFStorage pipeline_sf;
+    };
+  };
+
+  // Expose shared storage for tensors/pipelines separately to allow kernel layer to reorder them.
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr uint32_t SFTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>);
+  // Only one thread issues the TMA and updates the barriers in a 2SM MMA, adjust bytes accordingly
+  static constexpr uint32_t ABTmaTransactionBytes =
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutA{})) * cute::sizeof_bits_v<TmaInternalElementA>) +
+    cutlass::bits_to_bytes(size(AtomThrShapeMNK{}) * cosize(take<0,3>(SmemLayoutB{})) * cute::sizeof_bits_v<TmaInternalElementB>);
+
+  // Host side kernel arguments
+  struct Arguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+    RuntimeDataTypeA runtime_data_type_a{};
+    RuntimeDataTypeB runtime_data_type_b{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    using ClusterLayout_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMma::AtomThrID{})));
+    using ClusterLayoutSfb_VMNK =
+      decltype(tiled_divide(make_layout(conditional_return<IsDynamicCluster>(make_shape(uint32_t(0), uint32_t(0), Int<1>{}),
+                                                                              ClusterShape{})), make_tile(typename TiledMMA_SF::AtomThrID{})));
+
+    using TMA_A = decltype(make_tma_atom<uint8_t>(
+        GmemTiledCopyA{},
+        recast<uint8_t>(make_tensor(recast_ptr<ElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{})),
+        SmemLayoutA_tma{},
+        make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+        size<1>(ClusterShape{}))
+      );
+
+    using TMA_B = decltype(make_tma_atom<uint8_t>(
+        GmemTiledCopyB{},
+        recast<uint8_t>(make_tensor(recast_ptr<ElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{})),
+        SmemLayoutB_tma{},
+        make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+        size<0>(ClusterShape{})/size(typename TiledMma::AtomThrID{}))
+      );
+
+    using TMA_SFA = decltype(make_tma_atom<uint8_t>( // using legacy sm90 make_tma_atom
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+        make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+        size<1>(ClusterShape{}))
+      );
+
+    using TMA_SFB = decltype(make_tma_atom<uint8_t>( // using legacy sm90 make_tma_atom
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+        make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+        size<0>(ClusterShape{})/size(typename TiledMMA_SF::AtomThrID{}))
+      );
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    TMA_A tma_load_a_fallback;
+    TMA_B tma_load_b_fallback;
+    TMA_SFA tma_load_sfa_fallback;
+    TMA_SFB tma_load_sfb_fallback;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    dim3 cluster_shape_fallback;
+    RuntimeDataTypeA runtime_data_type_a;
+    RuntimeDataTypeB runtime_data_type_b;
+  };
+
+  CUTLASS_DEVICE
+  CollectiveMma(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      observed_tma_load_a_ = is_fallback_cluster ? &params.tma_load_a_fallback : &params.tma_load_a;
+      observed_tma_load_b_ = is_fallback_cluster ? &params.tma_load_b_fallback : &params.tma_load_b;
+      observed_tma_load_sfa_ = is_fallback_cluster ? &params.tma_load_sfa_fallback : &params.tma_load_sfa;
+      observed_tma_load_sfb_ = is_fallback_cluster ? &params.tma_load_sfb_fallback : &params.tma_load_sfb;
+    }
+    else {
+      observed_tma_load_a_ = &params.tma_load_a;
+      observed_tma_load_b_ = &params.tma_load_b;
+      observed_tma_load_sfa_ = &params.tma_load_sfa;
+      observed_tma_load_sfb_ = &params.tma_load_sfb;
+
+    }
+  }
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace,
+    cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) {
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<ElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<ElementB>(args.ptr_B);
+
+    Tensor tensor_a = recast<TmaInternalElementA>(make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA)));
+    Tensor tensor_b = recast<TmaInternalElementB>(make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB)));
+
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    // Cluster layout for TMA construction
+    auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{}));
+    auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback);
+    auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{}));
+
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_atom<uint8_t>(
+      GmemTiledCopyA{},
+      tensor_a,
+      SmemLayoutA_tma{},
+      make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+      size<1>(cluster_shape)
+    );
+    typename Params::TMA_B tma_load_b = make_tma_atom<uint8_t>(
+      GmemTiledCopyB{},
+      tensor_b,
+      SmemLayoutB_tma{},
+      make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+      size<0>(cluster_shape)/size(typename TiledMma::AtomThrID{})
+    );
+    typename Params::TMA_A tma_load_a_fallback =  make_tma_atom<uint8_t>(
+      GmemTiledCopyA{},
+      tensor_a,
+      SmemLayoutA_tma{},
+      make_tile(size<1,0>(typename TiledMma::ALayout{}), _384{}),
+      size<1>(cluster_shape_fallback)
+    );
+    typename Params::TMA_B tma_load_b_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopyB{},
+      tensor_b,
+      SmemLayoutB_tma{},
+      make_tile(size<1,0>(typename TiledMma::BLayout{}), _384{}),
+      size<0>(cluster_shape_fallback)/size(typename TiledMma::AtomThrID{})
+    );
+    typename Params::TMA_SFA tma_load_sfa = make_tma_atom<uint8_t>(
+      GmemTiledCopySFA{},
+      tensor_sfa,
+      SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<1>(cluster_shape)
+    );
+    typename Params::TMA_SFB tma_load_sfb = make_tma_atom<uint8_t>(
+      GmemTiledCopySFB{},
+      tensor_sfb,
+      SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<0>(cluster_shape)/size(typename TiledMMA_SF::AtomThrID{})
+    );
+    typename Params::TMA_SFA tma_load_sfa_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopySFA{},
+      tensor_sfa,
+      SmemLayoutSFA{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<1>(cluster_shape_fallback)
+    );
+    typename Params::TMA_SFB tma_load_sfb_fallback = make_tma_atom<uint8_t>(
+      GmemTiledCopySFB{},
+      tensor_sfb,
+      SmemLayoutSFB{}(_,_,_,cute::Int<0>{}),
+      make_shape(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})),
+      size<0>(cluster_shape_fallback)/size(typename TiledMMA_SF::AtomThrID{})
+    );
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      tma_load_a_fallback,
+      tma_load_b_fallback,
+      tma_load_sfa_fallback,
+      tma_load_sfb_fallback,
+      args.layout_SFA,
+      args.layout_SFB,
+      hw_info.cluster_shape_fallback,
+      args.runtime_data_type_a,
+      args.runtime_data_type_b,
+    };
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cute::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cute::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if constexpr (IsRuntimeDataType && detail::is_sm10x_mxf4nvf4_input<ElementAMma>() && detail::is_sm10x_mxf4nvf4_input<ElementBMma>()) {
+      bool is_compatible = (SFVecSize == 16 ||
+                           (SFVecSize == 32 && is_same_v<ElementSF, cutlass::float_ue8m0_t>
+                                            && args.runtime_data_type_a == cute::UMMA::MXF4Format::E2M1
+                                            && args.runtime_data_type_b == cute::UMMA::MXF4Format::E2M1));
+      if (!is_compatible) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: 2x mode (VectorSize=32) only supports float_e2m1_t for a/b types and ue8m0_t for sf type.\n");
+      }
+      implementable &= is_compatible;
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE static void
+  prefetch_tma_descriptors(Params const& params) {
+    if constexpr (IsDynamicCluster) {
+      dim3 cs = cute::cluster_shape();
+      const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y);
+      if (is_fallback_cluster) {
+        cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_sfa_fallback.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_sfb_fallback.get_tma_descriptor());
+      }
+      else {
+        cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_sfa.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_sfb.get_tma_descriptor());
+      }
+    }
+    else {
+      cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_sfa.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_load_sfb.get_tma_descriptor());
+    }
+  }
+
+  /// Construct A Single Stage's Accumulator Shape
+  CUTLASS_DEVICE auto
+  partition_accumulator_shape() {
+    auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{}));  // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+    return acc_shape;
+  }
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// gA_mkl - The tiled tma tensor for input A
+  /// gB_nkl - The tiled tma tensor for input B
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// tAsA - partitioned smem tensor for A
+  /// tBsB - partitioned smem tensor for B
+  /// mcast_mask_a - tma multicast mask for A
+  /// mcast_mask_b - tma multicast mask for B
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_ab_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    int K_recast = (K*cute::sizeof_bits_v<ElementA>/8);
+
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K_recast,L));
+    Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K_recast,L));
+
+    // Tile the tensors and defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, replace<2>(TileShape{}, _384{}), make_coord(_,_,_), Step<_1, X,_1>{});    // (BLK_M, BLK_K, m, k, l)
+    Tensor gB_nkl = local_tile(mB_nkl, replace<2>(TileShape{}, _384{}), make_coord(_,_,_), Step< X,_1,_1>{});    // (BLK_N, BLK_K, n, k, l)
+
+    // Partition for this CTA
+    ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{}));
+
+    Tensor tCgA_mkl_tmp = cta_mma.partition_A(gA_mkl);                                       // ((CTA_MMA_M,96),Rest_MMA_M,Rest_MMA_K, m, k, l)
+    Tensor cta_tCgA = make_tensor(tCgA_mkl_tmp.data(), make_layout(coalesce(make_layout(cute::layout<0,0>(tCgA_mkl_tmp), cute::layout<1>(tCgA_mkl_tmp))),
+                                                                   coalesce(make_layout(cute::layout<0,1>(tCgA_mkl_tmp), cute::layout<2>(tCgA_mkl_tmp))),
+                                                                   cute::layout<3>(tCgA_mkl_tmp), cute::layout<4>(tCgA_mkl_tmp), cute::layout<5>(tCgA_mkl_tmp)));   // (CTA_M,CTA_K,m,k,l)
+
+    Tensor tCgA_mkl = make_tensor(cta_tCgA.data(), tiled_divide(cta_tCgA.layout(),
+                                                                make_tile(size<1,0>(typename TiledMma::ALayout{}) /*MMA_M for SM100*/,
+                                                                _128{} /*128bytes*/)));      // ((CTA_MMA_M,256),Rest_MMA_M,Rest_MMA_K, m, k, l)
+
+    Tensor tCgB_nkl_tmp = cta_mma.partition_B(gB_nkl);                                       // ((MMA_ATOM_M,96),Rest_MMA_M,Rest_MMA_K, n, k, l)
+    Tensor cta_tCgB = make_tensor(tCgB_nkl_tmp.data(), make_layout(coalesce(make_layout(cute::layout<0,0>(tCgB_nkl_tmp), cute::layout<1>(tCgB_nkl_tmp))),
+                                                                   coalesce(make_layout(cute::layout<0,1>(tCgB_nkl_tmp), cute::layout<2>(tCgB_nkl_tmp))),
+                                                                  cute::layout<3>(tCgB_nkl_tmp), cute::layout<4>(tCgB_nkl_tmp), cute::layout<5>(tCgB_nkl_tmp)));   // (CTA_M,CTA_K,m,k,l)
+    Tensor tCgB_nkl = make_tensor(cta_tCgB.data(), tiled_divide(cta_tCgB.layout(),
+                                                                make_tile(size<1,0>(typename TiledMma::BLayout{}) /*MMA_M for SM100*/,
+                                                                _128{} /*128bytes*/)));      // ((CTA_MMA_M,256),Rest_MMA_M, Rest_MMA_K, m, k, l)
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});    // ((CTA_MMA_N,32),Rest_MMA_N,8,NUM_PIPE)
+
+
+    Layout cta_layout_mnk  = make_layout(cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape()));
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    int block_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sA), group_modes<0,1>(tCgA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_,
+                                      get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)),
+                                      group_modes<0,3>(sB), group_modes<0,1>(tCgB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk);
+
+    return cute::make_tuple(
+      gA_mkl, gB_nkl,                         // for scheduler
+      tAgA_mkl, tBgB_nkl, tAsA, tBsB,         // for input tensor values
+      mcast_mask_a, mcast_mask_b              // multicast masks
+    );
+  }
+
+
+  /// Set up the data needed by this collective for load.
+  /// Return tuple element contain
+  /// tAgA_mkl - partitioned gmem tensor for A
+  /// tBgB_nkl - partitioned gmem tensor for B
+  /// mcast_mask_sfa - tma multicast mask for SFA
+  /// mcast_mask_sfb - tma multicast mask for SFB
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_sf_init(
+      ProblemShape_MNKL const& problem_shape_MNKL,
+      Params const& params,
+      TensorStorage& shared_tensors) const {
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Represent the full tensor of Scale factors
+    Tensor mSFA_mkl = observed_tma_load_sfa_->get_tma_tensor(shape(params.layout_SFA));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN192) {
+        Tensor mSFB_tmp = observed_tma_load_sfb_->get_tma_tensor(shape(params.layout_SFB));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), 4);
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}, _2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(   x,    x), x*3)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return observed_tma_load_sfb_->get_tma_tensor(shape(params.layout_SFB));
+      }
+    }();
+
+    // Partition for this CTA
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, MMA_SF_Tiler{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, MMA_SF_Tiler{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+
+    Tensor tCgSFA_mkl = make_tensor(gSFA_mkl.data(), tiled_divide(gSFA_mkl.layout(), make_tile(get<0>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})))); // ((MMA_M,MMA_K),Rest_MMA_M,Rest_MMA_K, m, k, l)
+    Tensor tCgSFB_nkl = make_tensor(gSFB_nkl.data(), tiled_divide(gSFB_nkl.layout(), make_tile(get<1>(MMA_SF_Tiler{}), get<2>(MMA_SF_Tiler{})))); // ((MMA_N,MMA_K),Rest_MMA_N,Rest_MMA_K, n, k, l)
+
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    Layout cta_layout_mnk  = make_layout(cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape()));
+    Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{}));
+    int block_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cta_coord_vmnk  = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster);
+
+    Layout cta_layout_sfb_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMMA_SF::AtomThrID{}));
+    auto cta_coord_sfb_vmnk  = cta_layout_sfb_vmnk.get_flat_coord(block_rank_in_cluster);
+    // Project the cta_layout for tma_a along the n-modes
+    auto [tAgSFA_mkl, tAsSFA] = tma_partition(*observed_tma_load_sfa_,
+                                      get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)),
+                                      group_modes<0,3>(tCsSFA), group_modes<0,3>(tCgSFA_mkl));
+
+    // Project the cta_layout for tma_b along the m-modes
+    auto [tBgSFB_nkl, tBsSFB] = tma_partition(*observed_tma_load_sfb_,
+                                      get<1>(cta_coord_sfb_vmnk), make_layout(size<1>(cta_layout_sfb_vmnk)),
+                                      group_modes<0,3>(tCsSFB), group_modes<0,3>(tCgSFB_nkl));
+
+    // TMA Multicast Masks
+    uint16_t mcast_mask_sfa = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk);
+    uint16_t mcast_mask_sfb = create_tma_multicast_mask<1>(cta_layout_sfb_vmnk, cta_coord_sfb_vmnk);
+
+    return cute::make_tuple(
+      tAgSFA_mkl, tBgSFB_nkl, tAsSFA, tBsSFB, // for input scale factor tensor values
+      mcast_mask_sfa, mcast_mask_sfb          // multicast masks
+    );
+  }
+
+  /// Set up the data needed by this collective for mma compute.
+  CUTLASS_DEVICE auto
+  mma_init(
+    Params const& params,
+    TensorStorage& shared_tensors,
+    uint32_t const tmem_offset) const {
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});    // ((CTA_MMA_M,32),Rest_MMA_M,8,NUM_PIPE)
+
+    // Allocate "fragments/descriptors" for A and B matrices
+    Tensor tCrA = make_tensor<typename TiledMma::FrgTypeA>(sA);;
+    Tensor tCrB = make_tensor<typename TiledMma::FrgTypeB>(sB);;
+
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::LoadABPipelineStageCount>{} == size<3>(sA));                                     // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::LoadABPipelineStageCount>{} == size<3>(sB));                                     // PIPE
+
+    //
+    // Scale Factor
+    //
+    Tensor tCtSFA = make_tensor<typename TiledMma::FrgTypeSFA>(take<0,3>(shape(SmemLayoutAtomSFA{})));
+    // TMEM allocations for SFA and SFB will always start at DP 0.
+    tCtSFA.data() = tmem_offset;
+    Tensor tCtSFB = make_tensor<typename TiledMma::FrgTypeSFB>(take<0,3>(shape(SmemLayoutAtomSFB{})));
+
+    tCtSFB.data() = tCtSFA.data().get() + cutlass::detail::find_tmem_tensor_col_offset(tCtSFA);
+
+    // Setup smem descriptors for UTCCP
+    Tensor tCsSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});
+    Tensor tCsSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});
+
+    // Make SMEM and TMEM tensors compact removing the zero strides to eliminate unnecessary copy instructions.
+    auto tCsSFA_compact = make_tensor(tCsSFA.data(), filter_zeros(tCsSFA.layout()));
+    auto tCtSFA_compact = make_tensor(tCtSFA.data(), filter_zeros(tCtSFA.layout()));
+    auto tCsSFB_compact = make_tensor(tCsSFB.data(), filter_zeros(tCsSFB.layout()));
+    auto tCtSFB_compact = make_tensor(tCtSFB.data(), filter_zeros(tCtSFB.layout()));
+
+    // Create the SMEM to TMEM copy operations based on the MMA atom used (1CTA vs 2CTA)
+    using AtomThrID = typename TiledMma::AtomThrID;
+    using UtccpOp = cute::conditional_t<(decltype(cute::size(AtomThrID{}) == Int<2>{})::value),
+      SM100_UTCCP_4x32dp128bit_2cta, SM100_UTCCP_4x32dp128bit_1cta>;
+    auto tCtSFA_compact_copy = make_tensor(tCtSFA_compact.data(), append<3>(tCtSFA_compact(_,_0{},_0{}).layout()));
+    auto tCtSFB_compact_copy = make_tensor(tCtSFB_compact.data(), append<3>(tCtSFB_compact(_,_0{},_0{}).layout()));
+    auto tiled_copy_s2t_SFA = make_utccp_copy(UtccpOp{}, tCtSFA_compact_copy);
+    auto tiled_copy_s2t_SFB = make_utccp_copy(UtccpOp{}, tCtSFB_compact_copy);
+
+    auto thr_copy_s2t_SFA = tiled_copy_s2t_SFA.get_slice(0);
+    auto thr_tCsSFA_compact_s2t_ = thr_copy_s2t_SFA.partition_S(tCsSFA_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFA_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFA_compact_s2t_);
+    auto thr_tCtSFA_compact_s2t = thr_copy_s2t_SFA.partition_D(tCtSFA_compact);
+
+    auto thr_copy_s2t_SFB = tiled_copy_s2t_SFB.get_slice(0);
+    auto thr_tCsSFB_compact_s2t_ = thr_copy_s2t_SFB.partition_S(tCsSFB_compact);
+    // SMEM to TMEM copy operation requires source SMEM operand to be an SMEM descriptor
+    auto thr_tCsSFB_compact_s2t = get_utccp_smem_desc_tensor<UtccpOp>(thr_tCsSFB_compact_s2t_);
+    auto thr_tCtSFB_compact_s2t = thr_copy_s2t_SFB.partition_D(tCtSFB_compact);
+
+    TiledMma tiled_mma;
+
+    if constexpr (IsRuntimeDataType) {
+      tiled_mma.idesc_.a_format_ = uint8_t(params.runtime_data_type_a) & 0b111;
+      tiled_mma.idesc_.b_format_ = uint8_t(params.runtime_data_type_b) & 0b111;
+    }
+
+    // using MMA_SF_Tiler = decltype(make_tile(shape<0>(CtaShape_MNK{}), Int<CTA_N_SF>{}, Int<shape<2>(CtaShape_MNK{})/2>{}));  // 128x128x384
+    // MMA shapes are ((_128,_96),_1,_8) which makes the MMA_SFA_Shape ((128, (16,3)), 1, 8/3)
+    // The number is not divisible by 4 in K dimension which is needed for TMEM allocation.
+    // To be able to iterate thru the SFs for MMA, we model this as (MMA), MMA_M, MMA_K: ((128, (16,1)), 1, 24)
+    // with this layout we can iterate thru the SFs by incrementing MMA_K mode by 3/6 for this example (Vs=16 vs Vs=32).
+    constexpr int MMA_M = size<0>(CtaShape_MNK{});
+    constexpr int MMA_N_SF = CTA_N_SF;
+    constexpr int MMA_K_SF = shape<2>(CtaShape_MNK{}) / 2;
+    auto mnBasicBlockShape  =  make_shape(_32{}, _4{});
+    auto kBasicBlockShape_single   = make_shape(Int<SFVecSize>{}, Int<1>{});
+    auto mma_iter_SFA_shape  = make_shape( prepend(Int<MMA_M/128>{},  mnBasicBlockShape),  kBasicBlockShape_single);
+    auto sSFA_iter_shape  =   make_shape(mma_iter_SFA_shape,  _1{},  Int<MMA_K_SF/SFVecSize>{});
+    auto mma_iter_SFB_shape  = make_shape( prepend(Int<MMA_N_SF/128>{},  mnBasicBlockShape),  kBasicBlockShape_single);
+    auto sSFB_iter_shape  =   make_shape(mma_iter_SFB_shape,  _1{},  Int<MMA_K_SF/SFVecSize>{});
+
+    // Used for MMAs
+    using MmaIterShapeSFA = decltype(sSFA_iter_shape);  // ((32,4),(SFVecSize,1), MMA_M/128, SF_MMA_K/SfVecSize
+    using MmaIterShapeSFB = decltype(sSFB_iter_shape);  // ((32,4),(SFVecSize,1), MMA_N/128, SF_MMA_K/SfVecSize
+
+    Tensor tCtSFA_mma = make_tensor<typename TiledMma::FrgTypeSFA>(MmaIterShapeSFA{});
+    tCtSFA_mma.data() = tCtSFA.data();
+    Tensor tCtSFB_mma = make_tensor<typename TiledMma::FrgTypeSFB>(MmaIterShapeSFB{});
+    tCtSFB_mma.data() = tCtSFB.data();
+
+    return cute::make_tuple(
+      tiled_mma,
+      tCrA, tCrB, tCtSFA, tCtSFB, tCtSFA_mma, tCtSFB_mma,
+      tiled_copy_s2t_SFA, thr_tCsSFA_compact_s2t, thr_tCtSFA_compact_s2t,
+      tiled_copy_s2t_SFB, thr_tCsSFB_compact_s2t, thr_tCtSFB_compact_s2t);
+  }
+
+// Helper function to handle both prefetch types
+  template <int BuffersPerKtile, 
+            typename TmaPrefetchFn, 
+            typename KTileIterator
+            >
+  CUTLASS_DEVICE void issue_prefetch(
+      int& prefetch_k_tile_count,
+      int& prefetch_buf_idx,
+      KTileIterator& prefetch_k_tile,
+      TmaPrefetchFn&& tma_prefetch_fn
+      )
+  {
+    if (prefetch_k_tile_count > 0) {
+      if constexpr (PrefetchType == cutlass::sm103::detail::KernelPrefetchType::TmaPrefetch) {
+        tma_prefetch_fn();
+      }
+      prefetch_buf_idx = (prefetch_buf_idx + 1) % BuffersPerKtile;
+      if(prefetch_buf_idx == 0) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+  }
+
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorA, class GTensorB,
+    class GTensorPartitionedA, class GTensorPartitionedB,
+    class STensorA, class STensorB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_ab(
+    Params const& params,
+    MainloopABPipeline pipeline,
+    MainloopABPipelineState mainloop_pipe_producer_state,
+    cute::tuple<GTensorA, GTensorB,
+                GTensorPartitionedA, GTensorPartitionedB,
+                STensorA, STensorB,
+                uint16_t, uint16_t
+                > const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count, 
+    int prefetch_k_tile_count = 0) {
+
+    auto tAgA_mkl = get<2>(load_inputs);
+    auto tBgB_nkl = get<3>(load_inputs);
+    auto tAsA = get<4>(load_inputs);
+    auto tBsB = get<5>(load_inputs);
+    auto mcast_mask_a = get<6>(load_inputs);
+    auto mcast_mask_b = get<7>(load_inputs);
+    // slice out the work coord from partitioned tensors
+    Tensor tAgA = tAgA_mkl(_, _, _, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl));
+    Tensor tBgB = tBgB_nkl(_, _, _, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+    constexpr int BuffersPerKtile = 3;
+    auto prefetch_k_tile = k_tile_iter;
+    auto prefetch_buf_idx = 0;
+    auto tile_k_advance = LoadABPipelineStageCount / BuffersPerKtile;
+
+    if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+      prefetch_buf_idx = LoadABPipelineStageCount % BuffersPerKtile;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i=0;i<tile_k_advance;i++) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      using BarrierType = typename MainloopABPipeline::ProducerBarrierType;
+      // In total, we will load 3 buffers per k_tile_iter. Unrolled.
+      CUTLASS_PRAGMA_UNROLL
+      for(int buffer = 0; buffer < BuffersPerKtile; buffer++) {
+        pipeline.producer_acquire(mainloop_pipe_producer_state, barrier_token);
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_pipe_producer_state);
+        int write_stage = mainloop_pipe_producer_state.index();
+        ++mainloop_pipe_producer_state;
+        barrier_token = pipeline.producer_try_acquire(mainloop_pipe_producer_state);
+
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), group_modes<0,2>(tAgA(_,_,buffer,*k_tile_iter)), tAsA(_,write_stage));
+          copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), group_modes<0,2>(tBgB(_,_,buffer,*k_tile_iter)), tBsB(_,write_stage));
+        }
+
+        if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+          issue_prefetch <BuffersPerKtile>(
+            prefetch_k_tile_count,
+            prefetch_buf_idx,
+            prefetch_k_tile,
+            [&]() {
+              prefetch(*observed_tma_load_a_, group_modes<0,2>(tAgA(_,_,prefetch_buf_idx,*prefetch_k_tile)));
+              prefetch(*observed_tma_load_b_, group_modes<0,2>(tBgB(_,_,prefetch_buf_idx,*prefetch_k_tile)));
+            }
+          );
+        }
+      }
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_pipe_producer_state, k_tile_iter);
+  }
+
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class GTensorPartitionedSFA, class GTensorPartitionedSFB,
+    class STensorSFA, class STensorSFB,
+    class TileCoordMNKL,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE auto
+  load_sf(
+    Params const& params,
+    MainloopSFPipeline pipeline,
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state,
+    cute::tuple<GTensorPartitionedSFA, GTensorPartitionedSFB,
+                STensorSFA, STensorSFB,
+                uint16_t, uint16_t
+                > const& load_inputs,
+    TileCoordMNKL const& cta_coord_mnkl,
+    KTileIterator k_tile_iter, int k_tile_count, 
+    int prefetch_k_tile_count = 0) {
+
+    auto tAgSFA_mkl = get<0>(load_inputs);
+    auto tBgSFB_nkl = get<1>(load_inputs);
+    auto tAsSFA = get<2>(load_inputs);
+    auto tBsSFB = get<3>(load_inputs);
+    auto mcast_mask_sfa = get<4>(load_inputs);
+    auto mcast_mask_sfb = get<5>(load_inputs);
+    // slice out the work coord from partitioned tensors
+    Tensor tAgSFA = tAgSFA_mkl(_, get<0>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+    Tensor tBgSFB = tBgSFB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl));
+
+    auto barrier_token = pipeline.producer_try_acquire(mainloop_sf_pipe_producer_state);
+
+    using BarrierType = typename MainloopSFPipeline::ProducerBarrierType;
+    auto tAsSFA_compact = make_tensor(tAsSFA.data(), filter_zeros(tAsSFA.layout()));
+    auto tBsSFB_compact = make_tensor(tBsSFB.data(), filter_zeros(tBsSFB.layout()));
+    auto prefetch_k_tile = k_tile_iter;
+    auto prefetch_buf_idx = 0;
+    auto tile_k_advance = LoadSFPipelineStageCount / SF_BUFFERS_PER_TILE_K;
+
+    if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+      prefetch_buf_idx = LoadSFPipelineStageCount % SF_BUFFERS_PER_TILE_K;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i=0;i<tile_k_advance;i++) {
+        ++prefetch_k_tile;
+        --prefetch_k_tile_count;
+      }
+    }
+
+    // Issue the Mainloop loads
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // In total, we will load 2 or 4 buffers per k_tile_iter. Unrolled.
+      CUTLASS_PRAGMA_UNROLL
+      for(int buffer = 0; buffer < SF_BUFFERS_PER_TILE_K; buffer++) {
+        pipeline.producer_acquire(mainloop_sf_pipe_producer_state, barrier_token);
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(mainloop_sf_pipe_producer_state);
+
+        int write_stage = mainloop_sf_pipe_producer_state.index();
+        ++mainloop_sf_pipe_producer_state;
+        barrier_token = pipeline.producer_try_acquire(mainloop_sf_pipe_producer_state);
+        auto tAgSFA_compact = make_tensor(tAgSFA(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).data(), filter_zeros(tAgSFA(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).layout()));
+        auto tBgSFB_compact = make_tensor(tBgSFB(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).data(), filter_zeros(tBgSFB(_,*k_tile_iter*SF_BUFFERS_PER_TILE_K + buffer).layout()));
+
+        if (cute::elect_one_sync()) {
+          copy(observed_tma_load_sfa_->with(*tma_barrier, mcast_mask_sfa), tAgSFA_compact, tAsSFA_compact(_,write_stage));
+          copy(observed_tma_load_sfb_->with(*tma_barrier, mcast_mask_sfb), tBgSFB_compact, tBsSFB_compact(_,write_stage));
+        }
+        #if 0
+        if(threadIdx.x == 256 && blockIdx.x == 1 && blockIdx.y == 0) {
+          print("tAgSFA_compact: "); print(tAgSFA_compact); print("\n");
+          print("tBgSFB_compact: "); print(tBgSFB_compact); print("\n");
+        }
+        #endif
+
+        auto tAgSFA_compact_prefetch = make_tensor(tAgSFA(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).data(), filter_zeros(tAgSFA(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).layout()));
+        auto tBgSFB_compact_prefetch = make_tensor(tBgSFB(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).data(), filter_zeros(tBgSFB(_,*prefetch_k_tile*SF_BUFFERS_PER_TILE_K + prefetch_buf_idx).layout()));
+        
+        if constexpr (PrefetchType != cutlass::sm103::detail::KernelPrefetchType::Disable) {
+          issue_prefetch <SF_BUFFERS_PER_TILE_K>(
+            prefetch_k_tile_count,
+            prefetch_buf_idx,
+            prefetch_k_tile,
+            [&]() {
+              prefetch(*observed_tma_load_sfa_, tAgSFA_compact_prefetch);
+              prefetch(*observed_tma_load_sfb_, tBgSFB_compact_prefetch);
+            }
+          );
+        }
+      }
+
+      --k_tile_count;
+      ++k_tile_iter;
+    }
+
+    return cute::make_tuple(mainloop_sf_pipe_producer_state, k_tile_iter);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of ctas in a Cluster
+    template <
+    class MainloopPipeline, class MainloopPipelineState
+  >
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, MainloopPipelineState mainloop_pipe_producer_state) {
+    // Issue the epilogue waits
+    // This helps avoid early exit of ctas in Cluster
+    // Waits for all stages to either be released (all
+    // Consumer UNLOCKs), or if the stage was never used
+    // then would just be acquired since the phase was
+    // still inverted from make_producer_start_state
+    pipeline.producer_tail(mainloop_pipe_producer_state);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class AccumulatorPipeline,
+    class FrgEngine, class FrgLayout,
+    class FragmentA, class FragmentB,
+    class FragmentSFA, class FragmentSFB,
+    class MmaFragmentSFA, class MmaFragmentSFB,
+    class CtaTileCoord,
+    class SFATiledCopy, class SmemFrgSFA, class TmemFrgSFA,
+    class SFBTiledCopy, class SmemFrgSFB, class TmemFrgSFB
+  >
+  CUTLASS_DEVICE auto
+  mma(cute::tuple<MainloopABPipeline,MainloopSFPipeline,AccumulatorPipeline> pipelines,
+      cute::tuple<MainloopABPipelineState,MainloopSFPipelineState, typename AccumulatorPipeline::PipelineState> pipeline_states,
+      cute::Tensor<FrgEngine, FrgLayout>& accumulators,
+      cute::tuple<TiledMma,
+                  FragmentA, FragmentB,
+                  FragmentSFA, FragmentSFB, MmaFragmentSFA, MmaFragmentSFB,
+                  SFATiledCopy, SmemFrgSFA, TmemFrgSFA,
+                  SFBTiledCopy, SmemFrgSFB, TmemFrgSFB> const& mma_inputs,
+      CtaTileCoord cta_tile_coord,
+      int k_tile_count
+  ) {
+    static_assert(is_tmem<FrgEngine>::value, "Accumulator must be tmem resident.");
+    static_assert(rank(FrgLayout{}) == 3, "Accumulator must be MMA-partitioned: (MMA, MMA_M, MMA_N)");
+    auto pipeline_ab = get<0>(pipelines);
+    auto pipeline_sf = get<1>(pipelines);
+    auto accumulator_pipeline = get<2>(pipelines);
+    auto mainloop_pipe_ab_consumer_state = get<0>(pipeline_states);
+    auto mainloop_pipe_sf_consumer_state = get<1>(pipeline_states);
+    auto accumulator_pipe_producer_state = get<2>(pipeline_states);
+    auto tiled_mma  = get<0>(mma_inputs);
+    auto tCrA       = get<1>(mma_inputs);
+    auto tCrB       = get<2>(mma_inputs);
+    auto tCtSFA     = get<3>(mma_inputs);
+    auto tCtSFB     = get<4>(mma_inputs);
+    auto tCtSFA_mma = get<5>(mma_inputs);
+    auto tCtSFB_mma = get<6>(mma_inputs);
+    auto tiled_copy_s2t_SFA = get<7>(mma_inputs);
+    auto tCsSFA_s2t     = get<8>(mma_inputs);
+    auto tCtSFA_s2t     = get<9>(mma_inputs);
+    auto tiled_copy_s2t_SFB = get<10>(mma_inputs);
+    auto tCsSFB_s2t     = get<11>(mma_inputs);
+    auto tCtSFB_s2t     = get<12>(mma_inputs);
+
+    tCtSFB_mma = [tCtSFB_mma = tCtSFB_mma, cta_tile_coord]() {
+      if constexpr (IsCtaN192) {
+        // If this is an ODD tile, shift the TMEM start address for N=192 case by two words (ignores first 64 columns of SFB)
+        auto tCtSFB_tmp = tCtSFB_mma;
+        if (get<1>(cta_tile_coord) % 2 == 1) {
+          tCtSFB_tmp.data() = tCtSFB_tmp.data().get() + 2;
+        }
+        return tCtSFB_tmp;
+      }
+      else {
+        return tCtSFB_mma;
+      }
+    }();
+
+    tiled_mma.accumulate_ = UMMA::ScaleOut::Zero;
+    constexpr int sf_stride = TiledMma::SFVecSize == 16 ? 6 : 3;
+    auto barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+    auto barrier_token_sf = pipeline_sf.consumer_try_wait(mainloop_pipe_sf_consumer_state);
+    constexpr int MmasPerSfBuffer = 8 / SF_BUFFERS_PER_TILE_K;
+
+    auto sf_load_fn = [&](const int kphase, const int k_tile_count) {
+      if (kphase % MmasPerSfBuffer == 0) {
+        pipeline_sf.consumer_wait(mainloop_pipe_sf_consumer_state, barrier_token_sf);
+        int read_stage_sf_buffer0 = mainloop_pipe_sf_consumer_state.index();
+        if (cute::elect_one_sync()) {
+          copy(tiled_copy_s2t_SFA, tCsSFA_s2t(_,_,_,_,read_stage_sf_buffer0), tCtSFA_s2t);
+          copy(tiled_copy_s2t_SFB, tCsSFB_s2t(_,_,_,_,read_stage_sf_buffer0), tCtSFB_s2t);
+        }
+        auto buffer0_mainloop_pipe_sf_consumer_state = mainloop_pipe_sf_consumer_state;
+        ++mainloop_pipe_sf_consumer_state;
+        barrier_token_sf = pipeline_sf.consumer_try_wait(mainloop_pipe_sf_consumer_state, (kphase == 8 - MmasPerSfBuffer) && k_tile_count <= 1); // only skip wait for the last one.
+        pipeline_sf.consumer_release(buffer0_mainloop_pipe_sf_consumer_state);
+      }
+    };
+
+    bool is_first_iteration = true;
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > 0) {
+      // MMA 0
+      sf_load_fn(0, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer0 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer0_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+
+      // delay the acc acquire to unblock tmem copy.
+      if constexpr (IsOverlappingAccum) {
+        if(is_first_iteration) {
+          accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+          is_first_iteration = false;
+        }
+      };
+
+      cute::gemm(tiled_mma,
+      make_zip_tensor(tCrA(_,_,0,read_stage_ab_buffer0),  // A buffer: Points to buffer[0]
+                      tCrA(_,_,0,read_stage_ab_buffer0),  // Next A buffer for circular buffers: Points to buffer[0]
+                      tCtSFA_mma(_, _, 0 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+      make_zip_tensor(tCrB(_,_,0,read_stage_ab_buffer0),  // B buffer: Points to buffer[0]
+                      tCrB(_,_,0,read_stage_ab_buffer0),  // Next B buffer for circular buffers: Points to buffer[0]
+                      tCtSFB_mma(_, _, 0 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+      accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      tiled_mma.accumulate_ = UMMA::ScaleOut::One;
+
+      // MMA 1
+      sf_load_fn(1, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,3,read_stage_ab_buffer0),  // A buffer: Points to buffer[0] + 48 bytes. Note the 3.
+                        tCrA(_,_,0,read_stage_ab_buffer0),  // Next A buffer for circular buffers: Points to buffer[0]
+                        tCtSFA_mma(_, _, 1 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,3,read_stage_ab_buffer0),  // B buffer: Points to buffer[0] + 48 bytes. Note the 3.
+                        tCrB(_,_,0,read_stage_ab_buffer0),  // Next B buffer for circular buffers: Points to buffer[0]
+                        tCtSFB_mma(_, _, 1 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+
+      // MMA 2
+      sf_load_fn(2, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer1 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer1_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state);
+
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,6,read_stage_ab_buffer0),  // A buffer: Points to buffer[0] + 96 bytes. Note the 6.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 2 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,6,read_stage_ab_buffer0),  // B buffer: Points to buffer[0] + 96 bytes. Note the 6.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 2 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer0_mainloop_pipe_ab_consumer_state);
+
+
+      // MMA 3
+      sf_load_fn(3, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,1,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 16 bytes. Note the 1.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 3 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,1,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 16 bytes. Note the 1.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 3 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      // MMA 4
+        sf_load_fn(4, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,4,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 64 bytes. Note the 1.
+                        tCrA(_,_,0,read_stage_ab_buffer1),  // Next A buffer for circular buffers: Points to buffer[1].
+                        tCtSFA_mma(_, _, 4 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,4,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 64 bytes. Note the 1.
+                        tCrB(_,_,0,read_stage_ab_buffer1),  // Next B buffer for circular buffers: Points to buffer[1].
+                        tCtSFB_mma(_, _, 4 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      // MMA 5
+      sf_load_fn(5, k_tile_count);
+      pipeline_ab.consumer_wait(mainloop_pipe_ab_consumer_state, barrier_token_ab);
+      int read_stage_ab_buffer2 = mainloop_pipe_ab_consumer_state.index();
+      auto buffer2_mainloop_pipe_ab_consumer_state = mainloop_pipe_ab_consumer_state;
+      ++mainloop_pipe_ab_consumer_state;
+      barrier_token_ab = pipeline_ab.consumer_try_wait(mainloop_pipe_ab_consumer_state, k_tile_count <= 1);
+
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,7,read_stage_ab_buffer1),  // A buffer: Points to buffer[1] + 112 bytes. Note the 7.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 5 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,7,read_stage_ab_buffer1),  // B buffer: Points to buffer[1] + 112 bytes. Note the 7.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 5 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer1_mainloop_pipe_ab_consumer_state);
+
+      // MMA 6
+      sf_load_fn(6, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,2,read_stage_ab_buffer2),  // A buffer: Points to buffer[1] + 32 bytes. Note the 2.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 6 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,2,read_stage_ab_buffer2),  // B buffer: Points to buffer[1] + 32 bytes. Note the 2.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 6 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+      // MMA 7
+      sf_load_fn(7, k_tile_count);
+      cute::gemm(tiled_mma,
+        make_zip_tensor(tCrA(_,_,5,read_stage_ab_buffer2),  // A buffer: Points to buffer[1] + 80 bytes. Note the 5.
+                        tCrA(_,_,0,read_stage_ab_buffer2),  // Next A buffer for circular buffers: Points to buffer[2].
+                        tCtSFA_mma(_, _, 7 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFA
+        make_zip_tensor(tCrB(_,_,5,read_stage_ab_buffer2),  // B buffer: Points to buffer[1] + 80 bytes. Note the 5.
+                        tCrB(_,_,0,read_stage_ab_buffer2),  // Next B buffer for circular buffers: Points to buffer[2].
+                        tCtSFB_mma(_, _, 7 % MmasPerSfBuffer * sf_stride)),   // Tmem tensors for SFB
+        accumulators);   // (V,M) x (V,N) => (V,M,N)
+
+      pipeline_ab.consumer_release(buffer2_mainloop_pipe_ab_consumer_state);
+      --k_tile_count;
+    }
+    return cute::make_tuple(mainloop_pipe_ab_consumer_state, mainloop_pipe_sf_consumer_state);
+  }
+
+protected:
+  typename Params::TMA_A const* observed_tma_load_a_{nullptr};
+  typename Params::TMA_B const* observed_tma_load_b_{nullptr};
+  typename Params::TMA_SFA const* observed_tma_load_sfa_{nullptr};
+  typename Params::TMA_SFB const* observed_tma_load_sfb_{nullptr};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d0f5a1524256b695618c06f5e9e58e94ace3d21
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp
@@ -0,0 +1,1163 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120ArrayTmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120ArrayTmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+   // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+  using InternalStrideA  = cute::remove_pointer_t<StrideA>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using InternalStrideB  = cute::remove_pointer_t<StrideB>;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+
+
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+
+  // Gmem copies
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  // Smem copies
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB{}))>;
+
+  using SmemCopyAtomsA =  SmemCopyAtomsA_;
+  using SmemCopyAtomsB =  SmemCopyAtomsB_;
+
+  using SmemCopyAtomA   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+
+  using SmemCopyAtomB   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_SFA;
+      cute::TmaDescriptor smem_tensormap_SFB;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), InternalLayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    cute::TmaDescriptor* tensormaps;
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementSF const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementSF const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shapes, Arguments const& args, void* workspace) {
+    (void) workspace;
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_M = int32_t(size<0>(TileShape{}));
+    auto init_N = int32_t(size<1>(TileShape{}));
+    auto init_K = int32_t(size<2>(TileShape{}));
+    auto init_L = 1;
+
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+    ElementSF const* ptr_SFA_first_batch = nullptr;
+    ElementSF const* ptr_SFB_first_batch = nullptr;
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    InternalLayoutSFA layout_SFA;
+    InternalLayoutSFB layout_SFB;
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+      layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+      layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(init_M, init_N, init_K, 1));
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+      layout_SFA = args.layout_SFA;
+      layout_SFB = args.layout_SFB;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+    Tensor tensor_sfa = make_tensor(ptr_SFA_first_batch, layout_SFA);
+    Tensor tensor_sfb = make_tensor(ptr_SFB_first_batch, layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ArrayElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ArrayElementB const**>(args.ptr_B),
+      args.dB,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFA),
+      args.layout_SFA,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFB),
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 4;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t init_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                          // (m,k,l)
+    Tensor mB_nkl = params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                          // (n,k,l)
+
+    // Represent the full tensor of Scale factors
+    InternalLayoutSFA layout_SFA{};
+    InternalLayoutSFB layout_SFB{};
+    if constexpr (IsGroupedGemmKernel) {
+      layout_SFA = params.layout_SFA[0];
+      layout_SFB = params.layout_SFB[0];
+    }
+    else {
+      layout_SFA = params.layout_SFA;
+      layout_SFB = params.layout_SFB;
+    }
+
+    Tensor mSFA_mkl = params.tma_load_sfa.get_tma_tensor(shape(layout_SFA));
+    Tensor mSFB_nkl = params.tma_load_sfb.get_tma_tensor(shape(layout_SFB));
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (TILE_N,TILE_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class TensorMapA, class TensorMapB,
+    class TensorMapSFA, class TensorMapSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B, SFA and SFB
+      //
+
+      auto [gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl] = load_inputs;
+
+      auto block_tma_a = params.tma_load_a.get_slice(0);
+      auto block_tma_b = params.tma_load_b.get_slice(0);
+
+      auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+      auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA =   gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+      Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+      Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+      // Partition source and destination tensors for tma copies
+      Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                        // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                        // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                        // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                        // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(params.tma_load_a.with(get<0>(input_tensormaps),*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_b.with(get<1>(input_tensormaps),*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        copy(params.tma_load_sfa.with(get<2>(input_tensormaps),*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(get<3>(input_tensormaps),*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+
+        // Advance k tile
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+    }
+    __syncwarp();
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] Params const& params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy from smem to registers
+    //
+
+    // A
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    // B
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                          //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                       // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                         //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                        // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                        // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                    // PIPE
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                    // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                    // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                               // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                               // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                              // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                              // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA) == size<2>(sSFA));                                    // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB) == size<2>(sSFA));                                    // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+    auto tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+    auto tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+
+
+      // Copy smem->rmem for SFA/SFB operand
+      copy(tCsSFA_stage(_,_,k_block), tCrSFA_copy_view(_,_,k_block));
+      copy(tCsSFB_stage(_,_,k_block), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block)), make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+          tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+}
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+
+
+ //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_sfa = &gmem_tensormap[sm_idx + 2 * sm_count];
+    cute::TmaDescriptor* tma_desc_sfb = &gmem_tensormap[sm_idx + 3 * sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      Tensor pSFA_tensormap = make_tensor(mainloop_params.tma_load_sfa.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFA), Int<1>{}, Int<1>{});
+      Tensor pSFB_tensormap = make_tensor(mainloop_params.tma_load_sfb.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sSFB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_SFB), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+      copy(recast<uint128_t>(pSFA_tensormap), recast<uint128_t>(sSFA_tensormap));
+      copy(recast<uint128_t>(pSFB_tensormap), recast<uint128_t>(sSFB_tensormap));
+    }
+    __syncwarp();
+    return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_sfa, tma_desc_sfb);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                    mainloop_params.ptr_SFA[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                    mainloop_params.ptr_SFB[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFA  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFA = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_SFB  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_SFB = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    ElementSF const* ptr_SF = nullptr;
+    Tensor tensor_sfa = make_tensor(ptr_SF, mainloop_params.layout_SFA[next_group]);
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfa, tensor_sfa,
+                                             prob_shape_SFA, prob_stride_SFA);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfb, tensor_sfb,
+                                             prob_shape_SFB, prob_stride_SFB);
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFA) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_SFB) {
+      stride = (stride * sizeof_bits_v<ElementSF>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFA,
+                                                            prob_shape_SFA,
+                                                            prob_stride_SFA);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB,
+                                                            prob_shape_SFB,
+                                                            prob_stride_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+
+    tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_SFA);
+    tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_SFB);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class TensorMapSFA, class TensorMapSFB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB, TensorMapSFA, TensorMapSFB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..84d1ab14caa75497b8ecd0d42cf279a4f634e51f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp
@@ -0,0 +1,887 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementPairA_,
+  class StridePairA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementPairA_,
+    StridePairA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedBlockScaled<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using StridePairA = StridePairA_;
+  using StridePairB = StridePairB_;
+
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>, "SFA and SFB data types should be the same");
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+   // A and B matrices
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using StrideA  = remove_cvref_t<decltype(get<0>(StridePairA{}))>;
+
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB  = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+
+  // SFA and SFB
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<1>(StridePairA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+
+  // Gmem copies
+  using GmemTiledCopyPairA = GmemTiledCopyPairA_;
+  using GmemTiledCopyPairB = GmemTiledCopyPairB_;
+  using GmemTiledCopyA    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopySFA  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA{}))>;
+  using GmemTiledCopyB    = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB{}))>;
+  using GmemTiledCopySFB  = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB{}))>;
+
+  // Smem copies
+  using SmemLayoutAtomsA = SmemLayoutAtomsA_;
+  using SmemLayoutAtomsB = SmemLayoutAtomsB_;
+
+  using SmemLayoutAtomA   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA{}))>;
+  using SmemLayoutAtomB   = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB{}))>;
+
+  using SmemCopyAtomsA =  SmemCopyAtomsA_;
+  using SmemCopyAtomsB =  SmemCopyAtomsB_;
+
+  using SmemCopyAtomA   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+
+  using SmemCopyAtomB   = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // SmemLayoutAtomSFA and SmemLayoutAtomSFB are for whole CTA tiles. We add the number of pipeline stages here.
+  // The number of pipeline stages is the same as the number of pipeline stages from AB Load <-> MainLoop
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>, "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using TmaInternalElementSF = ElementSF;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));  // No programmatic multicast
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        _1{}); // No programmatic multicast
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      args.layout_SFA,
+      args.layout_SFB,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_sfa.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_sfb.get_tma_descriptor());
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                 // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);           // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                          // (m,k,l)
+    Tensor mB_nkl = params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                          // (n,k,l)
+    Tensor mSFA_mkl = params.tma_load_sfa.get_tma_tensor(shape(params.layout_SFA));
+    Tensor mSFB_nkl = params.tma_load_sfb.get_tma_tensor(shape(params.layout_SFB));
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});    // (TILE_N,TILE_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+      Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A, B, SFA and SFB
+      //
+
+      auto [gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl] = load_inputs;
+
+      auto block_tma_a = params.tma_load_a.get_slice(0);
+      auto block_tma_b = params.tma_load_b.get_slice(0);
+
+      auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+      auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+      Tensor gA =   gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+      Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+      Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+      // Partition source and destination tensors for tma copies
+      Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                        // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                        // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                        // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                        // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        copy(params.tma_load_sfa.with(*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+
+        // Advance k tile
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+    }
+    __syncwarp();
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      [[maybe_unused]] Params const& params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});  // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});  // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                  // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy from smem to registers
+    //
+
+    // A
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    // B
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                          //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                       // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                         //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                        // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                        // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                  // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                    // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                    // PIPE
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                    // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                    // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                               // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                               // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                              // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                              // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA) == size<2>(sSFA));                                    // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB) == size<2>(sSFA));                                    // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+    auto tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+    auto tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+
+
+      // Copy smem->rmem for SFA/SFB operand
+      copy(tCsSFA_stage(_,_,k_block), tCrSFA_copy_view(_,_,k_block));
+      copy(tCsSFB_stage(_,_,k_block), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block)), make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          tCsSFA_stage = tCsSFA(_,_,_,read_stage);
+          tCsSFB_stage = tCsSFB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+}
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..03163121718ae8e794fcb0e0ec95cd426b88b6e8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp
@@ -0,0 +1,1320 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// CollectiveMma for A/B with different or same stages based on asymmetric DMA.
+
+template <
+  int StagesA,
+  int StagesB,
+  int StagesE,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class TileShape_,
+  class ElementPairA_,
+  class LayoutPairsA_,
+  class ElementPairB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyPairA_,
+  class SmemLayoutAtomsA_,
+  class SmemCopyAtomsA_,
+  class TransformA_,
+  class GmemTiledCopyPairB_,
+  class SmemLayoutAtomsB_,
+  class SmemCopyAtomsB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedSparseBlockScaled<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>,
+    TileShape_,
+    ElementPairA_,
+    LayoutPairsA_,
+    ElementPairB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyPairA_,
+    SmemLayoutAtomsA_,
+    SmemCopyAtomsA_,
+    TransformA_,
+    GmemTiledCopyPairB_,
+    SmemLayoutAtomsB_,
+    SmemCopyAtomsB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using ElementPairA = ElementPairA_;
+  using ElementPairB = ElementPairB_;
+  using LayoutPairsA = LayoutPairsA_;
+  using StridePairB = StridePairB_;
+  using SmemCopyAtomsA = SmemCopyAtomsA_;
+  using SmemCopyAtomsB = SmemCopyAtomsB_;
+
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparseBlockScaled<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>;
+  using TileShape = TileShape_;
+  using ElementA = remove_cvref_t<decltype(get<0>(ElementPairA{}))>;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairsA{}))>;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairsA{}))>;
+  using StrideA =  remove_cvref_t<decltype(get<3>(LayoutPairsA{}))>;
+  using ElementB = remove_cvref_t<decltype(get<0>(ElementPairB{}))>;
+  using StrideB = remove_cvref_t<decltype(get<0>(StridePairB{}))>;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using RegisterE = typename remove_extent<typename TiledMma::MMA_Op::ERegisters>::type;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  // SFA, SFB and metadata config
+  static_assert(cute::is_same_v<remove_cvref_t<decltype(get<1>(ElementPairA{}))>,
+                                remove_cvref_t<decltype(get<1>(ElementPairB{}))>>,
+                                "SFA and SFB data types should be the same");
+  using ElementSF = remove_cvref_t<decltype(get<1>(ElementPairA{}))>;
+  using LayoutSFA = remove_cvref_t<decltype(get<2>(LayoutPairsA{}))>;
+  using LayoutSFB = remove_cvref_t<decltype(get<1>(StridePairB{}))>;
+  static constexpr int SFVecSize = TiledMma::Traits::SFVecSize;
+  using Sm1xxBlkScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopyB = remove_cvref_t<decltype(get<0>(GmemTiledCopyPairB_{}))>;;
+  using SmemCopyAtomA = remove_cvref_t<decltype(get<0>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomE = remove_cvref_t<decltype(get<1>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomB = remove_cvref_t<decltype(get<0>(SmemCopyAtomsB{}))>;
+  using SmemLayoutAtomA = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsA_{}))>;
+  using SmemLayoutAtomB = remove_cvref_t<decltype(get<0>(SmemLayoutAtomsB_{}))>;
+  using SmemLayoutAtomSFA = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsA_{}))>;
+  using SmemLayoutAtomSFB = remove_cvref_t<decltype(get<1>(SmemLayoutAtomsB_{}))>;
+  using SmemCopyAtomSFA = remove_cvref_t<decltype(get<2>(SmemCopyAtomsA{}))>;
+  using SmemCopyAtomSFB = remove_cvref_t<decltype(get<1>(SmemCopyAtomsB{}))>;
+  using GmemTiledCopySFA = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairA_{}))>;
+  using GmemTiledCopySFB = remove_cvref_t<decltype(get<1>(GmemTiledCopyPairB_{}))>;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using GmemTiledCopyE = GmemTiledCopyA;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with TILEK, and STAGEs.
+  //    It let AsymmetricKRatio equals TILEK_A / TILEK_B, to make sure A/B's
+  //    pipeline keep same steps when produce / consume data.
+  // Currently, AsymmetricKRatio = {1, 2} is the only support.
+  static constexpr int AsymmetricKRatio = DispatchPolicy::StagesA != DispatchPolicy::StagesB ? 2 : 1;
+
+  // Construct TileShape for SFB load from GMEM to SMEM.
+  // It is required to keep consistency with BlockScaled granularity defined in Sm1xxBlkScaledConfig.
+  // So that TileShape for scaling factor needs to be defined as a multiple of Blk_MN.
+  using Blk_MN      = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using TileShapeSF = decltype(make_shape(ceil_div(size<0>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{},
+                                           ceil_div(size<1>(CtaShape_MNK{}), Blk_MN{}) * Blk_MN{},
+                                           shape<2>(CtaShape_MNK{})));
+  using TileShapeB = decltype(make_shape(size<0>(TileShape{}),
+                                         size<1>(TileShape{}),
+                                         ceil_div(size<2>(TileShape{}), Int<AsymmetricKRatio>{})));
+
+  static constexpr int ThreadCount = size(TiledMma{});
+  static constexpr int IsCtaN64 = shape<1>(CtaShape_MNK{}) == 64;
+  static constexpr int TensorAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int TensorEMmaSparsity = ElementEMma::sparsity;
+
+  // Use two MainloopPipeline for A and B separately.
+  using MainloopPipelineMK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesA>;
+  using MainloopPipelineNK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesB>;
+  using PipelineStateMK  = typename cutlass::PipelineState<DispatchPolicy::StagesA>;
+  using PipelineStateNK  = typename cutlass::PipelineState<DispatchPolicy::StagesB>;
+  using PipelineParams = typename MainloopPipelineMK::Params;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  // Note: SmemA, SmemSFA and SmemSFB are with same stages, while SmemB is with another stage number.
+  // SmemSFB is not with same stages as SmemB, as it will not design 1.5x stages if Smem not enough.
+  // These different stages setting could maximize capacity of latency hide, while keep data in SMEM.
+  // Metadata may kept in SMEM, or in GMEM/L2, if under SMEM limitation.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesA>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{}), Int<DispatchPolicy::StagesB>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::StagesA>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::StagesA>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::StagesA >= 2, "Specialization requires StagesA set to value 2 or more.");
+  static_assert(DispatchPolicy::StagesB >= 2, "Specialization requires StagesB set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // Is E kept in SMEM or GMEM
+  static constexpr bool UseSmemE = DispatchPolicy::StagesE != 0;
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaSourceElementA = cute::conditional_t<IsF8F6F4, ElementA, uint8_t>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  // Set shared memory layout
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, sparse_elem<TensorAMmaSparsity, uint8_t>, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, ElementBMma>;
+
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<TensorAMmaSparsity>>;
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<ElementAMma,
+                                                      cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                                      ElementEMma>;
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<TensorEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+  using SmemLayoutE = decltype(tile_to_shape(
+                  SmemLayoutAtomE{},
+                  make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesE>{}),
+                  conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  static constexpr int SmemSizeE  = UseSmemE ? cosize(SmemLayoutE{}) : 0;
+  static constexpr int StageSizeE = UseSmemE ? cosize(take<0,2>(SmemLayoutE{})) : 0;
+  // Check if metetata fetching needs predication
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  static constexpr bool IsELoadPred = not (TensorEAtomM{} == size<0>(TileShape{}) && TensorEAtomK{} == size<2>(TileShape{}));
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  // Set the bytes transferred in this TMA transaction
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFA{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutSFB{})) * cute::sizeof_bits_v<ElementSF>) +
+    cutlass::bits_to_bytes(StageSizeE * cute::sizeof_bits_v<ElementEMma>));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::ArrayEngine<ElementSF, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+      cute::ArrayEngine<ElementEMma, Int<SmemSizeE>{}> smem_E;
+    } tensors;
+
+    using PipelineStorageMK = typename MainloopPipelineMK::SharedStorage;
+    using PipelineStorageNK = typename MainloopPipelineNK::SharedStorage;
+    alignas(16) PipelineStorageMK pipeline_storage_mk;
+    alignas(16) PipelineStorageNK pipeline_storage_nk;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorageMK = typename SharedStorage::PipelineStorageMK;
+  using PipelineStorageNK = typename SharedStorage::PipelineStorageNK;
+
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+    ElementSF const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementSF const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<sparse_elem<TensorAMmaSparsity,TmaSourceElementA>>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{}));
+    using TMA_E = decltype(make_tma_copy<ElementE>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    using TMA_SFA = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFA{}),
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    using TMA_SFB = decltype(make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        make_tensor(static_cast<ElementSF const*>(nullptr), LayoutSFB{}),
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeSF{}), shape<2>(TileShapeSF{})),
+        _1{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_E tma_load_e;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+    ElementE const* ptr_E{nullptr};
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<sparse_elem<TensorAMmaSparsity, TmaSourceElementA>>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_sfa = make_tensor(args.ptr_SFA, args.layout_SFA);
+    Tensor tensor_sfb = make_tensor(args.ptr_SFB, args.layout_SFB);
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShapeB{})),
+        _1{});
+    typename Params::TMA_E tma_load_e = make_tma_copy<ElementE>(
+        GmemTiledCopyE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_SFA tma_load_sfa = make_tma_copy<uint16_t>(
+        GmemTiledCopySFA{},
+        tensor_sfa,
+        SmemLayoutSFA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_SFB tma_load_sfb = make_tma_copy<uint16_t>(
+        GmemTiledCopySFB{},
+        tensor_sfb,
+        SmemLayoutSFB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeSF{}), shape<2>(TileShapeSF{})),
+        _1{});
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_e,
+      tma_load_sfa,
+      tma_load_sfb,
+      args.layout_a,
+      args.layout_e,
+      args.layout_SFA,
+      args.layout_SFB,
+      args.ptr_E
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::upcast<2>(make_layout(make_shape(M, K, L), StrideA{})));
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfa.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfb.get_tma_descriptor());
+    if constexpr (UseSmemE) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    }
+  }
+
+  /// Create fragment for metadata. The function is referred from thrfrg_A(...)
+  template <class Tensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_E(Tensor&& tensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma) {
+    CUTE_STATIC_ASSERT_V(rank(tensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutE_TV = typename Atom::Traits::ELayout;
+
+    auto t_tile = make_tile(get<0>(TiledPerm{}),
+                            get<2>(TiledPerm{}));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+    auto t_tensor = logical_divide(tensor, t_tile);
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);                               // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    // Fragment layout
+    return thr_tensor;
+  }
+
+  /// get metadata TV
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutE_TV(TiledMma& mma)
+  {
+      // (M,K) -> (M,K)
+      auto tile_shape_mnk = tile_shape(mma);
+      auto ref_E = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+      auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+      // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+      auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+      // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+      auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+      // (thr_idx,val) -> (M,K)
+      return thrfrg_E(ref_E, mma).compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Partitioning for metadata.
+  template <class Tensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_E(Tensor&& tensor, ThrMma& thread_mma) {
+    auto thr_tensor = make_tensor(static_cast<Tensor&&>(tensor).data(), thrfrg_E(tensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition = thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ThrMma::Atom::Traits::ValTypeE>(partition.layout());
+  }
+
+  // Temporary adhoc partitioning for scaling factors.
+  template <class SFATensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFA(SFATensor&& sfatensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfatensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFA_TV = typename Atom::Traits::SFALayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfatensor, t_tile);                                                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFA_TV{},_);                             // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template <class SFBTensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_SFB(SFBTensor&& sfbtensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma)
+  {
+    CUTE_STATIC_ASSERT_V(rank(sfbtensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutSFB_TV = typename Atom::Traits::SFBLayout;
+
+    auto permutation_mnk = TiledPerm{};
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<1>(permutation_mnk),
+                            get<2>(permutation_mnk));
+    auto t_tensor = logical_divide(sfbtensor, t_tile);                                                 // (PermN,PermK)
+
+    // Tile the tensor for the Atom
+    auto a_tile = make_tile(make_layout(size<1>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto a_tensor = zipped_divide(t_tensor, a_tile);                                   // ((AtomN,AtomK),(RestN,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = a_tensor.compose(AtomLayoutSFB_TV{},_);                             // ((ThrV,FrgV),(RestN,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<2>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrN,ThrK)),(FrgV,(RestN,RestK)))
+    return thr_tensor;
+  }
+
+  template <class SFATensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFATensor&&>(sfatensor).data(), thrfrg_SFA(sfatensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFA =  thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFA);
+  }
+
+  template <class SFBTensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_SFB(SFBTensor&& sfbtensor, ThrMma& thread_mma)
+  {
+    using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF;
+    auto thr_tensor = make_tensor(static_cast<SFBTensor&&>(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+    auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<2>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition_SFB =  thr_tensor(thr_vnk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ValTypeSF>(partition_SFB);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFA_TV(TiledMma& mma)
+  {
+    // (M,K) -> (M,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto atile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<1>{} ,                Int<0>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFA(ref_A, mma).compose(atile, _).compose(thridx_2_thrid, _);
+  }
+
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutSFB_TV(TiledMma& mma)
+  {
+    // (N,K) -> (N,K)
+    auto tile_shape_mnk = tile_shape(mma);
+    auto ref_B = make_layout(make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto btile = make_tile(_,
+                          make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                make_stride(               Int<0>{} ,                Int<1>{} )),
+                                    _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+    // (thr_idx,val) -> (M,K)
+    return thrfrg_SFB(ref_B, mma).compose(btile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());             // (m,k,l)
+    Tensor mSFA_mkl = mainloop_params.tma_load_sfa.get_tma_tensor(shape(mainloop_params.layout_SFA));
+    auto mSFB_nkl = [=](){
+      if constexpr (IsCtaN64) {
+        Tensor mSFB_tmp = mainloop_params.tma_load_sfb.get_tma_tensor(shape(mainloop_params.layout_SFB));
+        auto x = stride<0,1>(mSFB_tmp);
+        auto y = ceil_div(shape<0,1>(mSFB_tmp), _2{});
+        auto  new_shape =  make_shape (make_shape( shape<0,0>(mSFB_tmp),
+                                       make_shape( make_shape(_2{}),   y)),  shape<1>(mSFB_tmp), shape<2>(mSFB_tmp));
+        auto new_stride = make_stride(make_stride(stride<0,0>(mSFB_tmp),
+                                      make_stride(make_stride(_0{}),   x)), stride<1>(mSFB_tmp), stride<2>(mSFB_tmp));
+        return make_tensor(mSFB_tmp.data(), make_layout(new_shape, new_stride));
+      }
+      else {
+        return mainloop_params.tma_load_sfb.get_tma_tensor(shape(mainloop_params.layout_SFB));
+      }
+    }();
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // ( BLK_M, BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShapeB{}, make_coord(_,_,_), Step< X,_1,_1>{});       // ( BLK_N, BLK_K,n,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // ( BLK_N, BLK_K,n,k,l)
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});    // (TILE_M,TILE_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShapeSF{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (TILE_N,TILE_K,n,k,l)
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl, gSFA_mkl, gSFB_nkl);
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  template<class MainloopPipeline, class PipelineState>
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  // Issues loads for A/E/SF only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_MK(
+      Params const& params,
+      MainloopPipelineMK pipeline,
+      PipelineStateMK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});   // (BLK_M,BLK_K,PIPE)
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()), SmemLayoutSFB{});   // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and E
+    //
+
+    Tensor gA_mkl = get<0>(load_inputs);                                                             // (BLK_M,BLK_K,k)
+    Tensor gE_mkl = get<2>(load_inputs);                                                             // (BLK_M,BLK_K,k)
+    Tensor gSFA_mkl = get<3>(load_inputs);                                                           // (BLK_M,BLK_K,k)
+    Tensor gSFB_nkl = get<4>(load_inputs);                                                           // (BLK_N,BLK_K,k)
+
+    auto block_tma_a = params.tma_load_a.get_slice(0);
+    auto block_tma_e = params.tma_load_e.get_slice(0);
+    auto block_tma_sfa = params.tma_load_sfa.get_slice(0);
+    auto block_tma_sfb = params.tma_load_sfb.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gSFA = gSFA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+    Tensor gSFB = gSFB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+    // Partition source and destination tensors for tma copies
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tEgE = block_tma_e.partition_S(gE);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tEsE = block_tma_e.partition_D(sE);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tAgSFA = block_tma_sfa.partition_S(gSFA);                                          // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsSFA = block_tma_sfa.partition_D(sSFA);                                          // (TMA,TMA_M,TMA_K,PIPE)
+    Tensor tBgSFB = block_tma_sfb.partition_S(gSFB);                                          // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsSFB = block_tma_sfb.partition_D(sSFB);                                          // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      using BarrierType = typename MainloopPipelineMK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(params.tma_load_sfa.with(*tma_barrier), tAgSFA(_,_,_,*k_tile_iter), tAsSFA(_,_,_,write_stage));
+        copy(params.tma_load_sfb.with(*tma_barrier), tBgSFB(_,_,_,*k_tile_iter), tBsSFB(_,_,_,write_stage));
+        if constexpr (UseSmemE) {
+          copy(params.tma_load_e.with(*tma_barrier), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        }
+      }
+
+      if constexpr (!UseSmemE) {
+        // Prefetch 1 stage of E data to L2 in advance
+        auto blk_coord_mkl = make_coord(get<0>(blk_coord), *k_tile_iter, get<3>(blk_coord));         // (BLK_M,BLK_K,L)
+        prefetch(make_local_E(params, blk_coord_mkl));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  // Issues loads for B/SF only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_NK(
+      Params const& params,
+      MainloopPipelineNK pipeline,
+      PipelineStateNK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for B
+    //
+
+    Tensor gB_nkl = get<1>(load_inputs);
+    auto block_tma_b = params.tma_load_b.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gB =   gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Partition source and destination tensors for tma copies
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      using BarrierType = typename MainloopPipelineNK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  // Local tile E from global memory.
+  template<class BlockCoord>
+  CUTLASS_DEVICE auto
+  make_local_E(Params const& mainloop_params,
+               BlockCoord const& blk_coord) {
+    // E layout
+    auto layoutE = mainloop_params.layout_e;
+    // E data pointer as sparse datatype
+    auto ptr_E = recast_ptr<ElementEMma>(mainloop_params.ptr_E);
+
+    // Global gmem E
+    Tensor gE = make_tensor(make_gmem_ptr(ptr_E), layoutE);                                      // (BLK_M,BLK_K,BLK_L)
+    // Local tile E
+    return local_tile(gE, select<0,2>(TileShape{}), blk_coord);                                        // (BLK_M,BLK_K)
+  }
+
+  // Load E from global memory to registers.
+  template<bool IsF8F6F4, class BlockCoord, class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_E(Params const& mainloop_params,
+         BlockCoord const& blk_coord,
+         ProblemShape_MNKL const& problem_shape_MNKL,
+         int thread_idx) {
+    // Workload
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [m_coord, k_coord, l_coord] = blk_coord;
+    auto Shape_MK = cute::make_tuple(M, K);
+
+    // Tiled mma and thread mma
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // Tile shape
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    // Re-sue copy atom E from SmemCopyAtomE
+    using GmemCopyAtomeE = SmemCopyAtomE;
+    // Gmem tile copy
+    auto gmem_tiled_copy_E = make_tiled_copy_impl(GmemCopyAtomeE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    // Gmem thread copy
+    auto gmem_thr_copy_E = gmem_tiled_copy_E.get_thread_slice(thread_idx);
+    // Gmem local E
+    auto gE_mkl = make_local_E(mainloop_params, blk_coord);
+    // Tiled gmem E
+    Tensor tCgE = gmem_thr_copy_E.partition_S(gE_mkl);                                             // (CPY,CPY_M,CPY_K)
+    // Tiled register E and copy view
+    Tensor tCrE = partition_fragment_E(gE_mkl, thread_mma);                                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrE_copy_view = gmem_thr_copy_E.retile_D(tCrE);                                        // (CPY,CPY_M,CPY_K)
+
+    if constexpr (IsF8F6F4) {
+      auto get_copy_atom_and_common_vec = [&]() {
+        using ValType = typename decltype(tCrE)::value_type;
+        // Get maximum copy vector size (logically)
+        auto common_layout = max_common_layout(tCgE, tCrE);
+        auto vec_elem = cute::min(size(common_layout), Int<128 / sizeof_bits_v<ValType>>{});
+        auto common_vec = composition(common_layout, vec_elem);
+        // Compose a Copy_Atom
+        using VecType = uint_bit_t<vec_elem * sizeof_bits_v<ValType>>;
+        using cpy = Copy_Atom<UniversalCopy<VecType>, ValType>;
+        return cute::make_tuple(cpy{}, common_vec);
+      };
+
+      // Copy depends on whether predication is needed
+      if constexpr (IsELoadPred) {
+        // Get predication based on logical element coordinates.
+        Tensor cE_mk = local_tile(
+                make_identity_tensor(Shape_MK),
+                make_shape(get<0>(TileShape{}), get<2>(TileShape{})),
+                make_shape(m_coord, k_coord));                                                          // (BLK_M, BLK_K)
+        Tensor tCcE = gmem_thr_copy_E.partition_S(cE_mk);                                            // (CPY,CPY_M,CPY_K)
+        auto [atom, vec] = get_copy_atom_and_common_vec();
+        // Coordinate comparison for out of bound (OOB) predication
+        Tensor tZpE = cute::lazy::transform(zipped_divide(tCcE, vec), [&](auto const& c){ return cute::elem_less(c, Shape_MK); });
+        // Copy
+        cute::copy_if(atom, tZpE, zipped_divide(tCgE, vec), zipped_divide(tCrE_copy_view, vec));
+      }
+      else {
+        // Copy
+        cute::copy(cute::AutoVectorizingCopyWithAssumedAlignment<32>{}, tCgE, tCrE_copy_view);
+      }
+    }
+    return tCrE;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC,
+    class KTileIterator,
+    class CtaTileCoord,
+    class ProblemShape_MNKL
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipelineMK pipeline_mk,
+      PipelineStateMK smem_pipe_read_mk,
+      MainloopPipelineNK pipeline_nk,
+      PipelineStateNK smem_pipe_read_nk,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params,
+      CtaTileCoord const& cta_tile_coord,
+      ProblemShape_MNKL const& problem_shape_MNKL) {
+    using namespace cute;
+
+    CUTE_STATIC_ASSERT(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_SFA.begin()), SmemLayoutSFA{});   // (BLK_M,BLK_K,PIPE)
+    auto SmemLayoutSFB_Ld = [SLayoutSFB = SmemLayoutSFB{}]() {
+      if constexpr (IsCtaN64) {
+        auto SLayoutSFB_tmp = SLayoutSFB;
+        auto  new_shape =  make_shape (make_shape(make_shape(shape<0,0,0>(SLayoutSFB_tmp),
+                                    shape<0,0,1>(SLayoutSFB_tmp) / _2{}), shape<0,1>(SLayoutSFB_tmp)),
+                                    shape<1>(SLayoutSFB_tmp), shape<2>(SLayoutSFB_tmp));
+        auto new_stride = stride(SLayoutSFB_tmp);
+        return make_layout(new_shape, new_stride);
+      }
+      else {
+        return SLayoutSFB;
+      }
+    }();
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_SFB.begin()) +
+                (IsCtaN64 && get<1>(cta_tile_coord) % 2 == 1 ? 8 : 0), SmemLayoutSFB_Ld);         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define A/B/E partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                               // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                               // (MMA,MMA_N,MMA_K)
+    Tensor tCrE = partition_fragment_E(sE(_,_,Int<0>{}), thread_mma);                              // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFA = partition_fragment_SFA(sSFA(_,_,Int<0>{}), thread_mma);                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrSFB = partition_fragment_SFB(sSFB(_,_,Int<0>{}), thread_mma);                        // (MMA,MMA_N,MMA_K)
+
+    //
+    // Copy Atom A, B and E retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+          as_position_independent_swizzle_tensor(sA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                  //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+         as_position_independent_swizzle_tensor(sB));                                         // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                  //      (CPY,CPY_N,CPY_K)
+
+    auto tile_shape_mnk    = tile_shape(tiled_mma);
+    auto smem_tiled_copy_E = make_tiled_copy_impl(SmemCopyAtomE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+    Tensor tCsE            = smem_thr_copy_E.partition_S(
+                                  as_position_independent_swizzle_tensor(sE));                // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrE_copy_view  = smem_thr_copy_E.retile_D(tCrE);                                  //      (CPY,CPY_M,CPY_K)
+
+    // SFA
+    auto smem_tiled_copy_SFA = make_tiled_copy_impl(SmemCopyAtomSFA{},
+                                                    get_layoutSFA_TV(tiled_mma),
+                                                    make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFA   = smem_tiled_copy_SFA.get_thread_slice(thread_idx);
+    Tensor tCsSFA            = smem_thr_copy_SFA.partition_S(
+        as_position_independent_swizzle_tensor(sSFA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrSFA_copy_view  = smem_thr_copy_SFA.retile_D(tCrSFA);                            //      (CPY,CPY_M,CPY_K)
+
+    // SFB
+    auto smem_tiled_copy_SFB = make_tiled_copy_impl(SmemCopyAtomSFB{},
+                                                    get_layoutSFB_TV(tiled_mma),
+                                                    make_shape(size<1>(tile_shape_mnk), size<2>(tile_shape_mnk))
+                                                  );
+    auto smem_thr_copy_SFB   = smem_tiled_copy_SFB.get_thread_slice(thread_idx);
+    Tensor tCsSFB            = smem_thr_copy_SFB.partition_S(
+      as_position_independent_swizzle_tensor(sSFB));                                          // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrSFB_copy_view  = smem_thr_copy_SFB.retile_D(tCrSFB);                            //      (CPY,CPY_N,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCsE) == size<1>(tCrE_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB) * Int<AsymmetricKRatio>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == Int<DispatchPolicy::StagesA>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == Int<DispatchPolicy::StagesB>{});
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesB>{} == size<2>(sB));
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsSFA) == size<1>(tCrSFA_copy_view));                       // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCrSFA_copy_view));                       // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFA) == size<1>(accum));                                  // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrSFB) == size<2>(accum));                                  // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsSFA) == size<2>(tCsSFB));                                 // CPY_K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsSFA) == size<3>(tCsSFB));                                 // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sA)     == size<2>(sSFA));                                   // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sSFB)   == Int<DispatchPolicy::StagesA>{});                  // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(sB)     == Int<DispatchPolicy::StagesB>{});                  // PIPE
+
+    if constexpr (UseSmemE) {
+      CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sE));
+    }
+
+    //
+    // DEFINE FUNCTIONS FOR PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineStateMK smem_pipe_release_mk = smem_pipe_read_mk;
+    PipelineStateNK smem_pipe_release_nk = smem_pipe_read_nk;
+
+    // Wait consumer barrier MK
+    auto wait_barrier_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_mk = pipeline_mk.consumer_try_wait(smem_pipe_read_mk);
+      pipeline_mk.consumer_wait(smem_pipe_read_mk, barrier_token_mk);
+    };
+
+    // Wait consumer barrier NK
+    auto wait_barrier_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_nk = pipeline_nk.consumer_try_wait(smem_pipe_read_nk);
+      pipeline_nk.consumer_wait(smem_pipe_read_nk, barrier_token_nk);
+    };
+
+    // Release consumer barrier MK, and move forward
+    auto release_advance_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_mk.consumer_release(smem_pipe_release_mk);
+      ++smem_pipe_read_mk;
+      ++smem_pipe_release_mk;
+    };
+
+    // Release consumer barrier NK, and move forward
+    auto release_advance_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_nk.consumer_release(smem_pipe_release_nk);
+      ++smem_pipe_read_nk;
+      ++smem_pipe_release_nk;
+    };
+
+    // Copy A from SMEM to register, and do transform if needed
+    auto copy_transform_A = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrA_copy_view(_,m_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,m_block,k_block));
+    };
+
+    // Copy B from SMEM to register, and do transform if needed
+    auto copy_transform_B = [&](auto n_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for B operand
+      copy(smem_tiled_copy_B, tCsB(_,n_block,k_block,smem_pipe_read_nk.index()), tCrB_copy_view(_,n_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,n_block,k_block));
+    };
+
+    // Copy SFA from SMEM to register
+    auto copy_SFA = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy smem->rmem for SFA operand
+      copy(tCsSFA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrSFA_copy_view(_,m_block,k_block));
+    };
+
+    // Copy SFB of all Ns from SMEM to register
+    auto copy_SFBs = [&](auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy smem->rmem for SFB operand
+      copy(tCsSFB(_,_,k_block,smem_pipe_read_mk.index()), tCrSFB_copy_view(_,_,k_block));
+    };
+
+    // Copy E from SMEM to register
+    auto copy_E = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for E operand
+      copy( recast<RegisterE>(tCsE(_,m_block,k_block,smem_pipe_read_mk.index())),
+            recast<RegisterE>(tCrE_copy_view(_,m_block,k_block)));
+    };
+
+    constexpr auto M_BLOCK_MAX = size<1>(tCrA);
+    constexpr auto N_BLOCK_MAX = size<1>(tCrB);
+    constexpr auto K_BLOCK_MAX = size<2>(tCrA);
+    constexpr auto K_BLOCK_STEP = K_BLOCK_MAX / Int<AsymmetricKRatio>{};
+
+    // Perform mainloop gemm, when E is in SMEM.
+    auto gemm_loop_with_SmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      // Load A/B/E/SFA/SFB, then do gemm.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+        // Copy smem->rmem for A/B/E operand
+        copy_transform_A(_, k_block);
+        copy_transform_B(_, k_block);
+        copy_E(_, k_block);
+
+        // Copy smem->rmem for SFA/SFB operand
+        copy_SFA(_, k_block);
+        copy_SFBs(k_block);
+
+        // Gemm
+        cute::gemm(tiled_mma,
+                  make_zip_tensor(tCrA(_,_,k_block), tCrSFA(_,_,k_block), tCrE(_,_,k_block)),
+                  make_zip_tensor(tCrB(_,_,k_block), tCrSFB(_,_,k_block)),
+                  accum);
+
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    // Perform mainloop gemm, when E is in GMEM.
+    auto gemm_loop_with_GmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy gmem->rmem for E operand
+      auto blk_coord = make_coord(get<0>(cta_tile_coord), *k_tile_iter, get<3>(cta_tile_coord));     // (BLK_M,BLK_K,L)
+      Tensor tCrE = load_E<IsF8F6F4>(mainloop_params, blk_coord, problem_shape_MNKL, thread_idx);
+      ++k_tile_iter;
+
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        // Copy smem->rmem for SFB operand. SFB needs to be copied with all N_BLOCK_MAX,
+        //   as each LDS loads several groups of data needed by one MMA instruction.
+        copy_SFBs(k_block);
+
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+            copy_SFA(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrSFA(_,m_block,k_block), tCrE(_,m_block,k_block)),
+                      make_zip_tensor(tCrB(_,n_block,k_block), tCrSFB(_,n_block,k_block)),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline_nk
+      release_advance_nk();
+      // Wait next buffer
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        auto k_block_a = k_block + K_BLOCK_STEP;
+
+        // Copy smem->rmem for SFB operand. SFB needs to be copied with all N_BLOCK_MAX,
+        //   as each LDS loads several groups of data needed by one MMA instruction.
+        copy_SFBs(k_block_a);
+
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block_a);
+            copy_SFA(m_block, k_block_a);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block_a), tCrSFA(_,m_block,k_block_a), tCrE(_,m_block,k_block_a)),
+                      make_zip_tensor(tCrB(_,n_block,k_block), tCrSFB(_,n_block,k_block_a)),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // Case when A/B with same stages, and keep E in SMEM.
+      if constexpr (UseSmemE) {
+        gemm_loop_with_SmemE();
+      }
+      // Case when A/B with different stages, and keep E in GMEM.
+      else {
+        gemm_loop_with_GmemE();
+      } // end if
+
+    }
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipelineMK, PipelineStateMK, MainloopPipelineNK, PipelineStateNK, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_array_tma_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_array_tma_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fc3d583c9b8880b49bf68933ea11ed13cb68ad4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_array_tma_blockwise_scaling.hpp
@@ -0,0 +1,1001 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120ArrayTmaWarpSpecializedBlockwiseScaling<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120ArrayTmaWarpSpecializedBlockwiseScaling<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementA = remove_cvref_t<ElementA_>;
+  using StrideA = cute::remove_cvref_t<decltype(get<0>(StridePairA_{}))>;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using LayoutSFA = cute::remove_cvref_t<decltype(get<1>(StridePairA_{}))>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+
+  using ElementB = remove_cvref_t<ElementB_>;
+  using StrideB = cute::remove_cvref_t<decltype(get<0>(StridePairB_{}))>;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using LayoutSFB = cute::remove_cvref_t<decltype(get<1>(StridePairB_{}))>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementSF = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 33;
+
+  static constexpr int ScaleGranularityM = size<0,0>(InternalLayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(InternalLayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(InternalLayoutSFB{});
+
+  static_assert(size<1, 0>(InternalLayoutSFA{}) == size<1, 0>(InternalLayoutSFB{}), "Vector size K must be equal for SFA and SFB");
+  static_assert(size<0>(TileShape{}) % ScaleGranularityM == 0, "Scale Granularity M must evenly divide the tile shape M.");
+  static_assert(size<1>(TileShape{}) % ScaleGranularityN == 0, "Scale Granularity N must evenly divide the tile shape N.");
+  static_assert(size<2>(TileShape{}) == ScaleGranularityK    , "Scale Granularity K must be equal to the tile shape K.");
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K,
+      size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K>;
+
+  static constexpr int AlignmentSFA = 1;
+  static constexpr int AlignmentSFB = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // Block scaling gmem-to-smem copy atom
+  //  we can have partial tiles in M or N, so don't vectorize those loads
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementSF>, ElementSF>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementSF>, ElementSF>;
+
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<ScaleNsPerTile>, Int<DispatchPolicy::Stages>>>;
+
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::array_aligned<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::array_aligned<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ElementSF, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A;
+      cute::array_aligned<ElementSF, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const** ptr_B{nullptr};
+    StrideB dB{};
+    ElementAccumulator const** ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementAccumulator const** ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    cute::TmaDescriptor* tensormaps;
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementSF const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementSF const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shapes, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    constexpr int tma_alignment_bits = 128;
+    auto init_M = tma_alignment_bits;
+    auto init_N = tma_alignment_bits;
+    auto init_K = tma_alignment_bits;
+    const uint32_t init_L = 1;
+    TmaInternalElementA const* ptr_A_first_batch = nullptr;
+    TmaInternalElementB const* ptr_B_first_batch = nullptr;
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+
+    if constexpr (IsGroupedGemmKernel) {
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M, init_K, init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N, init_K, init_L), stride_b));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK,
+      reinterpret_cast<cute::TmaDescriptor*>(workspace),
+      reinterpret_cast<ElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ElementB const**>(args.ptr_B),
+      args.dB,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFA),
+      args.layout_SFA,
+      reinterpret_cast<ElementSF const**>(args.ptr_SFB),
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTmaTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    return (NumInputTmaTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      [[maybe_unused]] Arguments const& args) {
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      for (int i = 0; i < problem_shapes.groups(); ++i) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M, N, K, L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+
+        if (!implementable) {
+          CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+        }
+
+        // Ensure complete scale blocks
+        implementable = implementable && (M % ScaleGranularityM == 0);
+        implementable = implementable && (N % ScaleGranularityN == 0);
+
+        // We expect full tiles in K
+        implementable = implementable && (K % size<2>(TileShape{}) == 0);
+
+        if (!implementable) {
+          CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for blockwise scaling.\n");
+        }
+      }
+    }
+
+    return implementable;
+  }
+
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+    ProblemShape_MNKL const& problem_shape_MNKL,
+    Params const& mainloop_params,
+    ElementSF const* ptr_SFA = nullptr,
+    ElementSF const* ptr_SFB = nullptr,
+    InternalLayoutSFA const layout_SFA = InternalLayoutSFA{},
+    InternalLayoutSFB const layout_SFB = InternalLayoutSFB{}
+  ) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+    const int32_t init_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(ptr_SFA), filter(layout_SFA)); // (Ms, Ks)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(ptr_SFB), filter(layout_SFB)); // (Ns, Ks)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class TensorMapA, class TensorMapB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{});
+      Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{});
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+      Tensor mSFA_mkl = get<2>(load_inputs);
+      Tensor mSFB_nkl = get<3>(load_inputs);
+      auto scales_m = get<0>(mSFA_mkl.shape());
+      auto scales_n = get<0>(mSFB_nkl.shape());
+
+      Tensor cSFA_mkl = make_identity_tensor(mSFA_mkl.shape());
+      Tensor cSFB_nkl = make_identity_tensor(mSFB_nkl.shape());
+      Tensor gSFA = local_tile(
+        mSFA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+        make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+      Tensor cSFA = local_tile(
+        cSFA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+        make_coord(m_coord,_,l_coord));
+      Tensor gSFB = local_tile(
+        mSFB_nkl, make_tile(Int<ScaleNsPerTile>{}),
+        make_coord(n_coord,_,l_coord));                   // (ScaleNsPerTile,k,1)
+      Tensor cSFB = local_tile(
+        cSFB_nkl, make_tile(Int<ScaleNsPerTile>{}),
+        make_coord(n_coord,_,l_coord));
+
+      TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+        Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+      TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
+        Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+
+      ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+      ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
+
+      Tensor tAgA_SFA = thr_scale_copy_a.partition_S(gSFA);
+      Tensor tAcA_SFA = thr_scale_copy_a.partition_S(cSFA);
+      Tensor tAsA_SFA = thr_scale_copy_a.partition_D(sSFA);
+
+      Tensor tBgB_SFB = thr_scale_copy_b.partition_S(gSFB);
+      Tensor tBcB_SFB = thr_scale_copy_b.partition_S(cSFB);
+      Tensor tBsB_SFB = thr_scale_copy_b.partition_D(sSFB);
+
+      Tensor tApA_SFA = make_tensor<bool>(shape(tAsA_SFA(_,_,0)));
+      Tensor tBpB_SFB = make_tensor<bool>(shape(tBsB_SFB(_,_,0)));
+
+      auto scale_m_lim = std::min(scales_m, (m_coord + 1) * ScaleMsPerTile);
+      auto scale_n_lim = std::min(scales_n, (n_coord + 1) * ScaleNsPerTile);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tApA_SFA); ++i)
+        tApA_SFA(i) = get<0>(tAcA_SFA(i)) < scale_m_lim;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tBpB_SFB); ++i)
+        tBpB_SFB(i) = get<0>(tBcB_SFB(i)) < scale_n_lim;
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // TMA Multicast Masks
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        int write_stage = smem_pipe_write.index();
+        if (lane_predicate) {
+          using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+          BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+          copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+          copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        }
+
+        // Copy scale tensors
+        copy_if(scale_copy_a, tApA_SFA, tAgA_SFA(_,_,*k_tile_iter), tAsA_SFA(_,_,write_stage));
+        copy_if(scale_copy_b, tBpB_SFB, tBgB_SFB(_,_,*k_tile_iter), tBsB_SFB(_,_,write_stage));
+        pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    __syncwarp();
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+      * Waits for all stages to either be released (all
+      * Consumer UNLOCKs), or if the stage was never used
+      * then would just be acquired since the phase was
+      * still inverted from make_producer_start_state
+      */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    FrgTensorC tmp_accum;
+    clear(accum);
+    clear(tmp_accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});    // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});    // (BLK_N,BLK_K,PIPE)
+
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),TileShape_N,stage)
+    Tensor sScaleBViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()),
+      Layout<
+        Shape<cute::tuple_element_t<0, TileShape>, Shape<Int<ScaleGranularityN>, Int<ScaleNsPerTile>>, Int<DispatchPolicy::Stages>>,
+        Stride<_0, Stride<_0, _1>, Int<ScaleNsPerTile>>
+      >{}); // (TileShape_M,(ScaleGranularityN,ScaleNsPerTile),stage)
+
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCsScaleAViewAsC = thread_mma.partition_C(sScaleAViewAsC);                        // (MMA,MMA_M,MMA_N,PIPE)
+    Tensor tCsScaleBViewAsC = thread_mma.partition_C(sScaleBViewAsC);                        // (MMA,MMA_M,MMA_N,PIPE)
+
+    //
+    // Copy Atom A and B retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                           // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                 //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                           // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                 //      (CPY,CPY_M,CPY_K)
+
+    Tensor tCrScaleAViewAsC = make_tensor_like<ElementSF>(tCsScaleAViewAsC(_,_,_,_0{}));     // (MMA,MMA_M,MMA_N)
+    Tensor tCrScaleBViewAsC = make_tensor_like<ElementSF>(tCsScaleBViewAsC(_,_,_,_0{}));     // (MMA,MMA_M,MMA_N)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+    };
+
+    auto copy_scale_s2r = [&](auto read_stage) {
+      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
+      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementSF scale_b = tCrScaleBViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementSF scale_a = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
+          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        }
+      }
+    };
+
+    auto rescale = [&]() {
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementSF scale_ab = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * scale_ab;
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleAViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleBViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleAViewAsC(i) * tCrScaleBViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tmp_accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+    copy_scale_s2r(read_stage);
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          rescale();
+          copy_scale_s2r(read_stage);
+        }
+
+      });
+
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+    rescale();
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    TmaInternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+
+    TmaInternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                            prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                            prob_shape_B, prob_stride_B);
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<TmaInternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      Params const& mainloop_params,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if constexpr (IsGroupedGemmKernel) {
+      return load_init(
+        problem_shape_mnkl,
+        mainloop_params,
+        mainloop_params.ptr_SFA[next_batch],
+        mainloop_params.ptr_SFB[next_batch],
+        mainloop_params.layout_SFA[next_batch],
+        mainloop_params.layout_SFB[next_batch]
+      );
+    }
+    else {
+      auto [gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl] = input_tensors;
+
+      mSFA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA[next_batch]), mainloop_params.layout_SFA[next_batch]);
+      mSFB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB[next_batch]), mainloop_params.layout_SFB[next_batch]);
+
+      return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..65f83330a76d56a26aa5b9c5c2531828660dd22a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma.hpp
@@ -0,0 +1,587 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecialized<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120TmaWarpSpecialized<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::array_aligned<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::array_aligned<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});    // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});    // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+
+    //
+    // Copy Atom A and B retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                            //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                      // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                            //      (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+
+    auto copy_kblock = [&](auto k_block) {
+        // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+      });
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f77d66468788789801044fa95bb5528e9aa051c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_mma_tma_blockwise_scaling.hpp
@@ -0,0 +1,779 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class KernelScheduleType,
+  class TileShape_,
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedBlockwiseScaling<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedBlockwiseScaling<Stages, SchedulerPipelineStageCount, ClusterShape, KernelScheduleType>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = cute::remove_cvref_t<decltype(get<0>(StridePairA_{}))>;
+  using LayoutSFA = cute::remove_cvref_t<decltype(get<1>(StridePairA_{}))>;
+  using ElementB = ElementB_;
+  using StrideB = cute::remove_cvref_t<decltype(get<0>(StridePairB_{}))>;
+  using LayoutSFB = cute::remove_cvref_t<decltype(get<1>(StridePairB_{}))>;
+  using TiledMma = TiledMma_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementSF = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 33;
+
+  static constexpr int ScaleGranularityM = size<0,0>(LayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(LayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(LayoutSFB{});
+
+  static_assert(size<1, 0>(LayoutSFA{}) == size<1, 0>(LayoutSFB{}), "Vector size K must be equal for SFA and SFB");
+  static_assert(size<0>(TileShape{}) % ScaleGranularityM == 0, "Scale Granularity M must evenly divide the tile shape M.");
+  static_assert(size<1>(TileShape{}) % ScaleGranularityN == 0, "Scale Granularity N must evenly divide the tile shape N.");
+  static_assert(size<2>(TileShape{}) == ScaleGranularityK    , "Scale Granularity K must be equal to the tile shape K.");
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+  using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<ScaleGranularityM,
+      ScaleGranularityN,
+      ScaleGranularityK,
+      size<0,1>(LayoutSFA{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K,
+      size<0,1>(LayoutSFB{}.stride()) == 1 ? UMMA::Major::MN : UMMA::Major::K>;
+
+  static constexpr int AlignmentSFA = 1;
+  static constexpr int AlignmentSFB = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // Block scaling gmem-to-smem copy atom
+  //  we can have partial tiles in M or N, so don't vectorize those loads
+  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementSF>, ElementSF>;
+  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementSF>, ElementSF>;
+
+  // Block scaling smem layout
+  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
+  using SmemLayoutScaleB = Layout<Shape<Int<ScaleNsPerTile>, Int<DispatchPolicy::Stages>>>;
+
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm120_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<ElementB, float>,
+                                                  cutlass::tfloat32_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeA>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, typename TiledMma::ValTypeB>;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutA{})) * sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+      cutlass::bits_to_bytes(size(take<0,2>(SmemLayoutB{})) * sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::array_aligned<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::array_aligned<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ElementSF, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A;
+      cute::array_aligned<ElementSF, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementAccumulator const* ptr_SFA{nullptr};
+    LayoutSFA layout_SFA{};
+    ElementAccumulator const* ptr_SFB{nullptr};
+    LayoutSFB layout_SFB{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementSF const* ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementSF const* ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      TmaTransactionBytesMK,
+      TmaTransactionBytesNK,
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    // Ensure complete scale blocks
+    implementable = implementable && (M % ScaleGranularityM == 0);
+    implementable = implementable && (N % ScaleGranularityN == 0);
+
+    // We expect full tiles in K
+    implementable = implementable && (K % size<2>(TileShape{}) == 0);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the alignment requirements for blockwise scaling.\n");
+    }
+
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA), filter(mainloop_params.layout_SFA)); // (Ms, Ks)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB), filter(mainloop_params.layout_SFB)); // (Ns, Ks)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorSFA, TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{});
+    Tensor sSFB = make_tensor(make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{});
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mSFA_mkl = get<2>(load_inputs);
+    Tensor mSFB_nkl = get<3>(load_inputs);
+    auto scales_m = get<0>(mSFA_mkl.shape());
+    auto scales_n = get<0>(mSFB_nkl.shape());
+
+    Tensor cSFA_mkl = make_identity_tensor(mSFA_mkl.shape());
+    Tensor cSFB_nkl = make_identity_tensor(mSFB_nkl.shape());
+    Tensor gSFA = local_tile(
+      mSFA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
+    Tensor cSFA = local_tile(
+      cSFA_mkl, make_tile(Int<ScaleMsPerTile>{}),
+      make_coord(m_coord,_,l_coord));
+    Tensor gSFB = local_tile(
+      mSFB_nkl, make_tile(Int<ScaleNsPerTile>{}),
+      make_coord(n_coord,_,l_coord));                   // (ScaleNsPerTile,k,1)
+    Tensor cSFB = local_tile(
+      cSFB_nkl, make_tile(Int<ScaleNsPerTile>{}),
+      make_coord(n_coord,_,l_coord));
+
+    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA_SFA = thr_scale_copy_a.partition_S(gSFA);
+    Tensor tAcA_SFA = thr_scale_copy_a.partition_S(cSFA);
+    Tensor tAsA_SFA = thr_scale_copy_a.partition_D(sSFA);
+
+    Tensor tBgB_SFB = thr_scale_copy_b.partition_S(gSFB);
+    Tensor tBcB_SFB = thr_scale_copy_b.partition_S(cSFB);
+    Tensor tBsB_SFB = thr_scale_copy_b.partition_D(sSFB);
+
+    Tensor tApA_SFA = make_tensor<bool>(shape(tAsA_SFA(_,_,0)));
+    Tensor tBpB_SFB = make_tensor<bool>(shape(tBsB_SFB(_,_,0)));
+
+    auto scale_m_lim = std::min(scales_m, (m_coord + 1) * ScaleMsPerTile);
+    auto scale_n_lim = std::min(scales_n, (n_coord + 1) * ScaleNsPerTile);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tApA_SFA); ++i)
+      tApA_SFA(i) = get<0>(tAcA_SFA(i)) < scale_m_lim;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(tBpB_SFB); ++i)
+      tBpB_SFB(i) = get<0>(tBcB_SFB(i)) < scale_n_lim;
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    // TMA Multicast Masks
+    Layout cta_layout_mnk = make_layout(ClusterShape{});
+    auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+    uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+    uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      int write_stage = smem_pipe_write.index();
+      if (lane_predicate) {
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      // Copy scale tensors
+      copy_if(scale_copy_a, tApA_SFA, tAgA_SFA(_,_,*k_tile_iter), tAsA_SFA(_,_,write_stage));
+      copy_if(scale_copy_b, tBpB_SFB, tBgB_SFB(_,_,*k_tile_iter), tBsB_SFB(_,_,write_stage));
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+      * Waits for all stages to either be released (all
+      * Consumer UNLOCKs), or if the stage was never used
+      * then would just be acquired since the phase was
+      * still inverted from make_producer_start_state
+      */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    FrgTensorC tmp_accum;
+    clear(accum);
+    clear(tmp_accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});    // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});    // (BLK_N,BLK_K,PIPE)
+
+    // Block scaling
+    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
+      Layout<
+        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
+        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
+      >{}); // ((ScaleGranularityM,ScaleMsPerTile),TileShape_N,stage)
+    Tensor sScaleBViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()),
+      Layout<
+        Shape<cute::tuple_element_t<0, TileShape>, Shape<Int<ScaleGranularityN>, Int<ScaleNsPerTile>>, Int<DispatchPolicy::Stages>>,
+        Stride<_0, Stride<_0, _1>, Int<ScaleNsPerTile>>
+      >{}); // (TileShape_M,(ScaleGranularityN,ScaleNsPerTile),stage)
+
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                         // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                         // (MMA,MMA_N,MMA_K)
+
+    Tensor tCsScaleAViewAsC = thread_mma.partition_C(sScaleAViewAsC);                        // (MMA,MMA_M,MMA_N,PIPE)
+    Tensor tCsScaleBViewAsC = thread_mma.partition_C(sScaleBViewAsC);                        // (MMA,MMA_M,MMA_N,PIPE)
+
+    //
+    // Copy Atom A and B retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+      as_position_independent_swizzle_tensor(sA));                                           // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                 //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+      as_position_independent_swizzle_tensor(sB));                                           // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                 //      (CPY,CPY_M,CPY_K)
+
+    Tensor tCrScaleAViewAsC = make_tensor_like<ElementSF>(tCsScaleAViewAsC(_,_,_,_0{}));     // (MMA,MMA_M,MMA_N)
+    Tensor tCrScaleBViewAsC = make_tensor_like<ElementSF>(tCsScaleBViewAsC(_,_,_,_0{}));     // (MMA,MMA_M,MMA_N)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    int read_stage = smem_pipe_read.index();
+    auto tCsA_stage   = tCsA(_,_,_,read_stage);
+    auto tCsB_stage   = tCsB(_,_,_,read_stage);
+
+    auto copy_kblock = [&](auto k_block) {
+      // copy smem->rmem for A/B operand
+      copy(smem_tiled_copy_A, tCsA_stage(_,_,k_block), tCrA_copy_view(_,_,k_block));
+      copy(smem_tiled_copy_B, tCsB_stage(_,_,k_block), tCrB_copy_view(_,_,k_block));
+
+      // Left shift A,B for FP4
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA_copy_view(_,_,k_block));
+      fp4_shift_B(MMAOp{}, tCrB_copy_view(_,_,k_block));
+    };
+
+    auto copy_scale_s2r = [&](auto read_stage) {
+      copy(tCsScaleAViewAsC(_, _, _, read_stage), tCrScaleAViewAsC);
+      copy(tCsScaleBViewAsC(_, _, _, read_stage), tCrScaleBViewAsC);
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrScaleAViewAsC.data()[0] = tCrScaleAViewAsC.data()[0] * tCrScaleBViewAsC.data()[0];
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementSF scale_b = tCrScaleBViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleAViewAsC); i++) {
+          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementSF scale_a = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tCrScaleBViewAsC); i++) {
+          tCrScaleBViewAsC.data()[i] = tCrScaleBViewAsC.data()[i] * scale_a;
+        }
+      }
+    };
+
+    auto rescale = [&]() {
+      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC` and `tCrScaleBViewAsC`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementSF scale_ab = tCrScaleAViewAsC.data()[0];
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * scale_ab;
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleAViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleBViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(accum); ++i) {
+          accum(i) += tmp_accum(i) * tCrScaleAViewAsC(i) * tCrScaleBViewAsC(i);
+          tmp_accum(i) = 0;
+        }
+      }
+    };
+
+    auto gemm_kblock = [&](auto k_block) {
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tmp_accum);
+    };
+
+    pipeline.consumer_wait(smem_pipe_read);
+    copy_scale_s2r(read_stage);
+    copy_kblock(_0{});
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+      //
+      // Compute on k_tile
+      //
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+        auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          cutlass::arch::NamedBarrier::sync(
+          thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+          // UNLOCK smem_pipe_read, done _computing_ on it
+          pipeline.consumer_release(smem_pipe_read);
+          ++smem_pipe_read;
+          read_stage = smem_pipe_read.index();
+          tCsA_stage   = tCsA(_,_,_,read_stage);
+          tCsB_stage   = tCsB(_,_,_,read_stage);
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+
+        copy_kblock(k_block_next);
+        gemm_kblock(k_block);
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          rescale();
+          copy_scale_s2r(read_stage);
+        }
+
+      });
+
+    } // k_tile_count
+
+    //
+    // Hoist out last k_tile
+    //
+    for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+
+      auto k_block_next = ((k_block + 1) == K_BLOCK_MAX) ? 0 : (k_block + 1);
+
+      if (k_block == K_BLOCK_MAX - 1) {
+        cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+        // UNLOCK smem_pipe_read, done _computing_ on it
+        pipeline.consumer_release(smem_pipe_read);
+        ++smem_pipe_read;
+      }
+
+      if (k_block_next > 0) {
+        copy_kblock(k_block_next);
+      }
+      gemm_kblock(k_block);
+
+    });
+    rescale();
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline, PipelineState, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7eec27bcf27acf8d7b93936b83991955ee37b854
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm120_sparse_mma_tma.hpp
@@ -0,0 +1,988 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/collective/builders/sm1xx_sparse_config.inl"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int StagesA,
+  int StagesB,
+  int StagesE,
+  int SchedulerPipelineStageCount,
+  class ClusterShape,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomPairA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm120TmaWarpSpecializedSparse<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomPairA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using TiledMma = TiledMma_;
+  using AtomThrShapeMNK = Shape<decltype(shape<0>(typename TiledMma::ThrLayoutVMNK{})), _1, _1>;
+  using DispatchPolicy = MainloopSm120TmaWarpSpecializedSparse<StagesA, StagesB, StagesE, SchedulerPipelineStageCount, ClusterShape>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA =  remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE =  remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA =  remove_cvref_t<decltype(get<2>(LayoutPairAE{}))>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = remove_cvref_t<decltype(get<0>(SmemCopyAtomPairA_{}))>;
+  using SmemCopyAtomE = remove_cvref_t<decltype(get<1>(SmemCopyAtomPairA_{}))>;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using GmemTiledCopyE = GmemTiledCopyA_;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+  using RegisterE = typename remove_extent<typename TiledMma::MMA_Op::ERegisters>::type;
+
+  using RuntimeDataTypeA = void*;
+  using RuntimeDataTypeB = void*;
+
+  static constexpr int ThreadCount = size(TiledMma{});
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with TILEK, and STAGEs.
+  //    It let AsymmetricKRatio equals TILEK_A / TILEK_B, to make sure A/B's
+  //    pipeline keep same steps when produce / consume data.
+  static constexpr int AsymmetricKRatio = DispatchPolicy::StagesA != DispatchPolicy::StagesB ? 2 : 1;
+
+  using TileShapeB = decltype(make_shape(size<0>(TileShape{}),
+                                         size<1>(TileShape{}),
+                                         ceil_div(size<2>(TileShape{}), Int<AsymmetricKRatio>{})));
+
+  // Use two MainloopPipeline for A and B separately.
+  using MainloopPipelineMK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesA>;
+  using MainloopPipelineNK = cutlass::PipelineTmaAsync<DispatchPolicy::StagesB>;
+
+  using PipelineParams = typename MainloopPipelineMK::Params;
+  using PipelineStateMK  = typename cutlass::PipelineState<DispatchPolicy::StagesA>;
+  using PipelineStateNK  = typename cutlass::PipelineState<DispatchPolicy::StagesB>;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(not cute::is_void_v<SmemCopyAtomA>,
+    "SM120 mainloop must specify a copy atom for A operand smem->rmem reads.");
+  static_assert(not cute::is_void_v<SmemCopyAtomB>,
+    "SM120 mainloop must specify a copy atom for B operand smem->rmem reads.");
+
+  static_assert(DispatchPolicy::StagesA >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(DispatchPolicy::StagesB >= 2, "Specialization requires Stages set to value 2 or more.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesA>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{}), Int<DispatchPolicy::StagesB>{}),
+      conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+  static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operands from rmem for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD>,
+                  "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static constexpr bool IsF8F6F4 = detail::is_sm100_sparse_f8f6f4<TiledMma, ElementA, ElementB>();
+
+  // Is E kept in SMEM or GMEM
+  static constexpr bool UseSmemE = DispatchPolicy::StagesE != 0;
+
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::conditional_t<not IsF8F6F4,
+                                                  ElementA,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementA, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementA>>>>>>;
+
+  using TmaInternalElementB = cute::conditional_t<not IsF8F6F4,
+                                                  ElementB,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m1_t>,
+                                                  cutlass::detail::float_e2m1_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e2m3_t>,
+                                                cutlass::detail::float_e2m3_unpacksmem_t,
+                              cute::conditional_t<cute::is_same_v<ElementB, cutlass::float_e3m2_t>,
+                                                cutlass::detail::float_e3m2_unpacksmem_t,
+                                                uint_bit_t<sizeof_bits_v<ElementB>>>>>>;
+
+  // Set shared memory layout
+  using SmemAllocTypeA = cute::conditional_t<IsF8F6F4, sparse_elem<ElementAMmaSparsity, uint8_t>, ElementAMma>;
+  using SmemAllocTypeB = cute::conditional_t<IsF8F6F4, uint8_t, ElementBMma>;
+
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(stride<0>(LayoutA{})), Int<ElementAMmaSparsity>>;
+  using SparseConfig = cutlass::Sm1xxGemmSparseConfig<
+                                    ElementAMma,
+                                    cute::conditional_t<is_A_mn_major, cutlass::layout::ColumnMajor, cutlass::layout::RowMajor>,
+                                    ElementEMma>;
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+  using SmemLayoutE = decltype(tile_to_shape(
+                  SmemLayoutAtomE{},
+                  make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::StagesE>{}),
+                  conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  static constexpr int SmemSizeE  = UseSmemE ? cosize(SmemLayoutE{}) : 0;
+  static constexpr int StageSizeE = UseSmemE ? cosize(take<0,2>(SmemLayoutE{})) : 0;
+  // Check if metetata fetching needs predicator
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  static constexpr bool IsELoadPred = not (TensorEAtomM{} == size<0>(TileShape{}) && TensorEAtomK{} == size<2>(TileShape{}));
+
+  static_assert(rank(SmemLayoutAtomE{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomE{})) == 0, "SmemLayoutAtomE must evenly divide tile shape.");
+
+  // Set the bytes transferred in this TMA transaction
+  static constexpr uint32_t TmaTransactionBytesMK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+    cutlass::bits_to_bytes(StageSizeE * cute::sizeof_bits_v<ElementEMma>));
+  static constexpr uint32_t TmaTransactionBytesNK = static_cast<uint32_t>(
+    cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementB>));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(1024) cute::ArrayEngine<SmemAllocTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<ElementEMma, Int<SmemSizeE>{}> smem_E;
+    } tensors;
+
+    using PipelineStorageMK = typename MainloopPipelineMK::SharedStorage;
+    using PipelineStorageNK = typename MainloopPipelineNK::SharedStorage;
+    alignas(16) PipelineStorageMK pipeline_storage_mk;
+    alignas(16) PipelineStorageNK pipeline_storage_nk;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorageMK = typename SharedStorage::PipelineStorageMK;
+  using PipelineStorageNK = typename SharedStorage::PipelineStorageNK;
+
+  struct Arguments {
+    ElementA const* ptr_A{nullptr};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{nullptr};
+    StrideB dB{};
+    ElementE const* ptr_E{nullptr};
+    LayoutE layout_e{};
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<sparse_elem<ElementAMmaSparsity,ElementA>>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{}));
+    using TMA_E = decltype(make_tma_copy<ElementE>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_E tma_load_e;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    ElementE const* ptr_E{nullptr};
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<sparse_elem<ElementAMmaSparsity, ElementA>>(args.ptr_A);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaInternalElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShapeB{}), shape<2>(TileShapeB{})),
+        _1{});
+    typename Params::TMA_E tma_load_e = make_tma_copy<ElementE>(
+        GmemTiledCopyE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        _1{});
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_e,
+      args.layout_a,
+      args.layout_e,
+      args.ptr_E
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+    constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::upcast<2>(make_layout(make_shape(M, K, L), StrideA{})));
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    if constexpr (UseSmemE) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    }
+  }
+
+  /// Create fragment for metadata. The function is referred from thrfrg_A(...)
+  template <class Tensor, class Atom, class TiledThr, class TiledPerm>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  thrfrg_E(Tensor&& tensor, TiledMMA<Atom, TiledThr, TiledPerm>& mma) {
+    CUTE_STATIC_ASSERT_V(rank(tensor) >= Int<2>{});
+
+    using AtomShape_MNK  = typename Atom::Shape_MNK;
+    using AtomLayoutE_TV = typename Atom::Traits::ELayout;
+
+    auto t_tile = make_tile(get<0>(TiledPerm{}),
+                            get<2>(TiledPerm{}));
+    auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+    auto t_tensor = logical_divide(tensor, t_tile);
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(AtomShape_MNK{})),
+                            make_layout(size<2>(AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                                   // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);                               // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(thr_layout_vmnk)),
+                                        make_layout(size<3>(thr_layout_vmnk))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);                  // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    // Fragment layout
+    return thr_tensor;
+  }
+
+  /// get metadata TV
+  template<class TiledMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  get_layoutE_TV(TiledMma& mma)
+  {
+      // (M,K) -> (M,K)
+      auto tile_shape_mnk = tile_shape(mma);
+      auto ref_E = make_layout(make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+      auto thr_layout_vmnk = mma.get_thr_layout_vmnk();
+
+      // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+      auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+      // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+      auto thridx_2_thrid = right_inverse(thr_layout_vmnk);
+      // (thr_idx,val) -> (M,K)
+      return thrfrg_E(ref_E, mma).compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  /// Partitioning for metadata.
+  template <class Tensor, class ThrMma>
+  CUTE_HOST_DEVICE constexpr
+  auto
+  partition_fragment_E(Tensor&& tensor, ThrMma& thread_mma) {
+    auto thr_tensor = make_tensor(static_cast<Tensor&&>(tensor).data(), thrfrg_E(tensor.layout(),thread_mma));
+    auto thr_vmnk = thread_mma.thr_vmnk_;
+
+    auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk)));
+    auto partition = thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+    return make_fragment_like<ThrMma::Atom::Traits::ValTypeE>(partition.layout());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());             // (m,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{},  make_coord(_,_,_), Step<_1, X,_1>{});       // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShapeB{}, make_coord(_,_,_), Step< X,_1,_1>{});       // (BLK_N,BLK_K,n,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{},  make_coord(_,_,_), Step<_1, X,_1>{});       // (BLK_N,BLK_K,n,k,l)
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Issues loads for A/E only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_MK(
+      Params const& mainloop_params,
+      MainloopPipelineMK pipeline,
+      PipelineStateMK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+
+    // Prepare the TMA loads for A and B
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gE_mkl = get<2>(load_inputs);
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(0);
+    auto block_tma_e = mainloop_params.tma_load_e.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,  k)
+    Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,  k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+    Tensor tEgE = block_tma_e.partition_S(gE);                                                // (TMA,TMA_M,TMA_K,   k)
+    Tensor tEsE = block_tma_e.partition_D(sE);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipelineMK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(*tma_barrier), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        if constexpr (UseSmemE) {
+          copy(mainloop_params.tma_load_e.with(*tma_barrier), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        }
+      }
+
+      if constexpr (!UseSmemE) {
+        auto blk_coord_mkl = make_coord(get<0>(blk_coord), *k_tile_iter, get<3>(blk_coord));         // (BLK_M,BLK_K,L)
+        prefetch(make_local_E(mainloop_params, blk_coord_mkl));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Issues loads for B only (used when DMA warp is split).
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_NK(
+      Params const& mainloop_params,
+      MainloopPipelineNK pipeline,
+      PipelineStateNK smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});     //     (BLK_N,BLK_K,PIPE)
+
+    // Prepare the TMA loads for A and B
+    Tensor gB_nkl = get<1>(load_inputs);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(0);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                //     (BLK_N,BLK_K,   k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,   k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipelineNK::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_b.with(*tma_barrier), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      // Advance smem_pipe_write
+      ++k_tile_iter;
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  template<class MainloopPipeline, class PipelineState>
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  // Local tile E from global memory.
+  template<class BlockCoord>
+  CUTLASS_DEVICE auto
+  make_local_E(Params const& mainloop_params,
+               BlockCoord const& blk_coord) {
+    // E layout
+    auto layoutE = mainloop_params.layout_e;
+    // E data pointer as sparse datatype
+    auto ptr_E = recast_ptr<ElementEMma>(mainloop_params.ptr_E);
+
+    // Global gmem E
+    Tensor gE = make_tensor(make_gmem_ptr(ptr_E), layoutE);                                      // (BLK_M,BLK_K,BLK_L)
+    // Local tile E
+    return local_tile(gE, select<0,2>(TileShape{}), blk_coord);                                        // (BLK_M,BLK_K)
+  }
+
+  // Load E from global memory to registers.
+  template<bool IsF8F6F4, class BlockCoord, class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_E(Params const& mainloop_params,
+         BlockCoord const& blk_coord,
+         ProblemShape_MNKL const& problem_shape_MNKL,
+         int thread_idx) {
+    // Workload
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto [m_coord, k_coord, l_coord] = blk_coord;
+    auto Shape_MK = cute::make_tuple(M, K);
+
+    // Tiled mma and thread mma
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // Tile shape
+    auto tile_shape_mnk = tile_shape(tiled_mma);
+    // Re-sue copy atom E from SmemCopyAtomE
+    using GmemCopyAtomeE = SmemCopyAtomE;
+    // Gmem tile copy
+    auto gmem_tiled_copy_E = make_tiled_copy_impl(GmemCopyAtomeE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    // Gmem thread copy
+    auto gmem_thr_copy_E = gmem_tiled_copy_E.get_thread_slice(thread_idx);
+    // Gmem local E
+    auto gE_mkl = make_local_E(mainloop_params, blk_coord);
+    // Tiled gmem E
+    Tensor tCgE = gmem_thr_copy_E.partition_S(gE_mkl);                                             // (CPY,CPY_M,CPY_K)
+    // Tiled register E and copy view
+    Tensor tCrE = partition_fragment_E(gE_mkl, thread_mma);                                        // (MMA,MMA_M,MMA_K)
+    Tensor tCrE_copy_view = gmem_thr_copy_E.retile_D(tCrE);                                        // (CPY,CPY_M,CPY_K)
+
+    if constexpr (IsF8F6F4) {
+      auto get_copy_atom_and_common_vec = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+        using ValType = typename decltype(tCrE)::value_type;
+        // Get maximum copy vector size (logically)
+        auto common_layout = max_common_layout(tCgE, tCrE);
+        auto vec_elem = cute::min(size(common_layout), Int<128 / sizeof_bits_v<ValType>>{});
+        auto common_vec = composition(common_layout, vec_elem);
+        // Compose a Copy_Atom
+        using VecType = uint_bit_t<vec_elem * sizeof_bits_v<ValType>>;
+        using cpy = Copy_Atom<UniversalCopy<VecType>, ValType>;
+        return cute::make_tuple(cpy{}, common_vec);
+      };
+
+      // Copy depends on whether predication is needed
+      if constexpr (IsELoadPred) {
+        // Get predication based on logical element coordinates.
+        Tensor cE_mk = local_tile(
+                make_identity_tensor(Shape_MK),
+                make_shape(get<0>(TileShape{}), get<2>(TileShape{})),
+                make_shape(m_coord, k_coord));                                                          // (BLK_M, BLK_K)
+        Tensor tCcE = gmem_thr_copy_E.partition_S(cE_mk);                                            // (CPY,CPY_M,CPY_K)
+        auto [atom, vec] = get_copy_atom_and_common_vec();
+        // Coordinate comparison for out of bound (OOB) predication
+        Tensor tZpE = cute::lazy::transform(zipped_divide(tCcE, vec), [&](auto const& c){ return cute::elem_less(c, Shape_MK); });
+        // Copy
+        cute::copy_if(atom, tZpE, zipped_divide(tCgE, vec), zipped_divide(tCrE_copy_view, vec));
+      }
+      else {
+        // Copy
+        cute::copy(cute::AutoVectorizingCopyWithAssumedAlignment<32>{}, tCgE, tCrE_copy_view);
+      }
+    }
+    return tCrE;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC,
+    class KTileIterator,
+    class CtaTileCoord,
+    class ProblemShape_MNKL
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipelineMK pipeline_mk,
+      PipelineStateMK smem_pipe_read_mk,
+      MainloopPipelineNK pipeline_nk,
+      PipelineStateNK smem_pipe_read_nk,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params,
+      CtaTileCoord const& cta_tile_coord,
+      ProblemShape_MNKL const& problem_shape_MNKL) {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+
+    clear(accum);
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
+
+    //
+    // Define A/B/E partitioning
+    //
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+
+    // Allocate fragments and descriptors
+    Tensor tCrA = thread_mma.partition_fragment_A(sA(_,_,Int<0>{}));                               // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thread_mma.partition_fragment_B(sB(_,_,Int<0>{}));                               // (MMA,MMA_N,MMA_K)
+    Tensor tCrE = partition_fragment_E(sE(_,_,Int<0>{}), thread_mma);                              // (MMA,MMA_M,MMA_K)
+
+    //
+    // Copy Atom A, B and E retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(
+          as_position_independent_swizzle_tensor(sA));                                        // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                  //      (CPY,CPY_M,CPY_K)
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(
+         as_position_independent_swizzle_tensor(sB));                                         // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                                  //      (CPY,CPY_N,CPY_K)
+
+    auto tile_shape_mnk    = tile_shape(tiled_mma);
+    auto smem_tiled_copy_E = make_tiled_copy_impl(SmemCopyAtomE{},
+                                                  get_layoutE_TV(tiled_mma),
+                                                  make_shape(size<0>(tile_shape_mnk), size<2>(tile_shape_mnk)));
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+    Tensor tCsE            = smem_thr_copy_E.partition_S(
+                                  as_position_independent_swizzle_tensor(sE));                // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrE_copy_view  = smem_thr_copy_E.retile_D(tCrE);                                  //      (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCsE) == size<1>(tCrE_copy_view));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB) * Int<AsymmetricKRatio>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == Int<DispatchPolicy::StagesA>{});
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == Int<DispatchPolicy::StagesB>{});
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sA));
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesB>{} == size<2>(sB));
+    if constexpr (UseSmemE) {
+      CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::StagesA>{} == size<2>(sE));
+    }
+
+    //
+    // DEFINE FUNCTIONS FOR PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineStateMK smem_pipe_release_mk = smem_pipe_read_mk;
+    PipelineStateNK smem_pipe_release_nk = smem_pipe_read_nk;
+
+    // Wait consumer barrier MK
+    auto wait_barrier_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_mk = pipeline_mk.consumer_try_wait(smem_pipe_read_mk);
+      pipeline_mk.consumer_wait(smem_pipe_read_mk, barrier_token_mk);
+    };
+
+    // Wait consumer barrier NK
+    auto wait_barrier_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      auto barrier_token_nk = pipeline_nk.consumer_try_wait(smem_pipe_read_nk);
+      pipeline_nk.consumer_wait(smem_pipe_read_nk, barrier_token_nk);
+    };
+
+    // Release consumer barrier MK, and move forward
+    auto release_advance_mk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_mk.consumer_release(smem_pipe_release_mk);
+      ++smem_pipe_read_mk;
+      ++smem_pipe_release_mk;
+    };
+
+    // Release consumer barrier NK, and move forward
+    auto release_advance_nk = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      pipeline_nk.consumer_release(smem_pipe_release_nk);
+      ++smem_pipe_read_nk;
+      ++smem_pipe_release_nk;
+    };
+
+    // Copy A from SMEM to register, and do transform if needed
+    auto copy_transform_A = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,m_block,k_block,smem_pipe_read_mk.index()), tCrA_copy_view(_,m_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_A(MMAOp{}, tCrA(_,m_block,k_block));
+    };
+
+    // Copy B from SMEM to register, and do transform if needed
+    auto copy_transform_B = [&](auto n_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for B operand
+      copy(smem_tiled_copy_B, tCsB(_,n_block,k_block,smem_pipe_read_nk.index()), tCrB_copy_view(_,n_block,k_block));
+      // Perform transform if needed.
+      using MMAOp = typename TiledMma::MMA_Op;
+      fp4_shift_B(MMAOp{}, tCrB(_,n_block,k_block));
+    };
+
+    // Copy E from SMEM to register
+    auto copy_E = [&](auto m_block, auto k_block) CUTLASS_LAMBDA_FUNC_INLINE {
+      // copy smem->rmem for E operand
+      copy( recast<RegisterE>(tCsE(_,m_block,k_block,smem_pipe_read_mk.index())),
+            recast<RegisterE>(tCrE_copy_view(_,m_block,k_block)));
+    };
+
+    // TILE M/N/K for one TILE block
+    constexpr auto M_BLOCK_MAX = size<1>(tCrA);
+    constexpr auto N_BLOCK_MAX = size<1>(tCrB);
+    constexpr auto K_BLOCK_MAX = size<2>(tCrA);
+    constexpr auto K_BLOCK_STEP = K_BLOCK_MAX / Int<AsymmetricKRatio>{};
+
+    // Perform mainloop gemm, when E is in SMEM.
+    auto gemm_loop_with_SmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      // Load A/B/E, then do gemm.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+            copy_E(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrE(_,m_block,k_block)),
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+    // Perform mainloop gemm, when E is in GMEM.
+    auto gemm_loop_with_GmemE = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
+      // Copy gmem->rmem for E operand
+      auto blk_coord = make_coord(get<0>(cta_tile_coord), *k_tile_iter, get<3>(cta_tile_coord));     // (BLK_M,BLK_K,L)
+      Tensor tCrE = load_E<IsF8F6F4>(mainloop_params, blk_coord, problem_shape_MNKL, thread_idx);
+      ++k_tile_iter;
+
+      // WAIT on smem_pipe_read until data is available
+      wait_barrier_mk();
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block), tCrE(_,m_block,k_block)),
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline_nk
+      release_advance_nk();
+      // Wait next buffer
+      wait_barrier_nk();
+
+      for_each(make_int_sequence<K_BLOCK_STEP>{}, [&] (auto k_block) {
+        auto k_block_a = k_block + K_BLOCK_STEP;
+        for_each(make_int_sequence<N_BLOCK_MAX>{}, [&] (auto n_block) {
+          // Copy smem->rmem for B operand
+          copy_transform_B(n_block, k_block);
+
+          for_each(make_int_sequence<M_BLOCK_MAX>{}, [&] (auto m_block) {
+            // Copy smem->rmem for A operand
+            copy_transform_A(m_block, k_block_a);
+
+            // Gemm
+            cute::gemm(tiled_mma,
+                      make_zip_tensor(tCrA(_,m_block,k_block_a), tCrE(_,m_block,k_block_a)),
+                      tCrB(_,n_block,k_block),
+                      accum(_,m_block,n_block));
+          });
+        });
+      });
+
+      cutlass::arch::NamedBarrier::sync(
+        thr_size(tiled_mma), cutlass::arch::ReservedNamedBarriers::Sm120MainloopBarrier);
+
+      // Advance consumer pipeline mk/nk
+      release_advance_mk();
+      release_advance_nk();
+    };
+
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // Case when A/B with same stages, and keep E in SMEM.
+      if constexpr (UseSmemE) {
+        gemm_loop_with_SmemE();
+      }
+      // Case when A/B with different stages, and keep E in GMEM.
+      else {
+        gemm_loop_with_GmemE();
+      } // end if
+
+    } // end loop k_tile_count
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipelineMK, PipelineStateMK, MainloopPipelineNK, PipelineStateNK, int) {
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1b6f8589a249ce7fe9112d8be3f6a4f83eebc4a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
@@ -0,0 +1,600 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/gemm/collective/collective_mma_decl.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStageUnpredicated,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStageUnpredicated;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    (void)residue_mnk;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto copy_a_thr = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto copy_b_thr = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = copy_a_thr.partition_S(gA);                                  // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = copy_a_thr.partition_D(sA);                                  // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBgB = copy_b_thr.partition_S(gB);                                  // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = copy_b_thr.partition_D(sB);                                  // (BCPY,BCPY_N,BCPY_K)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA);                           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB);                           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_a = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto thr_copy_A        = smem_tiled_copy_a.get_thread_slice(thread_idx);
+    Tensor tCsA            = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view  = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto smem_tiled_copy_b = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto thr_copy_B        = smem_tiled_copy_b.get_thread_slice(thread_idx);
+    Tensor tCsB            = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view  = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy gmem to rmem for the first k_tile
+    copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+    copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+    if (--k_tile_count > 0) ++k_tile_iter;
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(smem_tiled_copy_a, tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(smem_tiled_copy_b, tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;     // static
+        copy(smem_tiled_copy_a, tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_b, tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          // Copy gmem to rmem
+          copy(gmem_tiled_copy_a, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy(gmem_tiled_copy_b, tBgB(_,_,_,*k_tile_iter), tBrB);
+          if (--k_tile_count > 0) ++k_tile_iter;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm70TwoStage,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}))));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}))));
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+    static_assert(cute::rank(SmemLayoutB{}) == 2,
+      "MainloopTwoStage must not have a smem shape with a pipeline mode.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA.data() = &gA(0, get<2>(residue_mnk), 0);
+    gB.data() = &gB(0, get<2>(residue_mnk), 0);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate the register tiles for double buffering -- same shape as partitioned data
+    Tensor tArA = make_fragment_like(tAsA);                                    // (ACPY,ACPY_M,ACPY_K)
+    Tensor tBrB = make_fragment_like(tBsB);                                    // (BCPY,BCPY_N,BCPY_K)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the rmem tiles to account for predicated off loads
+    clear(tArA);
+    clear(tBrB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tArA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tArA(_,_,k));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBrB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBrB(_,_,k));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.make_fragment_A(thr_mma.partition_A(sA));           // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.make_fragment_B(thr_mma.partition_B(sB));           // (MMA,MMA_M,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_a = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto thr_copy_A        = smem_tiled_copy_a.get_thread_slice(thread_idx);
+    Tensor tCsA            = thr_copy_A.partition_S(sA);
+    Tensor tCrA_copy_view  = thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // M
+
+    auto smem_tiled_copy_b = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto thr_copy_B        = smem_tiled_copy_b.get_thread_slice(thread_idx);
+    Tensor tCsB            = thr_copy_B.partition_S(sB);
+    Tensor tCrB_copy_view  = thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // N
+
+    //
+    // Prologue
+    //
+
+    // Copy rmem to smem
+    copy(tArA, tAsA);
+    copy(tBrB, tBsB);
+    // Clear accumulators
+    __syncthreads();
+
+    // Load A, B smem->rmem for k=0
+    copy(smem_tiled_copy_a, tCsA(_,_,0), tCrA_copy_view(_,_,0));
+    copy(smem_tiled_copy_b, tCsB(_,_,0), tCrB_copy_view(_,_,0));
+    //
+    // Mainloop
+    //
+
+    // Size of the k-tiles's outer product mode (k)
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -1)
+    {
+      // Pipeline the outer products with a static for loop
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          __syncthreads();
+
+          // Copy rmem to smem
+          copy(tArA, tAsA);
+          copy(tBrB, tBsB);
+          __syncthreads();
+        }
+
+        // Load A, B smem->rmem for k+1
+        int k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;    // static
+        copy(smem_tiled_copy_a, tCsA(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_b, tCsB(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        if (k_block == 0)
+        {
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tArA);
+          copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBrB);
+          ++k_tile_iter;
+          --k_tile_count;
+        }
+
+        // transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+
+        // Thread-level register gemm for k
+        // disambiguate gemm (shared with the namespace name)
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_array_multistage.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_array_multistage.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b83e04891244a840339af1f639fa3bbe74c58d66
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_array_multistage.hpp
@@ -0,0 +1,412 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_
+>
+struct CollectiveMma<
+    MainloopSm80ArrayCpAsync<
+      Stages,
+      ClusterShape_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+   >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80ArrayCpAsync<
+                          Stages,
+                          ClusterShape_>;
+  using TileShape = TileShape_;
+  // Follow the change in TestSmall: TileShape switch to CtaShape
+  // In legacy arch, it should be same
+  using CtaShape_MNK = TileShape;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A{nullptr};
+    StrideA dA{};
+    ElementB const** ptr_B{nullptr};
+    StrideB dB{};
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,                   // (BLK_M, BLK_K, K_TILES)
+      TensorB gB,                   // (BLK_N, BLK_K, K_TILES)
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA);
+    gB = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_A.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_B.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the smem tiles to account for predicated off loads
+    clear(tAsA);
+    clear(tBsB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      constexpr int k_pipe = 0;
+
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe));
+        }
+      }
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Start async loads for 1st k-tile onwards, no k-residue handling needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      if (k_tile_count <= 0) {
+        clear(tApA);
+        clear(tBpB);
+      }
+      copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));  // CpAsync
+      copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));  // CpAsync
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA(_,_,0));                    // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB(_,_,0));                    // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A   = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A     = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA           = smem_thr_copy_A.partition_S(sA);                   // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);                    // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB              = smem_thr_copy_B.partition_S(sB);                // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);                 // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          // Set all predicates to false if we are going to overshoot bounds
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          ++k_tile_iter;
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e3e394dc10a5d18eebf7e185894c1a9de303e8a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
@@ -0,0 +1,706 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm80CpAsyncUnpredicated<Stages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+  >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsyncUnpredicated<Stages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  // Follow the change in TestSmall: TileShape switch to CtaShape
+  // For sm80 arch, CtaShape should equal to TileShape
+  using CtaShape_MNK = TileShape;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,
+      TensorB gB,
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3,
+      "MainloopSm80CpAsync must have a pipeline mode in the smem layout.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    (void) residue_mnk;
+    //assert(residue_mnk == make_tuple(0,0,0));
+
+    //
+    // PREFETCH
+    //
+
+    // Start async loads for all pipes but the last
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 0; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));
+      copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));
+      cp_async_fence();
+      --k_tile_count;
+      if (k_tile_count > 0) { ++k_tile_iter; }
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA = thr_mma.partition_fragment_A(sA(_,_,0));                     // (MMA,MMA_M,MMA_K)
+    Tensor tCrB = thr_mma.partition_fragment_B(sB(_,_,0));                     // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_A) == size(tiled_mma));
+    CUTE_STATIC_ASSERT_V(size(gmem_tiled_copy_B) == size(tiled_mma));
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA            = smem_thr_copy_A.partition_S(sA);                  // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                   // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB            = smem_thr_copy_B.partition_S(sB);                  // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);                   // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (k_tile_count > -(DispatchPolicy::Stages-1))
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<x>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+
+          // Advance the tile
+          --k_tile_count;
+          if (k_tile_count > 0) { ++k_tile_iter; }
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_
+>
+struct CollectiveMma<
+    MainloopSm80CpAsync<
+      Stages,
+      ClusterShape_>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_
+   >
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm80CpAsync<
+                          Stages,
+                          ClusterShape_>;
+  using TileShape = TileShape_;
+  // Follow the change in TestSmall: TileShape switch to CtaShape
+  // In legacy arch, it should be same
+  using CtaShape_MNK = TileShape;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "CpAsync mainloop must have at least 2 stages in the pipeline.");
+
+  struct SharedStorage
+  {
+    cute::array_aligned<ElementA, cute::cosize_v<SmemLayoutA>> smem_a;
+    cute::array_aligned<ElementB, cute::cosize_v<SmemLayoutB>> smem_b;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  CollectiveMma() = default;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& _, Arguments const& args, void* workspace) {
+    (void) workspace;
+    return args;
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  template <
+    class FrgTensorD,
+    class TensorA,
+    class TensorB,
+    class FrgTensorC,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      FrgTensorD &accum,
+      TensorA gA,                   // (BLK_M, BLK_K, K_TILES)
+      TensorB gB,                   // (BLK_N, BLK_K, K_TILES)
+      FrgTensorC const &src_accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      char *smem_buf)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorD>::value, "D tensor must be rmem resident.");
+    static_assert(is_gmem<TensorA>::value,    "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value,    "B tensor must be gmem resident.");
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    // Construct shared memory tiles
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_a.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_b.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<0>(gA) == size<0>(sA));                          // BLK_M
+    CUTE_STATIC_ASSERT_V(size<1>(gA) == size<1>(sA));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<0>(gB) == size<0>(sB));                          // BLK_N
+    CUTE_STATIC_ASSERT_V(size<1>(gB) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(size<1>(sA) == size<1>(sB));                          // BLK_K
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    gA = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA);
+    gB = cute::domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_A;
+    GmemTiledCopyB gmem_tiled_copy_B;
+    auto gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(thread_idx);
+    auto gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_A.partition_S(gA);                             // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_A.partition_D(sA);                             // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_B.partition_S(gB);                             // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_B.partition_D(sB);                             // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    //
+    // PREDICATES
+    //
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_A.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_B.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    //
+    // PREFETCH
+    //
+
+    // Clear the smem tiles to account for predicated off loads
+    clear(tAsA);
+    clear(tBsB);
+
+    // Start async loads for 0th k-tile, where we take care of the k residue
+    {
+      constexpr int k_pipe = 0;
+
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_A, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,k_pipe));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_B, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,k_pipe));
+        }
+      }
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    // Start async loads for 1st k-tile onwards, no k-residue handling needed
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_pipe = 1; k_pipe < DispatchPolicy::Stages-1; ++k_pipe) {
+      if (k_tile_count <= 0) {
+        clear(tApA);
+        clear(tBpB);
+      }
+      copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,k_pipe));  // CpAsync
+      copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,k_pipe));  // CpAsync
+      cp_async_fence();
+      ++k_tile_iter;
+      --k_tile_count;
+    }
+
+    //
+    // MMA Atom partitioning
+    //
+
+    // Tile MMA compute thread partitions and allocate accumulators
+    TiledMma tiled_mma;
+    auto thr_mma = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCrA  = thr_mma.partition_fragment_A(sA(_,_,0));                    // (MMA,MMA_M,MMA_K)
+    Tensor tCrB  = thr_mma.partition_fragment_B(sB(_,_,0));                    // (MMA,MMA_N,MMA_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(src_accum));                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(accum));                     // MMA_N
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(src_accum));                 // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB));                      // MMA_K
+
+    //
+    // Copy Atom retiling
+    //
+
+    auto smem_tiled_copy_A   = make_tiled_copy_A(SmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A     = smem_tiled_copy_A.get_thread_slice(thread_idx);
+    Tensor tCsA           = smem_thr_copy_A.partition_S(sA);                   // (CPY,CPY_M,CPY_K,PIPE)
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);                    // (CPY,CPY_M,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));            // CPY_K
+
+    auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomB{}, tiled_mma);
+    auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(thread_idx);
+    Tensor tCsB              = smem_thr_copy_B.partition_S(sB);                // (CPY,CPY_N,CPY_K,PIPE)
+    Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);                 // (CPY,CPY_N,CPY_K)
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view));            // CPY_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsB) == size<2>(tCrB_copy_view));            // CPY_K
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // Current pipe index in smem to read from
+    int smem_pipe_read  = 0;
+    // Current pipe index in smem to write to
+    int smem_pipe_write = DispatchPolicy::Stages-1;
+
+    Tensor tCsA_p = tCsA(_,_,_,smem_pipe_read);
+    Tensor tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+    // Size of the register pipeline
+    auto K_BLOCK_MAX = size<2>(tCrA);
+
+    // PREFETCH register pipeline
+    if (K_BLOCK_MAX > 1) {
+      // Wait until our first prefetched tile is loaded in
+      cp_async_wait<DispatchPolicy::Stages-2>();
+      __syncthreads();
+
+      // Prefetch the first rmem from the first k-tile
+      copy(smem_tiled_copy_A, tCsA_p(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
+      copy(smem_tiled_copy_B, tCsB_p(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+    }
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > -(DispatchPolicy::Stages-1); --k_tile_count)
+    {
+      // Pipeline the outer products with a static for loop.
+      //
+      // Note, the for_each() function is required here to ensure `k_block` is of type Int<N>.
+      for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block)
+      {
+        if (k_block == K_BLOCK_MAX - 1)
+        {
+          // Slice the smem_pipe_read smem
+          tCsA_p = tCsA(_,_,_,smem_pipe_read);
+          tCsB_p = tCsB(_,_,_,smem_pipe_read);
+
+          // Commit the smem for smem_pipe_read
+          cp_async_wait<DispatchPolicy::Stages-2>();
+          __syncthreads();
+        }
+
+        // Load A, B shmem->regs for k_block+1
+        auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX;  // static
+        copy(smem_tiled_copy_A, tCsA_p(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
+        copy(smem_tiled_copy_B, tCsB_p(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+        // Copy gmem to smem before computing gemm on each k-pipe
+        if (k_block == 0)
+        {
+          // Set all predicates to false if we are going to overshoot bounds
+          if (k_tile_count <= 0) {
+            clear(tApA);
+            clear(tBpB);
+          }
+          copy_if(gmem_tiled_copy_A, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
+          copy_if(gmem_tiled_copy_B, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+          cp_async_fence();
+          ++k_tile_iter;
+
+          // Advance the pipe -- Doing it here accounts for K_BLOCK_MAX = 1 (no rmem pipe)
+          smem_pipe_write = smem_pipe_read;
+          ++smem_pipe_read;
+          smem_pipe_read = (smem_pipe_read == DispatchPolicy::Stages) ? 0 : smem_pipe_read;
+        }
+
+        // Transform before compute
+        cute::transform(tCrA(_,_,k_block), TransformA{});
+        cute::transform(tCrB(_,_,k_block), TransformB{});
+        // Thread-level register gemm for k_block
+        cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), src_accum);
+      });
+
+    }
+
+    cp_async_wait<0>();
+    __syncthreads();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa5e212d61b06ec8ebe9f8ea39eb505c418f896f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1380 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule_,
+  class TileShape_,
+  class ElementAOptionalTuple,
+  class StrideA_,
+  class ElementBOptionalTuple,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ConversionMode = cutlass::detail::ConversionMode;
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+private:
+  template<class T> friend struct detail::MixedInputUtils;
+  using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_,
+                                       ElementAOptionalTuple, StrideA_,
+                                       ElementBOptionalTuple, StrideB_,
+                                       TiledMma_,
+                                       GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                                       TransformA_,
+                                       GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
+                                       TransformB_>;
+  using Utils = detail::MixedInputUtils<CollectiveType>;
+
+  //
+  // Type Aliases
+  //
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value,
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale], [ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  using NonVoidStrideScale = cute::conditional_t<cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(( IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value || is_layout<InternalStrideA>::value)) ||
+                (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value || is_layout<InternalStrideB>::value)),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value || is_layout<InternalStrideA>::value) &&
+                 (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value || is_layout<InternalStrideB>::value)),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedStrideA = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+  using InternalSwappedStrideA = cute::conditional_t<!SwapAB, InternalStrideA, InternalStrideB>;
+  using InternalSwappedStrideB = cute::conditional_t<!SwapAB, InternalStrideB, InternalStrideA>;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  /// Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomA{}, select<0,2>(TileShape{}), InternalSwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomB{}, select<1,2>(TileShape{}), InternalSwappedStrideB{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+    SmemLayoutAtomScale{},
+    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+private:
+  static constexpr ConversionMode
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    }
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA) cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB) cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    struct TensorMapStorage {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+      cute::TmaDescriptor smem_tensormap_scale;
+      cute::TmaDescriptor smem_tensormap_zero;
+    };
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementScale const** ptr_S = nullptr;
+    NonVoidStrideScale const* dS{};
+    int chunk_size = 0;
+    ElementZero const** ptr_Z = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideA{}, int32_t(0)), InternalSwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(repeat_like(InternalSwappedStrideB{}, int32_t(0)), InternalSwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+   using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    void* tensormaps;
+    SwappedElementA const** ptr_A;
+    SwappedStrideA ptr_dA;
+    SwappedElementB const** ptr_B;
+    SwappedStrideB ptr_dB;
+    NonVoidElementScale const** ptr_S;
+    NonVoidStrideScale const* dS;
+    NonVoidElementZero const** ptr_Z;
+    int64_t scale_k;
+    int chunk_size;
+    int reload_factor = (chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+
+    if constexpr (SwapAB) {
+      init_M = get<1>(init_shape);
+      init_N = get<0>(init_shape);
+    }
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t mock_L = 1;
+    SwappedElementA const* ptr_A_first_batch;
+    SwappedElementB const* ptr_B_first_batch;
+    SwappedStrideA ptr_dA;
+    SwappedStrideB ptr_dB;
+    InternalSwappedStrideA dA;
+    InternalSwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    }
+    else {
+      ptr_A_first_batch = reinterpret_cast<SwappedElementA const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+      ptr_B_first_batch = reinterpret_cast<SwappedElementB const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    }
+
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      if constexpr (not SwapAB) {
+        ptr_dA = args.dA;
+        ptr_dB = args.dB;
+      }
+      else {
+        ptr_dA = args.dB;
+        ptr_dB = args.dA;
+      }
+      dA = InternalSwappedStrideA{};
+      if constexpr (is_layout<InternalSwappedStrideA>::value) {
+        dA = make_layout(
+          transform_leaf(dA.shape(), [](auto x){
+            if constexpr (not is_static_v<decltype(x)>) {
+              return static_cast<decltype(x)>(1);
+            } else {
+              return x;
+            }
+          }),
+          dA.stride());
+      }
+      dB = InternalSwappedStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+      if constexpr (SwapAB) {
+        init_M = get<1>(problem_shape_MNK);
+        init_N = get<0>(problem_shape_MNK);
+      }
+
+      if constexpr (not SwapAB) {
+        dA = args.dA;
+        dB = args.dB;
+      }
+      else {
+        dA = args.dB;
+        dB = args.dA;
+      }
+      ptr_dA = SwappedStrideA{};
+      ptr_dB = SwappedStrideB{};
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, detail::get_gmem_layout(make_shape(init_M,init_K,mock_L), dA));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, detail::get_gmem_layout(make_shape(init_N,init_K,mock_L), dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    void* tensormaps = workspace;
+    auto args_setup = [&](auto ptr_A, auto ptr_B, int64_t scale_k = 0, int chunk_size = 0, int reload_factor = 1) -> Params {
+      return {
+          tma_load_a,
+          tma_load_b,
+          TmaTransactionBytes,
+          tma_load_scale,
+          tma_load_zero,
+          tensormaps,
+          reinterpret_cast<SwappedElementA const**>(ptr_A),
+          ptr_dA,
+          reinterpret_cast<SwappedElementB const**>(ptr_B),
+          ptr_dB,
+          reinterpret_cast<NonVoidElementScale const**>(args.ptr_S),
+          args.dS,
+          reinterpret_cast<NonVoidElementZero const**>(args.ptr_Z),
+          scale_k,
+          chunk_size,
+          reload_factor,
+          dA,
+          dB
+      };
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return SwapAB ? args_setup(args.ptr_B, args.ptr_A)
+                    : args_setup(args.ptr_A, args.ptr_B);
+    }
+    else if constexpr (ModeHasScales) {
+      auto scale_k = ceil_div(init_K, args.chunk_size);
+      ElementScale const* ptr_S = reinterpret_cast<ElementScale const*>(args.ptr_S);
+      StrideScale dS{};
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_layout(make_shape(init_M,scale_k,mock_L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{}); // mcast along N mode for this M load, if any
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return SwapAB ? args_setup(args.ptr_B, args.ptr_A, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(args.ptr_A, args.ptr_B, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        ElementZero const* ptr_Z = reinterpret_cast<ElementZero const*>(args.ptr_Z);
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z), make_layout(make_shape(init_M,scale_k,mock_L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{}); // mcast along N mode for this M load, if any
+        return SwapAB ? args_setup(args.ptr_B, args.ptr_A, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}))
+                      : args_setup(args.ptr_A, args.ptr_B, scale_k, args.chunk_size, (args.chunk_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}));
+
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+
+    // Calculating workspace size
+    auto calculate_workspace_size = [SizeOfCuTensorMap, sm_count](uint32_t num_input_tensors) {
+        return num_input_tensors * SizeOfCuTensorMap * sm_count;
+    };
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+      return calculate_workspace_size(2);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies, followed by scale tensormap copies
+      return calculate_workspace_size(3);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies, followed by scale and zeros tensormap copies
+      return calculate_workspace_size(4);
+    }
+    else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in get_workspace_size.");
+    }
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        auto get_stride = [](auto stride) {
+          if constexpr (cute::is_pointer_v<cute::decay_t<decltype(stride)>>) {
+            return *stride;
+          }
+          else {
+            return stride;
+          }
+        };
+        auto dA = get_stride(args.dA);
+        auto dB = get_stride(args.dB);
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(detail::get_gmem_layout(cute::make_shape(M,K,L), dA));
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(detail::get_gmem_layout(cute::make_shape(N,K,L), dB));
+        if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+          implementable = implementable && (args.ptr_S == nullptr);
+          implementable = implementable && (args.ptr_Z == nullptr);
+        }
+        else if constexpr (ModeHasScales) {
+          const int scale_mn = SwapAB ? N : M;
+          const int scale_k = ceil_div(K, args.chunk_size);
+          constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+          implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+          implementable = implementable && (args.chunk_size == K || ((args.chunk_size % size<2>(TileShape{})) == 0));
+          implementable = implementable && args.chunk_size != 0;
+          implementable = implementable && (args.ptr_S != nullptr);
+          if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+            implementable = implementable && (args.ptr_Z == nullptr);
+          }
+          else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+            constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+            implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), StrideScale{});
+            implementable = implementable && (args.ptr_Z != nullptr);
+          }
+          else {
+            static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+          }
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+        }
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(M,K,mock_L), mainloop_params.dA))); // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(N,K,mock_L), mainloop_params.dB))); // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    }
+    else if constexpr (ModeHasScales) {
+      const int scale_mn = SwapAB ? N : M;
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(scale_mn,scale_k,L));
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(scale_mn,scale_k,L));
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));      // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class... Ts,
+    class... TMs,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      cute::tuple<TMs...> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+      static_assert(sizeof... (TMs) == 2, "Direct convert needs two tensormaps");
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+      static_assert(sizeof... (TMs) == 3, "Scaled convert needs three tensormaps");
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+      static_assert(sizeof... (TMs) == 4, "Scaled and zero convert needs four tensormaps");
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                       // (BLK_M,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                       // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      }
+      else if constexpr (ModeHasScales) {
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+        // on the fly.
+        // We must do a ceiling divide here to correctly handle with chunk_size == K. In that case, we don't require that K
+        // is a multiple of the threadblock tile K
+        const int scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when chunk_size == K.
+        if (cute::elect_one_sync()) {
+          copy(mainloop_params.tma_load_scale.with(get<2>(input_tensormaps), *tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+        }
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        }
+        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) {
+            copy(mainloop_params.tma_load_zero.with(get<3>(input_tensormaps), *tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+          }
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        }
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<SwappedSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SwappedSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrA_load = [&]{
+      if constexpr (not is_layout<InternalSwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      }
+      else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_,_,_,Int<0>{}));
+      }
+    }();
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info = Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) {
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+
+      Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+          partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+
+        warpgroup_wait<K_WAIT_MAX>();
+        Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+        }
+        else {
+          if (k_block < K_BLOCK_MAX - 2) {
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>();
+        if (k_block == K_BLOCK_MAX - 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) {
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+    cute::TmaDescriptor* tma_desc_scale = &gmem_tensormap[sm_idx + 2*sm_count];
+    cute::TmaDescriptor* tma_desc_zero = &gmem_tensormap[sm_idx + 3*sm_count];
+
+    // Bringing tensormaps from params to smem for modification later
+    Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+    Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+    Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+    if (cute::elect_one_sync()) {
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      Tensor pS_tensormap = make_tensor(mainloop_params.tma_load_scale.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sS_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_scale), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pS_tensormap), recast<uint128_t>(sS_tensormap));
+      }
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      Tensor pZ_tensormap = make_tensor(mainloop_params.tma_load_zero.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sZ_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_zero), Int<1>{}, Int<1>{});
+      if (cute::elect_one_sync()) {
+        copy(recast<uint128_t>(pZ_tensormap), recast<uint128_t>(sZ_tensormap));
+      }
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+
+    __syncwarp();
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      return cute::make_tuple(tma_desc_a, tma_desc_b, tma_desc_scale, tma_desc_zero);
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_init.");
+    }
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                    mainloop_params.ptr_S[next_batch]);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                    mainloop_params.ptr_Z[next_batch]);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_address.");
+    }
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = (SwapAB? get<1>(problem_shape_mnkl) : get<0>(problem_shape_mnkl));
+    const uint32_t N = (SwapAB? get<0>(problem_shape_mnkl) : get<1>(problem_shape_mnkl));
+    const uint32_t K = get<2>(problem_shape_mnkl);
+
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_scale  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_scale = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_zero   = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_zero  = {0,0,0,0,0};
+
+    SwappedElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, detail::get_gmem_layout(make_shape(M,K,Int<1>{}), mainloop_params.ptr_dA[next_group]));
+
+    SwappedElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, detail::get_gmem_layout(make_shape(N,K,Int<1>{}), mainloop_params.ptr_dB[next_group]));
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      NonVoidElementScale const* ptr_S = nullptr;
+      auto scale_k = ceil_div(K, mainloop_params.chunk_size);
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_shape(M,scale_k,Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_scale, tensor_scale,
+                                             prob_shape_scale, prob_stride_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      ElementZero const* ptr_Z = nullptr;
+      auto scale_k = ceil_div(K, mainloop_params.chunk_size);
+      Tensor tensor_zero = make_tensor(detail::get_logical_ptr(ptr_Z), make_shape(M,scale_k,Int<1>{}), mainloop_params.dS[next_group]);
+      cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_zero, tensor_zero,
+                                               prob_shape_zero, prob_stride_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<SwappedElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<SwappedElementB>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_scale) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_zero) {
+      stride = (stride * sizeof_bits_v<NonVoidElementScale>) / 8;
+    }
+
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_scale,
+                                                            prob_shape_scale,
+                                                            prob_stride_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_zero,
+                                                            prob_shape_zero,
+                                                            prob_stride_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_replace_global_tensor_properties.");
+    }
+  }
+
+  template <class... TMs, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TMs...> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class... TMs>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TMs...> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      tma_descriptor_cp_fence_release(get<2>(input_tensormaps), shared_tensormaps.smem_tensormap_scale);
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      tma_descriptor_cp_fence_release(get<3>(input_tensormaps), shared_tensormaps.smem_tensormap_zero);
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_cp_fence_release.");
+    }
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class... TMs>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TMs...> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+    if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::tma_descriptor_fence_acquire(get<2>(input_tensormaps));
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::tma_descriptor_fence_acquire(get<3>(input_tensormaps));
+    }
+    else if constexpr (KernelConversionMode != ConversionMode::DirectConvert){
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in tensormaps_fence_acquire.");
+    }
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6786cec5b6fc650fbb65e5fb810f32f786359bc6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,775 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  // Assumption: StrideA is congruent with Problem_MK
+  using TMA_A = decltype(make_tma_copy(
+      GmemTiledCopyA{},
+      make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+      SmemLayoutA{}(_,_,cute::Int<0>{}),
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+      size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+  // Assumption: StrideB is congruent with Problem_NK
+  using TMA_B = decltype(make_tma_copy(
+      GmemTiledCopyB{},
+      make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+      SmemLayoutB{}(_,_,cute::Int<0>{}),
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  // Device side kernel params
+  struct Params {
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    void* tensormaps;
+    InternalElementA const** ptr_A;
+    StrideA dA;
+    InternalElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t init_L = 1;
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+    TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    void* tensormaps = workspace;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      tensormaps,
+      reinterpret_cast<InternalElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<InternalElementB const**>(args.ptr_B),
+      args.dB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t init_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorMapA, class TensorMapB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    if (k_tile_count > 0) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum); // (V,M,K) x (V,N,K) => (V,M,N)
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    InternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    InternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<InternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<InternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..916c6db812ffb9279164e9d477e668b93ac60c2e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,784 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // Assumption: StrideA is congruent with Problem_MK
+  using TMA_A = decltype(make_tma_copy(
+      GmemTiledCopyA{},
+      make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+      SmemLayoutA{}(_,_,cute::Int<0>{}),
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+      size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+  // Assumption: StrideB is congruent with Problem_NK
+  using TMA_B = decltype(make_tma_copy(
+      GmemTiledCopyB{},
+      make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+      SmemLayoutB{}(_,_,cute::Int<0>{}),
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t mma_promotion_interval = 4;
+    void* tensormaps;
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+        // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+        // These will be replaced with correct values before the initial tma load.
+        auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1));
+        auto init_M = get<0>(init_shape);
+        auto init_N = get<1>(init_shape);
+        auto init_K = get<2>(init_shape);
+        auto init_L = get<3>(init_shape);
+
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    ElementA const* ptr_A_first_batch = reinterpret_cast<ElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    ElementB const* ptr_B_first_batch = reinterpret_cast<ElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+    TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    void* tensormaps = workspace;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      args.mma_promotion_interval,
+      tensormaps,
+      reinterpret_cast<ElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<ElementB const**>(args.ptr_B),
+      args.dB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    bool implementable = true;
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t mock_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,mock_L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,mock_L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});  // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorMapA, class TensorMapB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    ElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    ElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<ElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<ElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      InputTensors const& input_tensors,
+      [[maybe_unused]] Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      [[maybe_unused]] int32_t next_batch) {
+    return input_tensors;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6e662beb26d411ed6af326b6d5c2420b5b3bb3a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,1245 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90ArrayTmaGmmaWarpSpecializedBlockwise<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90ArrayTmaGmmaWarpSpecializedBlockwise<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = cute::tuple_element_t<0,StridePairA_>;
+  using LayoutSFA = cute::tuple_element_t<1,StridePairA_>;
+  using InternalStrideA = cute::remove_pointer_t<StrideA>;
+  using InternalLayoutSFA = cute::remove_pointer_t<LayoutSFA>;
+  using ElementB = ElementB_;
+  using StrideB = cute::tuple_element_t<0,StridePairB_>;
+  using LayoutSFB = cute::tuple_element_t<1,StridePairB_>;
+  using InternalStrideB = cute::remove_pointer_t<StrideB>;
+  using InternalLayoutSFB = cute::remove_pointer_t<LayoutSFB>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  static constexpr int NumProducerThreadEvents = 33;
+
+  static constexpr int ScaleGranularityM = size<0,0>(InternalLayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(InternalLayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(InternalLayoutSFA{});
+
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0);
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0);
+
+  static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
+  static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
+  static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}),
+    "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom.");
+  static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0,
+    "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom.");
+
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+  static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+  static_assert((size<1>(TileShape{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
+
+  static constexpr bool MMajorSFA = size<0,1>(InternalLayoutSFA{}.stride()) == 1;
+  static constexpr bool NMajorSFB = size<0,1>(InternalLayoutSFB{}.stride()) == 1;
+
+  using ScaleConfig = ::cutlass::detail::Sm90BlockwiseScaleConfig<
+      ScaleGranularityM, 
+      ScaleGranularityN, 
+      ScaleGranularityK, 
+      MMajorSFA ? cute::GMMA::Major::MN : cute::GMMA::Major::K, 
+      NMajorSFB ? cute::GMMA::Major::MN : cute::GMMA::Major::K>;
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(TileShape{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(TileShape{}));
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // Block scaling gmem-to-smem copy atom
+  //  we can have partial tiles in M or N, so don't vectorize those loads
+  using CopyAtomSFA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using CopyAtomSFB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+
+  static constexpr int AlignmentSFA = 1;
+  static constexpr int AlignmentSFB = 1;
+
+  // Block scaling smem layout
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFA>> smem_SFA;
+      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutSFB>> smem_SFB;
+    } tensors;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _0> {
+      cute::TmaDescriptor smem_tensormap_A;
+      cute::TmaDescriptor smem_tensormap_B;
+    } tensormaps;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using TensorMapStorage = typename SharedStorage::TensorMapStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const** ptr_A;
+    StrideA dA;
+    ElementB const** ptr_B;
+    StrideB dB;
+    ElementBlockScale const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementBlockScale const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    void* tensormaps;
+    InternalElementA const** ptr_A;
+    StrideA dA;
+    InternalElementB const** ptr_B;
+    StrideB dB;
+    // Block scaling factors for A and B
+    ElementBlockScale const** ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementBlockScale const** ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+      ProblemShape problem_shapes,
+      Arguments const& args,
+      void* workspace) {
+    // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc.
+    // These will be replaced with correct values before the initial tma load.
+    auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1));
+    auto init_M = get<0>(init_shape);
+    auto init_N = get<1>(init_shape);
+    auto init_K = get<2>(init_shape);
+    // Batches/Groups are managed by using appropriate pointers to input matrices
+    const uint32_t init_L = 1;
+    // NOTE: Since TMA desc creation with nullptr not possible until 12.6, we use an initial address even when tensor addresses are on device. This address is never used.
+    InternalElementA const* ptr_A_first_batch = reinterpret_cast<InternalElementA const*>(reinterpret_cast<uint64_t>(args.ptr_A) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+    InternalElementB const* ptr_B_first_batch = reinterpret_cast<InternalElementB const*>(reinterpret_cast<uint64_t>(args.ptr_B) & 0xFFFFFFFFFFFFFFF0);  // Address must be 16B-aligned
+
+    InternalStrideA stride_a;
+    InternalStrideB stride_b;
+    if constexpr (IsGroupedGemmKernel) {
+      // Strides for Grouped Gemm will be replaced prior to the first access regardless.
+      stride_a = InternalStrideA{};
+      stride_b = InternalStrideB{};
+    }
+    else {
+      // Tensor shapes for Ptr-Array are initialized correctly only here.
+      auto problem_shape_MNK = problem_shapes.get_host_problem_shape(0);
+      init_M = get<0>(problem_shape_MNK);
+      init_N = get<1>(problem_shape_MNK);
+      init_K = get<2>(problem_shape_MNK);
+
+      stride_a = args.dA;
+      stride_b = args.dB;
+    }
+    Tensor tensor_a = make_tensor(ptr_A_first_batch, make_layout(make_shape(init_M,init_K,init_L), stride_a));
+    Tensor tensor_b = make_tensor(ptr_B_first_batch, make_layout(make_shape(init_N,init_K,init_L), stride_b));
+    auto tma_load_a = make_tma_copy(
+         GmemTiledCopyA{},
+         tensor_a,
+         SmemLayoutA{}(_,_,cute::Int<0>{}),
+         make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+         size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    auto tma_load_b = make_tma_copy(
+         GmemTiledCopyB{},
+         tensor_b,
+         SmemLayoutB{}(_,_,cute::Int<0>{}),
+         make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+         size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+
+    void* tensormaps = workspace;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      TmaTransactionBytes,
+      tensormaps,
+      reinterpret_cast<InternalElementA const**>(args.ptr_A),
+      args.dA,
+      reinterpret_cast<InternalElementB const**>(args.ptr_B),
+      args.dB,
+      args.ptr_SFA,
+      args.layout_SFA,
+      args.ptr_SFB,
+      args.layout_SFB
+    };
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args, int sm_count) {
+    constexpr uint32_t NumInputTensors = 2;
+    constexpr size_t SizeOfCuTensorMap = sizeof(cute::TmaDescriptor);
+    // Allocate gmem space for input tensormaps per each SM, A tensormap copies followed by B tensormap copies
+    return (NumInputTensors * SizeOfCuTensorMap * sm_count);
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream, CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape problem_shapes,
+      Arguments const& args) {
+    bool implementable = true;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+
+    if (problem_shapes.is_host_problem_shape_available()) {
+      // Check alignment for all problem sizes
+      for (int i = 0; i < problem_shapes.groups(); i++) {
+        auto problem_shape_MNKL = append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto [M,N,K,L] = problem_shape_MNKL;
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), InternalStrideA{});
+        implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), InternalStrideB{});
+      }
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+  static constexpr uint32_t TmaTransactionBytes =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value))+
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  // Set up the data needed by this collective for load and mma.
+  // Returns a tuple of tensors. The collective and the kernel layer have the contract that the
+  // returned tuple must contain at least two elements, with the first two elements being:
+  // gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  // gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  // The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(
+    ProblemShape_MNKL const& problem_shape_MNKL,
+    Params const& mainloop_params,
+    ElementBlockScale const* ptr_SFA = nullptr,
+    ElementBlockScale const* ptr_SFB = nullptr
+  ) const {
+
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+    const int32_t init_L = 1;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,init_L));                        // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,init_L));                        // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N,BLK_K,n,k,l)
+
+    // Make the tiled views of scale tensors
+
+    Tensor mSFA_mkl = make_tensor(make_gmem_ptr(ptr_SFA),
+        ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, init_L)));                              // (scale_m,k,l)
+    Tensor mSFB_nkl = make_tensor(make_gmem_ptr(ptr_SFB),
+        ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, init_L)));                              // (scale_n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
+
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorMapA, class TensorMapB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+      Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()),
+          SmemLayoutSFA{});                                                                           // (BLK_M,BLK_K,P)
+      Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()),
+          SmemLayoutSFB{});                                                                           // (BLK_N,BLK_K,P)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+      Tensor mSFA_mkl = get<2>(load_inputs);
+      Tensor mSFB_nkl = get<3>(load_inputs);
+
+      Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});   // (BLK_M,BLK_K,m,k,l)
+      Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});   // (BLK_N,BLK_K,n,k,l)
+
+      Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+      Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
+
+      TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{});
+      TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{});
+
+      ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(_0{});
+      ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(_0{});
+
+      Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+      Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
+
+      Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+      Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(get<0>(input_tensormaps), *tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(get<1>(input_tensormaps), *tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  // Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      // This helps avoid early exit of blocks in Cluster.
+      // Waits for all stages to either be released (all
+      // Consumer UNLOCKs), or if the stage was never used
+      // then it would just be acquired since the phase was
+      // still inverted from make_producer_start_state.
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  // Perform a collective-scoped matrix multiply-accumulate
+  // Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorSFA, class TensorSFB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_auxiliary(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA,
+                  TensorB,
+                  TensorSFA,
+                  TensorSFB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()),
+        SmemLayoutSFA{});                                                                             // (BLK_M,BLK_K,P)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()),
+        SmemLayoutSFB{});                                                                             // (BLK_N,BLK_K,P)
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor mSFA_mkl  = get<2>(load_inputs);
+    Tensor mSFB_nkl  = get<3>(load_inputs);
+    Layout layoutSFA = mSFA_mkl.layout();
+    Layout layoutSFB = mSFB_nkl.layout();
+
+    Tensor iSFA_mkl = make_identity_tensor(shape(layoutSFA));                                // (m,k,l)
+    Tensor iSFB_nkl = make_identity_tensor(shape(layoutSFB));                                // (n,k,l)
+
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor cSFA_mkl = local_tile(iSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+    Tensor cSFB_nkl = local_tile(iSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor cSFA_k = cSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
+    Tensor cSFB_k = cSFB_nkl(_,_,n_coord,_,l_coord);
+
+    TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+    TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{}, Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
+
+    Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+    Tensor tSFAcSFA_k = thr_scale_copy_a.partition_S(cSFA_k);
+    Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
+
+    Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+    Tensor tSFBcSFB_k = thr_scale_copy_b.partition_S(cSFB_k);
+    Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
+
+    Tensor tSFApSFA = make_tensor<bool>(shape(filter_zeros(tSFAsSFA(_,_,_,_0{}))));                 // (CPY,CPY_M,CPY_K)
+    Tensor tSFBpSFB = make_tensor<bool>(shape(filter_zeros(tSFBsSFB(_,_,_,_0{}))));                 // (CPY,CPY_N,CPY_K)
+
+    auto SFA_shape = shape(layoutSFA);
+    auto SFB_shape = shape(layoutSFB);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      // Since scale granularity K is multiple of BLK_K we do not have to consider if that is OOB
+      bool load_sfa = thread_idx < ScaleMsPerTile;
+      Tensor tSFAcSFA = tSFAcSFA_k(_,_,_,*k_tile_iter);
+      Tensor tSFAcSFA_compact = filter_zeros(tSFAcSFA);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFApSFA); ++i) {
+        tSFApSFA(i) = load_sfa && elem_less(tSFAcSFA_compact(i), SFA_shape);
+      }
+
+      bool load_sfb = thread_idx < ScaleNsPerTile;
+      Tensor tSFBcSFB = tSFBcSFB_k(_,_,_,*k_tile_iter);
+      Tensor tSFBcSFB_compact = filter_zeros(tSFBcSFB);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFBpSFB); ++i) {
+        tSFBpSFB(i) = load_sfb && elem_less(tSFBcSFB_compact(i), SFB_shape);
+      }
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+
+      // Copy scale tensors from global memory to shared memory
+      copy_if(scale_copy_a, tSFApSFA, filter_zeros(tSFAgSFA_k(_,_,_,*k_tile_iter)), filter_zeros(tSFAsSFA(_,_,_,write_stage)));
+      copy_if(scale_copy_b, tSFBpSFB, filter_zeros(tSFBgSFB_k(_,_,_,*k_tile_iter)), filter_zeros(tSFBsSFB(_,_,_,write_stage)));
+
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+
+  template<
+    class EngineAccum,
+    class LayoutAccum,
+    class ScaleFactor
+  >
+  CUTLASS_DEVICE
+  void scale_if_needed(GmmaFP8Accumulation<EngineAccum, LayoutAccum>& accumulation, ScaleFactor scaleFactor) {
+    if constexpr (ScalePromotionInterval != 4) {
+      accumulation.scale_if_needed(scaleFactor);
+    }
+    else {
+      // avoid unnecessary tests when granularity is the finnest
+      accumulation.scale(scaleFactor);
+    }
+  }
+
+  template<
+    class EngineAccum,
+    class LayoutAccum,
+    class ScaleFactor1,
+    class ScaleFactor2
+  >
+  CUTLASS_DEVICE
+  void scale_if_needed(GmmaFP8Accumulation<EngineAccum, LayoutAccum>& accumulation, ScaleFactor1 scaleFactor1, ScaleFactor2 scaleFactor2) {
+    if constexpr (ScalePromotionInterval != 4) {
+      accumulation.scale_if_needed(scaleFactor1, scaleFactor2);
+    }
+    else {
+      // avoid unnecessary tests when granularity is the finnest
+      accumulation.scale(scaleFactor1, scaleFactor2);
+    }
+  }
+
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});           // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});           // (BLK_N,BLK_K,PIPE)
+
+    // Block scaling
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), make_layout(
+        make_shape(shape<0>(SmemLayoutSFA{}),
+                   get<1>(TileShape{}),
+                   make_shape(shape<1>(SmemLayoutSFA{}),
+                              shape<2>(SmemLayoutSFA{}))),
+        make_stride(stride<0>(SmemLayoutSFA{}), _0{},
+                    make_stride(stride<1>(SmemLayoutSFA{}),
+                                stride<2>(SmemLayoutSFA{})))
+      ));                                                                                     // (BLK_M,BLK_N,(BLK_K,P))
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), make_layout(
+        make_shape(get<0>(TileShape{}),
+                   shape<0>(SmemLayoutSFB{}),
+                   make_shape(shape<1>(SmemLayoutSFB{}),
+                              shape<2>(SmemLayoutSFB{}))),
+        make_stride(_0{},
+                    stride<0>(SmemLayoutSFB{}),
+                    make_stride(stride<1>(SmemLayoutSFB{}),
+                                stride<2>(SmemLayoutSFB{})))
+      ));                                                                                     // (BLK_M,BLK_N,(BLK_K,P))
+
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsSFA = tiled_mma.get_slice(thread_idx).partition_C(sSFA);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+    Tensor tCsSFB = tiled_mma.get_slice(thread_idx).partition_C(sSFB);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Per block scale values for operand A and B
+    // Since scale factors always broadcast across MMA_K we slice that away
+    Tensor tCrSFA = make_tensor_like<ElementBlockScale>(tCsSFA(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+    Tensor tCrSFB = make_tensor_like<ElementBlockScale>(tCsSFB(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+
+    // Prologue GMMAs
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+    GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+
+    if (k_tile_count > 0) {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      // Load per block scale values from shared memory to registers
+      copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB);
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+      warpgroup_fence_operand(accumulation());
+
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+
+      warpgroup_wait<0>();
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+    }
+
+    warpgroup_fence_operand(accumulation());
+
+    // Mainloop GMMAs
+    k_tile_count--;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count)
+    {
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_release;
+    }
+
+    if (k_tile_count > 0) {
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+    }
+    if constexpr (ScalePromotionInterval != 4) {
+      // residues only exists when granularity is not the finnest
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        accumulation.scale_residue_if_needed(scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        accumulation.scale_residue_if_needed(tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_residue_if_needed(tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_residue_if_needed(tCrSFA, tCrSFB);
+      }
+    }
+
+    warpgroup_fence_operand(accumulation());
+
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    if (k_tile_count > 0) {
+      // The pipeline is not released in the first iteration
+      smem_pipe_release.advance(k_tile_count - 1);
+      pipeline.consumer_release(smem_pipe_release);
+    }
+  }
+
+  //
+  // Methods to perform different parts of TMA/Tensormap modifications
+  //
+
+  CUTLASS_DEVICE auto
+  tensormaps_init(
+      Params const& mainloop_params,
+      TensorMapStorage& shared_tensormaps,
+      int32_t sm_count,
+      int32_t sm_idx) {
+    cute::TmaDescriptor* gmem_tensormap = reinterpret_cast<cute::TmaDescriptor*>(mainloop_params.tensormaps);
+
+    cute::TmaDescriptor* tma_desc_a = &gmem_tensormap[sm_idx];
+    cute::TmaDescriptor* tma_desc_b = &gmem_tensormap[sm_idx + sm_count];
+
+    if (cute::elect_one_sync()) {
+      // Bringing tensormaps from params to smem for modification later
+      Tensor pA_tensormap = make_tensor(mainloop_params.tma_load_a.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sA_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_A), Int<1>{}, Int<1>{});
+      Tensor pB_tensormap = make_tensor(mainloop_params.tma_load_b.get_tma_descriptor(), Int<1>{}, Int<1>{});
+      Tensor sB_tensormap = make_tensor(make_smem_ptr(&shared_tensormaps.smem_tensormap_B), Int<1>{}, Int<1>{});
+
+      copy(recast<uint128_t>(pA_tensormap), recast<uint128_t>(sA_tensormap));
+      copy(recast<uint128_t>(pB_tensormap), recast<uint128_t>(sB_tensormap));
+    }
+    __syncwarp();
+
+    return cute::make_tuple(tma_desc_a, tma_desc_b);
+  }
+
+  // Replace address for the global tensor (to be done by single thread)
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_address(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_batch) {
+    // Replacing global_address for the next batch
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                    mainloop_params.ptr_A[next_batch]);
+    cute::tma_descriptor_replace_addr_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                    mainloop_params.ptr_B[next_batch]);
+  }
+
+  // Replace dim and strides for the global tensor - used only for Grouped GEMM (to be done by single thread)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_replace_global_tensor_properties(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      int32_t next_group,
+      ProblemShape_MNKL problem_shape_mnkl) {
+    const uint32_t M = get<0>(problem_shape_mnkl);
+    const uint32_t N = get<1>(problem_shape_mnkl);
+    const uint32_t K = get<2>(problem_shape_mnkl);
+    // Replace all dims for consistency
+    constexpr int MaxTensorRank = 5;
+    cute::array<uint32_t, MaxTensorRank> prob_shape_A  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_A = {0,0,0,0,0};
+    cute::array<uint32_t, MaxTensorRank> prob_shape_B  = {1,1,1,1,1};
+    cute::array<uint64_t, MaxTensorRank> prob_stride_B = {0,0,0,0,0};
+
+    InternalElementA const* ptr_A = nullptr;
+    Tensor tensor_a = make_tensor(ptr_A, make_shape(M,K,Int<1>{}), mainloop_params.dA[next_group]);
+
+    InternalElementB const* ptr_B = nullptr;
+    Tensor tensor_b = make_tensor(ptr_B, make_shape(N,K,Int<1>{}), mainloop_params.dB[next_group]);
+
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a,
+                                             prob_shape_A, prob_stride_A);
+    cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b,
+                                             prob_shape_B, prob_stride_B);
+
+    // Convert strides to byte strides
+    for (uint64_t& stride : prob_stride_A) {
+      stride = (stride * sizeof_bits_v<InternalElementA>) / 8;
+    }
+    for (uint64_t& stride : prob_stride_B) {
+      stride = (stride * sizeof_bits_v<InternalElementB>) / 8;
+    }
+
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_A,
+                                                            prob_shape_A,
+                                                            prob_stride_A);
+    cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_B,
+                                                            prob_shape_B,
+                                                            prob_stride_B);
+  }
+
+  template <class TensorMapA, class TensorMapB, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  void
+  tensormaps_perform_update(
+      TensorMapStorage& shared_tensormaps,
+      Params const& mainloop_params,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps,
+      ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+    if (cute::elect_one_sync()) {
+      // Replacing global_address for the next batch
+      tensormaps_replace_global_address(shared_tensormaps, mainloop_params, next_batch);
+
+      if constexpr (IsGroupedGemmKernel) {
+        // Replacing global dims and strides for the next batch
+        tensormaps_replace_global_tensor_properties(shared_tensormaps,
+          mainloop_params, next_batch, problem_shape_mnkl);
+      }
+    }
+  }
+
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_cp_fence_release (
+      TensorMapStorage& shared_tensormaps,
+      cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    if (cute::elect_one_sync()) {
+      cute::tma_desc_commit_group();
+      cute::tma_desc_wait_group();
+    }
+    // Entire warp must do this (i.e. it's aligned)
+    tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A);
+    tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B);
+  }
+
+  // The entire warp must call this function collectively (that is, the instructions are aligned)
+  template <class TensorMapA, class TensorMapB>
+  CUTLASS_DEVICE
+  void
+  tensormaps_fence_acquire(cute::tuple<TensorMapA, TensorMapB> const& input_tensormaps) {
+    cute::tma_descriptor_fence_acquire(get<0>(input_tensormaps));
+    cute::tma_descriptor_fence_acquire(get<1>(input_tensormaps));
+  }
+
+  template <class InputTensors, class ProblemShape_MNKL>
+  CUTLASS_DEVICE
+  InputTensors
+  tensors_perform_update(
+      [[maybe_unused]] InputTensors const& input_tensors,
+      Params const& mainloop_params,
+      [[maybe_unused]] ProblemShape_MNKL problem_shape_mnkl,
+      int32_t next_batch) {
+
+    if constexpr (IsGroupedGemmKernel) {
+      return load_init(
+        problem_shape_mnkl,
+        mainloop_params,
+        mainloop_params.ptr_SFA[next_batch],
+        mainloop_params.ptr_SFB[next_batch]
+      );
+    } else {
+      auto [gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl] = input_tensors;
+
+      auto scaleA_layout = mScaleA_mkl.layout();
+      auto scaleB_layout = mScaleB_nkl.layout();
+
+      mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA[next_batch]), scaleA_layout); // (m,ScaleMsPerTile,k,l)
+      mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB[next_batch]), scaleB_layout); // (n,ScaleNsPerTile,k,l)
+      return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4289bc816b057416f25a7f155a0e72ed5088b034
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,676 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaRmemAWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalGmemTiledCopyA = cute::conditional_t<!SwapAB, GmemTiledCopyA, GmemTiledCopyB>;
+  using InternalGmemTiledCopyB = cute::conditional_t<!SwapAB, GmemTiledCopyB, GmemTiledCopyA>;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes(size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value)) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+  static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<256, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, 256> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, 256> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    InternalElementA const* ptr_A = nullptr;
+    InternalStrideA dA{};
+    InternalElementB const* ptr_B = nullptr;
+    InternalStrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    if constexpr (not SwapAB) {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_A),
+        args.dA,
+        reinterpret_cast<InternalElementB const*>(args.ptr_B),
+        args.dB
+      };
+    }
+    else {
+      return {
+        reinterpret_cast<InternalElementA const*>(args.ptr_B),
+        args.dB,
+        reinterpret_cast<InternalElementB const*>(args.ptr_A),
+        args.dA
+      };
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    InternalGmemTiledCopyA gmem_tiled_copy_a;
+    InternalGmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                     // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), GmmaSmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB);                                   // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{},
+                                    InternalSmemLayoutAtomB{}, InternalElementB{},
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+    // first k tile
+    {
+      pipeline.consumer_wait(smem_pipe_read);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+          transpose.synchronize();
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+
+
+      if (k_tile_count - 1 > 0) {
+        if (!skip_wait) {
+          pipeline.consumer_wait(smem_pipe_read);
+        }
+        copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+        transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      }
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    --k_tile_count;
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      bool skip_wait = (pipeline.consumer_try_wait(smem_pipe_read) == BarrierStatus::WaitDone);
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == size<2>(tCrA) - 1) {
+          if (!skip_wait) {
+            pipeline.consumer_wait(smem_pipe_read);
+          }
+          copy(smem_tiled_copy_A, tCsA(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        } else {
+          copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          if (k_block < 2) {
+            transpose.synchronize(k_block);                                      // make transpose of k_block available
+          }
+          if (k_block == 0) {
+            transpose(sB, gmma_sB, read_stage, 1);
+          }
+        }
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    if (k_tile_count > 0) {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        if (k_block < 2) {
+          transpose.synchronize(k_block);                                           // make k_block transpose available
+        }
+        if (k_block == 0) {
+          transpose(sB, gmma_sB, read_stage, 1);
+        }
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      warpgroup_commit_batch();
+      warpgroup_wait<2>();
+      warpgroup_fence_operand(accum);
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbbe971c7f338a26d7929fde565288eae11ffa54
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,508 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape_,
+  class TileShape_,
+  class KernelSchedule,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90CpAsyncGmmaWarpSpecialized<Stages,ClusterShape_,KernelSchedule>;
+  using TileShape = TileShape_;
+  using ClusterShape = ClusterShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineAsync<DispatchPolicy::Stages>;
+  using PipelineState    = typename MainloopPipeline::PipelineState;
+  using PipelineParams   = typename MainloopPipeline::Params;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  using Params = Arguments;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    Arguments const& args,
+    [[maybe_unused]] void* workspace) {
+    return args;
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyA::NumValSrc>(cute::make_shape(M,K,L), StrideA{});
+    implementable = implementable && cutlass::detail::check_alignment<GmemTiledCopyB::NumValSrc>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA,
+    class TensorB,
+    class KTileIterator,
+    class ResidueMNK
+  >
+  CUTLASS_DEVICE void
+  load(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      TensorA const& gA_in,
+      TensorB const& gB_in,
+      KTileIterator k_tile_iter, int k_tile_count,
+      ResidueMNK residue_mnk,
+      int thread_idx,
+      TensorStorage& shared_tensors)
+  {
+    using namespace cute;
+
+    static_assert(is_gmem<TensorA>::value, "A tensor must be gmem resident.");
+    static_assert(is_gmem<TensorB>::value, "B tensor must be gmem resident.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+    // Shift tensor so residue_k is at origin (Can't read any k_coord < residue_k)
+    // This aligns the tensor with BLK_K for all but the 0th k_tile
+    Tensor gA = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gA_in);
+    Tensor gB = domain_offset(make_coord(0, get<2>(residue_mnk), 0), gB_in);
+
+    // Partition the copying of A and B tiles across the threads
+    GmemTiledCopyA gmem_tiled_copy_a;
+    GmemTiledCopyB gmem_tiled_copy_b;
+    auto gmem_thr_copy_a = gmem_tiled_copy_a.get_slice(thread_idx);
+    auto gmem_thr_copy_b = gmem_tiled_copy_b.get_slice(thread_idx);
+
+    Tensor tAgA = gmem_thr_copy_a.partition_S(gA);                        // (ACPY,ACPY_M,ACPY_K,k)
+    Tensor tAsA = gmem_thr_copy_a.partition_D(sA);                        // (ACPY,ACPY_M,ACPY_K,PIPE)
+    Tensor tBgB = gmem_thr_copy_b.partition_S(gB);                        // (BCPY,BCPY_N,BCPY_K,k)
+    Tensor tBsB = gmem_thr_copy_b.partition_D(sB);                        // (BCPY,BCPY_N,BCPY_K,PIPE)
+
+    // Allocate predicate tensors for m and n
+    Tensor tApA = make_tensor<bool>(make_shape(size<1>(tAsA), size<2>(tAsA)), Stride<_1,_0>{});
+    Tensor tBpB = make_tensor<bool>(make_shape(size<1>(tBsB), size<2>(tBsB)), Stride<_1,_0>{});
+
+    // Construct identity layout for sA and sB
+    Tensor cA = make_identity_tensor(make_shape(size<0>(sA), size<1>(sA)));    // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    Tensor cB = make_identity_tensor(make_shape(size<0>(sB), size<1>(sB)));    // (BLK_N,BLK_K) -> (blk_n,blk_k)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tAcA = gmem_thr_copy_a.partition_S(cA);                             // (ACPY,ACPY_M,ACPY_K) -> (blk_m,blk_k)
+    Tensor tBcB = gmem_thr_copy_b.partition_S(cB);                             // (BCPY,BCPY_N,BCPY_K) -> (blk_n,blk_k)
+
+    // Set predicates for m bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < size<0>(tApA); ++m) {
+      tApA(m,0) = get<0>(tAcA(0,m,0)) < get<0>(residue_mnk);  // blk_m coord < residue_m
+    }
+    // Set predicates for n bounds
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size<0>(tBpB); ++n) {
+      tBpB(n,0) = get<0>(tBcB(0,n,0)) < get<1>(residue_mnk);  // blk_n coord < residue_n
+    }
+
+    // 0-th stage with predication on k to account for residue
+    {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter, predicating for k residue
+      Tensor tAgAk = tAgA(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tAsA); ++k) {
+        if (get<1>(tAcA(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gA shifted)
+          copy_if(gmem_tiled_copy_a, tApA(_,k), tAgAk(_,_,k), tAsA(_,_,k,write_stage));
+        }
+        else {
+          clear(tAsA(_,_,k,write_stage));
+        }
+      }
+      Tensor tBgBk = tBgB(_,_,_,*k_tile_iter);
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0; k < size<2>(tBsB); ++k) {
+        if (get<1>(tBcB(0,0,k)) >= -get<2>(residue_mnk)) {      // blk_k coord < residue_k (gB shifted)
+          copy_if(gmem_tiled_copy_b, tBpB(_,k), tBgBk(_,_,k), tBsB(_,_,k,write_stage));
+        }
+        else {
+          clear(tBsB(_,_,k,write_stage));
+        }
+      }
+      ++k_tile_iter;
+      --k_tile_count;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+      int write_stage = smem_pipe_write.index();
+
+      // Copy gmem to smem for *k_tile_iter
+      copy_if(gmem_tiled_copy_a, tApA, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      copy_if(gmem_tiled_copy_b, tBpB, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      ++k_tile_iter;
+
+      // UNLOCK smem_pipe_write
+      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive);
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    /* This helps avoid early exit of blocks in Cluster
+     * Waits for all stages to either be released (all
+     * Consumer UNLOCKs), or if the stage was never used
+     * then would just be acquired since the phase was
+     * still inverted from make_producer_start_state
+     */
+    pipeline.producer_tail(smem_pipe_write);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_arrive();
+
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8e054370098985ad991f9d0db20f059195e4b96
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  // Swap and transpose A/B for A k-major layout and B mn-major layout since WGMMA is k-major only
+  // (e.g. tf32, Fp32, Int8, Fp8 WGMMA)
+  static constexpr bool IsLayoutAkBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::RowMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+
+  static constexpr bool IsInputSizeTwoBytes = sizeof(ElementA) == 2 && sizeof(ElementB) == 2;
+  static constexpr bool SwapAB =  !IsInputSizeTwoBytes && IsLayoutAkBmn;
+  using InternalSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using InternalSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using InternalSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using InternalSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using InternalElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using InternalElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using InternalStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using InternalStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      InternalSmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      InternalSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // If A mn-layout and B mn-layout, transposing B matrix since WGMMA is k-major only (e.g. tf32, fp32, fp8, int8).
+  static constexpr bool IsLayoutAmnBmn =
+    cute::is_same_v<gemm::detail::StrideToLayoutTagA_t<StrideA>, layout::ColumnMajor> &&
+    cute::is_same_v<gemm::detail::StrideToLayoutTagB_t<StrideB>, layout::RowMajor>;
+  static constexpr bool TransposeB = !IsInputSizeTwoBytes && IsLayoutAmnBmn;
+  using TransposeOperandB = decltype(cutlass::transform::collective::detail::make_transpose_operand_b(
+                                      0, 0, TiledMma{}, SmemLayoutB{}, InternalSmemLayoutAtomB{},
+                                      InternalElementB{}, cute::bool_constant<TransposeB>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  using GmmaSmemLayoutAtomB = decltype(transform::collective::detail::gmma_smem_transpose_or_passthrough<
+      TransposeB, InternalSmemLayoutAtomB, InternalElementB>());
+
+  // SmemLayoutB for GMMA is different from SmemLayoutB for TMA if TransposeB
+  using GmmaSmemLayoutB = decltype(tile_to_shape(
+      GmmaSmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,InternalStrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(!SwapAB || !TransposeB, "Cannot SwapAB and TransposeB at the same time.");
+  static_assert(TransposeB xor (cute::is_same_v<SmemLayoutB, GmmaSmemLayoutB>),
+    "Should be same layout if not TransposeB.");
+  static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    "SmemLayoutB K must be 128bytes to be transposed.");
+
+  static constexpr bool uses_universal_transposition() {
+    if constexpr (TransposeB) {
+      return transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>();
+    }
+    else {
+      return false;
+    }
+  }
+
+  static_assert(!uses_universal_transposition(),
+    "Warp specialized ARF kernels have not supported universal B transposition yet.");
+
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<cute::max(SmemAlignmentA, SmemAlignmentB), _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>, SmemAlignmentA> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>, SmemAlignmentB> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(InternalStrideA{}, int32_t(0)), InternalStrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(InternalStrideB{}, int32_t(0)), InternalStrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    InternalElementA const* ptr_A;
+    InternalStrideA dA;
+    InternalElementB const* ptr_B;
+    InternalStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<InternalElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<InternalElementB>::value)) ;
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});       // (BLK_M,BLK_K,PIPE)
+      Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});       // (BLK_N,BLK_K,PIPE)
+      Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+      Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "InternalSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "InternalSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<InternalSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<InternalSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});         // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                     // (BLK_M,BLK_K,PIPE)
+
+    // If TransposeB, GMMA will read from transposed B layout SMEM
+    Tensor gmma_sB_position_dependent = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()),
+                                          GmmaSmemLayoutB{});                                     // (BLK_N,BLK_K,PIPE)
+    Tensor gmma_sB = as_position_independent_swizzle_tensor(gmma_sB_position_dependent);          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    Tensor tCrA = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                    // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = mma_warpgroup_slice.partition_B(gmma_sB_position_dependent);                // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+
+
+    auto smem_tiled_copy_A = make_tiled_copy_A(InternalSmemCopyAtomA{}, tiled_mma);
+
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);                                       // (CPY,CPY_M,CPY_K)
+    Tensor tCsA_copy_view  = smem_thr_copy_A.partition_S(sA);                                      // (CPY,CPY_M,CPY_K)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA_copy_view) == size<1>(tCrA_copy_view));                                  // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA_copy_view) == size<2>(tCrA_copy_view));                                  // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(accum));                                                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) > _2{}, "RS loops require more than 2 MMA k-iterations for correctness.");
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    TransposeOperandB transpose = cutlass::transform::collective::detail::make_transpose_operand_b(
+                                    warp_idx, warp_group_thread_idx, tiled_mma, SmemLayoutB{},
+                                    InternalSmemLayoutAtomB{}, InternalElementB{},
+                                    cute::bool_constant<TransposeB>{});
+
+    warpgroup_fence_operand(accum);
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,read_stage), tCrA_copy_view(_,_,0));
+      // transpose B operand in SMEM
+      transpose(sB, gmma_sB, read_stage, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        if(k_block == 0) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        }
+        warpgroup_commit_batch();
+      }
+
+      warpgroup_wait<2>();
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+      --k_tile_count;
+      if(k_tile_count == 0) {
+        return;
+      }
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+      transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+      warpgroup_wait<2>();
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+        if (k_block == size<2>(tCrA) - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,0,smem_pipe_read.index()), tCrA_copy_view(_,_,0));
+          // transpose B operand in SMEM
+          transpose(sB, gmma_sB, smem_pipe_read.index(), 0);
+        }
+        else {
+          copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+          // transpose B operand in SMEM
+          transpose.synchronize(k_block);                                      // make transpose of k_block available
+          transpose(sB, gmma_sB, read_stage, k_block + 1);
+        }
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA) - 1; ++k_block) {
+        copy(smem_tiled_copy_A, tCsA_copy_view(_,_,k_block + 1,read_stage), tCrA_copy_view(_,_,k_block + 1));
+        transpose.synchronize(k_block);                                           // make k_block transpose available
+        transpose(sB, gmma_sB, read_stage, k_block + 1);
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+        warpgroup_wait<2>();
+        if (k_block == 1) {
+          // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+      }
+
+      warpgroup_arrive();
+      // (V,M) x (V,N) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,size<2>(tCrA) - 1), tCrB(_,_,size<2>(tCrA) - 1,read_stage), accum);
+      warpgroup_commit_batch();
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2558350ce38664f8e412deef8b567d558f8a775c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -0,0 +1,1032 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/transform/collective/sm90_wgmma_transpose.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/collective/mixed_input_utils.hpp"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_traits_sm90_tma.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop that source A operand from registers
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule_,
+  class TileShape_,
+  class ElementAOptionalTuple,
+  class StrideA_,
+  class ElementBOptionalTuple,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>,
+    TileShape_,
+    ElementAOptionalTuple,
+    StrideA_,
+    ElementBOptionalTuple,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+public:
+
+  //
+  // Type Aliases
+  //
+  using ConversionMode = cutlass::detail::ConversionMode;
+  using DispatchPolicy = MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule_>;
+  using TileShape = TileShape_;
+  using KernelSchedule = KernelSchedule_;
+
+private:
+  template<class T> friend struct detail::MixedInputUtils;
+  using CollectiveType = CollectiveMma<DispatchPolicy, TileShape_,
+                                       ElementAOptionalTuple, StrideA_,
+                                       ElementBOptionalTuple, StrideB_,
+                                       TiledMma_,
+                                       GmemTiledCopyA_, SmemLayoutAtomA_, SmemCopyAtomA_,
+                                       TransformA_,
+                                       GmemTiledCopyB_, SmemLayoutAtomB_, SmemCopyAtomB_,
+                                       TransformB_>;
+  using Utils = detail::MixedInputUtils<CollectiveType>;
+
+  using ScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>;
+  using ScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>;
+  using ZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>;
+  using ZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>;
+
+public:
+  static_assert(cute::is_tuple<ElementAOptionalTuple>::value ^ cute::is_tuple<ElementBOptionalTuple>::value,
+    "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale],"
+    "[ElementZero]}. Inputs in [] are optional.");
+
+  using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>;
+  using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>;
+  static constexpr bool IsATransformed = cute::is_tuple<ElementAOptionalTuple>::value;
+  using ElementScale = cute::conditional_t<IsATransformed, ScaleA, ScaleB>;
+  using ElementZero = cute::conditional_t<IsATransformed, ZeroA, ZeroB>;
+  // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is void.
+  using NonVoidElementScale = cute::conditional_t<cute::is_void_v<ElementScale>, float, ElementScale>;
+  using NonVoidElementZero = cute::conditional_t<cute::is_void_v<ElementZero>, float, ElementZero>;
+
+  using StrideA = StrideA_;
+  using StrideB = StrideB_;
+  // These are always MN major
+  using StrideScale = cute::Stride<cute::Int<1>, int64_t, int64_t>;
+  // For cases where we can't have a void scale, we can use this to allow the code to compile when the scale is void.
+  using NonVoidStrideScale = cute::conditional_t<
+      cute::is_void_v<StrideScale>, cute::Stride<_1, int64_t, int64_t>, StrideScale>;
+
+  static_assert(( IsATransformed && (cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value)) ||
+                (!IsATransformed && (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value)),
+                "The transformed type must be K-major.");
+
+  static_assert(( IsATransformed && (sizeof(ElementB) == 2)) ||
+                (!IsATransformed && (sizeof(ElementA) == 2)) ||
+                ((cutlass::gemm::detail::is_k_major<StrideA>() || is_layout<StrideA>::value) &&
+                 (cutlass::gemm::detail::is_k_major<StrideB>() || is_layout<StrideB>::value)),
+                "The unscaled element must be 2 bytes OR both inputs must be K-major");
+
+  static_assert(cutlass::gemm::detail::is_mn_major<NonVoidStrideScale>(),
+    "Scale must be MN major [Col Major if A is scaled, Row Major if B is scaled].");
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScale = cute::SM90_TMA_LOAD;
+
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  // Scale layout atom set after swapping.
+
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using SmemCopyAtomScale = Copy_Atom<cute::AutoVectorizingCopy, NonVoidElementScale>;
+
+  // We must ensure the type to be scaled goes to RF
+  static constexpr bool SwapAB = !IsATransformed;
+  using SwappedSmemLayoutAtomA = cute::conditional_t<!SwapAB, SmemLayoutAtomA, SmemLayoutAtomB>;
+  using SwappedSmemLayoutAtomB = cute::conditional_t<!SwapAB, SmemLayoutAtomB, SmemLayoutAtomA>;
+  using SwappedSmemCopyAtomA   = cute::conditional_t<!SwapAB, SmemCopyAtomA, SmemCopyAtomB>;
+  using SwappedSmemCopyAtomB   = cute::conditional_t<!SwapAB, SmemCopyAtomB, SmemCopyAtomA>;
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using ConvertedElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using ConvertedElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+  using RealSwappedElementA = cute::conditional_t<!SwapAB, ElementA, ElementB>;
+  using RealSwappedElementB = cute::conditional_t<!SwapAB, ElementB, ElementA>;
+  using SwappedElementA = cute::conditional_t<!SwapAB, ConvertedElementA, ConvertedElementB>;
+  using SwappedElementB = cute::conditional_t<!SwapAB, ConvertedElementB, ConvertedElementA>;
+  using SwappedStrideA  = cute::conditional_t<!SwapAB, StrideA, StrideB>;
+  using SwappedStrideB  = cute::conditional_t<!SwapAB, StrideB, StrideA>;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using SwappedTransformA  = cute::conditional_t<!SwapAB, TransformA, TransformB>;
+  using SwappedTransformB  = cute::conditional_t<!SwapAB, TransformB, TransformA>;
+
+  static constexpr int IsSubbyteA = cute::sizeof_bits_v<SwappedElementA> < 8;
+  using TmaElementA = cute::conditional_t<IsSubbyteA, uint8_t, SwappedElementA>;
+  using TmaElementScale = uint_bit_t<sizeof_bits_v<NonVoidElementScale> >; // in case we have array. translating to uint to satisfy tma descriptor's specialization
+
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using MainloopPipeline = cutlass::PipelineTmaAsync<
+                             DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
+  using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SwappedSmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+
+  using SmemLayoutA = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomA{}, select<0,2>(TileShape{}), SwappedStrideA{}));
+  using SmemLayoutB = decltype(detail::get_smem_layout<DispatchPolicy::Stages>(SwappedSmemLayoutAtomB{}, select<1,2>(TileShape{}), SwappedStrideB{}));
+
+  // It is assumed that the scales and zero-points share the same smem layout
+  using SmemLayoutScale = decltype(tile_to_shape(
+    SmemLayoutAtomScale{},
+    make_shape(shape<0>(ScaleTileShape{}), shape<1>(ScaleTileShape{}), Int<Stages>{}),
+    cute::conditional_t< ::cutlass::gemm::detail::is_major<0,NonVoidStrideScale>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(not cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                    cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source A from rmem and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // To relax them, we need to handle loading more than 1 row of scales for every main loop iteration.
+  // We must also handle updating the pipeline transaction bytes on the fly.
+  // NOTE: Deleting this assertion without required changes will cause the code to hang.
+  static_assert(size<1>(SmemLayoutAtomScale{}) == 1, "size<1>(SmemLayoutAtomScale) must be 1.");
+
+private:
+  static constexpr ConversionMode
+  get_conversion_mode() {
+    if constexpr (cute::is_void_v<ElementScale>) {
+      return ConversionMode::DirectConvert;
+    }
+    else if constexpr (cute::is_void_v<ElementZero>) {
+      return ConversionMode::ConvertAndScale;
+    }
+    else {
+      return ConversionMode::ConvertAndScaleWithZero;
+    }
+  }
+
+public:
+  static constexpr ConversionMode KernelConversionMode = get_conversion_mode();
+  static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale ||
+                                        KernelConversionMode == ConversionMode::ConvertAndScaleWithZero;
+  static constexpr bool UseScaleLookupTable = KernelConversionMode == ConversionMode::ConvertAndScale &&
+                                              cutlass::detail::is_Array_v<ElementScale>;
+  static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{});
+
+  static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{});
+
+  // Just pick the max alignment of A and B since it is required to be at least 128B
+  static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB);
+
+  static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment");
+
+  struct SharedStorage
+  {
+    static constexpr int scale_elements = Utils::elements_per_smem_scale();
+    static constexpr int zero_elements = Utils::elements_per_smem_zero();
+    struct TensorStorage {
+      CUTE_ALIGNAS(SmemAlignmentA) cute::ArrayEngine<RealSwappedElementA, cute::cosize_v<SmemLayoutA>> smem_A;
+      CUTE_ALIGNAS(SmemAlignmentB) cute::ArrayEngine<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+      cute::ArrayEngine<NonVoidElementScale, scale_elements> smem_scale;
+      cute::ArrayEngine<NonVoidElementZero, zero_elements> smem_zero;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A = nullptr;
+    StrideA dA{};
+    ElementB const* ptr_B = nullptr;
+    StrideB dB{};
+    ElementScale const* ptr_S = nullptr;
+    NonVoidStrideScale dS{};
+    int group_size = 0;
+    ElementZero const* ptr_Z = nullptr;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+  public:
+
+    // Assumption: StrideA is congruent with Problem_MK
+    using LayoutA = decltype(detail::get_gmem_layout(repeat_like(SwappedStrideA{}, int32_t(0)), SwappedStrideA{}));
+    using LayoutB = decltype(detail::get_gmem_layout(repeat_like(SwappedStrideB{}, int32_t(0)), SwappedStrideB{}));
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementA const*>(nullptr)), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+   using TMA_Scale = decltype(make_tma_copy<TmaElementScale>(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementScale const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+   using TMA_Zero = decltype(make_tma_copy(
+        GmemTiledCopyScale{},
+        make_tensor(detail::get_logical_ptr(static_cast<NonVoidElementZero const*>(nullptr)), repeat_like(NonVoidStrideScale{}, int32_t(0)), NonVoidStrideScale{}),
+        SmemLayoutScale{}(_,_,cute::Int<0>{}),
+        ScaleTileShape{},
+        _1{}));  // mcast along N mode for this M load, if any. Scale is ALWAYS loaded with A for RF kernel
+
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(detail::get_logical_ptr(static_cast<SwappedElementB const*>(nullptr)), LayoutB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{})); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_Scale tma_load_scale;
+    TMA_Zero tma_load_zero;
+    int64_t scale_k;
+    int group_size;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    int reload_factor = (group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{});
+    SwappedStrideA dA;
+    SwappedStrideB dB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    if constexpr (SwapAB) {
+      M = get<1>(problem_shape_MNKL);
+      N = get<0>(problem_shape_MNKL);
+    }
+
+    SwappedElementA const* ptr_A;
+    SwappedStrideA dA;
+    SwappedElementB const* ptr_B;
+    SwappedStrideB dB;
+
+    if constexpr (not SwapAB) {
+      ptr_A = reinterpret_cast<SwappedElementA const*>(args.ptr_A);
+      ptr_B = reinterpret_cast<SwappedElementB const*>(args.ptr_B);
+      dA = args.dA;
+      dB = args.dB;
+    }
+    else {
+      ptr_A = reinterpret_cast<SwappedElementA const*>(args.ptr_B);
+      ptr_B = reinterpret_cast<SwappedElementB const*>(args.ptr_A);
+      dA = args.dB;
+      dB = args.dA;
+    }
+
+    Tensor tensor_a = make_tensor(detail::get_logical_ptr(ptr_A), detail::get_gmem_layout(make_shape(M,K,L), dA));
+    Tensor tensor_b = make_tensor(detail::get_logical_ptr(ptr_B), detail::get_gmem_layout(make_shape(N,K,L), dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<TmaElementA>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    typename Params::TMA_Scale tma_load_scale{};
+    typename Params::TMA_Zero tma_load_zero{};
+
+    uint32_t tma_transaction_bytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, 0, 0, tma_transaction_bytes, 1, dA, dB };
+    }
+    else if constexpr (ModeHasScales) {
+      auto scale_k = ceil_div(K, args.group_size);
+      ElementScale const* ptr_S = args.ptr_S;
+      StrideScale dS = args.dS;
+      Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), make_layout(make_shape(M,scale_k,L), dS));
+      tma_load_scale = make_tma_copy<TmaElementScale>(
+          GmemTiledCopyScale{},
+          tensor_scale,
+          SmemLayoutScale{}(_,_,cute::Int<0>{}),
+          ScaleTileShape{},
+          _1{}); // mcast along N mode for this M load, if any
+
+      if constexpr(KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      }
+      else if constexpr(KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor tensor_zero = make_tensor(detail::get_logical_ptr(args.ptr_Z), make_layout(make_shape(M,scale_k,L), dS));
+        tma_load_zero = make_tma_copy(
+            GmemTiledCopyScale{},
+            tensor_zero,
+            SmemLayoutScale{}(_,_,cute::Int<0>{}),
+            ScaleTileShape{},
+            _1{}); // mcast along N mode for this M load, if any
+        return { tma_load_a, tma_load_b, tma_load_scale, tma_load_zero, scale_k, args.group_size, tma_transaction_bytes + TmaTransactionBytesExtra, (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}), dA, dB };
+      } else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in to_underlying_arguments.");
+    }
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    bool check_aligned_A = cutlass::detail::check_alignment<min_tma_aligned_elements_A>(detail::get_gmem_layout(cute::make_shape(M,K,L), args.dA));
+
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    bool check_aligned_B = cutlass::detail::check_alignment<min_tma_aligned_elements_B>(detail::get_gmem_layout(cute::make_shape(N,K,L), args.dB));
+
+    bool check_aligned_S = true;
+    bool check_aligned_Z = true;
+    bool check_mode_args = true;
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      check_mode_args = check_mode_args && (args.ptr_S == nullptr);
+      check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+    }
+    else if constexpr (ModeHasScales) {
+      const int scale_mn = SwapAB ? N : M;
+      const int scale_k = ceil_div(K, args.group_size);
+      constexpr int min_tma_aligned_elements_scale = tma_alignment_bits / cutlass::sizeof_bits<ElementScale>::value;
+      check_aligned_S = cutlass::detail::check_alignment<min_tma_aligned_elements_scale>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+      check_mode_args = check_mode_args && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0));
+      check_mode_args = check_mode_args && args.group_size != 0;
+      check_mode_args = check_mode_args && (args.ptr_S != nullptr);
+
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        check_mode_args = check_mode_args && (args.ptr_Z == nullptr);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        constexpr int min_tma_aligned_elements_zero = tma_alignment_bits / cutlass::sizeof_bits<ElementZero>::value;
+        check_aligned_Z = cutlass::detail::check_alignment<min_tma_aligned_elements_zero>(cute::make_shape(scale_mn,scale_k,L), args.dS);
+        check_mode_args = check_mode_args && (args.ptr_Z != nullptr);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in can_implement.");
+    }
+
+    if (!check_mode_args) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n");
+    }
+    if (!check_aligned_A) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor A meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_B) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor B meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_S) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor S (scale) meet the minimum alignment requirements for TMA.\n");
+    }
+    if (!check_aligned_Z) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Tensor Z (zeros) meet the minimum alignment requirements for TMA.\n");
+    }
+
+    return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr uint32_t TmaTransactionBytesMK = Utils::compute_tma_transaction_bytes_mk();
+  static constexpr uint32_t TmaTransactionBytesNK = Utils::compute_tma_transaction_bytes_nk();
+  static constexpr uint32_t TmaTransactionBytesExtra = Utils::compute_tma_transaction_bytes_extra();
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesExtra;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      // Nothing extra to do
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_scale.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_zero.get_tma_descriptor());
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA prefetch.");
+    }
+
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(M,K,L), mainloop_params.dA))); // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(shape(detail::get_gmem_layout(make_shape(N,K,L), mainloop_params.dB))); // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      return cute::make_tuple(gA_mkl, gB_nkl);
+    }
+    else if constexpr (ModeHasScales) {
+      auto scale_k = mainloop_params.scale_k;
+      Tensor mS_mkl = mainloop_params.tma_load_scale.get_tma_tensor(make_shape(M,scale_k,L));          // (m,scale_k,l)
+      Tensor gS_mkl = local_tile(mS_mkl, ScaleTileShape{}, make_coord(_,_));         // (BLK_M,BLK_Scale_K,m,scale_k,l)
+      if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl);
+      }
+      else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+        Tensor mZ_mkl = mainloop_params.tma_load_zero.get_tma_tensor(make_shape(M,scale_k,L));         // (m,scale_k,l)
+        Tensor gZ_mkl = local_tile(mZ_mkl, ScaleTileShape{}, make_coord(_,_));       // (BLK_M,BLK_Scale_K,m,scale_k,l)
+        return cute::make_tuple(gA_mkl, gB_nkl, gS_mkl, gZ_mkl);
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+      }
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in load_init.");
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  /// This overload gets triggered when we have scales.
+  template <
+    class... Ts,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<Ts...> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+      static_assert(sizeof... (Ts) == 2, "Direct convert needs two inputs");
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+      static_assert(sizeof... (Ts) == 3, "Scaled convert needs three inputs");
+    }
+    else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+      static_assert(sizeof... (Ts) == 4, "Scaled and zero convert needs four inputs");
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled in TMA load.");
+    }
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});      // (BLK_M,BLK_K,PIPE)
+    Tensor sB_ = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});      // (BLK_N,BLK_K,PIPE)
+    Tensor sA  = as_position_independent_swizzle_tensor(sA_);                                   // (BLK_M,BLK_K,PIPE)
+    Tensor sB  = as_position_independent_swizzle_tensor(sB_);                                   // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A, B and Scales
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_s = 0;
+
+    // Issue TmaLoads
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    auto extra_input_partitions = Utils::partition_extra_tma_inputs(mainloop_params, load_inputs, shared_tensors, cluster_local_block_id, m_coord, l_coord);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      int write_stage = smem_pipe_write.index();
+      if (cute::elect_one_sync()) {
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+      }
+
+      if constexpr (KernelConversionMode == ConversionMode::DirectConvert) {
+        // Nothing extra to do.
+      }
+      else if constexpr (ModeHasScales) {
+        auto tSgS = get<0>(extra_input_partitions);
+        auto tSsS = get<1>(extra_input_partitions);
+
+        // Temporary factor which will determine which k tile to reload from gmem. Needed so we don't modify tma transaction bytes
+        // on the fly.
+        // We must do a ceiling divide here to correctly handle with group_size == K. In that case, we don't require that K
+        // is a multiple of the threadblock tile K
+        int const scale_load_k = *k_tile_iter / mainloop_params.reload_factor; // This will always be 0 when group_size == K.
+        if (cute::elect_one_sync()) copy(mainloop_params.tma_load_scale.with(*tma_barrier, mcast_mask_s), tSgS(_,_,_,scale_load_k), tSsS(_,_,_,write_stage));
+
+        if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) {
+          // Nothing extra to do
+        }
+        else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) {
+          auto tZgZ = get<2>(extra_input_partitions);
+          auto tZsZ = get<3>(extra_input_partitions);
+          if (cute::elect_one_sync()) copy(mainloop_params.tma_load_zero.with(*tma_barrier, mcast_mask_s), tZgZ(_,_,_,scale_load_k), tZsZ(_,_,_,write_stage));
+        }
+        else {
+          static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+        }
+      }
+      else {
+        static_assert(cutlass::detail::dependent_false<KernelSchedule>, "Conversion mode not handled for TMA copy op.");
+      }
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomA{}) == 2, "SwappedSmemLayoutAtomA must be rank 2.");
+    static_assert(cute::rank(SwappedSmemLayoutAtomB{}) == 2, "SwappedSmemLayoutAtomB must be rank 2.");
+    static_assert(!cute::is_void_v<SwappedSmemCopyAtomA>,
+      "SM90 GMMA mainloops must specify a non-void copy atom for RF sourced instructions.");
+    static_assert(cute::is_void_v<SwappedSmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    [[maybe_unused]] int warp_group_thread_idx = thread_idx % 128;
+
+    Tensor sA_ = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sA = as_position_independent_swizzle_tensor(sA_);                                      // (BLK_M,BLK_K,PIPE)
+
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});         // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto mma_thread_slice = tiled_mma.get_thread_slice(thread_idx);
+    Tensor tCsA = mma_thread_slice.partition_A(sA);
+    auto mma_warpgroup_slice = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    // Allocate fragments and descriptors
+    Tensor tCrA_mma = mma_thread_slice.partition_fragment_A(sA(_,_,Int<0>{}));                // (MMA,MMA_M,MMA_K,PIPE)
+
+    Tensor tCrA_load = [&]{
+      if constexpr (not is_layout<SwappedStrideA>::value) {
+        // Make register tensor with MMA layout
+        return make_fragment_like<RealSwappedElementA>(tCrA_mma);
+      }
+      else {
+        // Make register tensor matching smem layout, converter will take care of de-swizzling
+        return make_tensor_like<RealSwappedElementA>(tCsA(_,_,_,Int<0>{}));
+      }
+    }();
+
+    Tensor tCsB = mma_warpgroup_slice.partition_B(sB);                                        // (MMA,MMA_N,MMA_K,PIPE)
+    Tensor tCrB = mma_warpgroup_slice.make_fragment_B(tCsB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    //
+    // Copy Atom A retiling
+    //
+    auto smem_tiled_copy_A = make_tiled_copy_A(SwappedSmemCopyAtomA{}, tiled_mma);
+    auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(warp_group_thread_idx);
+
+    Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA_load);                                  // (CPY,CPY_M,CPY_K)
+
+    // Partition of thread -> shared and thread -> RF
+    auto partitioned_extra_info = Utils::partition_extra_mma_info(mma_thread_slice, shared_tensors);
+    auto copy_partitions_extra_info = Utils::retile_extra_mma_info(tiled_mma, partitioned_extra_info, warp_group_thread_idx);
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view));                                            // CPY_M
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCrA_copy_view));                                            // CPY_K
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA_mma) == size<1>(accum));                                                 // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+
+    constexpr int K_BLOCK_MAX = size<2>(tCrA_load);
+    constexpr int K_WAIT_MAX = cute::min(K_BLOCK_MAX - 1, 7);
+    static_assert(K_BLOCK_MAX >= 4, "Consider increasing TileShapeK");
+
+    ConsumerToken barrier_token = {BarrierStatus::WaitAgain};
+    // first k tile
+    {
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+
+      // copy smem->rmem for A operand
+      Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+        partitioned_extra_info, copy_partitions_extra_info, 0, read_stage);
+      if (K_BLOCK_MAX > 1) { // prefetch next block
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+          partitioned_extra_info, copy_partitions_extra_info, 1, read_stage);
+      }
+      Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+
+      --k_tile_count;
+      if (k_tile_count > 0) {
+        // Wait for K_BLOCK_MAX - 1 to be in flight to ensure that it is safe to overwrite the A registers for the first mma.
+        pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+          partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+        if (K_BLOCK_MAX > 1) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+        }
+        warpgroup_wait<K_WAIT_MAX>();
+        Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+      }
+    }
+
+    if (k_tile_count == 0) {
+      return;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count) {
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      ++smem_pipe_read;
+
+      warpgroup_fence_operand(accum);
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>(); // We have K_BLOCK_MAX - 1 GMMA instructions pending for this stage, so we can release prior barrier
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block == 0) {
+          barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+        }
+
+        if (k_block == K_BLOCK_MAX - 1) {
+          pipeline.consumer_wait(smem_pipe_read, barrier_token);
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, 0, smem_pipe_read.index());
+          if (K_BLOCK_MAX > 1) { // prefetch next block
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+              partitioned_extra_info, copy_partitions_extra_info, 1, smem_pipe_read.index());
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, 0);
+        }
+        else {
+          if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+            Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+              partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+          }
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+      warpgroup_fence_operand(accum);
+
+    }
+
+    warpgroup_fence_operand(accum);
+
+    {
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      warpgroup_fence_operand(accum);
+
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < K_BLOCK_MAX; ++k_block) {
+
+        warpgroup_arrive();
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA_mma(_,_,k_block), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        warpgroup_commit_batch();
+
+        warpgroup_wait<K_WAIT_MAX>();
+        if (k_block == K_BLOCK_MAX - 1) { // release prior barrier
+          pipeline.consumer_release(smem_pipe_release);             // UNLOCK smem_pipe_release, done _computing_ on it
+          ++smem_pipe_release;
+        }
+
+        if (k_block < K_BLOCK_MAX - 2) { // prefetch next block
+          Utils::copy_tensors_MK(smem_tiled_copy_A, tCsA, tCrA_copy_view,
+            partitioned_extra_info, copy_partitions_extra_info, k_block + 2, read_stage);
+        }
+        if (k_block < K_BLOCK_MAX - 1) {
+          Utils::dequantize_A_kblock(tCrA_load, tCrA_mma, partitioned_extra_info, k_block + 1);
+        }
+      }
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = 1;
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..228c25894dbcf8aac3eedd4dd54e07609b5eb365
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int Stages,
+  class ClusterShape,
+  int PipelineAsyncMmaStages,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmma<Stages, ClusterShape, PipelineAsyncMmaStages>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+  using PipelineState  = typename cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  static constexpr int ThreadCount = CUTE_STATIC_V(size(TiledMma{}));
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage {
+    cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+    cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    alignas(16) PipelineStorage pipeline_storage;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
+        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+    typename Params::TMA_B tma_load_b = make_tma_copy(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    return {
+      tma_load_a,
+      tma_load_b
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TMA_LOAD_A,
+    class TensorB, class TMA_LOAD_B,
+    class FrgTensorC,
+    class KTileIterator
+  >
+  CUTLASS_DEVICE void
+  operator() (
+      TensorA const& gA, TMA_LOAD_A& tma_load_a,
+      TensorB const& gB, TMA_LOAD_B& tma_load_b,
+      FrgTensorC& accum,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      char* shared_memory,
+      Params const& mainloop_params)
+  {
+    using namespace cute;
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+    Tensor sA = make_tensor(make_smem_ptr(storage.smem_A.data()), SmemLayoutA{});                 // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(storage.smem_B.data()), SmemLayoutB{});                 // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    auto block_tma_a = tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                // (TMA,TMA_N,TMA_K,PIPE)
+
+    //
+    // Prepare TMA membars and PREFETCH
+    //
+
+    // Number of pipelined k-tiles in smem
+    constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+
+    // NOTE: Another parameter: Partition the pipeline between active MMAs and active TMAs
+    // Tunable via the dispatch policy to tollerate latencies evenly across the math and compute stages
+    // K_PIPE_MMAS: The max number of active MMA pipes at beginning of every loop
+    // K_PIPE_TMAS: The max number of active TMA pipes at beginning of every loop (geq 1)
+    constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
+    constexpr int K_PIPE_TMAS = K_PIPE_MAX - K_PIPE_MMAS;
+    static_assert(0 <= K_PIPE_MMAS && K_PIPE_MMAS <  K_PIPE_MAX);
+    static_assert(0 <  K_PIPE_TMAS && K_PIPE_TMAS <= K_PIPE_MAX);
+
+    static_assert(K_PIPE_MMAS < K_PIPE_MAX - 1);
+
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr uint32_t TmaTransactionBytes = static_cast<uint32_t>(
+        cutlass::bits_to_bytes(size<0>(sA) * size<1>(sA) * sizeof_bits<InternalElementA>::value) +
+        cutlass::bits_to_bytes(size<0>(sB) * size<1>(sB) * sizeof_bits<InternalElementB>::value));
+
+    // Obtain warp index
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+
+    PipelineParams params;
+    params.transaction_bytes = TmaTransactionBytes;
+    params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
+    params.is_leader = warp_group_thread_idx == 0;
+    params.num_consumers = NumThreadsPerWarpGroup;
+
+    MainloopPipeline pipeline(storage.pipeline_storage, params, ClusterShape{});
+
+    // State variables used for iterating the circular buffer
+    // smem_pipe_read / release is used by the consumer of SMEM data - i.e MMA
+    // smem_pipe_write is used by the producer of SMEM data - i.e TMA
+    PipelineState smem_pipe_read;
+    PipelineState smem_pipe_release;
+    PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    }
+    else {
+      __syncthreads();
+    }
+
+    // Set predicate for the lowest lane_id in the warp
+    int lane_predicate = cute::elect_one_sync();
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    // Keep a copy to know when to stop issuing loads
+    int k_tile_count_tma = k_tile_count;
+
+    // Issue TmaLoads (Prologue fetches)
+    if (warp_idx == 0 && lane_predicate == 1) {
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Issue the prologue loads
+      int prologue_tma_count = min(K_PIPE_MAX, k_tile_count);
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < prologue_tma_count; ++stage) {
+        pipeline.producer_acquire(smem_pipe_write);
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,stage));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,stage));
+        ++k_tile_iter;
+        ++smem_pipe_write;
+      }
+      k_tile_count_tma -= prologue_tma_count;
+    }
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                            // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                            // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                     // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                     // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                      // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tAsA));                      // PIPE
+    CUTE_STATIC_ASSERT_V(size<3>(tCsB) == size<3>(tBsB));                      // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));        // PIPE
+
+    __syncthreads();
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    // Prologue MMAs
+    assert(k_tile_count >= 1);
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,smem_pipe_read.index()), tCrB(_,_,k_block,smem_pipe_read.index()), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count) - 1;
+        prologue_mma_count > 0; --prologue_mma_count)
+    {
+      // WAIT on smem_pipe_read until it's data is available
+      pipeline.consumer_wait(smem_pipe_read);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+      ++smem_pipe_read;
+      --k_tile_count;
+    }
+    warpgroup_fence_operand(accum);
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until data is available
+      pipeline.consumer_wait(smem_pipe_read);
+
+      //
+      // Compute on k_tile
+      //
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,smem_pipe_read.index()), tCrB(_,_,_,smem_pipe_read.index()), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      pipeline.consumer_release(smem_pipe_release);  // UNLOCK wr stage, done _computing_ on it
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+
+      // Do Acquire & Load only if needed - helps with both performance and also corner case illegal barrier-ops
+      if (warp_idx == 0 && lane_predicate == 1 && (k_tile_count_tma > 0) ) {
+        pipeline.producer_acquire(smem_pipe_write);  // LOCK wr stage, for _writing_
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write.index()));
+        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write.index()));
+        ++smem_pipe_write;
+        ++k_tile_iter;
+        --k_tile_count_tma;
+      }
+
+      // Advance consumer pipeline
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    // Wait on all GMMAs
+    warpgroup_wait<0>();
+    warpgroup_fence_operand(accum);
+
+    // Workaround for ensuring Smem destruction doesn't happen accidentally
+    if constexpr (size(typename DispatchPolicy::ClusterShape{}) > 1) {
+      cute::cluster_arrive();
+      cute::cluster_wait();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0e64bad5d2e406156f2532fc5420a662ba3d0687
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,584 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecialized<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  static constexpr bool ConvertF32toTF32A = cute::is_same_v<float, ElementA>;
+  static constexpr bool ConvertF32toTF32B = cute::is_same_v<float, ElementB>;
+  using InternalElementA = cute::conditional_t<ConvertF32toTF32A, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementA>>>;
+  using InternalElementB = cute::conditional_t<ConvertF32toTF32B, tfloat32_t, uint_bit_t<sizeof_bits_v<ElementB>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<InternalElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<InternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<InternalElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<InternalElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(typename DispatchPolicy::ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{}; // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    assert(k_tile_count >= 1);
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    warpgroup_fence_operand(accum);
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // (V,M,K) x (V,N,K) => (V,M,N)
+      cute::gemm(tiled_mma, tCrA(_,_,_,read_stage), tCrB(_,_,_,read_stage), accum);
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7ea65a6fdbecf31f82a8a51fc390137dd1b16c6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,587 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StrideA_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StrideA_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = StrideA_;
+  using ElementB = ElementB_;
+  using StrideB = StrideB_;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128, _0> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,0),
+        TileShape{},
+        ClusterShape{}));
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.mma_promotion_interval
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
+    implementable = implementable && (args.mma_promotion_interval % (size<2>(TileShape{})() / TiledMma().template tile_size_mnk<2>()()) == 0);
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      //
+      // Prepare the TMA loads for A and B
+      //
+
+      constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+      uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+      Tensor gA_mkl = get<0>(load_inputs);
+      Tensor gB_nkl = get<1>(load_inputs);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      uint16_t mcast_mask_a = 0;
+      uint16_t mcast_mask_b = 0;
+
+      // Issue TmaLoads
+      // Maps the tile -> block, value
+      if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int n = 0; n < size<1>(block_layout); ++n) {
+          mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+        }
+      }
+
+      if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+        auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+        for (int m = 0; m < size<0>(block_layout); ++m) {
+          mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+        }
+      }
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count) {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      int read_stage = smem_pipe_read.index();
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M,K) x (V,N,K) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..48ddf7a0d7b76350911e674f2d1cb3bd4b661921
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -0,0 +1,1102 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class StridePairA_,
+  class ElementB_,
+  class StridePairB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedBlockwiseFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    StridePairA_,
+    ElementB_,
+    StridePairB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_> {
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockwiseFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using ElementA = ElementA_;
+  using StrideA = cute::tuple_element_t<0,StridePairA_>;
+  using LayoutSFA = cute::tuple_element_t<1,StridePairA_>;
+  using ElementB = ElementB_;
+  using StrideB = cute::tuple_element_t<0,StridePairB_>;
+  using LayoutSFB = cute::tuple_element_t<1,StridePairB_>;
+  using TiledMma = TiledMma_;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using ElementBlockScale = ElementAccumulator;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using GmemTiledCopyScaleTMA = cute::SM90_TMA_LOAD;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  static constexpr int ScaleGranularityM = size<0,0>(LayoutSFA{});
+  static constexpr int ScaleGranularityN = size<0,0>(LayoutSFB{});
+  static constexpr int ScaleGranularityK = size<1,0>(LayoutSFA{});
+
+  static_assert(size<2>(TileShape{}) % ScaleGranularityK == 0);
+  static_assert(ScaleGranularityK % size<2>(typename TiledMma::AtomShape_MNK{}) == 0);
+
+  static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{});
+  static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4.");
+  static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}),
+    "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom.");
+  static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0,
+    "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom.");
+  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
+  static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN;
+
+  static constexpr bool MMajorSFA = size<0,1>(LayoutSFA{}.stride()) == 1;
+  static constexpr bool NMajorSFB = size<0,1>(LayoutSFB{}.stride()) == 1;
+
+  static constexpr int ScaleTmaThreshold = 32;
+  static constexpr bool IsTmaLoadSFA = ScaleMsPerTile >= ScaleTmaThreshold && ScaleNsPerTile < ScaleTmaThreshold && MMajorSFA;
+  static constexpr bool IsTmaLoadSFB = ScaleNsPerTile >= ScaleTmaThreshold && ScaleMsPerTile < ScaleTmaThreshold && NMajorSFB;
+  // Two threads per CTA are producers (1 for operand tile `tma`, and 32 for scales `cp.async`)
+  static constexpr int NumProducerThreadEvents = ((IsTmaLoadSFA && IsTmaLoadSFB)? 1 : 33);
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
+  static_assert((size<1>(TileShape{}) % ScaleGranularityN) == 0, "FP8 scaling granularity must evenly divide tile shape along N.");
+
+  using ScaleConfig = ::cutlass::detail::Sm90BlockwiseScaleConfig<
+      ScaleGranularityM, 
+      ScaleGranularityN, 
+      ScaleGranularityK,
+      MMajorSFA ? cute::GMMA::Major::MN : cute::GMMA::Major::K,
+      NMajorSFB ? cute::GMMA::Major::MN : cute::GMMA::Major::K>;
+  using SmemLayoutAtomSFA = decltype(ScaleConfig::smem_atom_layoutSFA(TileShape{}));
+  using SmemLayoutAtomSFB = decltype(ScaleConfig::smem_atom_layoutSFB(TileShape{}));
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  // Block scaling gmem-to-smem copy atom
+  //  we can have partial tiles in M or N, so don't vectorize those loads
+  using CopyAtomSFA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+  using CopyAtomSFB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
+
+  static constexpr int AlignmentSFA = IsTmaLoadSFA ? 128 / cutlass::sizeof_bits<ElementBlockScale>::value : 1;
+  static constexpr int AlignmentSFB = IsTmaLoadSFB ? 128 / cutlass::sizeof_bits<ElementBlockScale>::value : 1;
+
+  // Block scaling smem layout
+  using SmemLayoutSFA = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFA{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFA{}), size(filter_zeros(SmemLayoutAtomSFA{})))
+  ));
+  using SmemLayoutSFB = decltype(make_layout(
+    append(shape(SmemLayoutAtomSFB{}), Int<DispatchPolicy::Stages>{}),
+    append(stride(SmemLayoutAtomSFB{}), size(filter_zeros(SmemLayoutAtomSFB{})))
+  ));
+
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
+             "ElementAccumulator and ElementBlockScale should be same datatype");
+
+  struct SharedStorage
+  {
+    struct TensorStorage : cute::aligned_struct<128> {
+      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // TILE_M x PIPE_K
+      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // TILE_N x PIPE_K
+      CUTE_ALIGNAS(128) cute::array<ElementBlockScale, cute::cosize_v<SmemLayoutSFA>> smem_SFA; // ScaleMsPerTile x PIPE_K
+      CUTE_ALIGNAS(128) cute::array<ElementBlockScale, cute::cosize_v<SmemLayoutSFB>> smem_SFB; // ScaleNsPerTile x PIPE_K
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A;
+    StrideA dA;
+    ElementB const* ptr_B;
+    StrideB dB;
+    ElementBlockScale const* ptr_SFA;
+    LayoutSFA layout_SFA;
+    ElementBlockScale const* ptr_SFB;
+    LayoutSFB layout_SFB;
+  };
+
+  // Device side kernel params
+  struct Params {
+    static auto getTmaSFA() {
+      if constexpr (IsTmaLoadSFA) {
+        return make_tma_copy(
+          GmemTiledCopyScaleTMA{},
+          make_tensor(static_cast<ElementBlockScale const*>(nullptr), filter_zeros(LayoutSFA{})),
+          filter_zeros(SmemLayoutSFA{}(_,_,_0{})),
+          Shape<Int<ScaleMsPerTile>, Int<1>>{},
+          _1{});
+      }
+      else {
+        return nullptr;
+      }
+    }
+    static auto getTmaSFB() {
+      if constexpr (IsTmaLoadSFB) {
+        return make_tma_copy(
+          GmemTiledCopyScaleTMA{},
+          make_tensor(static_cast<ElementBlockScale const*>(nullptr), filter_zeros(LayoutSFB{})),
+          filter_zeros(SmemLayoutSFB{}(_,_,_0{})),
+          Shape<Int<ScaleNsPerTile>, Int<1>>{},
+          _1{});
+      }
+      else {
+        return nullptr;
+      }
+    }
+    // Assumption: StrideA is congruent with Problem_MK
+    using TMA_A = decltype(make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
+        SmemLayoutA{}(_,_,_0{}),
+        TileShape{},
+        ClusterShape{}));
+    // Assumption: StrideB is congruent with Problem_NK
+    using TMA_B = decltype(make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,_0{}),
+        TileShape{},
+        ClusterShape{}));
+    // NOTE: Does make_tma_copy supports 0 stride?
+    using TMA_SFA = decltype(getTmaSFA());
+    using TMA_SFB = decltype(getTmaSFB());
+    TMA_A tma_load_a;
+    TMA_B tma_load_b;
+    TMA_SFA tma_load_sfa;
+    TMA_SFB tma_load_sfb;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
+    // Block scaling factors for A and B
+    ElementBlockScale const* ptr_SFA;
+    ElementBlockScale const* ptr_SFB;
+    LayoutSFA layout_SFA;
+    LayoutSFB layout_SFB;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
+    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
+    auto ptr_SFA = reinterpret_cast<ElementBlockScale const*>(args.ptr_SFA);
+    auto ptr_SFB = reinterpret_cast<ElementBlockScale const*>(args.ptr_SFB);
+
+    Tensor tensor_sfa = make_tensor(ptr_SFA, filter_zeros(args.layout_SFA));
+    Tensor tensor_sfb = make_tensor(ptr_SFB, filter_zeros(args.layout_SFB));
+    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{});
+    typename Params::TMA_SFA tma_load_sfa{};
+    if constexpr (IsTmaLoadSFA) {
+      tma_load_sfa = make_tma_copy(
+          GmemTiledCopyScaleTMA{},
+          tensor_sfa,
+          filter_zeros(SmemLayoutSFA{})(_,_,cute::Int<0>{}),
+          Shape<Int<ScaleMsPerTile>, Int<1>>{},
+          _1{});
+    }
+    typename Params::TMA_SFB tma_load_sfb{};
+    if constexpr (IsTmaLoadSFB) {
+      tma_load_sfb = make_tma_copy(
+          GmemTiledCopyScaleTMA{},
+          tensor_sfb,
+          filter_zeros(SmemLayoutSFB{})(_,_,cute::Int<0>{}),
+          Shape<Int<ScaleNsPerTile>, Int<1>>{},
+          _1{});
+    }
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes_sfa = TmaTransactionBytesSFA;
+    uint32_t transaction_bytes_sfb = TmaTransactionBytesSFB;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk + transaction_bytes_sfa + transaction_bytes_sfb;
+
+    return {
+      tma_load_a,
+      tma_load_b,
+      tma_load_sfa,
+      tma_load_sfb,
+      transaction_bytes,
+      transaction_bytes_mk,
+      transaction_bytes_nk,
+      args.ptr_SFA,
+      args.ptr_SFB,
+      args.layout_SFA,
+      args.layout_SFB,
+    };
+  }
+
+  template<class ProblemShape>
+  static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool implementable = true;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    if (!cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{})) {
+      implementable = false;
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load tensor A.\n");
+    }
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    if (!cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{})) {
+      implementable = false;
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load tensor B.\n");
+    }
+    constexpr int min_tma_aligned_elements_S = tma_alignment_bits / cutlass::sizeof_bits<ElementBlockScale>::value;
+    if (IsTmaLoadSFA && !cutlass::detail::check_alignment<min_tma_aligned_elements_S>(args.layout_SFA)) {
+      implementable = false;
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load scale A.\n");
+    }
+    if (IsTmaLoadSFB && !cutlass::detail::check_alignment<min_tma_aligned_elements_S>(args.layout_SFB)) {
+      implementable = false;
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load scale B.\n");
+    }
+    return implementable;
+  }
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 1;
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
+
+  static constexpr uint32_t TmaTransactionBytesSFA =
+        (IsTmaLoadSFA? cutlass::bits_to_bytes(ScaleMsPerTile * static_cast<uint32_t>(sizeof_bits<ElementBlockScale>::value)): 0);
+  static constexpr uint32_t TmaTransactionBytesSFB =
+        (IsTmaLoadSFB? cutlass::bits_to_bytes(ScaleNsPerTile * static_cast<uint32_t>(sizeof_bits<ElementBlockScale>::value)): 0);
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK + TmaTransactionBytesSFA + TmaTransactionBytesSFB;
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params)
+  {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+    if constexpr (IsTmaLoadSFA) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfa.get_tma_descriptor());
+    }
+    if constexpr (IsTmaLoadSFB) {
+      cute::prefetch_tma_descriptor(mainloop_params.tma_load_sfb.get_tma_descriptor());
+    }
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                             // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                             // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});         // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});         // (BLK_N,BLK_K,n,k,l)
+
+    // Note that mSFA_mkl and mSFB_nkl are already blocked tiled in the `m` host and
+    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mSFA_mkl and mSFB_nkl.
+    auto mSFA_mkl = [&]() {
+      if constexpr (IsTmaLoadSFA) {
+        return mainloop_params.tma_load_sfa.get_tma_tensor(shape(filter_zeros(mainloop_params.layout_SFA)));
+      }
+      else {
+        return make_tensor(make_gmem_ptr(mainloop_params.ptr_SFA), mainloop_params.layout_SFA); // (scale_m,k,l)
+      }
+    }();
+    auto mSFB_nkl = [&]() {
+      if constexpr (IsTmaLoadSFB) {
+        return mainloop_params.tma_load_sfb.get_tma_tensor(shape(filter_zeros(mainloop_params.layout_SFB)));
+      }
+      else {
+        return make_tensor(make_gmem_ptr(mainloop_params.ptr_SFB), mainloop_params.layout_SFB); // (scale_n,k,l)
+      }
+    }();
+
+    return cute::make_tuple(gA_mkl, gB_nkl, mSFA_mkl, mSFB_nkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), filter_zeros(SmemLayoutSFA{})); // (ScaleMsPerTile,PIPE)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), filter_zeros(SmemLayoutSFB{})); // (ScaleNsPerTile,PIPE)
+
+    //
+    // Prepare the TMA loads for A and B
+    //
+
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+    Tensor mSFA_mkl = get<2>(load_inputs);
+    Tensor mSFB_nkl = get<3>(load_inputs);
+
+    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
+    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
+
+    // Partition the inputs based on the current block coordinates.
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+    Tensor gSFA = local_tile(
+      mSFA_mkl, make_tile(Int<ScaleMsPerTile>{}, Int<1>{}),
+      make_coord(m_coord,_,l_coord));
+    Tensor gSFB = local_tile(
+      mSFB_nkl, make_tile(Int<ScaleNsPerTile>{}, Int<1>{}),
+      make_coord(n_coord,_,l_coord));
+
+    // Applies the mapping from block_tma_a
+    Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+    Tensor tAsA = block_tma_a.partition_D(sA);                                                 // (TMA,TMA_M,TMA_K,PIPE)
+
+    Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+    Tensor tBsB = block_tma_b.partition_D(sB);                                                 // (TMA,TMA_N,TMA_K,PIPE)
+
+    auto [tAgA_SFA, tAsA_SFA] = [&]() {
+      if constexpr (IsTmaLoadSFA) {
+        auto block_tma_sfa = mainloop_params.tma_load_sfa.get_slice(cluster_local_block_id.y);
+        Tensor tAgA_SFA_ = block_tma_sfa.partition_S(gSFA);
+        Tensor tAsA_SFA_ = block_tma_sfa.partition_D(sSFA);
+        return cute::make_tuple(tAgA_SFA_, tAsA_SFA_);
+      }
+      else {
+        return cute::make_tuple(0, 0);
+      }
+    }();
+    auto [tBgB_SFB, tBsB_SFB] = [&]() {
+      if constexpr (IsTmaLoadSFB) {
+        auto block_tma_sfb = mainloop_params.tma_load_sfb.get_slice(cluster_local_block_id.y);
+        Tensor tBgB_SFB_ = block_tma_sfb.partition_S(gSFB);
+        Tensor tBsB_SFB_ = block_tma_sfb.partition_D(sSFB);
+        return cute::make_tuple(tBgB_SFB_, tBsB_SFB_);
+      }
+      else {
+        return cute::make_tuple(0, 0);
+      }
+    }();
+
+    uint16_t mcast_mask_a = 0;
+    uint16_t mcast_mask_b = 0;
+    uint16_t mcast_mask_sf = 0;
+
+    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
+    // Maps the tile -> block, value
+    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
+      }
+    }
+
+    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
+      }
+    }
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      //
+      // Copy gmem to smem for *k_tile_iter
+      //
+      int write_stage = smem_pipe_write.index();
+      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+      // Copy operands A and B from global memory to shared memory
+      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+
+      // Copy scale tensors from global memory to shared memory
+      if constexpr (IsTmaLoadSFA) {
+        if (lane_predicate) {
+          copy(mainloop_params.tma_load_sfa.with(*tma_barrier, mcast_mask_sf), tAgA_SFA(_,_,_,*k_tile_iter), tAsA_SFA(_,_,_,write_stage));
+        }
+      }
+      if constexpr (IsTmaLoadSFB) {
+        if (lane_predicate) {
+          copy(mainloop_params.tma_load_sfb.with(*tma_barrier, mcast_mask_sf), tBgB_SFB(_,_,_,*k_tile_iter), tBsB_SFB(_,_,_,write_stage));
+        }
+      }
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  template <
+    class TensorA, class TensorB,
+    class TensorScaleA, class TensorScaleB,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load_auxiliary(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), SmemLayoutSFA{}); // (ScaleMsPerTile,k)
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), SmemLayoutSFB{}); // (ScaleNsPerTile,k)
+
+    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+
+    Tensor mSFA_mkl = get<2>(load_inputs);
+    Tensor mSFB_nkl = get<3>(load_inputs);
+
+    Tensor iSFA_mkl = make_identity_tensor(shape(mainloop_params.layout_SFA));                                // (m,k,l)
+    Tensor iSFB_nkl = make_identity_tensor(shape(mainloop_params.layout_SFB));                                // (n,k,l)
+
+    Tensor gSFA_mkl = local_tile(mSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor cSFA_mkl = local_tile(iSFA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});     // (BLK_M,BLK_K,m,k,l)
+    Tensor gSFB_nkl = local_tile(mSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+    Tensor cSFB_nkl = local_tile(iSFB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});     // (BLK_N,BLK_K,n,k,l)
+
+    Tensor gSFA_k = gSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor cSFA_k = cSFA_mkl(_,_,m_coord,_,l_coord);
+    Tensor gSFB_k = gSFB_nkl(_,_,n_coord,_,l_coord);
+    Tensor cSFB_k = cSFB_nkl(_,_,n_coord,_,l_coord);
+
+    TiledCopy scale_copy_a = make_tiled_copy(CopyAtomSFA{},
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+    TiledCopy scale_copy_b = make_tiled_copy(CopyAtomSFB{},
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{});
+    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(thread_idx);
+    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(thread_idx);
+
+    Tensor tSFAgSFA_k = thr_scale_copy_a.partition_S(gSFA_k);
+    Tensor tSFAcSFA_k = thr_scale_copy_a.partition_S(cSFA_k);
+    Tensor tSFAsSFA   = thr_scale_copy_a.partition_D(sSFA);
+
+    Tensor tSFBgSFB_k = thr_scale_copy_b.partition_S(gSFB_k);
+    Tensor tSFBcSFB_k = thr_scale_copy_b.partition_S(cSFB_k);
+    Tensor tSFBsSFB   = thr_scale_copy_b.partition_D(sSFB);
+
+    Tensor tSFApSFA = make_tensor<bool>(shape(filter_zeros(tSFAsSFA(_,_,_,_0{}))));                 // (CPY,CPY_M,CPY_K)
+    Tensor tSFBpSFB = make_tensor<bool>(shape(filter_zeros(tSFBsSFB(_,_,_,_0{}))));                 // (CPY,CPY_N,CPY_K)
+
+    auto SFA_shape = shape(mainloop_params.layout_SFA);
+    auto SFB_shape = shape(mainloop_params.layout_SFB);
+
+    // Mainloop
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count) {
+      // LOCK smem_pipe_write for _writing_
+      pipeline.producer_acquire(smem_pipe_write);
+
+      // Since scale granularity K is multiple of BLK_K we do not have to consider if that is OOB
+      bool load_sfa = thread_idx < ScaleMsPerTile;
+      Tensor tSFAcSFA = tSFAcSFA_k(_,_,_,*k_tile_iter);
+      Tensor tSFAcSFA_compact = filter_zeros(tSFAcSFA);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFApSFA); ++i) {
+        tSFApSFA(i) = load_sfa && elem_less(tSFAcSFA_compact(i), SFA_shape);
+      }
+
+      bool load_sfb = thread_idx < ScaleNsPerTile;
+      Tensor tSFBcSFB = tSFBcSFB_k(_,_,_,*k_tile_iter);
+      Tensor tSFBcSFB_compact = filter_zeros(tSFBcSFB);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(tSFBpSFB); ++i) {
+        tSFBpSFB(i) = load_sfb && elem_less(tSFBcSFB_compact(i), SFB_shape);
+      }
+      int write_stage = smem_pipe_write.index();
+      // Copy scale tensors from global memory to shared memory
+      if constexpr (!IsTmaLoadSFA) {
+        copy_if(scale_copy_a, tSFApSFA, filter_zeros(tSFAgSFA_k(_,_,_,*k_tile_iter)), filter_zeros(tSFAsSFA(_,_,_,write_stage)));
+      }
+      if constexpr (!IsTmaLoadSFB) {
+        copy_if(scale_copy_b, tSFBpSFB, filter_zeros(tSFBgSFB_k(_,_,_,*k_tile_iter)), filter_zeros(tSFBsSFB(_,_,_,write_stage)));
+      }
+      if constexpr (!IsTmaLoadSFA || !IsTmaLoadSFB) {
+        pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
+      }
+
+      ++k_tile_iter;
+
+      // Advance smem_pipe_write
+      ++smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  template<
+    class EngineAccum,
+    class LayoutAccum,
+    class ScaleFactor
+  >
+  CUTLASS_DEVICE
+  void scale_if_needed(GmmaFP8Accumulation<EngineAccum, LayoutAccum>& accumulation, ScaleFactor scaleFactor) {
+    if constexpr (ScalePromotionInterval != 4) {
+      accumulation.scale_if_needed(scaleFactor);
+    }
+    else {
+      // avoid unnecessary tests when granularity is the finnest
+      accumulation.scale(scaleFactor);
+    }
+  }
+  template<
+    class EngineAccum,
+    class LayoutAccum,
+    class ScaleFactor1,
+    class ScaleFactor2
+  >
+  CUTLASS_DEVICE
+  void scale_if_needed(GmmaFP8Accumulation<EngineAccum, LayoutAccum>& accumulation, ScaleFactor1 scaleFactor1, ScaleFactor2 scaleFactor2) {
+    if constexpr (ScalePromotionInterval != 4) {
+      accumulation.scale_if_needed(scaleFactor1, scaleFactor2);
+    }
+    else {
+      // avoid unnecessary tests when granularity is the finnest
+      accumulation.scale(scaleFactor1, scaleFactor2);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+
+
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::is_void_v<SmemCopyAtomA>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+    static_assert(cute::is_void_v<SmemCopyAtomB>,
+      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+
+    // Block scaling
+    Tensor sSFA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFA.data()), make_layout(
+        make_shape(get<0>(shape(SmemLayoutSFA{})),
+                   get<1>(TileShape{}),
+                   make_shape(get<1>(shape(SmemLayoutSFA{})),
+                   get<2>(shape(SmemLayoutSFA{})))),
+        make_stride(get<0>(stride(SmemLayoutSFA{})), _0{},
+                    make_stride(get<1>(stride(SmemLayoutSFA{})), get<2>(stride(SmemLayoutSFA{}))))
+      ));                                                                                       // (BLK_M,BLK_N,(BLK_K,P))
+    Tensor sSFB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_SFB.data()), make_layout(
+        make_shape(get<0>(TileShape{}),
+                   get<0>(shape(SmemLayoutSFB{})),
+                   make_shape(get<1>(shape(SmemLayoutSFB{})),
+                   get<2>(shape(SmemLayoutSFB{})))),
+        make_stride(_0{},
+                    get<0>(stride(SmemLayoutSFB{})),
+                    make_stride(get<1>(stride(SmemLayoutSFB{})),
+                    get<2>(stride(SmemLayoutSFB{}))))
+      ));                                                                                       // (BLK_M,BLK_N,(BLK_K,P))
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsSFA = tiled_mma.get_slice(thread_idx).partition_C(sSFA);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+    Tensor tCsSFB = tiled_mma.get_slice(thread_idx).partition_C(sSFB);                 // (MMA,MMA_M,MMA_N,(MMA_K,PIPE))
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                  // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                  // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                            // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                            // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                          // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                          // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                           // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                        // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                          // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                          // PIPE
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Per block scale values for operand A and B
+    // Since scale factors always broadcast across MMA_K we slice that away
+    Tensor tCrSFA = make_tensor_like<ElementBlockScale>(tCsSFA(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+    Tensor tCrSFB = make_tensor_like<ElementBlockScale>(tCsSFB(_, _, _, _0{}));                     // (MMA,MMA_M,MMA_N)
+
+    // Prologue GMMAs
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+    auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+    pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    {
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+      warpgroup_wait<0>();
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= 1;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 1; --k_tile_count)
+    {
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+      ++smem_pipe_read;
+      barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_release;
+    }
+    if (k_tile_count) {
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+
+      //
+      // Compute on k_tile
+      //
+
+      int read_stage = smem_pipe_read.index();
+
+      // Load per block scale values from shared memory to registers (at most twice per block along M and/or N)
+      copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA);
+      copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB);
+
+      if constexpr (ScalePromotionInterval != 4) {
+        if (accumulation.prepare_if_needed()) {
+          tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        }
+      }
+      else {
+        // Always zero out the accumulator for finest granularity
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        // (V,M) x (V,N) => (V,M,N)
+        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_fence_operand(accumulation());
+
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{});
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_b = tCrSFB(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) {
+          filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b;
+        }
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        ElementBlockScale scale_a = tCrSFA(_0{});
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) {
+          filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a;
+        }
+      }
+      warpgroup_wait<0>();
+      pipeline.consumer_release(smem_pipe_release); // Unlock previous tile
+      // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB`
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        scale_if_needed(accumulation, scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        scale_if_needed(accumulation, tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        scale_if_needed(accumulation, tCrSFA, tCrSFB);
+      }
+    }
+    if constexpr (ScalePromotionInterval != 4) {
+      // residues only exists when granularity is not the finnest
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) {
+        ElementBlockScale scale_ab = tCrSFA(_0{});
+        accumulation.scale_residue_if_needed(scale_ab);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile == 1) {
+        accumulation.scale_residue_if_needed(tCrSFA);
+      }
+      if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_residue_if_needed(tCrSFB);
+      }
+      if constexpr (ScaleMsPerTile  > 1 && ScaleNsPerTile  > 1) {
+        accumulation.scale_residue_if_needed(tCrSFA, tCrSFB);
+      }
+    }
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // The pipeline is not released in the first iteration
+    smem_pipe_release.advance(k_tile_count - 1);
+    pipeline.consumer_release(smem_pipe_release);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..220e996a8611a4e3f666380ddab977bc535849a8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
@@ -0,0 +1,748 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA = decltype(cute::stride(LayoutA{}));
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
+  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
+                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
+                                                     ElementEMma,
+                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using SmemCopyAtomE = AutoVectorizingCopy;
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using CtaShape_MNK = TileShape;
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutE = decltype(tile_to_shape(
+      SmemLayoutAtomE{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>,
+                                                  tfloat32_t,
+                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage {
+      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
+      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 0;
+
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{};
+    StrideB dB{};
+    ElementE const* ptr_E{};
+    LayoutE layout_e{};
+  };
+
+  // Device side kernel params
+  struct Params {
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_E = decltype(make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_B = decltype(make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_E tma_load_e = make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      args.layout_a,
+      args.layout_e,
+      transaction_bytes
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool size_check = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
+    }
+    else { // If A is K-major
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!size_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    // Check if layout_a and layout_e is filled correctly
+    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    bool layout_check = true;
+    layout_check = layout_check && (layout_a_ref == args.layout_a);
+    layout_check = layout_check && (layout_e_ref == args.layout_e);
+
+    if (!layout_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
+    }
+
+    return size_check && layout_check;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
+
+      // Define the CTA-in-cluster Layout and Coord
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      // TMA Multicast Masks
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sE = as_position_independent_swizzle_tensor(
+      make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{}));                   // (BLK_M,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_thread_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
+
+    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
+    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
+
+    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+
+    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
+    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    warpgroup_fence_operand(accum);
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accum);
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      warpgroup_fence_operand(accum);
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accum);
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accum);
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    warpgroup_fence_operand(accum);
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+
+  template <class MMA_Atom,
+            class AtomLayoutMNK,
+            class PermutationMNK,
+            class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
+  {
+    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
+
+    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(PermutationMNK{}),
+                            get<2>(PermutationMNK{}));
+    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
+                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
+                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template<class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
+  {
+    // (M,K) -> (M,K)
+    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
+    // (ethrid,val) -> (M,K)
+    auto layoutE_TV = thrfrg_E(mma, ref_E);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  template <class... MArgs, class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
+  {
+    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class... CArgs, class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
+                    TiledMMA<MArgs...>  const& mma)
+  {
+    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d993d9a1f84635327ca24777ab9a49737973fd34
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,774 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedSparseFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparseFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA = decltype(cute::stride(LayoutA{}));
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
+  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
+                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
+                                                     ElementEMma,
+                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using SmemCopyAtomE = AutoVectorizingCopy;
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using CtaShape_MNK = TileShape;
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutE = decltype(tile_to_shape(
+      SmemLayoutAtomE{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>,
+                                                  tfloat32_t,
+                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage {
+      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
+      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 0;
+
+  static constexpr uint32_t TmaTransactionBytesMK =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{};
+    StrideB dB{};
+    ElementE const* ptr_E{};
+    LayoutE layout_e{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_E = decltype(make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_B = decltype(make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_E tma_load_e = make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      args.layout_a,
+      args.layout_e,
+      transaction_bytes,
+      args.mma_promotion_interval
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    bool size_check = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
+    }
+    else { // If A is K-major
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!size_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    // Check if layout_a and layout_e is filled correctly
+    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    bool layout_check = true;
+    layout_check = layout_check && (layout_a_ref == args.layout_a);
+    layout_check = layout_check && (layout_e_ref == args.layout_e);
+
+    if (!layout_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
+    }
+
+    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
+    bool interval_check = args.mma_promotion_interval % (size<2>(TileShape{}) / TiledMma().template tile_size_mnk<2>()) == 0;
+
+    if (!interval_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: MMA promotion interval is not a multiple of number of MMA instructions per tile.\n");
+    }
+
+    return size_check && layout_check && interval_check;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline,
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
+
+      // Define the CTA-in-cluster Layout and Coord
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      // TMA Multicast Masks
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sE = as_position_independent_swizzle_tensor(
+      make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{}));                   // (BLK_M,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{},
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
+
+    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
+    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
+
+    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+
+    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
+    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+
+  template <class MMA_Atom,
+            class AtomLayoutMNK,
+            class PermutationMNK,
+            class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
+  {
+    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
+
+    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(PermutationMNK{}),
+                            get<2>(PermutationMNK{}));
+    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
+                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
+                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template<class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
+  {
+    // (M,K) -> (M,K)
+    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
+    // (ethrid,val) -> (M,K)
+    auto layoutE_TV = thrfrg_E(mma, ref_E);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  template <class... MArgs, class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
+  {
+    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class... CArgs, class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
+                    TiledMMA<MArgs...>  const& mma)
+  {
+    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/base_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/base_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9c2423b2bfe384695d83cad1737e2bbfc1e0f62
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/base_grouped.h
@@ -0,0 +1,478 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Base device-level grouped kernel.
+*/
+
+#pragma once
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename BaseKernel_>
+class BaseGrouped {
+public:
+
+  using BaseKernel = BaseKernel_;
+
+  using ElementA = typename BaseKernel::ElementA;
+  using LayoutA = typename BaseKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = BaseKernel::kTransformA;
+  static int const kAlignmentA = BaseKernel::kAlignmentA;
+
+  using ElementB = typename BaseKernel::ElementB;
+  using LayoutB = typename BaseKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = BaseKernel::kTransformB;
+  static int const kAlignmentB = BaseKernel::kAlignmentB;
+
+  using ElementC = typename BaseKernel::ElementC;
+  using LayoutC = typename BaseKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  static int const kAlignmentC = BaseKernel::kAlignmentC;
+
+  using ElementAccumulator = typename BaseKernel::Mma::Policy::Operator::ElementC;
+
+  using EpilogueOutputOp = typename BaseKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename BaseKernel::ThreadblockSwizzle;
+
+  using Operator = typename BaseKernel::Operator;
+  using WarpMmaOperator = typename BaseKernel::Mma::Policy::Operator;
+
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+  using ThreadblockShape = typename BaseKernel::Mma::Shape;
+  using WarpShape = typename BaseKernel::WarpShape;
+  using InstructionShape = typename BaseKernel::InstructionShape;
+  static int const kStages = BaseKernel::Mma::kStages;
+
+  /// Argument structure
+  using Arguments = typename BaseKernel::Arguments;
+
+  using ProblemInfo = typename BaseKernel::ProblemVisitor::ProblemInfo;
+
+protected:
+
+  /// Kernel parameters object
+  typename BaseKernel::Params params_;
+
+private:
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* problem_sizes_ptr, int problem_count) {
+    int32_t tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_ptr[i];
+      BaseKernel::ProblemVisitor::possibly_transpose_problem(problem);
+      tiles += problem_tile_count(problem);
+    }
+    return tiles;
+  }
+
+  /// Copy from `data` to `workspace`
+  Status copy_to_workspace(void* workspace, void* data, size_t bytes, cudaStream_t stream = nullptr) {
+    cudaError_t cuda_error = cudaMemcpyAsync(workspace, data, bytes, cudaMemcpyHostToDevice, stream);
+    if (cuda_error != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      cuda_error = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+          "  cudaMemcpy() returned error "
+          << cudaGetErrorString(cuda_error));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Precomputes scheduling information for the grouped GEMM
+  Status precompute(Arguments const &args, int32_t tile_count, void* workspace, cudaStream_t stream = nullptr) {
+    size_t workspace_bytes = get_workspace_size(args);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+    BaseKernel::ProblemVisitor::host_precompute(args.host_problem_sizes,
+                                                args.problem_count,
+                                                args.threadblock_count,
+                                                (void*)host_workspace.data());
+    return copy_to_workspace(workspace, host_workspace.data(), workspace_bytes, stream);
+  }
+
+  /// Reorder `data` according to `indices`
+  template <typename T>
+  static void reorder_array(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+      copy.at(i) = data[indices[i]];
+    }
+
+    memcpy(data, copy.data(), indices.size() * sizeof(T));
+  }
+
+public:
+
+  /// Constructs the GEMM.
+  BaseGrouped() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return BaseKernel::can_implement(args);
+  }
+
+  /// Get the number of tiles in a problem
+  static int32_t problem_tile_count(cutlass::gemm::GemmCoord const &problem) {
+    auto grid = BaseKernel::ProblemVisitor::grid_shape(problem);
+    return BaseKernel::ProblemVisitor::tile_count(grid);
+  }
+
+  /// Get the number of tiles across all problems in a group
+  static int32_t group_tile_count(Arguments const &args) {
+    if (args.host_problem_sizes == nullptr) {
+        CUTLASS_TRACE_HOST("Received nullptr for `args.host_problem_sizes");
+        return -1;
+    }
+
+    return group_tile_count(args.host_problem_sizes, args.problem_count);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      return BaseKernel::ProblemVisitor::get_workspace_size(args.host_problem_sizes,
+                                                            args.problem_count,
+                                                            args.threadblock_count);
+    } else {
+      return 0;
+    }
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+
+    return dim3(args.threadblock_count, 1, 1);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::maximum_active_blocks()");
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    CUTLASS_TRACE_HOST("  smem_size: " << smem_size << " bytes");
+
+    cudaError_t result;
+    if (smem_size > (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        // Call cudaGetLastError() to clear the error bit
+        result = cudaGetLastError();
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    int max_active_blocks = -1;
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        Kernel<BaseKernel>,
+        BaseKernel::kThreadCount,
+        smem_size);
+
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Sorts each pointer passed in according to the indices that sort
+  /// `problem_sizes_ptr` in descending order of problem-K dimension.
+  static void sort_problems(int problem_count,
+                            cutlass::gemm::GemmCoord* problem_sizes_ptr,
+                            int64_t* lda_host_ptr,
+                            int64_t* ldb_host_ptr,
+                            int64_t* ldc_host_ptr,
+                            int64_t* ldd_host_ptr,
+                            int64_t* offset_A_ptr,
+                            int64_t* offset_B_ptr,
+                            int64_t* offset_C_ptr,
+                            int64_t* offset_D_ptr)
+  {
+    std::vector<size_t> indices(problem_count);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::stable_sort(indices.begin(), indices.end(),
+      [&problem_sizes_ptr](size_t i, size_t j) {
+        return problem_sizes_ptr[i].k() > problem_sizes_ptr[j].k();
+      });
+
+    reorder_array(problem_sizes_ptr, indices);
+    reorder_array(lda_host_ptr, indices);
+    reorder_array(ldb_host_ptr, indices);
+    reorder_array(ldc_host_ptr, indices);
+    reorder_array(ldd_host_ptr, indices);
+    reorder_array(offset_A_ptr, indices);
+    reorder_array(offset_B_ptr, indices);
+    reorder_array(offset_C_ptr, indices);
+    reorder_array(offset_D_ptr, indices);
+  }
+
+  /// Computes the number of threadblocks to launch for the grouped kernel
+  static int sufficient(const cutlass::gemm::GemmCoord* problem_sizes_ptr=nullptr,
+                        int problem_count=0,
+                        int available_sm_count=-1) {
+    // Determine the number of blocks that would be launched to fill up a single
+    // wave on the GPU with each SM having maximum occupancy.
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      // Call cudaGetLastError() to clear the error bit
+      result = cudaGetLastError();
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error "
+          << cudaGetErrorString(result));
+      return 0;
+    }
+
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_idx);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
+    if (override_sm_count) {
+      available_sm_count = multiprocessor_count;
+    }
+
+    int max_active_blocks = maximum_active_blocks();
+    if (max_active_blocks <= 0) {
+      return 0;
+    }
+
+    int occupancy_based_block_count = available_sm_count * max_active_blocks;
+
+    if (problem_sizes_ptr == nullptr || problem_count == 0) {
+      return occupancy_based_block_count;
+    }
+
+    int total_tiles = group_tile_count(problem_sizes_ptr, problem_count);
+
+    // If the group contains a single problem, launching the exact number of
+    // threadblocks needed to cover the problem minimizes the work performed
+    // per threadblock in finding the next tile to compute. We return total_tiles
+    // unless the user has provided the SM count.
+    if (problem_count == 1 && override_sm_count) {
+      return total_tiles;
+    }
+
+    // Choose between the full wave of threadblocks and the tile count. If there
+    // are fewer tiles in the group than threadblocks in the full wave, only
+    // some threadblocks will be assigned tiles. Those threadblocks
+    // which are not assigned tiles still need to perform the work of iterating through
+    // problem sizes to determine that they have no work to do. This competes for cycles
+    // with those threadblocks that are assigned tiles to compute.
+    return std::min(total_tiles, occupancy_based_block_count);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    CUTLASS_TRACE_HOST("BaseGrouped::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Workspace
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace, stream);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_ = typename BaseKernel::Params(args, workspace, tile_count);
+    } else {
+      params_ = typename BaseKernel::Params(args, workspace);
+    }
+
+    // Specify shared memory capacity for kernel.
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<BaseKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    if (BaseKernel::ProblemVisitor::kRequiresPrecomputation) {
+      int32_t tile_count = group_tile_count(args);
+      Status status = precompute(args, tile_count, workspace, stream);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      params_.update(args, workspace, tile_count);
+    } else {
+      params_.update(args, workspace);
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Configure grid and block dimensions
+    //
+
+    if (!params_.problem_visitor.problem_count) {
+      return Status::kSuccess;
+    }
+
+    dim3 grid(params_.threadblock_count, 1, 1);
+    dim3 block(BaseKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    //
+    // Launch kernel
+    //
+
+    // Launch
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Initializes and runs the kernel.
+  Status operator()(
+    Arguments const &args,
+    void *workspace,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
new file mode 100644
index 0000000000000000000000000000000000000000..75edf2fc2c92ad344f4e30790b80e8185e0744db
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -0,0 +1,955 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Definitions for GEMM structures
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename OperatorClass,
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC,
+  typename ElementAccumulator
+>
+struct DefaultGemmConfiguration;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassSimt, 
+  ArchTag,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  using ThreadblockShape = GemmShape<128, 128, 8>;
+  using WarpShape = GemmShape<32, 64, 8>;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    1,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ArchTag,
+  typename ElementC>
+struct DefaultGemmConfiguration<arch::OpClassSimt, ArchTag, int8_t, int8_t, ElementC, int32_t> {
+  
+  static int const kAlignmentA = 4;
+  static int const kAlignmentB = 4;
+  using ThreadblockShape = GemmShape<128, 128, 32>;
+  using WarpShape = GemmShape<32, 64, 32>;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+    ElementC,
+    1,
+    int32_t,
+    float
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ArchTag,
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassWmmaTensorOp, 
+  ArchTag,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  static int const kStages = 2;
+  
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm70,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 32>;
+  using WarpShape = GemmShape<64, 64, 32>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 2;
+  
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, 
+  typename ElementB, 
+  typename ElementC, 
+  typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75,
+  ElementA, 
+  ElementB, 
+  ElementC, 
+  ElementAccumulator> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
+  using ThreadblockShape = GemmShape<128, 256, 32>;
+  using WarpShape = GemmShape<64, 64, 32>;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+    ElementC,
+    128 / sizeof_bits<ElementC>::value,
+    ElementAccumulator,
+    ElementAccumulator
+  >;
+
+  using Operator = typename platform::conditional<
+      (platform::is_same<ElementA, int8_t>::value ||
+       platform::is_same<ElementA, int4b_t>::value ||
+       platform::is_same<ElementA, uint8_t>::value ||
+       platform::is_same<ElementA, uint4b_t>::value),
+      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<8, 8, 16>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+   
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  int4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+    
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+  
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+   
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<8, 8, 32>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm75, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+    
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<8, 8, 128>;
+  static int const kStages = 2;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpXorPopc;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementA, typename ElementB, typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, ElementA,
+                                ElementB, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementA>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 16>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = typename platform::conditional<
+      (platform::is_same<ElementA, int8_t>::value ||
+       platform::is_same<ElementA, int4b_t>::value ||
+       platform::is_same<ElementA, uint8_t>::value ||
+       platform::is_same<ElementA, uint4b_t>::value),
+      arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 128, 16>;
+  using WarpShape = GemmShape<32, 64, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 1, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm80, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+     
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+ 
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  int8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint8_t, 
+  uint8_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<uint8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint8_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+      
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  int4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  int4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint4b_t, 
+  uint4b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint4b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 128>;
+  using WarpShape = GemmShape<64, 64, 128>;
+  using InstructionShape = GemmShape<16, 8, 64>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template < 
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp, 
+  arch::Sm80, 
+  uint1b_t, 
+  uint1b_t, 
+  ElementC, 
+  int32_t> {
+       
+  static int const kAlignmentA = 128 / sizeof_bits<uint1b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<uint1b_t>::value;
+  
+  using ThreadblockShape = GemmShape<128, 256, 512>;
+  using WarpShape = GemmShape<64, 64, 512>;
+  using InstructionShape = GemmShape<16, 8, 256>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int4b_t,
+  int8_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int4b_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int8_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementC>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm80,
+  int8_t,
+  int4b_t,
+  ElementC,
+  int32_t> {
+
+  static int const kAlignmentA = 128 / sizeof_bits<int8_t>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<int4b_t>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp<
+      ElementC, 128 / sizeof_bits<ElementC>::value, int32_t, float>;
+
+  using Operator = arch::OpMultiplyAddSaturate;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Base configuration for all {fe4m3, fe5m2} x {fe4m3, fe5m2} combinations on SM89
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementAccumulator>
+struct DefaultGemmConfigurationSm89F8 {
+  static_assert((platform::is_same<ElementA, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementA, cutlass::float_e5m2_t>::value),
+                "ElementA must be of type float_e4m3_t or float_e5m2_t");
+  static_assert((platform::is_same<ElementB, cutlass::float_e4m3_t>::value ||
+                 platform::is_same<ElementB, cutlass::float_e5m2_t>::value),
+                "ElementB must be of type float_e4m3_t or float_e5m2_t");
+
+  static int const kAlignmentA = 128 / sizeof_bits<ElementA>::value;
+  static int const kAlignmentB = 128 / sizeof_bits<ElementB>::value;
+
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 32>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 128 / sizeof_bits<ElementC>::value, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+/// Partial specialization for SM89 fe4m3 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe4m3 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e4m3_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e4m3_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe4m3
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e4m3_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e4m3_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+/// Partial specialization for SM89 fe5m2 x fe5m2
+template <typename ElementC, typename ElementAccumulator>
+struct DefaultGemmConfiguration<
+  arch::OpClassTensorOp,
+  arch::Sm89,
+  cutlass::float_e5m2_t,
+  cutlass::float_e5m2_t,
+  ElementC,
+  ElementAccumulator> : DefaultGemmConfigurationSm89F8<
+                            cutlass::float_e5m2_t,
+                            cutlass::float_e5m2_t,
+                            ElementC,
+                            ElementAccumulator> {};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename ElementC,
+          typename ElementAccumulator>
+struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm90, double,
+                                double, ElementC, ElementAccumulator> {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<128, 256, 64>;
+  using WarpShape = GemmShape<64, 64, 64>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      ElementC, 1, ElementAccumulator,
+      ElementAccumulator>;
+
+  using Operator = arch::OpMultiplyAdd;
+};
+
+template <>
+struct DefaultGemmConfiguration<
+    arch::OpClassTensorOp, 
+    arch::Sm90, 
+    complex<double>,
+    complex<double>, 
+    complex<double>,
+    complex<double>
+  > {
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  
+  using ThreadblockShape = GemmShape<64, 64, 16>;
+  using WarpShape = GemmShape<32, 32, 16>;
+  using InstructionShape = GemmShape<16, 8, 4>;
+  static int const kStages = 3;
+
+  using EpilogueOutputOp = epilogue::thread::LinearCombination<
+      complex<double>, 1, complex<double>,
+      complex<double>>;
+
+  using Operator = arch::OpMultiplyAddComplex;
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/ell_gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/ell_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..097debf5bed5e356881f8ef7e8515d726645f8d6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/ell_gemm.h
@@ -0,0 +1,849 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/ell_gemm.h"
+
+#include "cutlass/gemm/kernel/default_ell_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Blocked-Ell sparse gemm device-level operator. This is an interface to efficient CUTLASS
+  Blocked-Ell kernels that may be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to Blocked-Ell problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  Example of a CUTLASS EllGemm operator is as follows:
+
+    //
+    // Instantiate the CUTLASS EllGemm operator.
+    //
+
+    cutlass::gemm::device::EllGemm<
+      cutlass::half_t,
+      cutlass::layout::RowMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      cutlass::half_t,
+      cutlass::layout::ColumnMajor,
+      float, 
+      cutlass::arch::OpClassTensorOp, 
+      cutlass::arch::Sm80,
+      cutlass::gemm::GemmShape<128, 128, 32>,
+      cutlass::gemm::GemmShape<64, 64, 32>, 
+      cutlass::gemm::GemmShape<16, 8, 16>,
+      cutlass::epilogue::thread::LinearCombination<
+          cutlass::half_t, 128 / cutlass::sizeof_bits<cutlass::half_t>::value,
+          float, float>,
+      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>, 
+      4, // Stages
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value, // Alignment A
+      128 / cutlass::sizeof_bits<cutlass::half_t>::value  // Alignment B
+    > ellgemm_op;
+
+    //
+    // Launch the EllGemm operation on the device
+    //
+
+    Description of parameters and tensors used to represent the Blocked-Ellpack (ELL) format:
+      a_rows              - Rows in the sparse matrix.
+      a_cols              - Columns in the sparse matrix.
+      BlockedEllA         - Packed matrix (ellValue matrix) that stores non-zero values in 
+                            consecutive blocks, whose size is (a_rows * a_ell_num_columns)
+      ell_idx             - Blocked-ELL Column indices (ellColInd) matrix, whose size is
+                            (a_rows / a_ell_blocksize) * (a_ell_num_columns / a_ell_blocksize)
+      a_ell_blocksize     - Size of the ELL-Blocks.
+      a_ell_num_columns   - Number of columns in the Blocked-Ellpack format (ellValue columns)
+      B                   - Input dense matrix whose size is (a_cols * n)
+      C/D                 - Output dense matrix whose size is (a_rows * n)
+
+    cutlass::Status status = ellgemm_op({
+      {a_rows, n, a_cols},  // GemmCoord problem_size
+      {BlockedEllA, lda},   // TensorRef<cutlass::half_t, layout::RowMajor> ref_BlockedEllA
+      {B, ldb},             // TensorRef<cutlass::half_t, layout::ColumnMajor> ref_B,
+      {C, ldc},             // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},             // TensorRef<float, layout::ColumnMajor> ref_D,
+      ell_idx,              // Blocked-ELL Column indices or ellColInd matrix (const int*)
+      a_ell_num_columns,    // Columns in the Blocked-Ellpack (ellValue) matrix (int)
+      a_ell_blocksize,      // Size of the ELL-Blocks (int)
+      a_ell_base,           // Base index of ellColInd (int) - Zero or One
+      {alpha, beta}         // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// Supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by GEMM
+      typename Operator,
+
+      /// Sparse matrix is A or not
+      bool IsASparse
+    >
+    class EllGemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse = true
+    >
+class EllGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kIsASparse = IsASparse;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kIsASparse
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_{};
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+                                              args.problem_size, 
+                                              {args.ell_blocksize,
+                                              ThreadblockShape::kN, ThreadblockShape::kK},
+                                              args.split_k_slices);
+      
+    tiled_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    return Status::kSuccess;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {args.ell_blocksize, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    grid_shape.m() *= (args.ell_blocksize + ThreadblockShape::kM - 1 ) / ThreadblockShape::kM;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    return set(args, grid_shape, workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+class EllGemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, IsASparse> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kSplitKSerial = SplitKSerial;
+  static bool const kIsASparse = false;
+
+  using UnderlyingOperator = EllGemm< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    SplitKSerial,
+    Operator,
+    kIsASparse
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    const int* ell_idx;
+    int ell_ncol;
+    int ell_blocksize;
+    int ell_base_idx;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      const int* ell_idx_,
+      int ell_ncol_,
+      int ell_blocksize_,
+      int ell_base_idx_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ell_idx(ell_idx_),
+      ell_ncol(ell_ncol_),
+      ell_blocksize(ell_blocksize_),
+      ell_base_idx(ell_base_idx_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  EllGemm() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.ell_idx,
+      args.ell_ncol,
+      args.ell_blocksize,
+      args.ell_base_idx,
+      args.epilogue,
+      args.split_k_slices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    tiled_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  Status set(Arguments const &args, cutlass::gemm::GemmCoord const &grid_shape, void *workspace){
+    // Initialize the Params structure
+    return underlying_operator_.set(to_underlying_arguments(args), grid_shape, workspace);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()}, 
+      {ThreadblockShape::kM, args.ell_blocksize, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    grid_shape.n() *= (args.ell_blocksize + ThreadblockShape::kN - 1 ) / ThreadblockShape::kN;
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    set(args, grid_shape, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4ea4ebe86bedabc28b3ea667dcd8f735b667868
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm.h
@@ -0,0 +1,772 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute>
+class Gemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    SharedMemoryClearOption::kNone,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      int const *gather_A_indices_ = nullptr,
+      int const *gather_B_indices_ = nullptr,
+      int const *scatter_D_indices_ = nullptr
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  Gemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace),
+      args.gather_A_indices,
+      args.gather_B_indices,
+      args.scatter_D_indices
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
+>
+class Gemm<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
+           Operator_, GatherA, GatherB, ScatterD, PermuteDLayout> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = Gemm< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    SplitKSerial,
+    Operator,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    // For gather+scatter operations
+    int *gather_A_indices;
+    int *gather_B_indices;
+    int *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      int *gather_A_indices_ = nullptr,
+      int *gather_B_indices_ = nullptr,
+      int *scatter_D_indices_ = nullptr
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      gather_A_indices(gather_A_indices_),
+      gather_B_indices(gather_B_indices_),
+      scatter_D_indices(scatter_D_indices_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  Gemm() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices,
+      args.gather_B_indices,
+      args.gather_A_indices,
+      args.scatter_D_indices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_array.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab5ed26b0d5008d9164661a2b1763f86540b41c5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_array.h
@@ -0,0 +1,738 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_array.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmArray {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  using Operator = Operator_;
+
+  /// Define the kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    false,
+    Operator
+  >::GemmKernel;
+
+  using GemmKernel = kernel::GemmArray<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+
+    ElementA const * const *ptr_A;
+    LayoutA layout_A;
+
+    ElementB const * const *ptr_B;
+    LayoutB layout_B;
+
+    ElementC const * const *ptr_C;
+    LayoutC layout_C;
+
+    ElementC * const * ptr_D;
+    LayoutC layout_D;
+    
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      ElementA const * const *ptr_A_,
+      LayoutA layout_A_,
+      ElementB const * const *ptr_B_,
+      LayoutB layout_B_,
+      ElementC const * const *ptr_C_,
+      LayoutC layout_C_,
+      ElementC * const * ptr_D_,
+      LayoutC layout_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ptr_A(ptr_A_),
+      layout_A(layout_A_),
+      ptr_B(ptr_B_),
+      layout_B(layout_B_),
+      ptr_C(ptr_C_),
+      layout_C(layout_C_),
+      ptr_D(ptr_D_),
+      layout_D(layout_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmArray() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (args.layout_A.stride(0) % kAlignmentA) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_B.stride(0) % kAlignmentB) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_C.stride(0) % kAlignmentC) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (args.layout_D.stride(0) % kAlignmentC) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ptr_A,
+      args.layout_A,
+      args.ptr_B,
+      args.layout_B,
+      args.ptr_C,
+      args.layout_C,
+      args.ptr_D,
+      args.layout_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      args.batch_count,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK});
+
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ptr_A,
+      args.layout_A,
+      args.ptr_B,
+      args.layout_B,
+      args.ptr_C,
+      args.layout_C,
+      args.ptr_D,
+      args.layout_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB,
+  typename Operator_
+>
+class GemmArray<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  AlignmentA,
+  AlignmentB,
+  Operator_
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = false;
+
+  //
+  using UnderlyingOperator = GemmArray< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+
+    ElementA const * const *ptr_A;
+    LayoutA layout_A;
+
+    ElementB const * const *ptr_B;
+    LayoutB layout_B;
+
+    ElementC const * const *ptr_C;
+    LayoutC layout_C;
+
+    ElementC * const * ptr_D;
+    LayoutC layout_D;
+    
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      ElementA const * const *ptr_A_,
+      LayoutA layout_A_,
+      ElementB const * const *ptr_B_,
+      LayoutB layout_B_,
+      ElementC const * const *ptr_C_,
+      LayoutC layout_C_,
+      ElementC * const * ptr_D_,
+      LayoutC layout_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ptr_A(ptr_A_),
+      layout_A(layout_A_),
+      ptr_B(ptr_B_),
+      layout_B(layout_B_),
+      ptr_C(ptr_C_),
+      layout_C(layout_C_),
+      ptr_D(ptr_D_),
+      layout_D(layout_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmArray() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+
+    GemmCoord problem_size{
+      args.problem_size.n(), 
+      args.problem_size.m(), 
+      args.problem_size.k()
+    };
+
+    return UnderlyingArguments(
+      problem_size,
+      args.ptr_B,
+      args.layout_B.stride(),
+      args.ptr_A,
+      args.layout_A.stride(),
+      args.ptr_C,
+      args.layout_C.stride(),
+      args.ptr_D,
+      args.layout_D.stride(),
+      args.epilogue,
+      args.batch_count
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_batched.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_batched.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a5b4105b3ad23151c534f0bd42884a33fe296a3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_batched.h
@@ -0,0 +1,704 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined batch GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_batched.h"
+
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmBatched {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  using Operator = Operator_;
+
+  /// Define the kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    false,
+    Operator
+  >::GemmKernel;
+
+  using GemmKernel = kernel::GemmBatched<typename DefaultGemmKernel::Mma, typename DefaultGemmKernel::Epilogue, ThreadblockSwizzle>;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    int64_t stride_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    int64_t stride_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    int64_t stride_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    int64_t stride_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      int64_t stride_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      int64_t stride_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      int64_t stride_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      int64_t stride_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!TensorRef_aligned(args.ref_A, kAlignmentA) || (args.stride_A % kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_B, kAlignmentB) || (args.stride_B % kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_C, kAlignmentC) || (args.stride_C % kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(args.ref_D, kAlignmentC) || (args.stride_D % kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.stride_A,
+      args.ref_B.non_const_ref(),
+      args.stride_B,
+      args.ref_C.non_const_ref(),
+      args.stride_C,
+      args.ref_D,
+      args.stride_D,
+      args.epilogue,
+      args.batch_count
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data()); 
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Access granularity of A matrix in units of elements
+  int AlignmentA,
+  /// Access granularity of B matrix in units of elements
+  int AlignmentB,
+  typename Operator_
+>
+class GemmBatched<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  AlignmentA,
+  AlignmentB,
+  Operator_
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = false;
+
+  //
+  using UnderlyingOperator = GemmBatched< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    int64_t stride_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    int64_t stride_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    int64_t stride_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    int64_t stride_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int batch_count;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      int64_t stride_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      int64_t stride_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      int64_t stride_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      int64_t stride_D_,
+      typename EpilogueOutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmBatched() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      args.stride_B,
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      args.stride_A,
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      args.stride_C,
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.stride_D,
+      args.epilogue,
+      args.batch_count
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0403230af18a8c12983a8e9d8b71d840d4f84f7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_complex.h
@@ -0,0 +1,718 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM
+  kernels that may be invoked from host code.
+
+  The contributions of this class are:
+
+    1. At compile time, it maps data types and high-level structural parameters
+  onto specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel
+  parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most
+  plausible GEMM configurations for each supported architecture. Consequently,
+  not all parameters are exposed to the top-level interface. Rather, sensible
+  defaults at each level of the CUTLASS hierarchy are selected to tradeoff
+  simplicity of the interface with flexibility. We expect most configurations to
+  be specified at this level. Applications with more exotic requirements may
+  construct their kernels of interest using CUTLASS components at the
+  threadblock, warp, and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects
+  compose some internal state with an overloaded function call operator. This
+  enables decoupling of initialization from execution, possibly reducing
+  overhead during steady state phases of application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each
+  logical input to the computation. This is distinct from the kernel-level
+  Params structure pattern which contains application-specific precomputed state
+  needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's
+  SGEMM NN is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+
+      /// Layout type for A matrix operand
+      typename LayoutA,
+
+      /// Element type for B matrix operand
+      typename ElementB,
+
+      /// Layout type for B matrix operand
+      typename LayoutB,
+
+      /// Element type for C and D matrix operands
+      typename ElementC,
+
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator
+    // (selects complex or gaussian complex)
+    typename Operator_ = arch::OpMultiplyAddComplex,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false>
+class GemmComplex {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static bool const kSplitKSerial = SplitKSerial;
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultGemmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kTransformA,
+    kTransformB,
+    Operator,
+    kSplitKSerial
+  >::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmComplex() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      // Determine grid shape
+      ThreadblockSwizzle threadblock_swizzle;
+
+      cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+        args.problem_size, 
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.split_k_slices);
+
+      return sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Epilogue output operator
+  typename EpilogueOutputOp_,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (selects complex or gaussian complex)
+  typename Operator_,
+  /// If true, kernel supports split-K as a serial reduction
+  bool SplitKSerial
+>
+class GemmComplex<
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  layout::ColumnMajor,    // partially specialized on LayoutC
+  ElementAccumulator_,
+  OperatorClass_,
+  ArchTag_,
+  ThreadblockShape_,
+  WarpShape_,
+  InstructionShape_,
+  EpilogueOutputOp_,
+  ThreadblockSwizzle_,
+  Stages,
+  TransformA,
+  TransformB,
+  Operator_,
+  SplitKSerial
+> {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static int const kStages = Stages;
+  using Operator = Operator_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = GemmComplex< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformB,
+    TransformA,
+    Operator,
+    SplitKSerial
+  >;
+  
+  static int const kAlignmentA = UnderlyingOperator::kAlignmentB;
+  static int const kAlignmentB = UnderlyingOperator::kAlignmentA;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+  static ComplexTransform const kTransformA = UnderlyingOperator::kTransformB;
+  static ComplexTransform const kTransformB = UnderlyingOperator::kTransformA;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) { }
+  };
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmComplex() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c1c9bc75a81920ed69844b9558d4b3a7b38826c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_grouped.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Device-level grouped GEMM.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/device/base_grouped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM Grouped
+template <typename GemmKernel_>
+class GemmGrouped : public BaseGrouped<GemmKernel_> {
+public:
+  using GemmKernel = GemmKernel_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdc2e5f327b81524fae86ac37c86cee25e561e20
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,385 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Device-level GEMM with layernorm elementwise operations fused in mainloop
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GemmLayernormMainloopFusion : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmLayernormMainloopFusion<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementScaleBias_,
+      LayoutScaleBias_,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_
+>
+class GemmLayernormMainloopFusion<ElementA_, LayoutA_, ElementB_, LayoutB_, 
+           ElementScaleBias_, LayoutScaleBias_,
+           ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementScaleBias = ElementScaleBias_;
+  using LayoutScaleBias = LayoutScaleBias_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  using UnderlyingOperator = typename GemmLayernormMainloopFusion< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementScaleBias,
+    LayoutScaleBias, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmLayernormMainloopFusion() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..57f345f41f625e673ed29254954bc392130a82c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse.h
@@ -0,0 +1,515 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Gemm device-level operator. This is an interface to efficient CUTLASS GEMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to GEMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible GEMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS GEMM operator implementing the functionality of cuBLAS's SGEMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS GEMM operator.
+    //
+
+    cutlass::gemm::device::Gemm<
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor
+    > gemm_op;
+
+    //
+    // Launch the GEMM operation on the device
+    //
+
+    cutlass::Status status = gemm_op({
+      {m, n, k},                          // GemmCoord problem_size,
+      {A, lda},                           // TensorRef<float, layout::ColumnMajor> ref_A,
+      {B, ldb},                           // TensorRef<float, layout::ColumnMajor> ref_B,
+      {C, ldc},                           // TensorRef<float, layout::ColumnMajor> ref_C,
+      {D, ldd},                           // TensorRef<float, layout::ColumnMajor> ref_D,
+      {alpha, beta}                       // EpilogueOutputOp::Params epilogue_op_params
+    });
+
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages
+    >
+    class Gemm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class SparseGemm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    TensorRef<ElementE const, LayoutE> ref_E;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      TensorRef<ElementE, LayoutE> ref_E_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ref_E(ref_E_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemm() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref(),
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c92030c00157f577ce69acca1a48025d52f4799
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal.h
@@ -0,0 +1,211 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  GemmSparseUniversal is a stateful, reusable Sparse GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversal : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ampere sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversal<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42c82b47f128b57d9fc3002fd7e750565beed66
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,202 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSparseUniversalWithAbsmax :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  static_assert((platform::is_same<LayoutC_, layout::RowMajor>::value),
+             "Epilogue of Ada sparse GEMM must be row major for now.");
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmSparseUniversalWithAbsmax<
+      ElementA_,
+      LayoutA_,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b86f123388502f746e011d27cd3ff07df1d5607
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
@@ -0,0 +1,360 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a sparse GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class SparseGemmWithAbsmax {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithAbsmax<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  using Arguments = typename GemmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithAbsmax() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.split_k_slices > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    if (kSplitKSerial) {
+      if (args.split_k_slices > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.split_k_slices > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_C.non_const_ref(),
+      args.ref_D,
+      args.ref_E.non_const_ref(),
+      args.ref_Aux,
+      args.ptr_Vector,
+      args.ldr,
+      args.epilogue,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.split_k_slices > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_C.reset(args.ref_C.non_const_ref().data());
+    params_.ref_D.reset(args.ref_D.data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+    params_.semaphore = static_cast<int *>(workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c700733502d12ea17df5dbf5a5beec7b76c0ccec
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Sparse GEMM with visitor
+ */
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks_ =
+        typename cutlass::epilogue::threadblock::detail::EmptyCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+class SparseGemmWithVisitor {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using FusionCallbacks = FusionCallbacks_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using MathOperator = Operator;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+
+  /// Define the kernel
+  using GemmKernel = typename kernel::DefaultSparseGemmWithVisitor<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    FusionCallbacks,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    EpilogueStages
+  >::GemmKernel;
+
+  using ElementE = typename GemmKernel::ElementE;
+
+  using LayoutE = typename GemmKernel::LayoutE;
+
+  static int const kAlignmentE = 128 / sizeof_bits<ElementE>::value;
+
+  static int const kSparse = GemmKernel::kSparse;
+  static int const kMetaSizeInBits = GemmKernel::kMetaSizeInBits;
+  static int const kElementsPerElementE = GemmKernel::kElementsPerElementE;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementE const, LayoutE> ref_E;
+    typename FusionCallbacks::Arguments epilogue;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementE, LayoutE> ref_E_,
+      typename FusionCallbacks::Arguments epilogue_ = 
+        typename FusionCallbacks::Arguments()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_E(ref_E_),
+      epilogue(epilogue_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  SparseGemmWithVisitor() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = GemmKernel::can_implement(
+      args.problem_size,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      cutlass::TensorRef<ElementC, LayoutC>(), // It only matters that it's empty.
+      cutlass::TensorRef<ElementC, LayoutC>(), // Same as above.
+      args.ref_E.non_const_ref()
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    size_t bytes = 0;
+
+    return bytes;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    constexpr int SplitKSlices = 1;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      SplitKSlices);
+
+    // Initialize the Params structure
+    params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      args.ref_E.non_const_ref(),
+      args.epilogue
+    };
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<GemmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A.reset(args.ref_A.non_const_ref().data());
+    params_.ref_B.reset(args.ref_B.non_const_ref().data());
+    params_.ref_E.reset(args.ref_E.non_const_ref().data());
+    params_.output_op = args.epilogue;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+
+    cutlass::Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cf506f53d7df39449df73de3034163ccc72606f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -0,0 +1,636 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for GEMM performing a reduction over K partitions in parallel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemm_splitk_parallel.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  Gemm device-level operator performing parallel reduction over the K partition.
+
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename ConvertScaledOp_ = cutlass::epilogue::thread::Convert<
+        ElementAccumulator_,
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementAccumulator_,
+                                 ElementAccumulator_>::EpilogueOutputOp::kCount,
+        ElementAccumulator_>,
+    /// Reduction operator
+    typename ReductionOp_ = cutlass::reduction::thread::ReduceAdd<
+        ElementAccumulator_, typename EpilogueOutputOp_::ElementAccumulator,
+        EpilogueOutputOp_::kCount>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        threadblock::GemmSplitKHorizontalThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class GemmSplitKParallel {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ConvertScaledOp = ConvertScaledOp_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ReductionOp = ReductionOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+
+  /// GEMM kernel 
+  using GemmKernel = typename kernel::DefaultGemmSplitKParallel<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    ConvertScaledOp,
+    ThreadblockSwizzle,
+    kStages,
+    Operator
+  >::GemmKernel;
+
+  /// Reduction kernel
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  //
+  //
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    typename ConvertScaledOp::Params convert;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      typename ConvertScaledOp::Params convert_ = 
+        typename ConvertScaledOp::Params(),
+      typename ReductionOp::Params reduction_ =
+        typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      convert(convert_),
+      reduction(reduction_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GemmKernel::Params gemm_params_;
+
+  /// Reduction kernel parameters object
+  typename ReductionKernel::Params reduction_params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmSplitKParallel() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    return sizeof(ElementAccumulator_) * size_t(args.problem_size.m()) * size_t(args.problem_size.n()) * grid_shape.k();
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.split_k_slices);
+
+    // Define a reference to the workspace - this is an aligned region in device memory.
+    if (!workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+    
+    TensorRef<ElementAccumulator_, layout::RowMajor> ref_workspace(
+      static_cast<ElementAccumulator_ *>(workspace), 
+      args.problem_size.n());
+
+    int64_t partition_stride = int64_t(args.problem_size.m()) * int64_t(args.problem_size.n());
+
+    // Initialize the Params structure
+    gemm_params_ = typename GemmKernel::Params{
+      args.problem_size,
+      grid_shape,
+      args.ref_A.non_const_ref(),
+      args.ref_B.non_const_ref(),
+      ref_workspace,
+      args.convert,
+      partition_stride
+    };
+
+    reduction_params_ = typename ReductionKernel::Params(
+      args.problem_size.mn(),
+      grid_shape.k(),
+      partition_stride,
+      ref_workspace,
+      args.ref_D,
+      args.ref_C.non_const_ref(),
+      args.epilogue
+    );
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    if (!workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    gemm_params_.ref_A.reset(args.ref_A.data());
+    gemm_params_.ref_B.reset(args.ref_B.data());
+    gemm_params_.ref_D.reset(workspace);     
+
+    reduction_params_.ref_D.reset(args.ref_D.data());
+    reduction_params_.ref_C.reset(args.ref_C.data());
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    //
+    // Launch GEMM kernel
+    //
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(gemm_params_.grid_tiled_shape);
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+
+      result = cudaFuncSetAttribute(
+        Kernel<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    Kernel<GemmKernel><<<grid, block, smem_size, stream>>>(gemm_params_);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    //
+    // Launch reduction kernel
+    //
+
+    block = ReductionKernel::block_shape();
+    grid = ReductionKernel::grid_shape(gemm_params_.problem_size.mn());
+
+    Kernel<ReductionKernel><<< grid, block, 0, stream >>>(reduction_params_);
+
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Epilogue output operator
+    typename ConvertScaledOp_,
+    /// Reduction operator
+    typename ReductionOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages, int kAlignmentA, int kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_>
+class GemmSplitKParallel<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+                         layout::ColumnMajor, ElementAccumulator_,
+                         OperatorClass_, ArchTag_, ThreadblockShape_,
+                         WarpShape_, InstructionShape_, EpilogueOutputOp_,
+                         ConvertScaledOp_, ReductionOp_, ThreadblockSwizzle_,
+                         Stages, kAlignmentA, kAlignmentB, Operator_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ConvertScaledOp = ConvertScaledOp_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ReductionOp = ReductionOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+
+  using UnderlyingOperator = GemmSplitKParallel< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ConvertScaledOp,
+    ReductionOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentA,
+    kAlignmentB,
+    Operator
+  >;
+
+  using UnderlyingArguments = typename UnderlyingOperator::Arguments;
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  using ReductionKernel = typename UnderlyingOperator::ReductionKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRef<ElementA const, LayoutA> ref_A;
+    TensorRef<ElementB const, LayoutB> ref_B;
+    TensorRef<ElementC const, LayoutC> ref_C;
+    TensorRef<ElementC, LayoutC> ref_D;
+    typename EpilogueOutputOp::Params epilogue;
+    int split_k_slices;
+    typename ConvertScaledOp::Params convert;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() { }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRef<ElementA const, LayoutA> ref_A_,
+      TensorRef<ElementB const, LayoutB> ref_B_,
+      TensorRef<ElementC const, LayoutC> ref_C_,
+      TensorRef<ElementC, LayoutC> ref_D_,
+      typename EpilogueOutputOp::Params epilogue_ = 
+        typename EpilogueOutputOp::Params(),
+      int split_k_slices = 1,
+      typename ConvertScaledOp::Params convert_ = 
+        typename ConvertScaledOp::Params(),
+      typename ReductionOp::Params reduction_ =
+        typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices),
+      convert(convert_),
+      reduction(reduction_) { }
+  };
+
+private:
+
+  /// Kernel parameters object
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmSplitKParallel() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static UnderlyingArguments to_underlying_arguments(Arguments const &args) {
+    return UnderlyingArguments(
+      {args.problem_size.n(), args.problem_size.m(), args.problem_size.k()},
+      {args.ref_B.data(), args.ref_B.stride(0)},
+      {args.ref_A.data(), args.ref_A.stride(0)},
+      {args.ref_C.data(), args.ref_C.stride(0)},
+      {args.ref_D.data(), args.ref_D.stride(0)},
+      args.epilogue,
+      args.split_k_slices,
+      args.convert,
+      args.reduction
+    );
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2c76eb86ddcb659fa9b41184fb362c45884c719
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal.h
@@ -0,0 +1,442 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  GemmUniversal is a stateful, reusable GEMM handle.  Once initialized for a given GEMM computation
+  (problem geometry and data references), it can be reused across different GEMM problems having the
+  geometry.  (Once initialized, details regarding problem geometry and references to workspace memory
+  cannot be updated.)
+
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout_ = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute
+>
+class GemmUniversal : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmUniversal<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmUniversal<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone,
+      GatherA,
+      GatherB,
+      ScatterD,
+      PermuteDLayout_,
+      PermuteALayout_,
+      PermuteBLayout_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout_,
+    /// Permute operand A
+    typename PermuteALayout_,
+    /// Permute operand B
+    typename PermuteBLayout_
+>
+class GemmUniversal<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD,
+           PermuteDLayout_, PermuteALayout_, PermuteBLayout_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  using PermuteDLayout = PermuteDLayout_;
+  using PermuteALayout = PermuteALayout_;
+  using PermuteBLayout = PermuteBLayout_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversal< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout,
+    PermuteBLayout,
+    PermuteALayout
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversal() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..390e41f899037193ff4b795e9c51b62125854125
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -0,0 +1,784 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and
+    batched array variants.
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+// 2.x
+#include "cutlass/gemm/device/gemm_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+// 3.x
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  GemmUniversalAdapter is a stateful, reusable GEMM handle built around a kernel
+  of type cutlass::gemm::kernel::Gemm or cutlass::gemm::kernel::GemmUniversal.
+
+  It manages the lifetime of the underlying `kernel::Params` struct, and exposes APIs
+  to create it from the host facing arguments. For power users, new static methods
+  are exposed in 3.x APIs that bypass the stateful methods or args->params lowering.
+
+  It supports kernel types that implement both the 2.x and 3.0 APIs,
+  however, this is done by specializing the implementation of GemmUniversalAdapter
+  on the two kernel API types, and thus, GemmUniversalAdapter's behaviour might
+  differ between the two specializations.
+*/
+template <class GemmKernel_, class Enable = void>
+class GemmUniversalAdapter;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 3.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// Work-around for some DispatchPolicy types not having a Stages member.
+// In that case, the Stages value is 0.  Most code should static_assert
+// that the number of stages is valid.
+
+// Whether DispatchPolicy::Stages is valid.
+// It should also be convertible to int, but if not, that will show up
+// as a build error when GemmUniversalAdapter attempts to assign it to kStages.
+template <class DispatchPolicy, class Enable = void>
+struct has_Stages : cute::false_type {};
+
+template <class DispatchPolicy>
+struct has_Stages<DispatchPolicy, cute::void_t<decltype(DispatchPolicy::Stages)>> : cute::true_type {};
+
+template<class DispatchPolicy>
+constexpr int stages_member(DispatchPolicy) {
+  if constexpr (has_Stages<DispatchPolicy>::value) {
+    return DispatchPolicy::Stages;
+  }
+  else {
+    return 0;
+  }
+}
+
+} // namespace detail
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<gemm::detail::IsCutlass3GemmKernel<GetUnderlyingKernel_t<GemmKernel_>>::value>>
+{
+public:
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // Map back to 2.x type as best as possible
+  using LayoutA = gemm::detail::StrideToLayoutTagA_t<typename GemmKernel::StrideA>;
+  using LayoutB = gemm::detail::StrideToLayoutTagB_t<typename GemmKernel::StrideB>;
+  using LayoutC = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideC>;
+  using LayoutD = gemm::detail::StrideToLayoutTagC_t<typename GemmKernel::StrideD>;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  static ComplexTransform const kTransformA = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformA, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = cute::is_same_v<typename GemmKernel::CollectiveMainloop::TransformB, cute::conjugate> ?
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+
+  // Legacy: Assume MultiplyAdd only since we do not use this tag type in 3.0
+  using MathOperator = cutlass::arch::OpMultiplyAdd;
+
+  using OperatorClass = cutlass::detail::get_operator_class_t<typename CollectiveMainloop::TiledMma>;
+
+  using ArchTag = typename GemmKernel::ArchTag;
+
+  // NOTE: Assume identity swizzle for now
+  using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;
+
+  // Assume TiledMma's ShapeMNK is the same as 2.x's ThreadblockShape
+  using ThreadblockShape = cutlass::gemm::GemmShape<
+      cute::size<0>(TileShape{}),
+      cute::size<1>(TileShape{}),
+      cute::size<2>(TileShape{})>;
+
+  using ClusterShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+      cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{})>;
+
+  // Instruction shape is easy too, since we get that directly from our TiledMma's atom shape
+  using InstructionShape = cutlass::gemm::GemmShape<
+      cute::size<0>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<1>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{}),
+      cute::size<2>(typename CollectiveMainloop::TiledMma::AtomShape_MNK{})>;
+
+  // Legacy: provide a correct warp count, but no reliable warp shape
+  static int const kThreadCount = GemmKernel::MaxThreadsPerBlock;
+
+  // Warp shape is not a primary API type in 3.x
+  // But we can best approximate it by inspecting the TiledMma
+  // For this, we make the assumption that we always have 4 warps along M, and rest along N, none along K
+  // We also always round up the warp count to 4 if the tiled mma is smaller than 128 threads
+  static constexpr int WarpsInMma = cute::max(4, CUTE_STATIC_V(cute::size(typename GemmKernel::TiledMma{})) / 32);
+  static constexpr int WarpsInMmaM = 4;
+  static constexpr int WarpsInMmaN = cute::ceil_div(WarpsInMma, WarpsInMmaM);
+  using WarpCount = cutlass::gemm::GemmShape<WarpsInMmaM, WarpsInMmaN, 1>;
+  using WarpShape = cutlass::gemm::GemmShape<
+      CUTE_STATIC_V(cute::tile_size<0>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaM,
+      CUTE_STATIC_V(cute::tile_size<1>(typename CollectiveMainloop::TiledMma{})) / WarpsInMmaN,
+      CUTE_STATIC_V(cute::tile_size<2>(typename CollectiveMainloop::TiledMma{}))>;
+
+  static int constexpr kStages = detail::stages_member(typename CollectiveMainloop::DispatchPolicy{});
+
+  // Inspect TiledCopy for A and B to compute the alignment size
+  static int constexpr kAlignmentA = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyA, ElementA, typename CollectiveMainloop::TiledMma::ValTypeA>();
+  static int constexpr kAlignmentB = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveMainloop::GmemTiledCopyB, ElementB, typename CollectiveMainloop::TiledMma::ValTypeB>();
+  static int constexpr kAlignmentC = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyC, ElementC>();
+  static int constexpr kAlignmentD = cutlass::detail::get_alignment_count_from_gmem_tiled_copy<
+      typename CollectiveEpilogue::GmemTiledCopyD, ElementD>();
+
+  using EpilogueOutputOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  // Split-K preserves splits that are 128b aligned
+  static int constexpr kSplitKAlignment = cute::max(
+      128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  /// Argument structure: User API
+  using Arguments = typename GemmKernel::Arguments;
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::Params;
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (GemmKernel::can_implement(args)) {
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kInvalid;
+    }
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    if (args.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      workspace_bytes += sizeof(int) * size_t(cute::size<0>(TileShape{})) * size_t(cute::size<1>(TileShape{}));
+    }
+
+    workspace_bytes += GemmKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = GemmKernel::to_underlying_arguments(args, workspace);
+    return GemmKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return GemmKernel::get_grid_shape(params);
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int /* smem_capacity */ = -1) {
+    CUTLASS_TRACE_HOST("GemmUniversal::maximum_active_blocks()");
+    int max_active_blocks = -1;
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    // first, account for dynamic smem capacity if needed
+    cudaError_t result;
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST(
+          "  cudaFuncSetAttribute() returned error: "
+          << cudaGetErrorString(result));
+        return -1;
+      }
+    }
+
+    // query occupancy after setting smem size
+    result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks,
+        device_kernel<GemmKernel>,
+        GemmKernel::MaxThreadsPerBlock,
+        smem_size);
+
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST(
+        "  cudaOccupancyMaxActiveBlocksPerMultiprocessor() returned error: "
+        << cudaGetErrorString(result));
+      return -1;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << max_active_blocks);
+    return max_active_blocks;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize the workspace
+    Status status = GemmKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = GemmKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<GemmKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Update API is preserved in 3.0, but does not guarantee a lightweight update of params.
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("GemmUniversal()::update() - workspace: " << workspace);
+
+    size_t workspace_bytes = get_workspace_size(args);
+    if (workspace_bytes > 0 && nullptr == workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_ = GemmKernel::to_underlying_arguments(args, workspace);
+    return Status::kSuccess;
+  }
+
+  /// Primary run() entry point API that is static allowing users to create and manage their own params.
+  /// Supplied params struct must be construct by calling GemmKernel::to_underlying_arguments()
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("GemmUniversal::run()");
+    dim3 const block = GemmKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability >= 90) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: Use extended launch API");
+#endif
+      [[maybe_unused]] constexpr bool is_static_1x1x1 =
+        cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape> and
+        cute::size(typename GemmKernel::DispatchPolicy::ClusterShape{}) == 1;
+      [[maybe_unused]] dim3 cluster(cute::size<0>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+        cute::size<1>(typename GemmKernel::DispatchPolicy::ClusterShape{}),
+        cute::size<2>(typename GemmKernel::DispatchPolicy::ClusterShape{}));
+      
+      // Dynamic cluster support
+      [[maybe_unused]] dim3 fallback_cluster = dim3{0,0,0};
+      if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100 
+                    || GemmKernel::ArchTag::kMinComputeCapability == 101
+                    || GemmKernel::ArchTag::kMinComputeCapability == 103
+                    ) {
+        if constexpr (!cute::is_static_v<typename GemmKernel::DispatchPolicy::ClusterShape>) {
+          fallback_cluster = params.hw_info.cluster_shape_fallback;
+          cluster = params.hw_info.cluster_shape;
+        }
+      }
+      
+      [[maybe_unused]] void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "GemmUniversal::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          if constexpr (is_static_1x1x1) {
+            launch_result = cuda_adapter->launch(grid,
+                                                block,
+                                                smem_size,
+                                                stream,
+                                                kernel_params,
+                                                0);
+          }
+          else {
+            launch_result = cuda_adapter->launch(grid,
+                                                cluster,
+                                                fallback_cluster, 
+                                                block,
+                                                smem_size,
+                                                stream,
+                                                kernel_params,
+                                                0);
+          }
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: kEnableCudaHostAdapter is true, but CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        [[maybe_unused]] void const* kernel = (void const*) device_kernel<GemmKernel>;
+        static constexpr bool kClusterLaunch = GemmKernel::ArchTag::kMinComputeCapability == 90;
+        if constexpr (kClusterLaunch) {
+          if constexpr (is_static_1x1x1) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching static 1x1x1 kernel");
+#endif
+            launch_result = cutlass::kernel_launch<GemmKernel>(
+              grid, block, smem_size, stream, params, launch_with_pdl);
+            if (launch_result != Status::kSuccess) {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+            }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            else {
+              CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+            }
+#endif
+          }
+          else {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            CUTLASS_TRACE_HOST("GemmUniversal::run: Launching dynamic cluster kernel");
+#endif
+            launch_result = ClusterLauncher::launch(
+              grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+          }
+        }
+        
+        else {
+          if constexpr (GemmKernel::ArchTag::kMinComputeCapability == 100
+                        || GemmKernel::ArchTag::kMinComputeCapability == 101
+                        || GemmKernel::ArchTag::kMinComputeCapability == 120
+                        || GemmKernel::ArchTag::kMinComputeCapability == 103
+                       ) {
+            if constexpr (is_static_1x1x1) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              CUTLASS_TRACE_HOST("GemmUniversal::run: Launching static 1x1x1 kernel");
+#endif
+              launch_result = cutlass::kernel_launch<GemmKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
+              if (launch_result != Status::kSuccess) {
+                CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+              }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              else {
+                CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+              }
+#endif
+            }
+            else {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+              CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with fall-back cluster");
+#endif
+              launch_result = ClusterLauncher::launch_with_fallback_cluster(
+                grid, 
+                cluster,
+                fallback_cluster,
+                block,
+                smem_size,
+                stream,
+                kernel,
+                kernel_params,
+                launch_with_pdl);
+            }
+          }
+        }
+        
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with CUDA host adapter");
+#endif
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: CUDA host adapter is null");
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        CUTLASS_TRACE_HOST("GemmUniversal::run: Launching kernel with cutlass::kernel_launch");
+#endif
+        launch_result = cutlass::kernel_launch<GemmKernel>(
+          grid, block, smem_size, stream, params, launch_with_pdl);
+        if (launch_result != Status::kSuccess) {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports failure");
+        }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+        else {
+          CUTLASS_TRACE_HOST("GemmUniversal::run: cutlass::kernel_launch reports success");
+        }
+#endif
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("GemmUniversal::run: cudaGetLastError reports success");
+#endif
+      return Status::kSuccess;
+    }
+    else {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+      return Status::kErrorInternal;
+    }
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////// CUTLASS 2.x API /////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+template <class GemmKernel_>
+class GemmUniversalAdapter<
+  GemmKernel_,
+  cute::enable_if_t<not gemm::detail::IsCutlass3GemmKernel<GetUnderlyingKernel_t<GemmKernel_>>::value>>
+{
+public:
+
+  using GemmKernel = GetUnderlyingKernel_t<GemmKernel_>;
+
+  static bool const kInternalTranspose =
+    !cutlass::epilogue::threadblock::detail::is_2x_evt_v<typename GemmKernel::Epilogue> &&  // 2.x EVT does not require internal transpose
+    cute::is_same<typename GemmKernel::LayoutC, cutlass::layout::RowMajor>::value;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+  using WarpShape = typename GemmKernel::WarpShape;
+  using InstructionShape = typename GemmKernel::InstructionShape;
+
+  // warp-level, arch-level (instruction), math operator
+  using WarpMmaOperator = typename GemmKernel::Mma::Policy::Operator;
+  using ArchMmaOperator = typename WarpMmaOperator::ArchMmaOperator;
+  using MathOperator = typename WarpMmaOperator::MathOperator;
+
+  // Operator class and arch tag extract bottom-up
+  // set it for top-level gemm device-level template
+  using OperatorClass = typename WarpMmaOperator::OperatorClass;
+  using ArchTag = typename WarpMmaOperator::ArchTag;
+
+  // Type, layout, and complex transform deliberately exchanged with B
+  using MapArguments = kernel::detail::MapArguments<
+    typename GemmKernel::ElementA,
+    typename GemmKernel::LayoutA,
+    GemmKernel::kTransformA,
+    GemmKernel::kAlignmentA,
+    typename GemmKernel::ElementB,
+    typename GemmKernel::LayoutB,
+    GemmKernel::kTransformB,
+    GemmKernel::kAlignmentB,
+    typename GemmKernel::LayoutC,
+    kInternalTranspose
+  >;
+
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename MapArguments::LayoutC;
+  static int const kAlignmentC = GemmKernel::kAlignmentC;
+
+  // C and D same type for 2.x kernel
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutD>;
+
+  static int const kStages = GemmKernel::Mma::kStages;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ElementAccumulator = typename EpilogueOutputOp::ElementAccumulator;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using UnderlyingOperator = GemmUniversalBase<GemmKernel>;
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalAdapter() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    if (kInternalTranspose) {
+      return args.transposed_problem();
+    }
+    else {
+      return args;
+    }
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args), cuda_adapter);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args), cuda_adapter);
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr
+  ) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream, cuda_adapter);
+  }
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args) {
+
+    return underlying_operator_.update(to_underlying_arguments(args));
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return underlying_operator_.run(stream, cuda_adapter);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f836ecdc3a2b75c264c9ec66aa2dc023c05dc23
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_base.h
@@ -0,0 +1,521 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief The universal GEMM accommodates streamk, batched strided, and batched array variants.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(limits)
+#else
+#include <limits>
+#endif
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename GemmKernel_>
+class GemmUniversalBase {
+public:
+
+  using GemmKernel = GemmKernel_;
+
+  /// Boolean indicating whether the CudaHostAdapter is enabled
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  using ThreadblockShape = typename GemmKernel::Mma::Shape;
+
+  using ElementA = typename GemmKernel::ElementA;
+  using LayoutA = typename GemmKernel::LayoutA;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  static ComplexTransform const kTransformA = GemmKernel::kTransformA;
+
+  using ElementB = typename GemmKernel::ElementB;
+  using LayoutB = typename GemmKernel::LayoutB;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  static ComplexTransform const kTransformB = GemmKernel::kTransformB;
+
+  using ElementC = typename GemmKernel::ElementC;
+  using LayoutC = typename GemmKernel::LayoutC;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename GemmKernel::Mma::ElementC;
+
+  using EpilogueOutputOp = typename GemmKernel::EpilogueOutputOp;
+  using ThreadblockSwizzle = typename GemmKernel::ThreadblockSwizzle;
+  using Operator = typename GemmKernel::Operator;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::Arguments;
+
+
+  /// Index of the GEMM Kernel within the CudaHostAdapter
+  static int32_t const kGemmKernelIndex = 0;
+
+  /// Kernel dynamic shared memory allocation requirement
+  /// Update the kernel function's shared memory configuration for the current device
+  static constexpr size_t kSharedStorageSize = sizeof(typename GemmKernel::SharedStorage);
+
+protected:
+
+  //
+  // Device properties (uniform across all instances of the current thread)
+  //
+
+  // Device ordinal
+  CUTLASS_THREAD_LOCAL static int device_ordinal_;
+
+  /// Device SM count
+  CUTLASS_THREAD_LOCAL static int device_sms_;
+
+  /// Kernel SM occupancy (in thread blocks)
+  CUTLASS_THREAD_LOCAL static int sm_occupancy_;
+
+protected:
+
+  /// Initialize static thread-local members for the thread's current device,
+  /// if necessary.
+  static Status init_device_props()
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::init_device_props()");
+
+    cudaError_t cudart_result;
+
+    // Get current device ordinal
+    int current_ordinal;
+    cudart_result = cudaGetDevice(&current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaGetDevice() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Done if matches the current static member
+    if (current_ordinal == device_ordinal_) {
+      // Already initialized
+      return Status::kSuccess;
+    }
+
+    // Update SM count member
+    cudart_result = cudaDeviceGetAttribute (&device_sms_, cudaDevAttrMultiProcessorCount, current_ordinal);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaDeviceGetAttribute() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // If requires more than 48KB: configure for extended, dynamic shared memory
+    if constexpr (kSharedStorageSize >= (48 << 10))
+    {
+      cudart_result = cudaFuncSetAttribute(
+        Kernel2<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        kSharedStorageSize);
+      if (cudart_result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error " << cudaGetErrorString(cudart_result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    // Update SM occupancy member
+    cudart_result = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+      &sm_occupancy_,
+      Kernel2<GemmKernel>,
+      GemmKernel::kThreadCount,
+      kSharedStorageSize,
+      cudaOccupancyDisableCachingOverride);
+    if (cudart_result != cudaSuccess) {
+      CUTLASS_TRACE_HOST("  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags() returned error " << cudaGetErrorString(cudart_result));
+      return Status::kErrorInternal;
+    }
+
+    // Update device ordinal member on success
+    device_ordinal_ = current_ordinal;
+
+    CUTLASS_TRACE_HOST("  "
+      "device_ordinal: (" << device_ordinal_ << "), "
+      "device_sms: (" << device_sms_ << "), "
+      "sm_occupancy: (" << sm_occupancy_ << ") "
+      "smem_size: (" << kSharedStorageSize << ") "
+      "GemmKernel::kThreadCount: (" << GemmKernel::kThreadCount << ")");
+
+    return Status::kSuccess;
+  }
+
+
+protected:
+
+  //
+  // Instance data members
+  //
+
+  /// Kernel parameters
+  typename GemmKernel::Params params_;
+
+
+  /// Initialize params member
+  Status init_params(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    int32_t device_sms = 0;
+    int32_t sm_occupancy = 0;
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      //
+      // Occupancy query using CudaHostAdapter::query_occupancy().
+      //
+
+      if (cuda_adapter) {
+
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      // Initialize static device properties, if necessary
+      Status result = init_device_props();
+
+      if (result != Status::kSuccess) {
+        return result;
+      }
+
+      //
+      // Use thread-local static members for occupancy query initialized by call to
+      // `init_device_props()`
+      //
+
+      device_sms   = device_sms_;
+      sm_occupancy = sm_occupancy_;
+    }
+
+    // Initialize params member
+    params_ = typename GemmKernel::Params(args, device_sms, sm_occupancy);
+    return Status::kSuccess;
+  }
+
+public:
+
+  //---------------------------------------------------------------------------------------------
+  // Stateless API
+  //---------------------------------------------------------------------------------------------
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::can_implement()");
+
+    if (!kEnableCudaHostAdapter || cuda_adapter) {
+
+      dim3 grid = get_grid_shape(args, cuda_adapter);
+
+      if (!(grid.y <= std::numeric_limits<uint16_t>::max() &&
+            grid.z <= std::numeric_limits<uint16_t>::max()))
+      {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    else {
+      //
+      // With a null host adapter, a conservative grid shape is computed and required to conform to CUDA grid
+      // dimension limits.
+      //
+
+      int64_t logicalGridM = (int64_t(args.problem_size.m()) + ThreadblockShape::kM - 1) / ThreadblockShape::kM;
+      int64_t logicalGridN = (int64_t(args.problem_size.n()) + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+      int32_t logicalGridL = args.batch_count;
+
+      if ((int64_t(std::numeric_limits<uint32_t>::max()) < logicalGridM) ||
+          (int64_t(std::numeric_limits<uint16_t>::max()) < logicalGridN) ||
+          (int32_t(std::numeric_limits<uint16_t>::max()) < logicalGridL)) {
+
+        return Status::kErrorInvalidProblem;
+      }
+
+    }
+
+    return GemmKernel::can_implement(args);
+  }
+
+
+  /// Returns the workspace size (in bytes) needed for the problem
+  /// geometry expressed by these arguments
+  static size_t get_workspace_size(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_workspace_size()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return 0;
+    }
+
+    // Get size from parameters
+    size_t workspace_bytes = base.params_.get_workspace_size();
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+    return workspace_bytes;
+  }
+
+
+  /// Returns the grid extents in thread blocks to launch
+  static dim3 get_grid_shape(Arguments const &args, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::get_grid_shape()");
+
+    // Initialize parameters from args
+    GemmUniversalBase base;
+    if (base.init_params(args, cuda_adapter) != Status::kSuccess) {
+      return dim3(0,0,0);
+    }
+
+    // Get dims from parameters
+    dim3 grid_dims = base.params_.get_grid_dims();
+
+    CUTLASS_TRACE_HOST(
+         "  tiled_shape: " << base.params_.get_tiled_shape()  << "\n"
+      << "  grid_dims: {" << grid_dims << "}");
+
+    return grid_dims;
+  }
+
+
+  /// Returns the maximum number of active thread blocks per multiprocessor
+  static int maximum_active_blocks(CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::maximum_active_blocks()");
+
+    int32_t device_sms   = 0;
+    int32_t sm_occupancy = 0;
+
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+
+      if (cuda_adapter) {
+
+        Status status = cuda_adapter->query_occupancy(
+          &device_sms,
+          &sm_occupancy,
+          kGemmKernelIndex,
+          GemmKernel::kThreadCount,
+          kSharedStorageSize);
+
+        CUTLASS_ASSERT(status == Status::kSuccess);
+
+        if (status != Status::kSuccess) {
+        return -1;
+        }
+      }
+      else {
+        return -1;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+      // Initialize static device properties, if necessary
+      if (init_device_props() != Status::kSuccess) {
+        return -1;
+      }
+
+      sm_occupancy = sm_occupancy_;
+    }
+
+    CUTLASS_TRACE_HOST("  max_active_blocks: " << sm_occupancy_);
+    return sm_occupancy;
+  }
+
+
+  //---------------------------------------------------------------------------------------------
+  // Stateful API
+  //---------------------------------------------------------------------------------------------
+
+  /// Initializes GEMM state from arguments and workspace memory
+  Status initialize(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null"));
+
+    // Initialize parameters from args
+    Status result = init_params(args, cuda_adapter);
+    if (result != Status::kSuccess) {
+      return result;
+    }
+
+    // Assign and prepare workspace memory
+    if (args.mode == GemmUniversalMode::kGemm) {
+      return params_.init_workspace(workspace, stream);
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Lightweight update given a subset of arguments.
+  Status update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase()::update()");
+    params_.update(args);
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalBase::run()");
+
+    // Configure grid and block dimensions
+    dim3 block(GemmKernel::kThreadCount, 1, 1);
+    dim3 grid = params_.get_grid_dims();
+
+    // Launch kernel
+    CUTLASS_TRACE_HOST("  "
+      "grid: (" << grid << "), "
+      "block: (" << block << "), "
+      "SMEM: (" << kSharedStorageSize << ")");
+
+    cutlass::arch::synclog_setup();
+
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      if (cuda_adapter) {
+        void* kernel_params[] = {&params_};
+        return cuda_adapter->launch(grid, block, kSharedStorageSize, stream, kernel_params, 0);
+      }
+      else {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      Kernel2<GemmKernel><<<grid, block, kSharedStorageSize, stream>>>(params_);
+
+      // Query for errors
+      cudaError_t result = cudaGetLastError();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    return run(stream, cuda_adapter);
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr)
+  {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (status == Status::kSuccess) {
+      status = run(stream, cuda_adapter);
+    }
+
+    return status;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Static initializers
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Device ordinal
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_ordinal_ = -1;
+
+/// Device SM count
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::device_sms_ = -1;
+
+/// Kernel SM occupancy (in thread blocks)
+template <typename GemmKernel_>
+CUTLASS_THREAD_LOCAL int GemmUniversalBase<GemmKernel_>::sm_occupancy_ = -1;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..84d148d8418b249b98e86839b8641afd0c7c5cf9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Stream-K GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalStreamkWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmStreamkWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalStreamkWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalStreamkWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalStreamkWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2172d639cb95962b61eca1cad820a33afd31ab0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_absmax.h
@@ -0,0 +1,404 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that computes the absolute maximum of the output tensor
+    and applies additional scaling factors to operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_absmax.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Universal GEMM with absolute-maximum calculation and scaling
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm89,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithAbsMax;
+
+// Partial specialization for SM89
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename LayoutC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB
+>
+class GemmUniversalWithAbsMax<
+    ElementA_,
+    LayoutA_,
+    ElementB_,
+    LayoutB_,
+    ElementC_,
+    LayoutC_,
+    ElementAccumulator_,
+    arch::OpClassTensorOp,
+    arch::Sm89,
+    ThreadblockShape_,
+    WarpShape_,
+    InstructionShape_,
+    EpilogueOutputOp_,
+    ThreadblockSwizzle_,
+    Stages,
+    AlignmentA,
+    AlignmentB,
+    Operator_,
+    TransformA,
+    TransformB
+> :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      arch::OpClassTensorOp,
+      arch::Sm89,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithAbsMax<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SM89 column-major output exchanges problem size and operand.
+template <
+    typename ElementA_,
+    typename LayoutA_,
+    typename ElementB_,
+    typename LayoutB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename ThreadblockShape_,
+    typename WarpShape_,
+    typename InstructionShape_,
+    typename EpilogueOutputOp_,
+    typename ThreadblockSwizzle_,
+    int Stages,
+    int AlignmentA,
+    int AlignmentB,
+    typename Operator_,
+    ComplexTransform TransformA,
+    ComplexTransform TransformB>
+class GemmUniversalWithAbsMax<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, arch::OpClassTensorOp, arch::Sm89, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm89;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithAbsMax<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithAbsMax() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..f04bf8d5f27404a77f7851f22882832865559c63
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@@ -0,0 +1,386 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a GEMM kernel that can broadcast bias vector in the
+           epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/epilogue/thread/linear_combination_bias_elementwise.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_with_broadcast.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!
+  The universal GEMM with a broadcast epilogue.
+  Supports
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp'
+    typename EpilogueOutputOp_ = cutlass::epilogue::thread::LinearCombinationBiasElementwise<
+        ElementC_, ElementAccumulator_, ElementAccumulator_,
+        ElementC_, ElementC_, 128 / cutlass::sizeof_bits<ElementC_>::value>,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class GemmUniversalWithBroadcast :
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithBroadcast<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB>
+class GemmUniversalWithBroadcast<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmUniversalWithBroadcast<
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmUniversalWithBroadcast() { }
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) {
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bde1161c700e822c89b2d5102ac5365a02b51e4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemm_with_k_reduction.h
@@ -0,0 +1,415 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a GEMM kernel that can reduce one of the input matrix
+    into a vector along the K dimension.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
+
+#include "cutlass/gemm/kernel/default_gemm_with_k_reduction.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! 
+  The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+  batched array variants.
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_ = true,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute
+>
+class GemmWithKReduction : 
+  public GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  > {
+
+ public:
+
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static constexpr int kStages = Stages;
+  static constexpr int kAlignmentA = AlignmentA;
+  static constexpr int kAlignmentB = AlignmentB;
+  static constexpr int kAlignmentC = EpilogueOutputOp::kCount;
+  static constexpr ComplexTransform kTransformA = TransformA;
+  static constexpr ComplexTransform kTransformB = TransformB;
+
+  using Base = GemmUniversalBase<
+    typename kernel::DefaultGemmWithKReduction<
+      ElementA_,
+      LayoutA_,
+      TransformA,
+      AlignmentA,
+      ElementB_,
+      LayoutB_,
+      TransformB,
+      AlignmentB,
+      ElementC_,
+      LayoutC_,
+      ElementAccumulator_,
+      OperatorClass_,
+      ReduceKForA_,
+      ArchTag_,
+      ThreadblockShape_,
+      WarpShape_,
+      InstructionShape_,
+      EpilogueOutputOp_,
+      ThreadblockSwizzle_,
+      Stages,
+      Operator_,
+      SharedMemoryClearOption::kNone
+    >::GemmKernel
+  >;
+
+  using Arguments = typename Base::Arguments;
+  using GemmKernel = typename Base::GemmKernel;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Reduce A or B operand along the K dimension
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout
+>
+class GemmWithKReduction<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ReduceKForA_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           Operator_, TransformA, TransformB, GatherA, GatherB, ScatterD, PermuteDLayout> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using UnderlyingOperator = typename GemmWithKReduction< 
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    ElementC,
+    layout::RowMajor,    
+    ElementAccumulator,
+    OperatorClass,
+    !ReduceKForA_,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    kAlignmentB,
+    kAlignmentA,
+    Operator,
+    kTransformB,
+    kTransformA,
+    GatherB,
+    GatherA,
+    ScatterD,
+    PermuteDLayout
+  >::Base;
+
+  using GemmKernel = typename UnderlyingOperator::GemmKernel;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the GEMM.
+  GemmWithKReduction() = default;
+
+  /// Helper to construct a transposed equivalent for the underlying GEMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..763f18e8ec04b445220000dd63098792c4a8e48d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel_>
+class Gemv {
+public:
+
+  using GemvKernel = GemvKernel_;
+
+
+  using ElementA = typename GemvKernel::ElementA;
+  using LayoutA  = typename GemvKernel::LayoutA;
+  using ElementB = typename GemvKernel::ElementB;
+  using ElementC = typename GemvKernel::ElementC;
+
+  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
+  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
+
+  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
+  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
+
+  static int const kThreadCount = GemvKernel::kThreadCount;
+  static int const kThreadsPerRow = GemvKernel::kThreadsPerRow;
+
+  using Arguments = typename GemvKernel::Arguments;
+  using Params = typename GemvKernel::Params;
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Constructs the Gemv.
+  Gemv() { }
+
+  /// Determines whether the Gemv can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return GemvKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return 0;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args, dim3 const &block) { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3((args.problem_size.row() + (block.x - 1)) / block.x, 1, args.batch_count % 65536);
+    }
+    else {
+      return dim3((args.problem_size.row() + (block.y - 1)) / block.y, 1, args.batch_count % 65536);
+    }
+  }
+
+  /// Computes the block shape
+  static dim3 get_block_shape() { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3(kThreadCount, 1, 1);
+    }
+    else {
+      return dim3(kThreadsPerRow, kThreadCount / kThreadsPerRow, 1);
+    }
+  }
+
+  /// Initializes Gemv state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    params_ = Params(args);
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    return params_.update(args);    
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    dim3 block = get_block_shape();
+    dim3 grid = get_grid_shape(params_, block);
+
+    int smem_size = int(sizeof(typename GemvKernel::SharedStorage));
+    
+    // Launch
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv_blockscaled.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv_blockscaled.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4dc0dd3061c9dc00e184881689ed0bb74e1921b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/gemv_blockscaled.h
@@ -0,0 +1,183 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel_>
+class GemvBlockScaled {
+public:
+
+  using GemvKernel = GemvKernel_;
+
+
+  using ElementA = typename GemvKernel::ElementA;
+  using LayoutA  = typename GemvKernel::LayoutA;
+  using ElementB = typename GemvKernel::ElementB;
+  using ElementC = typename GemvKernel::ElementC;
+
+  using ElementSFA = typename GemvKernel::ElementSFA;
+  using ElementSFB = typename GemvKernel::ElementSFB;
+
+  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
+  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
+
+  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
+  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
+
+  static int const kThreadCount = GemvKernel::kThreadCount;
+  static int const kThreadsPerRow = GemvKernel::kThreadsPerRow;
+
+  using Arguments = typename GemvKernel::Arguments;
+  using Params = typename GemvKernel::Params;
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Constructs the GemvBlockScaled.
+  GemvBlockScaled() = default;
+
+  /// Determines whether the GemvBlockScaled can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return GemvKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return 0;
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args, dim3 const &block) { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3((args.problem_size.row() + (block.x - 1)) / block.x, 1, args.batch_count % 65536);
+    }
+    else {
+      return dim3((args.problem_size.row() + (block.y - 1)) / block.y, 1, args.batch_count % 65536);
+    }
+  }
+
+  /// Computes the block shape
+  static dim3 get_block_shape() { 
+    if(platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      return dim3(kThreadCount, 1, 1);
+    }
+    else {
+      return dim3(kThreadsPerRow, kThreadCount / kThreadsPerRow, 1);
+    }
+  }
+
+  /// Initializes GemvBlockScaled state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    params_ = Params(args);
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    return params_.update(args);    
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+    const dim3 block = get_block_shape();
+    const dim3 grid = get_grid_shape(params_, block);
+
+    int smem_size = int(sizeof(typename GemvKernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+    if (result == cudaSuccess) {
+        return Status::kSuccess;
+    } else {
+        return Status::kErrorInternal;
+    }
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k.h
new file mode 100644
index 0000000000000000000000000000000000000000..293ca06a3a943ef83ca63bf6e6cc545e052c0a1a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k.h
@@ -0,0 +1,548 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined Rank2K kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementB_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Rank2K {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 2;
+
+  // static asserts for rank 2k update kernel
+  static_assert(platform::is_same<LayoutA, LayoutB>::value,
+    "Rank 2K update operator support same layouts for operandA and B");
+
+  /// Define the kernel
+  using Rank2Kkernel = typename kernel::DefaultRank2KUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kTransformB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::Rank2Kkernel;
+  
+  using Arguments = typename Rank2Kkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename Rank2Kkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  Rank2K() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = Rank2Kkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename Rank2Kkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<Rank2Kkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(Rank2Kkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename Rank2Kkernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<Rank2Kkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Rank2K update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformB,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Rank2K<ElementA_, LayoutA_, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, TransformA, TransformB, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kUpdateRank = 2;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Rank2K<
+    ElementB,
+    LayoutB,
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentB,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kTransformB,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using Rank2Kkernel = typename UnderlyingOperator::Rank2Kkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Rank2K.
+  Rank2K() { }
+
+  /// Helper to construct a transposed equivalent for the underlying Rank2K operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem();
+  }
+
+  /// Determines whether the Rank2K can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Rank2K state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Rank2K
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c59744b5a9b6c7e98aa66a7b8ddb998413ed46e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_2k_grouped.h
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Device-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/device/base_grouped.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Rank2K Grouped
+template <typename Rank2Kkernel_>
+class Rank2KGrouped : public BaseGrouped<Rank2Kkernel_> {
+public:
+  using Rank2Kkernel = Rank2Kkernel_;
+  static const cutlass::FillMode kFillModeC = Rank2Kkernel::kFillModeC;
+  static const cutlass::BlasMode kBlasMode = Rank2Kkernel::kBlasMode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_k.h
new file mode 100644
index 0000000000000000000000000000000000000000..80c420cd8a73859183a013fbd1b10ca0f46cbc0d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/rank_k.h
@@ -0,0 +1,510 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined RankK kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+
+#include "cutlass/gemm/kernel/default_rank_k_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ =
+        typename threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementA_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYRK
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementA_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class RankK {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static ComplexTransform const kTransformA = TransformA;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  /// Define the kernel
+  using RankKkernel = typename kernel::DefaultRankKUniversal<
+    ElementA,
+    LayoutA,
+    kTransformA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::RankKkernel;
+  
+  using Arguments = typename RankKkernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename RankKkernel::Params params_;
+public:
+
+  /// Constructs the SYRK.
+  RankK() { }
+
+  /// Determines whether the SYRK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = RankKkernel::can_implement(args);
+   
+    if (FillModeC != FillMode::kLower && FillModeC != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYRK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+    // Initialize the Params structure
+    params_ = typename RankKkernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<RankKkernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(RankKkernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename RankKkernel::SharedStorage));
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<RankKkernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for column-major output exchange operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by RankK update kernel
+    typename Operator_,
+    /// Complex elementwise transformation 
+    ComplexTransform TransformA,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class RankK<ElementA_, LayoutA_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           FillModeC, ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA,
+           SplitKSerial, Operator_, TransformA, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static FillMode const kFillModeC = FillModeC;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  static int const kUpdateRank = 1;
+
+  // Complex transform for input A matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::RankK<
+    ElementA,
+    LayoutA,
+    ElementC,
+    layout::RowMajor,
+    InvertFillMode<FillModeC>::mode,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kSplitKSerial,
+    Operator,
+    kTransformA,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using RankKkernel = typename UnderlyingOperator::RankKkernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the RankK.
+  RankK() { }
+
+  /// Helper to construct a transposed equivalent for the underlying RankK operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args;
+  }
+
+  /// Determines whether the RankK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes RankK state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace RankK
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/symm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/symm.h
new file mode 100644
index 0000000000000000000000000000000000000000..538d294f83e24955c2354cfaceeb79e835fc28cd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/symm.h
@@ -0,0 +1,603 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined SYMM and HEMM kernels. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+
+#include "cutlass/gemm/kernel/default_symm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by SYMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+class Symm {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+
+  // static asserts for symm update kernel
+  static_assert(platform::is_same<LayoutA, LayoutB>::value,
+    "SYMM update operator support same layouts for operand A and B");
+
+  /// Define the kernel
+  using SymmKernel = typename kernel::DefaultSymmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kSideModeA,
+    kFillModeA,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kAlignmentBKernel,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >::SymmKernel;
+  
+  using Arguments = typename SymmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename SymmKernel::Params params_;
+public:
+
+  /// Constructs the SYMM.
+  Symm() { }
+
+  /// Determines whether the SYMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = SymmKernel::can_implement(args);
+
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+   
+    if (FillModeA != FillMode::kLower && FillModeA != FillMode::kUpper) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes SYMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideModeA == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename SymmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename SymmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(SymmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename SymmKernel::SharedStorage));
+
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<SymmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<SymmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/********************************************************************************************************
+  SYMM/HEMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always symmetric/hermitian, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+  Although, cuBLAS SYMM/HEMM only supports ColumnMajor layouts for all matrices (A, B, C/D).
+
+  For the mainloop and symm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for SYMM/HEMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for.  This is the minimum SM that
+    /// supports the intended feature. The device kernel can be built
+    /// targeting any SM larger than this number.
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial,
+    /// Operation performed by Symm update kernel
+    typename Operator_,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_
+    >
+class Symm<ElementA_, LayoutA_, SideModeA, FillModeA, ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB,
+           SplitKSerial, Operator_, BlasMode_> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideModeA = SideModeA;
+  static FillMode const kFillModeA = FillModeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  static BlasMode const kBlasMode = BlasMode_;
+  
+  /// Define the kernel
+  using UnderlyingOperator = typename cutlass::gemm::device::Symm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    InvertSideMode<kSideModeA>::mode,
+    InvertFillMode<kFillModeA>::mode,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    kBlasMode
+  >;
+  
+
+  /// Argument structure
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using SymmKernel = typename UnderlyingOperator::SymmKernel;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the Symm.
+  Symm() { }
+
+  /// Helper to construct a transposed equivalent for the underlying SYMM operator
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the Symm can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Computes the grid shape
+  static dim3 get_grid_shape(Arguments const &args) { 
+    return UnderlyingOperator::get_grid_shape(to_underlying_arguments(args));
+  }
+
+  /// Computes the maximum number of active blocks per multiprocessor
+  static int maximum_active_blocks(int smem_capacity = -1) {
+    return UnderlyingOperator::maximum_active_blocks(smem_capacity);
+  }
+
+  /// Initializes Symm state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace Symm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/trmm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/trmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..46f6473e8a201a22ee3f4b55783f0a5d24b91d54
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/device/trmm.h
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a TRMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+
+#include "cutlass/gemm/kernel/default_trmm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*! Trmm device-level operator. This is an interface to efficient CUTLASS TRMM kernels that may
+  be invoked from host code.
+
+  The contributions of this class are:
+    
+    1. At compile time, it maps data types and high-level structural parameters onto 
+       specific CUTLASS components.
+
+    2. At runtime, it maps logical arguments to TRMM problems to kernel parameters.
+
+    3. At runtime, it launches kernels on the device.
+
+  The intent is to provide a convenient mechanism for interacting with most plausible TRMM
+  configurations for each supported architecture. Consequently, not all parameters are exposed
+  to the top-level interface. Rather, sensible defaults at each level of the CUTLASS hierarchy
+  are selected to tradeoff simplicity of the interface with flexibility. We expect 
+  most configurations to be specified at this level. Applications with more exotic requirements 
+  may construct their kernels of interest using CUTLASS components at the threadblock, warp, 
+  and thread levels of abstraction.
+
+  CUTLASS exposes computations using the functor design pattern in which objects compose some
+  internal state with an overloaded function call operator. This enables decoupling of
+  initialization from execution, possibly reducing overhead during steady state phases of
+  application execution.
+
+  CUTLASS device-level operators expose an Arguments structure encompassing each logical
+  input to the computation. This is distinct from the kernel-level Params structure pattern
+  which contains application-specific precomputed state needed by the device code.
+
+  Example of a CUTLASS TRMM operator implementing the functionality of cuBLAS's STRMM NN
+  is as follows:
+
+    //
+    // Instantiate the CUTLASS TRMM operator.
+    //
+
+    cutlass::gemm::device::Trmm<
+      float,
+      cutlass::layout::ColumnMajor,
+      cutlass::SideMode::kLeft,
+      cutlass::FillMode::kLower,
+      cutlass::DiagType::kNonUnit,
+      float,
+      cutlass::layout::ColumnMajor,
+      float,
+      cutlass::layout::ColumnMajor,
+    > trmm_op;
+
+    //
+    // Launch the TRMM operation on the device
+    //
+
+    cutlass::Status status = trmm_op({
+      cutlass::gemm::GemmUniversalMode,   // Trmm Problem Mode
+      {m, n, m/n},                        // GemmCoord problem_size (k is based on left- or right-side mode)
+      batch_count,
+      {alpha},                            // EpilogueOutputOp::Params epilogue_op_params
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int lda,
+      int ldb,
+      int ldc
+    });
+
+  A simplified view of the template is listed below.
+
+    template <
+      /// Element type for A matrix operand
+      typename ElementA,
+      
+      /// Layout type for A matrix operand
+      typename LayoutA,
+      
+      /// Side Mode for A (kLeft or kRight)
+      SideMode SideModeA,
+
+      /// Fill Mode for A (kLower or kUpper)
+      FillMode FillModeA,
+
+      /// DiagType for A (kNonUnit or kUnit)
+      DiagType DiagTypeA,
+
+      /// Element type for B matrix operand
+      typename ElementB,
+      
+      /// Layout type for B matrix operand
+      typename LayoutB,
+      
+      /// Element type for C and D matrix operands
+      typename ElementC,
+      
+      /// Layout type for C and D matrix operands
+      typename LayoutC,
+      
+      /// Element type for internal accumulation
+      typename ElementAccumulator,
+
+      /// Operator class tag
+      typename OperatorClass,
+      
+      /// Tag indicating architecture to tune for.  This is the minimum SM that
+      /// supports the intended feature. The device kernel can be built
+      /// targeting any SM larger than this number.
+      typename ArchTag,
+      
+      /// Threadblock-level tile size (concept: GemmShape)
+      typename ThreadblockShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename WarpShape,
+      
+      /// Warp-level tile size (concept: GemmShape)
+      typename InstructionShape,
+      
+      /// Epilogue output operator
+      typename EpilogueOutputOp,
+      
+      /// Threadblock-level swizzling operator
+      typename ThreadblockSwizzle,
+      
+      /// Number of stages used in the pipelined mainloop
+      int Stages,
+
+      /// Access granularity of A matrix in units of elements
+      int AlignmentA,
+
+      /// Access granularity of B matrix in units of elements
+      int AlignmentB,
+
+      /// If true, kernel supports split-K with serial reduction
+      bool SplitKSerial,
+
+      /// Operation performed by TRMM
+      typename Operator,
+
+      /// Complex elementwise transformation on A operand
+      ComplexTransform TransformA
+    >
+    class Trmm;
+*/
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassTensorOp,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm80,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_ = epilogue::thread::LinearCombination<
+      ElementC_,
+      128 / sizeof_bits<ElementC_>::value,
+      ElementAccumulator_,
+      ElementAccumulator_,
+      epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// If true, kernel supports split-K with serial reduction
+    bool SplitKSerial = false,
+    /// Operation performed by TRMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone>
+class Trmm {
+ public:
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementB_, ElementA_>::type;
+  using LayoutAKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutB_, LayoutA_>::type;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), ElementA_, ElementB_>::type;
+  using LayoutBKernel = typename platform::conditional<(SideModeA == SideMode::kRight), LayoutA_, LayoutB_>::type;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentAKernel = (SideModeA == SideMode::kRight) ? AlignmentB : AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentBKernel = (SideModeA == SideMode::kRight) ? AlignmentA : AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp::kCount;
+  static bool const kSplitKSerial = SplitKSerial;
+  // Complex Transform don't apply to B
+  static ComplexTransform const kTransformA = TransformA; 
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static ComplexTransform const kTransformAKernel = (SideModeA == SideMode::kRight) ? 
+                                              ComplexTransform::kNone : TransformA;
+  static ComplexTransform const kTransformBKernel = (SideModeA == SideMode::kRight) ? 
+                                              TransformA : ComplexTransform::kNone;
+
+  /// Define the kernel
+  using TrmmKernel = typename kernel::DefaultTrmmUniversal<
+    ElementAKernel,
+    LayoutAKernel,
+    kTransformAKernel,
+    kAlignmentAKernel,
+    ElementBKernel,
+    LayoutBKernel,
+    kTransformBKernel,
+    kAlignmentBKernel,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kSplitKSerial,
+    Operator
+  >::TrmmKernel;
+  
+  using Arguments = typename TrmmKernel::Arguments;
+
+private:
+
+  /// Kernel parameters object
+  typename TrmmKernel::Params params_;
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    if (!kSplitKSerial && args.batch_count > 1) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    Status status = TrmmKernel::can_implement(args);
+   
+    if (SideModeA == SideMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (FillModeA == FillMode::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (DiagTypeA == DiagType::kInvalid) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    size_t bytes = 0;
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+    
+    if (kSplitKSerial && args.batch_count > 1) {
+
+      bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n());
+    }
+
+    return bytes;
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+ 
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_tiled_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size, 
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      args.batch_count);
+
+    if (kSplitKSerial) {
+      if (args.batch_count > 1) {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        size_t bytes = get_workspace_size(args);
+      
+        cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream);
+
+        if (result != cudaSuccess) {
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    else {
+
+      if (args.batch_count > 1) {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+    
+    int gemm_k_size = args.problem_size.k();
+
+   // Swapping argument for A and B, if A was on the right side (problem size doesn't need to change here).
+    if (kSideMode == SideMode::kRight) {
+      // Initialize the Params structure
+      params_ = typename TrmmKernel::Params{
+        args.swapped_matrices(),
+        grid_tiled_shape,
+        gemm_k_size,
+        static_cast<int *>(workspace)
+      };
+
+      return Status::kSuccess;
+    }
+
+    // Initialize the Params structure
+    params_ = typename TrmmKernel::Params{
+      args,
+      grid_tiled_shape,
+      gemm_k_size,
+      static_cast<int *>(workspace)
+    };
+    
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    
+    if (kSplitKSerial && args.batch_count > 1) {  
+      if (!workspace) {
+        return Status::kErrorWorkspaceNull;
+      }
+    }
+
+    size_t workspace_bytes = get_workspace_size(args);
+
+    if (workspace_bytes && !workspace) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    params_.update(args, workspace);
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(TrmmKernel::kThreadCount, 1, 1);
+
+    int smem_size = int(sizeof(typename TrmmKernel::SharedStorage));
+    
+    if (smem_size >= (48 << 10)) {
+      cudaError_t result = cudaFuncSetAttribute(Kernel<TrmmKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::arch::synclog_setup();
+    cutlass::Kernel<TrmmKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    Status status = initialize(args, workspace);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+/********************************************************************************************************
+  TRMM has 4 combinations based on Layouts {RowMajor, ColumnMajor} x Side mode {LeftSide, RightSide}
+  In templates and arguments to cutlass kernel, `matrix A` is always triangular, and `matrix B` is rectangular. 
+  (adhering to the cuBLAS convention)
+
+For the mainloop and trmm kernel, `A` and `B` points to left-side and right-side matrices, respectively.
+  
+  Thus, for LeftSide mode `A` and `B` points to `matrix A` and `matrix B`, respectively. While for 
+  the RightSide mode `A` and `B` points to `matrix B` and `matrix A`, respectively. 
+  
+  Additionally, CUTLASS GEMM epilogue is always RowMajor, and ColumnMajor output is achieved by 
+  transposing the GEMM problem. Thus, ColumnMajor output layout for TRMM requires:
+   - Transposing `matrix A` and `matrix B` layouts
+   - Swapping problem size m and n values
+   - Swapping LeftSide and RightSide mode
+  
+  RowMajor output:    D = matrix A x matrix B
+  ColumnMajor output: D = matrix A x matrix B -> Transpose (D) = Transpose(matrix B) x Transpose(matrix A)
+
+  {RowMajor, ColumnMajor} x Side Mode {LeftSide, RightSide} 4 cases:
+    1.  LeftSide mode and RowMajor output (default template)
+    2.  LeftSide mode and ColumnMajor output 
+    3.  RightSide mode and RowMajor output
+    4.  RightSide mode and ColumnMajor output
+  
+  Mapping ColumnMajor output layout cases 2 and 4 to RowMajor efficient epilogue implementation:
+  
+  Case 2 -> Case 3:
+      D_col = matrix A x matrix B (LeftSide mode) 
+   => Transpose(D_col) = Transpose(matrix B) x Transpose(matrix A) (RightSide mode)
+
+  swap pointers for `A` and `B` call GEMM mainloop with RowMajor efficient-epilogue
+
+  Case 4 -> Case 1:
+      D_col = matrix B x matrix A (RightSide mode) 
+   => Transpose(D_col) = Transpose(matrix A) x Transpose(matrix B) (LeftSide mode)
+
+   call GEMM mainloop for with RowMajor efficient-epilogue
+********************************************************************************************************/
+
+/// Partial specialization for column-major output exchanges problem size and operand.
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A 
+    SideMode SideModeA,
+    /// Fill Mode for A
+    FillMode FillModeA,
+    /// DiagType for A
+    DiagType DiagTypeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Epilogue output operator
+    typename EpilogueOutputOp_,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB,
+    /// If true, kernel supports split-K as a serial reduction
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA>
+class Trmm<ElementA_, LayoutA_, SideModeA, FillModeA, DiagTypeA,
+           ElementB_, LayoutB_, ElementC_,
+           layout::ColumnMajor,  // partially specialized on LayoutC
+           ElementAccumulator_, OperatorClass_, ArchTag_, ThreadblockShape_,
+           WarpShape_, InstructionShape_, EpilogueOutputOp_,
+           ThreadblockSwizzle_, Stages, AlignmentA, AlignmentB, SplitKSerial,
+           Operator_, TransformA> {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_; 
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = layout::ColumnMajor;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape = ThreadblockShape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static SideMode const kSideMode = SideModeA;
+  static FillMode const kFillMode = FillModeA;
+  static DiagType const kDiagType = DiagTypeA;
+  // Changing SideMode as we change the layout
+  static SideMode const kSideModeT = (SideModeA == SideMode::kLeft) ?
+                                      SideMode::kRight : SideMode::kLeft;
+  // Changing FillMode as we change the layout
+  static FillMode const kFillModeT = (FillModeA == FillMode::kLower) ? 
+                                      FillMode::kUpper : FillMode::kLower;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static ComplexTransform const kTransformA = TransformA;
+  // Complex Transform don't apply to B
+  static ComplexTransform const kTransformB = ComplexTransform::kNone; 
+  static bool const kSplitKSerial = SplitKSerial;
+
+  using UnderlyingOperator = Trmm<
+    ElementA,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    kSideModeT,
+    kFillModeT,
+    kDiagType,
+    ElementB,
+    typename layout::LayoutTranspose<LayoutB>::type, 
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    kStages,
+    kAlignmentA,
+    kAlignmentB,
+    kSplitKSerial,
+    Operator,
+    TransformA
+  >;
+
+  using Arguments = typename UnderlyingOperator::Arguments;
+  using TrmmKernel = typename UnderlyingOperator::TrmmKernel;
+  static int const kAlignmentC = UnderlyingOperator::kAlignmentC;
+
+private:
+
+  UnderlyingOperator underlying_operator_;
+
+public:
+
+  /// Constructs the TRMM.
+  Trmm() { }
+
+  /// Helper to construct a transposed equivalent for the underlying TRMM operator which is identical
+  static Arguments to_underlying_arguments(Arguments const &args) {
+    return args.transposed_problem_size();
+  }
+
+  /// Determines whether the TRMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return UnderlyingOperator::can_implement(to_underlying_arguments(args));
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    
+    return UnderlyingOperator::get_workspace_size(to_underlying_arguments(args));
+  }
+
+  /// Initializes TRMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.initialize(to_underlying_arguments(args), workspace, stream);
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    return underlying_operator_.update(to_underlying_arguments(args), workspace);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    return underlying_operator_.run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+   
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/dispatch_policy.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/dispatch_policy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f42fc7ba89f7c4325634119e334a37d4ca340e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/dispatch_policy.hpp
@@ -0,0 +1,1430 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/layout.hpp"
+#include "cute/numeric/integral_constant.hpp" // cute::false_type
+#include "cute/atom/copy_traits_sm100.hpp"
+#include "cutlass/detail/collective/sm103_kernel_type.hpp"
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::detail {
+
+template <class T, template <int...> class U>
+struct is_kernel_tag_of : cute::false_type {};
+
+template <template <int...> class U, int... Args>
+struct is_kernel_tag_of<U<Args...>, U> : cute::true_type {};
+
+template <class T, template <int...> class U>
+constexpr bool is_kernel_tag_of_v = is_kernel_tag_of<T, U>::value;
+
+template <class T, template <int,bool> class U>
+struct is_asymmetric_dma_kernel_tag_of : cute::false_type {};
+
+template <template <int, bool> class U, int I0, bool B0>
+struct is_asymmetric_dma_kernel_tag_of<U<I0, B0>, U> : cute::true_type {};
+
+template <class T, template <int, bool> class U>
+constexpr bool is_asymmetric_dma_kernel_tag_of_v = \
+                              is_asymmetric_dma_kernel_tag_of<T, U>::value;
+
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+using namespace cute;
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+enum class KernelInputTransformType {
+    FastF32,
+    InterleavedComplexTF32,
+    MixedInput
+};
+
+} // namespace detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+namespace kernel::detail {
+
+// Has_SwapAB<T>::value will be true only if:
+//   class T has member SwapAB and T::SwapAB is true
+template <typename T, typename = void>
+struct Has_SwapAB { static constexpr bool value = false; };
+
+template <typename T>
+struct Has_SwapAB <T, CUTE_STL_NAMESPACE::void_t<decltype(T::SwapAB)>>
+{ static constexpr bool value = T::SwapAB; };
+
+template <typename T>
+static constexpr bool Has_SwapAB_v = Has_SwapAB<T>::value;
+
+// additional producer warp role check for block scaling mainloop
+template<typename T>
+struct HasAuxiliaryLoad : cute::false_type{};
+
+template <typename T>
+static constexpr bool HasAuxiliaryLoad_v = HasAuxiliaryLoad<T>::value;
+
+} // namespace kernel::detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Kernel schedule policies (the base class tags, one for each kernel layer file)
+//
+struct KernelMultistage { };
+struct KernelPtrArrayMultistage { };
+struct KernelCpAsyncWarpSpecialized { };
+struct KernelCpAsyncWarpSpecializedPingpong { };
+struct KernelCpAsyncWarpSpecializedCooperative { };
+struct KernelTma { };
+struct KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = 0;
+};
+struct KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = 0;
+};
+
+struct KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+// FP8 related policies (including Blocked Scaled Accumulation)
+struct KernelTmaWarpSpecializedCooperativeFP8Blockwise: KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongFP8Blockwise: KernelTmaWarpSpecializedPingpong { };
+struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise: KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpongFP8Blockwise: KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+using KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum = KernelTmaWarpSpecializedCooperativeFP8Blockwise;
+using KernelTmaWarpSpecializedPingpongFP8BlockScaledAccum = KernelTmaWarpSpecializedPingpongFP8Blockwise;
+using KernelPtrArrayTmaWarpSpecializedCooperativeFP8BlockScaledAccum = KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise;
+using KernelPtrArrayTmaWarpSpecializedPingpongFP8BlockScaledAccum = KernelPtrArrayTmaWarpSpecializedPingpongFP8Blockwise;
+
+// Policies to opt into mixed type GEMMs
+struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongMixedInput : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeMixedInput: KernelTmaWarpSpecializedCooperative { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Builder dispatch policies (not a part of the main CUTLASS layers, simply used to opt into
+// specific collective builder dispatches)
+//
+
+// FP8 related policies (including Fast Accumulation)
+struct KernelTmaWarpSpecializedFP8FastAccum : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongFP8FastAccum : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };
+struct KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum : KernelPtrArrayTmaWarpSpecializedPingpong { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+// Policies for dispatch of epilogue
+struct EpilogueDefault { };
+struct EpilogueTransposed { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Collective Mainloop Policies
+//
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, WITHOUT predicated gmem loads
+struct MainloopSm70TwoStageUnpredicated {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// 2 stage pipeline through 1 stage in smem, 1 in rmem, with predicated gmem loads
+struct MainloopSm70TwoStage {
+  constexpr static int Stages = 2;
+  using ArchTag = arch::Sm70;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, WITHOUT predicated gmem loads
+template<int Stages_>
+struct MainloopSm80CpAsyncUnpredicated {
+  constexpr static int Stages = Stages_;
+  using ArchTag = arch::Sm80;
+  using Schedule = KernelMultistage;
+  using ClusterShape = Shape<_1,_1,_1>;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm80CpAsync {
+  constexpr static int Stages = Stages_;
+  using ArchTag = cute::conditional_t<(size(ClusterShape_{}) > 1), arch::Sm90, arch::Sm80>;
+  using Schedule = KernelMultistage;
+  using ClusterShape = ClusterShape_;
+};
+
+// n-buffer in smem (cp.async), pipelined with registers, with predicated gmem loads for SM100 Simt Ptr-Array
+template<int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm80ArrayCpAsync {
+  constexpr static int Stages = Stages_;
+  using ArchTag = cute::conditional_t<(size(ClusterShape_{}) > 1), arch::Sm90, arch::Sm80>;
+  using Schedule = KernelPtrArrayMultistage;
+  using ClusterShape = ClusterShape_;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (cp.async), pipelined with Hopper GMMA, with predicated gmem loads, warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelCpAsyncWarpSpecialized
+>
+struct MainloopSm90CpAsyncGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, static schedule between TMA and GMMA
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  int PipelineAsyncMmaStages_ = 1
+>
+struct MainloopSm90TmaGmma {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  constexpr static int PipelineAsyncMmaStages = PipelineAsyncMmaStages_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelTma;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// With GMMA's A data from registers.
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<Schedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// For FP8 kernels
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecialized
+>
+struct MainloopSm90TmaGmmaWarpSpecializedFP8
+  : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecialized> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedPingpong> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+    "KernelSchedule must be one of the warp specialized policies");
+};
+
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// For FP8 kernels with Blockwise (Software) Scaling
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperativeFP8Blockwise
+>
+struct MainloopSm90TmaGmmaWarpSpecializedBlockwiseFP8
+  : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperativeFP8Blockwise> ||
+    cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedPingpongFP8Blockwise>,
+    "KernelSchedule must be one of the warp specialized FP8 block scale policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  constexpr static int PipelineAsyncMmaStages = 1;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, KernelSchedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, KernelSchedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
+// For FP8 kernels
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecializedFP8
+  : MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, KernelSchedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, KernelSchedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper sparse GMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecializedSparse {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+};
+
+// For slow-accumulation sparse FP8 kernels
+template<
+  int Stages,
+  class ClusterShape = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecializedSparseFP8 
+  : MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule> {
+};
+
+// Mixed precision version n-buffer in rmem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperative
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecializedMixedInput {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm90;
+  using Schedule = KernelSchedule;
+  static_assert(
+    cute::is_same_v<Schedule, KernelPtrArrayTmaWarpSpecializedCooperative> ||
+    cute::is_same_v<Schedule, KernelPtrArrayTmaWarpSpecializedPingpong>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative policies");
+};
+
+// n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
+// For FP8 kernels with Block Scaling
+template<
+  int Stages_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise
+>
+struct MainloopSm90ArrayTmaGmmaWarpSpecializedBlockwise
+  : MainloopSm90ArrayTmaGmmaWarpSpecialized<Stages_, ClusterShape_, KernelSchedule> {
+  static_assert(
+    cute::is_any_of_v<
+      KernelSchedule,
+      KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise,
+      KernelPtrArrayTmaWarpSpecializedPingpongFP8Blockwise
+    >,
+    "KernelSchedule must be one of the warp specialized FP8 block scale policies");
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Kernel Scheduler Tag
+//
+
+// Dense GEMM: SM100 tensor op policy that applies to both 1SM and 2SM MMA atoms
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelMixedTmaCpAsyncWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Gemm with block scaling factors
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedBlockScaledSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedMmaTransformSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelPtrArrayTmaWarpSpecializedMmaTransformSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedBlockScaledSm103 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelPtrArrayTmaWarpSpecializedBlockScaledSm103 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Sparse Gemm
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelSparseTmaWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Sparse Gemm with block scaling factors
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelSparseTmaWarpSpecializedBlockScaledSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// InputTransform GEMM
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedInputTransformSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// InputTransform GEMM
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelTmaWarpSpecializedMixedInputTransformSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Ptr-Array Dense GEMM: SM100 tensor op policy that applies to both 1SM and 2SM MMA atoms
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelPtrArrayTmaWarpSpecializedSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Ptr-Array Block Scaled GEMM
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelPtrArrayTmaWarpSpecializedBlockScaledSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+// Ptr-Array InputTransform GEMM
+template<
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_
+>
+struct KernelPtrArrayTmaWarpSpecializedInputTransformSm100 final {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr int AccumulatorPipelineStageCount = AccumulatorPipelineStageCount_;
+};
+
+
+// SM120 kernel schedules
+template<int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedCooperativeSm120 : KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template<int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedPingpongSm120 : KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+
+template<int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedCooperativeBlockScaledSm120 : KernelTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template<int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedPingpongBlockScaledSm120 : KernelTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+// SM120 dense Ptr-array kernel schedules
+template<int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedCooperativeSm120 : KernelPtrArrayTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template<int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedPingpongSm120 : KernelPtrArrayTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template<int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedCooperativeBlockScaledSm120 : KernelPtrArrayTmaWarpSpecializedCooperative { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template<int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedPingpongBlockScaledSm120 : KernelPtrArrayTmaWarpSpecializedPingpong { 
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+// SM120 sparse kernel schedules
+template<int SchedulerPipelineStageCount_, bool isAsymmetric_>
+struct KernelTmaWarpSpecializedCooperativeSparseSm120 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr bool isAsymmetric = isAsymmetric_;
+};
+
+template<int SchedulerPipelineStageCount_, bool isAsymmetric_>
+struct KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120 {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  static constexpr bool isAsymmetric = isAsymmetric_;
+};
+
+// SM120 blockwise kernel schedules
+template <int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedCooperativeBlockwiseScalingSm120 : KernelTmaWarpSpecializedCooperative {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template <int SchedulerPipelineStageCount_>
+struct KernelTmaWarpSpecializedPingpongBlockwiseScalingSm120 : KernelTmaWarpSpecializedPingpong {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template <int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedCooperativeBlockwiseScalingSm120 : KernelPtrArrayTmaWarpSpecializedCooperative {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+template <int SchedulerPipelineStageCount_>
+struct KernelPtrArrayTmaWarpSpecializedPingpongBlockwiseScalingSm120 : KernelPtrArrayTmaWarpSpecializedPingpong {
+  static constexpr int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+};
+
+// Auxiliary Load Tag.
+
+namespace kernel::detail {
+
+template<
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule
+>
+struct HasAuxiliaryLoad<
+  MainloopSm90ArrayTmaGmmaWarpSpecializedBlockwise<
+    Stages,
+    ClusterShape,
+    KernelSchedule
+  >
+> : cute::true_type{};
+
+template<
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule
+>
+struct HasAuxiliaryLoad<
+  MainloopSm90TmaGmmaWarpSpecializedBlockwiseFP8<
+    Stages,
+    ClusterShape,
+    KernelSchedule
+  >
+> : cute::true_type{};
+
+} // namespace kernel::detail
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Collective Builder Tag Property
+//
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          SM100 Dispatch Policies
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Builder Tag Base Dispatch Policies
+struct KernelSchedule1Sm {};
+struct KernelSchedule2Sm {};
+struct KernelScheduleSm100 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100DenseGemm : KernelScheduleSm100 {};   // Base policy
+// Dense GEMM: Specialize for 1SM vs 2SM
+struct KernelTmaWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100DenseGemm {};  // Use for 1SM Dense GEMM Kernels for Collective Mainloop Builder
+struct KernelTmaWarpSpecialized2SmSm100 final : KernelSchedule2Sm, KernelScheduleSm100DenseGemm {};  // Use for 2SM Dense GEMM Kernels for Collective Mainloop Builder
+struct KernelWarpSpecialized1SmSm100    final : KernelSchedule1Sm, KernelScheduleSm100DenseGemm {};  // Use for 1SM Dense GEMM Kernels for Collective Mainloop Builder Without TMA
+struct KernelMixedTmaCpAsyncWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100DenseGemm {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Dense GEMM + (Ptr Array or Group GEMM)
+struct KernelScheduleSm100PtrArrayDenseGemm : KernelScheduleSm100DenseGemm {};
+// Ptr-Array Dense GEMM: Specialize for 1SM vs 2SM
+struct KernelPtrArrayTmaWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayDenseGemm {};
+struct KernelPtrArrayTmaWarpSpecialized2SmSm100 final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayDenseGemm {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Blockwise GEMM + Ptr-Array GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100Blockwise  : KernelScheduleSm100 {};
+struct KernelTmaWarpSpecializedBlockwise1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100Blockwise {};
+struct KernelTmaWarpSpecializedBlockwise2SmSm100 final : KernelSchedule2Sm, KernelScheduleSm100Blockwise {};
+
+struct KernelScheduleSm100PtrArrayBlockwise  : KernelScheduleSm100Blockwise {};
+struct KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100 final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayBlockwise {};
+struct KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100 final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayBlockwise {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Planar Complex GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100PlanarComplexGemm : KernelScheduleSm100{};
+// Planar Complex GEMM: Specialize for 1SM vs 2SM
+struct KernelTmaWarpSpecialized1SmPlanarComplexSm100 final : KernelSchedule1Sm, KernelScheduleSm100PlanarComplexGemm { };
+struct KernelTmaWarpSpecialized2SmPlanarComplexSm100 final : KernelSchedule2Sm, KernelScheduleSm100PlanarComplexGemm { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array Planar Complex GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Planar Complex GEMM + (Ptr Array or Group GEMM)
+struct KernelScheduleSm100PtrArrayPlanarComplexGemm : KernelScheduleSm100PlanarComplexGemm {};
+
+struct KernelPtrArrayTmaWarpSpecialized1SmPlanarComplexSm100 final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayPlanarComplexGemm {};
+struct KernelPtrArrayTmaWarpSpecialized2SmPlanarComplexSm100 final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayPlanarComplexGemm {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 FastF32 (9xBF16) GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100FastFP32Gemm           : KernelScheduleSm100 {};
+struct KernelTmaWarpSpecializedFastFP32SmemSm100 : KernelScheduleSm100FastFP32Gemm { };
+// Dispatch policies without smem load the A operand from tmem
+struct KernelTmaWarpSpecialized1SmFastFP32Sm100 final : KernelSchedule1Sm, KernelScheduleSm100FastFP32Gemm { };
+struct KernelTmaWarpSpecialized2SmFastFP32Sm100 final : KernelSchedule2Sm, KernelScheduleSm100FastFP32Gemm { };
+// Dispatch policies with smem load the A operand from smem
+struct KernelTmaWarpSpecialized1SmFastFP32SmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
+struct KernelTmaWarpSpecialized2SmFastFP32SmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedFastFP32SmemSm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Mixed Precision Input GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm100MixedInputGemm           : KernelScheduleSm100 {};
+struct KernelTmaWarpSpecializedMixedInputSmemSm100 : KernelScheduleSm100MixedInputGemm { };
+struct KernelTmaWarpSpecialized1SmMixedInputSm100 final : KernelSchedule1Sm, KernelScheduleSm100MixedInputGemm { };
+struct KernelTmaWarpSpecialized1SmMixedInputSmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedMixedInputSmemSm100 { };
+struct KernelTmaWarpSpecialized2SmMixedInputSm100 final : KernelSchedule2Sm, KernelScheduleSm100MixedInputGemm { };
+struct KernelTmaWarpSpecialized2SmMixedInputSmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedMixedInputSmemSm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Ptr-Array FastF32 (9xBF16) GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Ptr-Array Transform GEMM: Specialize for 1SM vs 2SM FastF32 GEMM
+struct KernelScheduleSm100PtrArrayFastFP32Gemm           : KernelScheduleSm100FastFP32Gemm {};
+struct KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 : KernelScheduleSm100PtrArrayFastFP32Gemm { };
+
+struct KernelPtrArrayTmaWarpSpecialized1SmFastFP32Sm100     final : KernelSchedule1Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
+struct KernelPtrArrayTmaWarpSpecialized2SmFastFP32Sm100     final : KernelSchedule2Sm, KernelScheduleSm100PtrArrayFastFP32Gemm { };
+struct KernelPtrArrayTmaWarpSpecialized1SmFastFP32SmemSm100 final : KernelSchedule1Sm, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmFastFP32SmemSm100 final : KernelSchedule2Sm, KernelTmaWarpSpecializedPtrArrayFastFP32SmemSm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSparseGemmSm100 : KernelScheduleSm100 {};
+// Sparse GEMM: Specialize for 1SM vs 2SM
+struct KernelSparseTmaWarpSpecialized1SmSm100 final : KernelSchedule1Sm, KernelScheduleSparseGemmSm100 { };
+struct KernelSparseTmaWarpSpecialized2SmSm100 final : KernelSchedule2Sm, KernelScheduleSparseGemmSm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 BlockScaled Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledGemmSm100   : KernelScheduleSm100 {};                  
+struct KernelScheduleMxNvf4Sm100            : KernelScheduleBlockScaledGemmSm100 {};
+struct KernelScheduleMxf8f6f4Sm100          : KernelScheduleBlockScaledGemmSm100 {};
+// Block Scaled Dense GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
+struct KernelTmaWarpSpecialized1SmBlockScaledSm100       final : KernelSchedule1Sm, KernelScheduleBlockScaledGemmSm100 { };
+struct KernelTmaWarpSpecialized2SmBlockScaledSm100       final : KernelSchedule2Sm, KernelScheduleBlockScaledGemmSm100 { };
+struct KernelTmaWarpSpecialized1SmNvf4Sm100              final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized2SmNvf4Sm100              final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized1SmMxf4Sm100              final : KernelSchedule1Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized2SmMxf4Sm100              final : KernelSchedule2Sm, KernelScheduleMxNvf4Sm100 { };
+struct KernelTmaWarpSpecialized1SmMxf8f6f4Sm100          final : KernelSchedule1Sm, KernelScheduleMxf8f6f4Sm100 { };
+struct KernelTmaWarpSpecialized2SmMxf8f6f4Sm100          final : KernelSchedule2Sm, KernelScheduleMxf8f6f4Sm100 { };
+struct KernelMixedTmaCpAsyncWarpSpecialized1SmBlockScaledSm100 final : KernelSchedule1Sm, KernelScheduleBlockScaledGemmSm100 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 BlockScaled Ptr Array Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// BlockScaled Dense GEMM + (Ptr Array or Group GEMM)
+struct KernelSchedulePtrArrayBlockScaledGemmSm100   : KernelScheduleBlockScaledGemmSm100 {};
+struct KernelSchedulePtrArrayMxNvf4Sm100            : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
+struct KernelSchedulePtrArrayMxf8f6f4Sm100          : KernelSchedulePtrArrayBlockScaledGemmSm100 {};
+// Ptr-Array Block Scaled Dense GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100       final : KernelSchedule1Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100       final : KernelSchedule2Sm, KernelSchedulePtrArrayBlockScaledGemmSm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100              final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100              final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100              final : KernelSchedule1Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100              final : KernelSchedule2Sm, KernelSchedulePtrArrayMxNvf4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100          final : KernelSchedule1Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
+struct KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100          final : KernelSchedule2Sm, KernelSchedulePtrArrayMxf8f6f4Sm100 { };
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 BlockScaled Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledSparseGemmSm100 : KernelScheduleSm100 {};
+struct KernelScheduleSparseMxNvf4Sm100          : KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelScheduleSparseMxf8f6f4Sm100        : KernelScheduleBlockScaledSparseGemmSm100 {};
+// Block Scaled Sparse GEMM: Specialize for instruction type, scale factor vector size, and 1SM vs. 2SM
+struct KernelSparseTmaWarpSpecialized1SmBlockScaledSm100 final : KernelSchedule1Sm, KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelSparseTmaWarpSpecialized2SmBlockScaledSm100 final : KernelSchedule2Sm, KernelScheduleBlockScaledSparseGemmSm100 {};
+struct KernelSparseTmaWarpSpecialized1SmMxf8f6f4Sm100    final : KernelSchedule1Sm, KernelScheduleSparseMxf8f6f4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmMxf8f6f4Sm100    final : KernelSchedule2Sm, KernelScheduleSparseMxf8f6f4Sm100 { };
+struct KernelSparseTmaWarpSpecialized1SmNvf4Sm100        final : KernelSchedule1Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmNvf4Sm100        final : KernelSchedule2Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized1SmMxf4Sm100        final : KernelSchedule1Sm, KernelScheduleSparseMxNvf4Sm100 { };
+struct KernelSparseTmaWarpSpecialized2SmMxf4Sm100        final : KernelSchedule2Sm, KernelScheduleSparseMxNvf4Sm100 { };
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          SM103 Dispatch Policies
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct KernelScheduleSm103 {};
+struct KernelScheduleSm103BlockScaledGemm                  : KernelScheduleSm103 {};
+struct KernelScheduleSm103BlockScaledMxNvf4UltraTmaPrefetch     : KernelScheduleSm103BlockScaledGemm {};
+struct KernelScheduleSm103BlockScaledMxNvf4UltraDisablePrefetch : KernelScheduleSm103BlockScaledGemm {};
+
+// Blockscaled Gemm: Specialized for instruction type, scale factor vector size, and 1SM vs. 2SM
+// These are the public dispatch policy name
+struct KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch final : KernelSchedule1Sm, KernelScheduleSm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch final : KernelSchedule2Sm, KernelScheduleSm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch final : KernelSchedule1Sm, KernelScheduleSm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch final : KernelSchedule2Sm, KernelScheduleSm103BlockScaledMxNvf4UltraTmaPrefetch { };
+
+struct KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch final : KernelSchedule1Sm, KernelScheduleSm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch final : KernelSchedule2Sm, KernelScheduleSm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch final : KernelSchedule1Sm, KernelScheduleSm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch final : KernelSchedule2Sm, KernelScheduleSm103BlockScaledMxNvf4UltraDisablePrefetch { };
+
+using KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103 = KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch;
+using KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103 = KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch;
+using KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103 = KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch;
+using KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103 = KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch;
+
+
+struct KernelSchedulePtrArraySm103BlockScaledGemm                  : KernelScheduleSm103 {};
+struct KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraTmaPrefetch     : KernelSchedulePtrArraySm103BlockScaledGemm {};
+struct KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraDisablePrefetch : KernelSchedulePtrArraySm103BlockScaledGemm {};
+
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch final : KernelSchedule1Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch final : KernelSchedule2Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch final : KernelSchedule1Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraTmaPrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch final : KernelSchedule2Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraTmaPrefetch { };
+
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch final : KernelSchedule1Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch final : KernelSchedule2Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch final : KernelSchedule1Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraDisablePrefetch { };
+struct KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch final : KernelSchedule2Sm, KernelSchedulePtrArraySm103BlockScaledMxNvf4UltraDisablePrefetch { };
+
+using KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103 = KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch;
+using KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103 = KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch;
+using KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103 = KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch;
+using KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103 = KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//          SM120 Dispatch Policies
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Builder Tag Base Dispatch Policies
+struct KernelScheduleSm120 {};
+struct KernelScheduleAcc2x4Sm120 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM100 Dense GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSm120DenseGemm : KernelScheduleSm120 {};
+// Dense GEMM: Specialize for instruction type
+struct KernelScheduleF8f6f4Sm120 final : KernelScheduleSm120DenseGemm {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 BlockScaled GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledGemmSm120 : KernelScheduleSm120 {};
+struct KernelScheduleMxf8f6f4Sm120        : KernelScheduleBlockScaledGemmSm120 {};
+struct KernelScheduleMxNvf4Sm120          : KernelScheduleBlockScaledGemmSm120 {};
+// Block Scaled GEMM: Specialize for instruction type, scale factor vector size.
+struct KernelTmaWarpSpecializedNvf4Sm120             final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongNvf4Sm120     final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedMxf4Sm120             final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongMxf4Sm120     final : KernelScheduleMxNvf4Sm120, KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedMxf8f6f4Sm120         final : KernelScheduleMxf8f6f4Sm120, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120 final : KernelScheduleMxf8f6f4Sm120, KernelTmaWarpSpecializedPingpong { };
+// Blockwise Scaled GEMM
+struct KernelScheduleSm120Blockwise: KernelScheduleSm120 { };
+struct KernelTmaWarpSpecializedBlockwiseCooperativeSm120 final : KernelScheduleSm120Blockwise, KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedBlockwisePingpongSm120 final : KernelScheduleSm120Blockwise, KernelTmaWarpSpecializedPingpong { };
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleSparseGemmSm120 : KernelScheduleSm120 {};
+// Sparse GEMM: Specialize for instruction type
+struct KernelScheduleSparseF8f6f4Sm120 final : KernelScheduleSparseGemmSm120 {};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// SM120 BlockScaled Sparse GEMM Dispatch Policies
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+struct KernelScheduleBlockScaledSparseGemmSm120 : KernelScheduleSm120 {};
+struct KernelScheduleSparseMxNvf4Sm120          : KernelScheduleBlockScaledSparseGemmSm120 {};
+struct KernelScheduleSparseMxf8f6f4Sm120        : KernelScheduleBlockScaledSparseGemmSm120 {};
+// Block Scaled Sparse GEMM: Specialize for instruction type, scale factor vector size, Acc2x4
+struct KernelSparseTmaWarpSpecializedNvf4Sm120           final : KernelScheduleSparseMxNvf4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf4Sm120           final : KernelScheduleSparseMxNvf4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf8f6f4Sm120       final : KernelScheduleSparseMxf8f6f4Sm120 { };
+struct KernelSparseTmaWarpSpecializedMxf8f6f4Acc2x4Sm120 final : KernelScheduleSparseMxf8f6f4Sm120, KernelScheduleAcc2x4Sm120 { };
+
+//////////////////////////////////////////////////////////////////////////////
+
+//
+// Collective Mainloop Dispatch Policies
+//
+
+// n-buffer in smem, pipelined with Blackwell UMMA and CPASYNC, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100UmmaCpAsyncWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100UmmaMixedTmaCpAsyncWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelMixedTmaCpAsyncWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  constexpr static bool IsOverlappingAccum = false;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100UmmaMixedTmaCpAsyncWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelMixedTmaCpAsyncWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  constexpr static bool IsOverlappingAccum = false;
+};
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  constexpr static bool IsOverlappingAccum = false;
+};
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedBlockwiseScaling {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelTmaWarpSpecializedMmaTransformSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  constexpr static bool IsOverlappingAccum = false;
+};
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100ArrayTmaUmmaWarpSpecializedBlockwiseScaling {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelPtrArrayTmaWarpSpecializedMmaTransformSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  constexpr static bool IsOverlappingAccum = false;
+};
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedSparse {
+  constexpr static int Stages = Stages_;
+  constexpr static int MetadataS2TStages = 4;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelSparseTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedBlockScaledSparse {
+  constexpr static int Stages = Stages_;
+  constexpr static int MetadataS2TStages = 4;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelSparseTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+// n-buffer in smem, pipelined with Blackwell Fast FP32 kernel with UMMA (HwScaled) and TMA,
+// Warp specialized dynamic schedule
+template<
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Load2TransformPipelineStageCount_,
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Transform2MmaPipelineStageCount_,
+  // TileScheduler pipeline depth
+  int SchedulerPipelineStageCount_,
+  // Accmulator pipeline depth
+  int AccumulatorPipelineStageCount_,
+  // Number of MMA Bands to be computed in a single FastF32 MMA operation.
+  // For BF16 emulation, we have 3 compute matrices, with 9 MMAs forming 5 bands.
+  //    We can eliminate bands 4 and/or 5 (up to last 3 MMA operations).
+  //    Valid values are 3, 4, 5
+  int NumBandsToCompute_,
+  // Scaling factor for decomposed matrices (2^ScalingFactor)
+  // 8 for BF16, 11 for TF32
+  int ScalingFactor_,
+  // Number of UMMA instructions emulated a single stage
+  // Ex: Staged16 has 1 FastF32 MMA per stage
+  // Should be smaller than K-mode of a single ClusterTile
+  int AccPromotionInterval_,
+  // ClusterShape for the kernel
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  // The TMEM_LOAD atom to be used for loading local accumulator
+  // from TMEM to registers
+  class AccumulatorCopyAtom_ = cute::SM100_TMEM_LOAD_32dp32b32x
+>
+struct MainloopSm100TmaUmmaWarpSpecializedFastF32 {
+  constexpr static int Load2TransformPipelineStageCount = Load2TransformPipelineStageCount_;
+  constexpr static int Transform2MmaPipelineStageCount = Transform2MmaPipelineStageCount_;
+  constexpr static int NumBandsToCompute = NumBandsToCompute_;
+  constexpr static int ScalingFactor = ScalingFactor_;
+  constexpr static int AccPromotionInterval = AccPromotionInterval_;
+  constexpr static detail::KernelInputTransformType InputTransformType = detail::KernelInputTransformType::FastF32;
+  using ClusterShape = ClusterShape_;
+  using AccumulatorCopyAtom = AccumulatorCopyAtom_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelTmaWarpSpecializedInputTransformSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+
+  // For backwards compatibility with GemmUniversalAdapter.
+  constexpr static int Stages = Load2TransformPipelineStageCount;
+};
+
+
+// n-buffer in smem, pipelined with Blackwell Mixed Input kernel with UMMA (HwScaled) and TMA,
+template<
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Load2TransformPipelineStageCount_,
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Transform2MmaPipelineStageCount_,
+  // TileScheduler pipeline depth
+  int SchedulerPipelineStageCount_,
+  // Accmulator pipeline depth
+  int AccumulatorPipelineStageCount_,
+  // ClusterShape for the kernel
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100TmaUmmaWarpSpecializedMixedInput {
+  constexpr static int Load2TransformPipelineStageCount = Load2TransformPipelineStageCount_;
+  constexpr static int Load2MmaPipelineStageCount = Load2TransformPipelineStageCount_;
+  constexpr static int Transform2MmaPipelineStageCount = Transform2MmaPipelineStageCount_;
+  constexpr static detail::KernelInputTransformType InputTransformType = detail::KernelInputTransformType::MixedInput;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelTmaWarpSpecializedMixedInputTransformSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+
+  // For backwards compatibility with GemmUniversalAdapter.
+  constexpr static int Stages = Load2TransformPipelineStageCount;
+};
+
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100ArrayTmaUmmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = false;
+  using Schedule = KernelPtrArrayTmaWarpSpecializedSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm100ArrayTmaUmmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm100;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+};
+
+
+
+// n-buffer in smem, pipelined with Blackwell Fast FP32 kernel with UMMA (HwScaled) and TMA,
+// Warp specialized dynamic schedule
+template<
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Load2TransformPipelineStageCount_,
+  // Number of Pipeline stages for
+  // MainloopLoad <-> Conversion <-> MainLoad
+  int Transform2MmaPipelineStageCount_,
+  // TileScheduler pipeline depth
+  int SchedulerPipelineStageCount_,
+  // Accmulator pipeline depth
+  int AccumulatorPipelineStageCount_,
+  // Number of MMA Bands to be computed in a single FastF32 MMA operation.
+  // For BF16 emulation, we have 3 compute matrices, with 9 MMAs forming 5 bands.
+  //    We can eliminate bands 4 and/or 5 (up to last 3 MMA operations).
+  //    Valid values are 3, 4, 5
+  int NumBandsToCompute_,
+  // Scaling factor for decomposed matrices (2^ScalingFactor)
+  // 8 for BF16, 11 for TF32
+  int ScalingFactor_,
+  // Number of UMMA instructions emulated a single stage
+  // Ex: Staged16 has 1 FastF32 MMA per stage
+  // Should be smaller than K-mode of a single ClusterTile
+  int AccPromotionInterval_,
+  // ClusterShape for the kernel
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  // The TMEM_LOAD atom to be used for loading local accumulator
+  // from TMEM to registers
+  class AccumulatorCopyAtom_ = cute::SM100_TMEM_LOAD_32dp32b32x
+>
+struct MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32 {
+  constexpr static int Load2TransformPipelineStageCount = Load2TransformPipelineStageCount_;
+  constexpr static int Transform2MmaPipelineStageCount = Transform2MmaPipelineStageCount_;
+  constexpr static int NumBandsToCompute = NumBandsToCompute_;
+  constexpr static int ScalingFactor = ScalingFactor_;
+  constexpr static int AccPromotionInterval = AccPromotionInterval_;
+  constexpr static detail::KernelInputTransformType InputTransformType = detail::KernelInputTransformType::FastF32;
+  using ClusterShape = ClusterShape_;
+  using AccumulatorCopyAtom = AccumulatorCopyAtom_;
+  using ArchTag = arch::Sm100;
+  using Schedule = KernelPtrArrayTmaWarpSpecializedInputTransformSm100<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+
+  // For backwards compatibility with GemmUniversalAdapter.
+  constexpr static int Stages = Load2TransformPipelineStageCount;
+};
+
+
+// n-buffer in smem, pipelined with Blackwell UMMA and TMA, Warp specialized dynamic schedule
+template<
+  int LoadABPipelineStageCount_,
+  int LoadSFPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  cutlass::sm103::detail::KernelPrefetchType PrefetchType_ = cutlass::sm103::detail::KernelPrefetchType::TmaPrefetch
+>
+struct MainloopSm103TmaUmmaWarpSpecializedBlockScaled {
+  constexpr static int LoadABPipelineStageCount = LoadABPipelineStageCount_;
+  constexpr static int LoadSFPipelineStageCount = LoadSFPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm103;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelTmaWarpSpecializedBlockScaledSm103<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  // For backwards compatibility with GemmUniversalAdapter.
+  constexpr static int Stages = LoadABPipelineStageCount;
+  constexpr static cutlass::sm103::detail::KernelPrefetchType PrefetchType = PrefetchType_;
+};
+
+// Mainloop schedule for array-based TMA
+
+template<
+  int LoadABPipelineStageCount_,
+  int LoadSFPipelineStageCount_,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>,
+  cutlass::sm103::detail::KernelPrefetchType PrefetchType_ = cutlass::sm103::detail::KernelPrefetchType::TmaPrefetch
+>
+struct MainloopSm103ArrayTmaUmmaWarpSpecializedBlockScaled {
+  constexpr static int LoadABPipelineStageCount = LoadABPipelineStageCount_;
+  constexpr static int LoadSFPipelineStageCount = LoadSFPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm103;
+  constexpr static bool IsOverlappingAccum = AccumulatorPipelineStageCount_ == 1;
+  using Schedule = KernelPtrArrayTmaWarpSpecializedBlockScaledSm103<SchedulerPipelineStageCount_, AccumulatorPipelineStageCount_>;
+  // For backwards compatibility with GemmUniversalAdapter.
+  constexpr static int Stages = LoadABPipelineStageCount;
+  constexpr static cutlass::sm103::detail::KernelPrefetchType PrefetchType = PrefetchType_;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120TmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120ArrayTmaWarpSpecialized {
+  constexpr static int Stages = Stages_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+  static_assert(
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, Schedule> ||
+    cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, Schedule>,
+    "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies");                                     
+};
+
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120TmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+};
+
+template<
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120ArrayTmaWarpSpecializedBlockScaled {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using Schedule = KernelSchedule_;
+  using ArchTag = arch::Sm120;
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, Schedule> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, Schedule>, 
+                "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies.");
+};
+
+
+template<
+  int StagesA_,
+  int StagesB_,
+  int StagesE_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm120TmaWarpSpecializedSparse {
+  constexpr static int StagesA = StagesA_;
+  constexpr static int StagesB = StagesB_;
+  constexpr static int StagesE = StagesE_;
+  constexpr static bool isAsymmetric = (StagesA != StagesB);
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm120;
+  using Schedule = KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, isAsymmetric>;
+};
+
+template<
+  int StagesA_,
+  int StagesB_,
+  int StagesE_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_ = Shape<_1,_1,_1>
+>
+struct MainloopSm120TmaWarpSpecializedSparseBlockScaled {
+  constexpr static int StagesA = StagesA_;
+  constexpr static int StagesB = StagesB_;
+  constexpr static int StagesE = StagesE_;
+  constexpr static bool isAsymmetric = (StagesA != StagesB);
+  using ClusterShape = ClusterShape_;
+  using ArchTag = arch::Sm120;
+  using Schedule = KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, isAsymmetric>;
+};
+
+template <
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120TmaWarpSpecializedBlockwiseScaling {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+};
+
+template <
+  int Stages_,
+  int SchedulerPipelineStageCount_,
+  class ClusterShape_,
+  class KernelSchedule_
+>
+struct MainloopSm120ArrayTmaWarpSpecializedBlockwiseScaling {
+  constexpr static int Stages = Stages_;
+  constexpr static int SchedulerPipelineStageCount = SchedulerPipelineStageCount_;
+  using ClusterShape = ClusterShape_;
+  using Schedule = KernelSchedule_;
+
+  constexpr static int PipelineAsyncMmaStages = 0;
+  using ArchTag = arch::Sm120;
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, Schedule> ||
+                cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, Schedule>, 
+                "KernelSchedule must be one of the Ptr-Array or Grouped Gemm TMA Warp Specialized Cooperative or Pingpong policies.");
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..5137bfada8d35474b7157fcaa8880df6b567506d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm.h
@@ -0,0 +1,140 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cute/layout.hpp"
+#include "cutlass/detail/layout.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Scaling kind
+enum class ScalingKind {
+  kTensorwise,   // Accumulated GEMM result is scaled per tensor (default alpha scaling)
+  kBlockwise     // Accumulated GEMM result is scaled per CTA tile (blockwise)
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using cutlass::detail::TagToStrideA;
+using cutlass::detail::TagToStrideB;
+using cutlass::detail::TagToStrideC;
+using cutlass::detail::TagToStrideA_t;
+using cutlass::detail::TagToStrideB_t;
+using cutlass::detail::TagToStrideC_t;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+using cutlass::detail::StrideToLayoutTagA;
+using cutlass::detail::StrideToLayoutTagB;
+using cutlass::detail::StrideToLayoutTagC;
+using cutlass::detail::StrideToLayoutTagA_t;
+using cutlass::detail::StrideToLayoutTagB_t;
+using cutlass::detail::StrideToLayoutTagC_t;
+
+template<int ModeIndex, class Stride>
+constexpr bool
+is_major(Stride = {}) {
+  return ::cutlass::detail::is_major<ModeIndex>(Stride{});
+}
+
+template<class Stride>
+constexpr bool
+is_mn_major() {
+  return is_major<0,Stride>();
+}
+
+template<class Stride>
+constexpr
+bool
+is_k_major() {
+  return is_major<1,Stride>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_mn_major_A() {
+  return is_mn_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_mn_major_B() {
+  return is_mn_major<TagToStrideB_t<LayoutB>>();
+}
+
+template<class LayoutA>
+constexpr bool
+is_k_major_A() {
+  return is_k_major<TagToStrideA_t<LayoutA>>();
+}
+
+template<class LayoutB>
+constexpr bool
+is_k_major_B() {
+  return is_k_major<TagToStrideB_t<LayoutB>>();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// The following two metafunctions are used to detect whether a `kernel::Gemm` or `kernel::GemmUniversal`
+// is implementing the CUTLASS 3.x API or not, by checking if the problem shape type is aliased within or not.
+template <class GemmKernel, class = void>
+struct IsCutlass3GemmKernel : cute::false_type { };
+
+template <typename GemmKernel>
+struct IsCutlass3GemmKernel<GemmKernel, cute::void_t<typename GemmKernel::ProblemShape>>
+    : cute::true_type { };
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm_enumerated_types.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..8961735b9a38cb71f18aada5c402e0d875140b57
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/gemm_enumerated_types.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines common types used for all GEMM-like operators.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GEMM operand enumeration: D = A * B + C
+enum class Operand {
+  kA, /// A multiplicand
+  kB, /// B multiplicand
+  kC, /// Source accumulator
+  kD  /// Destination accumulator
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class GemmUniversalMode {
+  kGemm,
+  kGemmSplitKParallel,
+  kBatched,
+  kArray,
+  kGrouped,
+  kInvalid
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Some options for clearing shared memory
+enum class SharedMemoryClearOption {
+  kNone,            ///< SMEM is in don't-care state
+  kZfill,           ///< Kernels fill out of bounds accesses with zeros
+  kClearLastStage   ///< Last SMEM stage is explicitly cleared. Mainloop uses 'kNone'
+};
+
+/////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..400f7e6b2d30913469d5627e804d19f0df3760d1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/group_array_problem_shape.hpp
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This file contains definitions and utility functions for describing problem shapes 
+           for 3.x Ptr-Array GEMMs and Grouped GEMMs.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cute/container/array.hpp"
+
+#if ! defined(__CUDACC_RTC__)
+#include <initializer_list>
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class ProblemShape_>
+struct GroupProblemShape {
+  using UnderlyingProblemShape = ProblemShape_;
+  int32_t num_groups = 1;
+  UnderlyingProblemShape* problem_shapes = nullptr;
+  UnderlyingProblemShape const* host_problem_shapes = nullptr;
+
+  CUTLASS_HOST_DEVICE
+  int32_t groups() const { return num_groups; }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t group_idx) const {
+    return problem_shapes[group_idx];
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t group_idx) const {
+    return host_problem_shapes != nullptr ? host_problem_shapes[group_idx] : UnderlyingProblemShape{};
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() const {
+    return host_problem_shapes != nullptr;
+  }
+};
+
+template <class ProblemShape_, class MaxProblemShape_>
+struct MoEProblemShape {
+  using UnderlyingProblemShape = ProblemShape_;
+  using MaxProblemShape = MaxProblemShape_;
+
+  UnderlyingProblemShape problem_shape;
+  MaxProblemShape max_problem_shape;
+};
+
+
+template <class ProblemShape_>
+class ArrayProblemShape {
+public:
+  using UnderlyingProblemShape = ProblemShape_;
+
+  ArrayProblemShape() = default;
+  ArrayProblemShape(UnderlyingProblemShape ps) : problem_shape_(ps) {}
+
+  // Num of groups for Ptr-Array GEMM always remain one, just the number of batches (l) can vary
+  // This is just to maintain uniformity with GroupProblemShape
+  constexpr int32_t groups() const { return 1; }
+
+  UnderlyingProblemShape* problem_shapes() const {
+    return &problem_shape_;
+  }
+  UnderlyingProblemShape const* host_problem_shapes() const {
+    return &problem_shape_;
+  }
+
+  // This is just to maintain uniformity with GroupProblemShape
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  UnderlyingProblemShape const
+  get_host_problem_shape(int32_t /* unused */ = 0) const {
+    return problem_shape_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool
+  is_host_problem_shape_available() const {
+    return true;
+  }
+private:
+  UnderlyingProblemShape problem_shape_{};
+};
+
+
+namespace detail {
+  
+template<class T>
+struct is_moe_problem_shape : cute::false_type {};
+template<class T, class U>
+struct is_moe_problem_shape<cutlass::gemm::MoEProblemShape<T,U>> : cute::true_type {}; 
+
+}
+
+} // namespace cutlass::gemm 
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..561508c74de20ce2c9e47b3265b463570ed2c7db
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_ell_gemm.h
@@ -0,0 +1,837 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Default kernel-level Blocked-Ell sparse gemm operators.
+      This operator combines threadblock-scoped ELL MMA
+      with the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/kernel/ell_gemm.h"
+#include "cutlass/gemm/threadblock/default_ell_mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+>
+struct DefaultEllGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse>
+struct DefaultEllGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
+                   WarpShape, InstructionShape, EpilogueOutputOp,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, IsASparse> {
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
+      InstructionShape, 2, Operator, true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for Volta architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Sparse matrix is A or not
+  bool IsASparse
+>
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  GemmShape<8, 8, 4>,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  IsASparse
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<8, 8, 4>,
+    2,
+    Operator
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+  >
+struct DefaultEllGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    layout::RowMajor,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    2,
+    SplitKSerial,
+    Operator,
+    IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      layout::RowMajor,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      GemmShape<1, 1, 1>,
+      2,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator, 
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   layout::RowMajor,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator,
+                   IsASparse> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial,IsASparse>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for SIMT DP4A
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for C matrix operand
+    typename LayoutC,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    >
+struct DefaultEllGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
+                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
+                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
+                   Operator, IsASparse> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+
+  using OperatorClass =  arch::OpClassSimt;
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      2,
+      Operator
+      >::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Wmma Gemm Kernel
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Sparse matrix is A or not
+    bool IsASparse
+    > 
+struct DefaultEllGemm<
+  ElementA, LayoutA, kAlignmentA, 
+  ElementB, LayoutB, kAlignmentB, 
+  ElementC, LayoutC, 
+  ElementAccumulator, 
+  arch::OpClassWmmaTensorOp,
+  ArchTag, 
+  ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, 
+  ThreadblockSwizzle, 
+  Stages, 
+  SplitKSerial,
+  Operator,
+  IsASparse> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultEllMma<
+      ElementA, LayoutA, kAlignmentA,
+      ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, 
+      arch::OpClassWmmaTensorOp, 
+      ArchTag,
+      ThreadblockShape, 
+      WarpShape, 
+      InstructionShape, 
+      Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue 
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
+      ThreadblockShape,
+      typename Mma::Operator, 
+      kPartitionsK, 
+      EpilogueOutputOp,
+      EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::EllGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial, IsASparse>;
+};
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..da41c3e0a49f882c0db0103718ac067808c959e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm.h
@@ -0,0 +1,1189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/layout/permute.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+>
+struct DefaultGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB, 
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD, 
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   LayoutC, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator, SharedMemoryClear, GatherA, GatherB, ScatterD,
+                   PermuteDLayout, PermuteALayout, PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using RegularEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, ScatterD, PermuteDLayout>::Epilogue;
+
+  using Affine2Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpAffineRankN<
+          2, ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Use zfill or predicate for out-of-bound cp.async
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
+>
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm75,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm75,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    2,
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ScatterD,
+    PermuteDLayout
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultGemm<
+    ElementA, layout::ColumnMajorInterleaved<InterleavedK>, kAlignmentA,
+    ElementB, layout::RowMajorInterleaved<InterleavedK>, kAlignmentB, ElementC,
+    layout::ColumnMajorInterleaved<InterleavedK>, int32_t,
+    arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape,
+    InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages,
+    SplitKSerial, Operator, SharedMemoryClear, false, false, false> {
+
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, Operator,
+      true, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Turing Integer Matrix Multiply Interleaved layout
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of Interleaved k
+    int InterleavedK,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear>
+struct DefaultGemm<ElementA, layout::ColumnMajorInterleaved<InterleavedK>,
+                   kAlignmentA, ElementB,
+                   layout::RowMajorInterleaved<InterleavedK>, kAlignmentB,
+                   ElementC, layout::ColumnMajorInterleaved<InterleavedK>,
+                   int32_t, arch::OpClassTensorOp, arch::Sm75, ThreadblockShape,
+                   WarpShape, InstructionShape, EpilogueOutputOp,
+                   ThreadblockSwizzle, 2, SplitKSerial, Operator, SharedMemoryClear,
+                   false, false, false> {
+
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using LayoutC = layout::ColumnMajorInterleaved<InterleavedK>;
+
+  using ElementAccumulator = int32_t;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC,
+      arch::OpClassTensorOp, arch::Sm75, ThreadblockShape, WarpShape,
+      InstructionShape, 2, Operator, true>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          64 / sizeof_bits<ElementC>::value, InterleavedK>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Volta architecture
+template <
+  /// Element type for A matrix operand
+  typename ElementA,
+  /// Layout type for A matrix operand
+  typename LayoutA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB,
+  /// Layout type for B matrix operand
+  typename LayoutB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Use zfill or predicate for out-of-bound cp.async
+  SharedMemoryClearOption SharedMemoryClear,
+  /// Gather operand A by using an index array
+  bool GatherA,
+  /// Gather operand B by using an index array
+  bool GatherB,
+  /// Scatter result D by using an index array
+  bool ScatterD,
+  /// Permute result D
+  typename PermuteDLayout,
+  /// Permute operand A
+  typename PermuteALayout,
+  /// Permute operand B
+  typename PermuteBLayout
+>
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA,
+  ElementB, LayoutB, kAlignmentB,
+  ElementC, layout::RowMajor,
+  ElementAccumulator,
+  arch::OpClassTensorOp,
+  arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  GemmShape<8, 8, 4>,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  2,
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout
+> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<8, 8, 4>,
+    2,
+    Operator,
+    false,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    PermuteALayout,
+    PermuteBLayout
+  >::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueVoltaTensorOp<
+    ThreadblockShape,
+    typename Mma::Operator,
+    kPartitionsK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount,
+    ScatterD,
+    PermuteDLayout
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+  >
+struct DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    arch::OpClassSimt,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    GemmShape<1, 1, 1>,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    2,
+    SplitKSerial,
+    Operator,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout,
+    typename platform::enable_if< ! platform::is_same<ArchTag, arch::Sm80>::value >::type > {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      GemmShape<1, 1, 1>,
+      2,
+      Operator,
+      false,
+      SharedMemoryClear,
+      GatherA,
+      GatherB,
+      PermuteALayout,
+      PermuteBLayout>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess,
+      ScatterD,
+      PermuteDLayout
+      >::Epilogue;
+
+  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
+      2,
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemm<ElementA,
+                   LayoutA,
+                   kAlignmentA,
+                   ElementB,
+                   LayoutB,
+                   kAlignmentB,
+                   ElementC,
+                   LayoutC,
+                   ElementAccumulator,
+                   arch::OpClassSimt,
+                   arch::Sm80,
+                   ThreadblockShape,
+                   WarpShape,
+                   GemmShape<1, 1, 1>,
+                   EpilogueOutputOp,
+                   ThreadblockSwizzle,
+                   Stages,
+                   SplitKSerial,
+                   Operator,
+                   SharedMemoryClear,
+                   GatherA,
+                   GatherB,
+                   ScatterD,
+                   PermuteDLayout,
+                   PermuteALayout,
+                   PermuteBLayout> {
+
+  static_assert((platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value),
+             "Epilogue in the kernel level must be row major");
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, arch::OpClassSimt, arch::Sm80,
+      ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages,
+      Operator, false, SharedMemoryClear, GatherA, GatherB,
+      PermuteALayout, PermuteBLayout>::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using RegularEpilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess,
+      ScatterD,
+      PermuteDLayout
+      >::Epilogue;
+
+  using Affine2Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimtAffineRankN<
+      2,
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  using Epilogue = typename platform::conditional<platform::is_same<LayoutC, layout::RowMajor>::value,
+                                                  RegularEpilogue,
+                                                  Affine2Epilogue>::type;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>; 
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for SIMT DP4A
+
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for C matrix operand
+    typename LayoutC,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+>
+struct DefaultGemm<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassSimt,
+                   ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>,
+                   EpilogueOutputOp, ThreadblockSwizzle, 2, SplitKSerial,
+                   Operator, SharedMemoryClear, false, false, false,
+                   layout::NoPermute, layout::NoPermute> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+
+  using OperatorClass =  arch::OpClassSimt;
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      arch::OpClassSimt,
+      arch::Sm50,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      2,
+      Operator
+      >::ThreadblockMma;
+
+  static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount;
+  static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars");
+
+  /// Define the epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+      ThreadblockShape,
+      typename Mma::Operator,
+      EpilogueOutputOp,
+      kEpilogueElementsPerAccess
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Wmma Gemm Kernel
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+> 
+struct DefaultGemm<
+  ElementA, LayoutA, kAlignmentA, 
+  ElementB, LayoutB, kAlignmentB, 
+  ElementC, LayoutC, 
+  ElementAccumulator, 
+  arch::OpClassWmmaTensorOp,
+  ArchTag, 
+  ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, 
+  ThreadblockSwizzle, 
+  Stages, 
+  SplitKSerial,
+  Operator,
+  SharedMemoryClear,
+  false,
+  false,
+  false,
+  layout::NoPermute,
+  layout::NoPermute
+> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, kAlignmentA,
+      ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, LayoutC, 
+      arch::OpClassWmmaTensorOp, 
+      ArchTag,
+      ThreadblockShape, 
+      WarpShape, 
+      InstructionShape, 
+      Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue 
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWmmaTensorOp<
+      ThreadblockShape,
+      typename Mma::Operator, 
+      kPartitionsK, 
+      EpilogueOutputOp,
+      EpilogueOutputOp::kCount
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..438769f3a07a6ccaeb61fd41a1c6135ff4acad00
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -0,0 +1,404 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial
+>
+struct DefaultGemmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm50, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+    ThreadblockShape,
+    WarpShape, 
+    InstructionShape, 
+    ElementA, LayoutA, 
+    ElementB, LayoutB, 
+    ElementAccumulator, layout::RowMajor, 
+    arch::OpClassSimt,
+    Stages,
+    Operator,
+    false,
+    cutlass::arch::CacheOperation::Global,
+    cutlass::arch::CacheOperation::Global,
+    TransformA, 
+    TransformB
+  >;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, 
+          typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, 
+          typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultGemmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC,
+  layout::RowMajor, ElementAccumulator, arch::OpClassSimt,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator,
+      layout::RowMajor, arch::OpClassSimt, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+    typename cutlass::epilogue::threadblock::DefaultEpilogueSimt<
+        ThreadblockShape, 
+        typename Mma::Operator, 
+        EpilogueOutputOp,
+        EpilogueOutputOp::kCount
+      >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::Gemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..1481465b7b9a36444281a7439a1213d5f0d79b6b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout
+>
+struct DefaultGemmGrouped<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  PermuteDLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    false, /*GatherA*/
+    false, /*GatherB*/
+    false, /*ScatterD*/
+    PermuteDLayout
+  >::GemmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGrouped<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmGrouped<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  layout::NoPermute, /*PermuteDLayout*/
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGrouped<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ace2127d0e4ac4188b088f4b73d42dd9ee17ae8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_per_group_scale.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmGroupedPerGroupScale;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Permute result D
+    typename PermuteDLayout
+>
+struct DefaultGemmGroupedPerGroupScale<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  PermuteDLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  // Define the default GEMM kernel
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    false, /*GatherA*/
+    false, /*GatherB*/
+    false, /*ScatterD*/
+    PermuteDLayout
+  >::GemmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGroupedPerGroupScale<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmGroupedPerGroupScale<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  GroupScheduleMode_,
+  Operator,
+  SharedMemoryClear,
+  layout::NoPermute, /*PermuteDLayout*/
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    kInternalTranspose
+  >;
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmGroupedPerGroupScale<
+    typename DefaultGemmKernel::Mma,
+    typename DefaultGemmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ad2f90fbab7931ebedf434e74c633e9f84a54f1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level softmax-grouped-GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias_,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    /// Operation performed by GEMM
+    typename Operator = typename device::DefaultGemmConfiguration<
+        OperatorClass, ArchTag, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator>::Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultGemmGroupedSoftmaxMainloopFusion {
+  // If true, we must construct a 'transposed-and-exchanged' Mma operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC_, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::MapArguments<
+    ElementA_,
+    LayoutA_,
+    ComplexTransform::kNone,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    ComplexTransform::kNone,
+    kAlignmentB,
+    LayoutC_,
+    kInternalTranspose
+  >;
+
+private:
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaSoftmaxMainloopFusion<
+      typename MapArguments::ElementA, typename MapArguments::LayoutA, MapArguments::kAlignmentA,
+      typename MapArguments::ElementB, typename MapArguments::LayoutB, MapArguments::kAlignmentB,
+      ElementScaleBias_, LayoutScaleBias_, ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag,
+      ThreadblockShape, WarpShape, InstructionShape, Stages, kInternalTranspose,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+public:
+  using GemmKernel = kernel::GemmGroupedSoftmaxMainloopFusion<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..d06a2a213915a20b4e66591d4713a00e0e127800
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone>
+struct DefaultGemmLayernormMainloopFusion {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaLayernormMainloopFusion<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementScaleBias, LayoutScaleBias, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmLayernormMainloopFusion<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c50d003c23ca929adbc243ddccf7bbda2e022eb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -0,0 +1,352 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_planar_complex.h"
+#include "cutlass/gemm/kernel/gemm_planar_complex_array.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Math operation performed by GEMM (e.g. arch::OpMultiplyAdd)
+    typename Operator,
+    /// Conditional enabling to switch between stages
+    typename Enable = void
+  >
+struct DefaultGemmPlanarComplexUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for pipelined mainloop
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+  >
+struct DefaultGemmPlanarComplexUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  typename platform::enable_if<(Stages <= 2)>::type 
+> {
+
+  /// Define planar complex valued variants instead
+  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexPipelined<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator
+  >::ThreadblockMma;
+
+  /// Planar complex epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
+    ThreadblockShape,
+    typename Mma::Policy::Operator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape::kK / WarpShape::kK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount  
+  >::Epilogue;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmPlanarComplex<
+    Mma,
+    Epilogue, 
+    ThreadblockSwizzle
+  >;
+
+  // Array variant
+  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for multiple pipeline stages.
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+  >
+struct DefaultGemmPlanarComplexUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  typename platform::enable_if<(Stages > 2)>::type 
+> {
+
+  /// Define planar complex valued variants instead
+  using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator
+  >::ThreadblockMma;
+
+  /// Planar complex epilogue
+  using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex<
+    ThreadblockShape,
+    typename Mma::Policy::Operator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape::kK / WarpShape::kK,
+    EpilogueOutputOp,
+    EpilogueOutputOp::kCount  
+  >::Epilogue;
+
+  /// Define the kernel in terms of the default kernel
+  using GemmKernel = kernel::GemmPlanarComplex<
+    Mma,
+    Epilogue, 
+    ThreadblockSwizzle
+  >;
+
+  // Array variant
+  using GemmArrayKernel = kernel::GemmPlanarComplexArray<
+    Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5ca03c1308e2cdbdb35d91bd1cd4356911e78
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -0,0 +1,252 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm;
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ada Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm89, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm89,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemm<Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..60965524415c3b8ec651eb5e3fc9e4c68168416a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversal {
+
+  using DefaultGemmKernel = typename kernel::DefaultSparseGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator
+  >::GemmKernel;
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = kernel::GemmSparseUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..15d9d7900dee7fa1d6d4e5be350212084e9f09e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,144 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Sparse GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+>
+struct DefaultGemmSparseUniversalWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA, LayoutA, kAlignmentA,
+    ElementB, LayoutB, kAlignmentB,
+    ElementC, LayoutC, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false, // SplitKSerial
+    Operator
+  >::GemmKernel;
+
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  using GemmKernel = kernel::GemmSparseUniversalWithAbsmax<
+      typename GemmBase::Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f8a2f289f28398fa274385e1d38f90d08866cf1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief
+    Default configuration for a sparse GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSparseGemmWithAbsmax {
+
+  using GemmBase = typename DefaultSparseGemm<
+    ElementA_, LayoutA_, kAlignmentA,
+    ElementB_, LayoutB_, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithAbsmax<typename GemmBase::Mma, Epilogue, ThreadblockSwizzle, SplitKSerial>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb2167fdcc7917c0b0e3b46dd3565d1702dfb130
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_sparse.h"
+#include "cutlass/gemm/kernel/sparse_gemm_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/gemm/threadblock/default_sparse_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages = 1>
+struct DefaultSparseGemmWithVisitor;
+
+////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename FusionCallbacks,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of stages used in the pipelined epilogue
+    int EpilogueStages>
+struct DefaultSparseGemmWithVisitor<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   ElementC, LayoutC, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   FusionCallbacks, ThreadblockSwizzle, Stages, Operator,
+                   EpilogueStages> {
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultSparseMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static constexpr int kAlignmentC = 128 / sizeof_bits<ElementC>::value;
+  using ElementEpilogue = ElementAccumulator;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+  using EpilogueOutputOp =
+      typename epilogue::thread::LinearCombination<
+          ElementC, kAlignmentC,
+          ElementAccumulator, ElementEpilogue>;
+  using BaseEpilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK,
+          EpilogueOutputOp, EpilogueOutputOp::kCount>::Epilogue;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      BaseEpilogue,
+      FusionCallbacks,
+      EpilogueStages>;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::SparseGemmWithEpilogueVisitor<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4aed55ca471e1ac6a67ef429579d7e4d1b0e87a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/gemm_splitk_parallel.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator
+>
+struct DefaultGemmSplitKParallel {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate using the basic GEMM's
+  /// mainloop.
+  using Default = DefaultGemm<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    LayoutC_,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false,
+    Operator
+  >;
+
+  /// Define the matrix multiply operator
+  using Mma = typename Default::Mma;
+
+  /// Define the epilogue
+  using Epilogue = typename Default::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmSplitKParallel<Mma, Epilogue, ThreadblockSwizzle>;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..683fc511dd299a9d20e348c39f8e46ecac25d9d0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
@@ -0,0 +1,146 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a Stream-K GEMM that can broadcast a bias vector in the epilogue.
+    Similar structure to DefaultGemmWithBroadcast, but uses its own epilogue 
+    (DefaultStreamkEpilogueWithBroadcastTensorOp) and its own GEMM kernel 
+    (GemmStreamkWithFusedEpilogue).
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmStreamkWithBroadcast {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Replace epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultStreamkEpilogueWithBroadcastTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmStreamkWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..29ff219d92de663471c95be92d64d2e1fc6a81e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -0,0 +1,396 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/gemm_universal.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+
+#include "cutlass/layout/permute.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Scatter result D by using an index array
+    bool ScatterD = false,
+    /// Permute result D
+    typename PermuteDLayout = layout::NoPermute,
+    /// Permute operand A
+    typename PermuteALayout_ = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout_ = layout::NoPermute,
+    ///
+    typename Enable = void
+    >
+struct DefaultGemmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Scatter result D by using an index array
+    bool ScatterD,
+    /// Permute result D
+    typename PermuteDLayout,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+>
+struct DefaultGemmUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  SharedMemoryClear,
+  GatherA,
+  GatherB,
+  ScatterD,
+  PermuteDLayout,
+  PermuteALayout,
+  PermuteBLayout,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultGemmKernel = typename kernel::DefaultGemm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    true,
+    Operator,
+    SharedMemoryClear,
+    GatherA,
+    GatherB,
+    ScatterD,
+    PermuteDLayout,
+    PermuteALayout,
+    PermuteBLayout
+  >::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued GEMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear
+  >
+struct DefaultGemmUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  SharedMemoryClear,
+  false,
+  false,
+  false,
+  layout::NoPermute,
+  layout::NoPermute,
+  layout::NoPermute,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultGemmKernel = typename kernel::DefaultGemmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    false
+  >::GemmKernel;
+
+  /// Universal kernel without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public kernel::GemmUniversal<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Universal kernel with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public kernel::GemmUniversalStreamk<
+      typename DefaultGemmKernel::Mma,
+      typename DefaultGemmKernel::Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ec473e4aa8de5724b9cb885dc307eecfd5667da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused epilogue visitor callbacks
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor.h"
+#include "cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Access granularity of C matrix in unit of elements
+  int kAlignmentC,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Element type for epilogue computation
+  typename ElementEpilogue,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename FusionCallbacks,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Number of stages used in the pipelined epilogue
+  int EpilogueStages = 1
+>
+struct DefaultGemmWithVisitor {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA, 
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    epilogue::thread::LinearCombination<
+        ElementC_, kAlignmentC, 
+        ElementAccumulator, ElementEpilogue 
+    >,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueWithVisitorCallbacks<
+      typename GemmBase::Epilogue,
+      FusionCallbacks,
+      EpilogueStages
+  >;
+
+  /// GemmWithVisitor without StreamkFeature member type
+  template <class SwizzleT, class Enable = void>
+  class SelectBase :
+    public GemmWithEpilogueVisitor<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// GemmWIthVisitor with StreamkFeature member type
+  template <class SwizzleT>
+  class SelectBase<SwizzleT, typename SwizzleT::StreamkFeature> :
+    public GemmWithEpilogueVisitorStreamk<
+      typename GemmBase::Mma,
+      Epilogue,
+      SwizzleT>
+  {};
+
+  /// Select kernel by ThreadblockSwizzle's support for StreamkFeature
+  using GemmKernel = SelectBase<ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..b27a078c52cf1e9227bb66d7f3a0e5a2eb54bf33
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief
+    Default configuration for a GEMM with fused absolute-maximum calculations and scaling
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_absmax.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_absmax.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_absmax.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithAbsMax {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithAbsMax<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementAuxOutput,
+    ElementC_,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithAbsMax<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..e53f31fcb59c7d02a34b6d884d5b0e66ddd168f8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@@ -0,0 +1,243 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_broadcast.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithBroadcast {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
+///
+///
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator      - must satisfy concept of 'EpilogueWithBroadcastOp' 
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable
+>
+struct DefaultGemmWithBroadcast<
+  ElementA_, LayoutA_, TransformA, kAlignmentA, 
+  ElementB_, LayoutB_, TransformB, kAlignmentB,
+  ElementC_, LayoutC_,
+  ElementAccumulator,
+  OperatorClass,
+  cutlass::arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  Enable
+  > {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    cutlass::arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithBroadcastVoltaTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    typename EpilogueOutputOp::ElementT,
+    typename EpilogueOutputOp::ElementVector,
+    EpilogueOutputOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..01019cf2a51e469761f6e61237595809aeebf411
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
@@ -0,0 +1,150 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_with_k_reduction.h"
+#include "cutlass/gemm/threadblock/default_mma_with_reduction.h"
+#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Reduce A or B along the K dimension
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    ///
+    typename Enable = void>
+struct DefaultGemmWithKReduction {
+
+  static const bool kReduceKForA = (platform::is_same<LayoutC, cutlass::layout::RowMajor>::value) ? ReduceKForA_ : !ReduceKForA_;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMmaWithReduction<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, kReduceKForA, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator, false, SharedMemoryClear>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the epilogue of the reduction vector
+  using EpilogueGemmKReduction =
+      typename cutlass::epilogue::threadblock::EpilogueGemmKReduction<
+          ElementAccumulator, ElementC, ThreadblockShape, typename Mma::Operator, kReduceKForA>;
+
+  /// Define the kernel-level GEMM operator.
+  using GemmKernel = kernel::GemmWithKReduction<Mma, Epilogue, EpilogueGemmKReduction, ThreadblockSwizzle>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..e24dd9233b946e0d472aedc5242d0eba7a229923
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief 
+    Defines a GEMM with Reduction based on an existing UniversalGemm kernel.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/gemm_with_fused_epilogue.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_with_reduction.h"
+#include "cutlass/epilogue/threadblock/epilogue_with_reduction.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable = void
+>
+struct DefaultGemmWithReduction {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator,
+    SharedMemoryClearOption::kClearLastStage
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization: ArchTag = cutlass::arch::Sm70
+///
+///
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Epilogue reduction operator
+  typename EpilogueReductionOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  ///
+  typename Enable
+>
+struct DefaultGemmWithReduction<
+  ElementA_, LayoutA_, TransformA, kAlignmentA, 
+  ElementB_, LayoutB_, TransformB, kAlignmentB,
+  ElementC_, LayoutC_,
+  ElementAccumulator,
+  OperatorClass,
+  cutlass::arch::Sm70,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  EpilogueReductionOp,
+  ThreadblockSwizzle,
+  Stages,
+  Operator,
+  Enable
+  >  {
+
+  using GemmBase = typename DefaultGemmUniversal<
+    ElementA_, LayoutA_, TransformA, kAlignmentA,
+    ElementB_, LayoutB_, TransformB, kAlignmentB,
+    ElementC_, LayoutC_, ElementAccumulator,
+    OperatorClass,
+    cutlass::arch::Sm70,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator
+  >::GemmKernel;
+
+  // Define epilogue
+  using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueWithReductionVoltaTensorOp<
+    typename GemmBase::Epilogue::Shape,
+    typename GemmBase::Epilogue::WarpMmaOperator,
+    GemmBase::Epilogue::kPartitionsK,
+    ElementC_,
+    EpilogueOutputOp,
+    EpilogueReductionOp,
+    GemmBase::Epilogue::kElementsPerAccess
+  >::Epilogue;
+
+  // Compose the GEMM kernel
+  using GemmKernel = GemmWithFusedEpilogue<
+    typename GemmBase::Mma,
+    Epilogue,
+    ThreadblockSwizzle
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..a574dabb6a09f9d0f7b373604a3c0eeabd4c1150
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_gemv.h
@@ -0,0 +1,132 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/gemv.h"
+#include "cutlass/gemm/threadblock/default_gemv_core.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the ThreadBlock tile - concept: gemm::GemmShape<>
+    typename ThreadBlockShape_,
+    /// Size of the per-thread shape - concept: gemm::GemmShape<>
+    typename ThreadShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C/D matrix
+    typename ElementCD_,
+    /// Layout of C/D matrix (concept: MatrixLayout)
+    typename LayoutCD_,
+    ///  Data type of the accumulator
+    typename ElementAccumulator_ = ElementCD_>
+struct DefaultGemv {
+
+  /// Shape of Threadblock-level matrix operation (concept: GemmShape)
+  using ThreadBlockShape = ThreadBlockShape_;
+
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using ThreadShape = ThreadShape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulators
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Data type of accumulators (same as C/D)
+  using LayoutAccumulator = LayoutCD_;
+
+  /// Data type of input/output matrix C/D
+  using ElementCD = ElementCD_;
+
+  /// Layout of input/output matrix C/D
+  using LayoutCD = LayoutCD_;
+
+  // Define the core components
+  using Core = typename cutlass::gemm::threadblock::DefaultGemvCore<
+      ThreadBlockShape, ThreadShape, ElementA, LayoutA, ElementB, LayoutB,
+      ElementAccumulator, LayoutAccumulator>;
+
+  // Define the threadblock-scoped gemv
+  using ThreadBlockGemv = cutlass::gemm::threadblock::Gemv<Core>;
+
+  // Iterator for multiplicand A
+  using IteratorA = typename ThreadBlockGemv::IteratorA;
+
+  // Iterator for multiplicand B
+  using IteratorB = typename ThreadBlockGemv::IteratorB;
+
+  /// Policy for the iterator that reads/writes C/D
+  using IteratorPolicyCD = typename platform::conditional<
+        platform::is_same<LayoutCD, layout::RowMajor>::value,
+        cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+          layout::PitchLinearShape<ThreadBlockShape::kN, ThreadBlockShape::kM>, Core::kThreadsPerN, ThreadShape::kN>,
+        cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+          layout::PitchLinearShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, Core::kThreadsPerN, ThreadShape::kM>>::type;
+
+  /// Iterator that reads/writes C/D
+  using IteratorCD = cutlass::transform::threadblock::PredicatedTileIterator<
+   cutlass::MatrixShape<ThreadBlockShape::kM, ThreadBlockShape::kN>, ElementCD, LayoutCD, 0, IteratorPolicyCD>;
+
+  /// Fragment storage for C/D
+  using FragmentCD = typename IteratorCD::Fragment;
+
+  // Define the threadblock swizzle
+  using ThreadBlockSwizzle = cutlass::gemm::threadblock::GemvBatchedStridedThreadblockDefaultSwizzle;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
new file mode 100644
index 0000000000000000000000000000000000000000..f52e5d7fe954eb74c8dbd369346ba4b876ec7a7c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k.h
@@ -0,0 +1,285 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2K;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRank2K<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x BT)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      kAlignmentB,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x AT)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementB, LayoutB, 
+      kAlignmentB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, BlasMode::kSymmetric>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b6e3290e4a9b888bcdfc2ad5acbd9b3900a9c54
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_complex.h
@@ -0,0 +1,498 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank2K definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRank2KComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformB,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct Rank2KTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+};
+  
+  // partial specializations for HER2K CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::ColumnMajor, layout::ColumnMajor, 
+  ComplexTransform::kNone, ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HER2K CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct Rank2KTransposedComplexTransform <
+  layout::RowMajor, layout::RowMajor, 
+  ComplexTransform::kConjugate, ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^T)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRank2KComplex<
+  ElementA, LayoutA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, TransformB, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+
+  using TransposedComplexTransform = detail::Rank2KTransposedComplexTransform<
+                                        LayoutA, LayoutB, 
+                                        TransformA, TransformB,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^H)
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementB, typename layout::LayoutTranspose<LayoutB>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (B x A^H)
+  using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementB, LayoutB, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level Rank2K operator.
+  using Rank2Kkernel = kernel::Rank2KUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, FillModeC, kBlasMode>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f5efe32e2d1f55de48f71ecca2df97a8c3113d2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
@@ -0,0 +1,355 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level grouped Rank2K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KGrouped;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    MapArguments::kAlignmentA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    MapArguments::kAlignmentB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    false,                  // SplitKSerial
+    Operator,
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Complex-valued grouped Rank2K
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_,
+    /// Whether the schedule of problems to visit has been precomputed
+    GroupScheduleMode GroupScheduleMode_
+    >
+struct DefaultRank2KGrouped<ElementA, LayoutA, TransformA, kAlignmentA,
+          ElementB, LayoutB, TransformB, kAlignmentB,
+          ElementC, LayoutC,
+          FillModeC, ElementAccumulator, OperatorClass, ArchTag, ThreadblockShape,
+          WarpShape, InstructionShape, EpilogueOutputOp,
+          ThreadblockSwizzle, Stages, Operator, BlasMode_, GroupScheduleMode_,
+          typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+  // If true, we must construct a 'transposed-and-exchanged' Rank2K operator.
+  static bool const kInternalTranspose = platform::is_same<LayoutC, layout::ColumnMajor>::value;
+
+  using MapArguments = kernel::detail::Rank2KMapArguments<
+    ElementA,
+    LayoutA,
+    TransformA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    TransformB,
+    kAlignmentB,
+    LayoutC,
+    FillModeC,
+    kInternalTranspose
+  >;
+
+  // Define the default grouped Rank2K kernel
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    typename MapArguments::ElementA,
+    typename MapArguments::LayoutA,
+    typename MapArguments::ElementB,
+    typename MapArguments::LayoutB,
+    ElementC,
+    typename MapArguments::LayoutC,
+    MapArguments::kFillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    MapArguments::kTransformA,
+    MapArguments::kTransformB,
+    Operator,
+    false,                  // SplitKSerial
+    BlasMode_
+  >::Rank2Kkernel;
+
+  /// Define the kernel in terms of the default kernel
+  /// Pass through the user-provided TransformA and TransformB so as to
+  /// correctly set public-facing TransformA and TransformB in kernel::Rank2KGrouped.
+  /// This is needed because kernel::DefaultRank2KComplex may change TransformA and
+  /// TransformB that become template arguments to Mma1 and Mma2.
+  using Rank2Kkernel = kernel::Rank2KGrouped<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue,
+    ThreadblockSwizzle,
+    TransformA,
+    TransformB,
+    DefaultRank2Kkernel::kFillModeC,
+    DefaultRank2Kkernel::kBlasMode,
+    GroupScheduleMode_,
+    kInternalTranspose
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27be8d1149890453b8ee2d16ff13ba3f4fd11ca
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_2k_universal.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank 2k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_2k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRank2KUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank 2k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2K<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    BlasMode::kSymmetric
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRank2KUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,  
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRank2Kkernel = typename kernel::DefaultRank2KComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::Rank2Kkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using Rank2Kkernel = kernel::Rank2KUniversal<
+    typename DefaultRank2Kkernel::Mma1,
+    typename DefaultRank2Kkernel::Mma2,
+    typename DefaultRank2Kkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC,
+    kBlasMode
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
new file mode 100644
index 0000000000000000000000000000000000000000..5001b338940f83f562287bd5a77877e0eee4c4f8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k.h
@@ -0,0 +1,247 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankK;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultRankK<
+                    ElementA, LayoutA, kAlignmentA, 
+                    ElementC,layout::RowMajor, FillModeC, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x AT)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMma<
+      ElementA, LayoutA, 
+      kAlignmentA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      kAlignmentA,
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+  
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, BlasMode::kSymmetric>::Epilogue;
+
+  /// Define the kernel-level Rank2 operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..21ccc331d8dfbea3a355980de7af998752b094d8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_complex.h
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level RankK definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultRankKComplex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+namespace detail {
+
+template <
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Complex elementwise transformation 
+  ComplexTransform TransformA,
+  /// Blas3 computation mode (symmetric/hermitian)
+  BlasMode BlasMode_
+  > struct RankKTransposedComplexTransform {
+  
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformA;
+
+};
+  
+  // partial specializations for HERK CUBLAS_OP_N layout (ColumMajor)
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::ColumnMajor, 
+  ComplexTransform::kNone,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kConjugate;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+};
+
+  // partial specializations for HERK CUBLAS_OP_C layout (RowMajor + Complex conjugate) 
+template <>
+  struct RankKTransposedComplexTransform <
+  layout::RowMajor, 
+  ComplexTransform::kConjugate,
+  BlasMode::kHermitian> {
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kConjugate;
+
+};
+
+}
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x B^T)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      TransformA, TransformA, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultRankKComplex<
+  ElementA, LayoutA, ElementC, 
+  layout::RowMajor, FillModeC, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  TransformA, Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+  // Complex transform for input A and B matrices (function on input layout)
+  static ComplexTransform const kTransformA = TransformA;
+
+  using TransposedComplexTransform = detail::RankKTransposedComplexTransform<
+                                        LayoutA, 
+                                        TransformA,
+                                        kBlasMode>;
+
+  // Complex transform on operandA and operandB (function of blas3 computation)
+  static ComplexTransform const kTransformOperandA = TransposedComplexTransform::kTransformA;
+  static ComplexTransform const kTransformOperandB = TransposedComplexTransform::kTransformB;
+
+  /// Define the threadblock-scoped matrix multiply-accumulate (A x A^H)
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex<
+      ElementA, LayoutA, 
+      ElementA, typename layout::LayoutTranspose<LayoutA>::type, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, 
+      ThreadblockShape, WarpShape, InstructionShape, Stages, 
+      kTransformOperandA, kTransformOperandB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOpBlas3<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator, kBlasMode>::Epilogue;
+
+  /// Define the kernel-level RankK operator.
+  using RankKkernel = kernel::RankKUniversal<Mma, Epilogue, ThreadblockSwizzle, FillModeC>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..503040a7a689f55a1e4caabf57fd78dfff269780
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_rank_k_universal.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level Rank k  definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/rank_k_universal.h"
+#include "cutlass/gemm/kernel/default_rank_k.h"
+#include "cutlass/gemm/kernel/default_rank_k_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultRankKUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued Rank k update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by Rank2k
+    typename Operator>
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankK<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued Rank 2K update kernels
+//
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Fill Mode for C (kLower or kUpper)
+    FillMode FillModeC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultRankKUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,   
+  kAlignmentA,
+  ElementC,
+  LayoutC,
+  FillModeC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultRankKkernel = typename kernel::DefaultRankKComplex<
+    ElementA,
+    LayoutA,
+    ElementC,
+    LayoutC,
+    FillModeC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::RankKkernel;
+
+    /// Define the kernel in terms of the default kernel
+  using RankKkernel = kernel::RankKUniversal<
+    typename DefaultRankKkernel::Mma,
+    typename DefaultRankKkernel::Epilogue, 
+    ThreadblockSwizzle,
+    FillModeC
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm.h
new file mode 100644
index 0000000000000000000000000000000000000000..435e46b38766950feff3b162f5dfc9953087a2fb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm90, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultSymm<
+                    ElementA, LayoutA, kSideModeA, kFillModeA, kAlignmentA, 
+                    ElementB, LayoutB, kAlignmentB, 
+                    ElementC,layout::RowMajor, 
+                    ElementAccumulator, arch::OpClassTensorOp, arch::Sm80, 
+                    ThreadblockShape, WarpShape, InstructionShape,
+                    EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                    Operator> {
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, 
+      ElementB, LayoutB, kAlignmentB,
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate 
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultTrmm<
+			ElementA, LayoutAMma2, kAlignmentA, 
+			ElementB, LayoutBMma2, kAlignmentB,
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma1::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level SYMM/HEMM operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+};
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..2184a0ac9ceba90257d252dc6b45b0d391b075af
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_complex.h
@@ -0,0 +1,508 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kSymmetric>
+struct DefaultSymmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't apply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm90,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (symmetric)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kSymmetric> {
+
+  static BlasMode const kBlasMode = BlasMode::kSymmetric;
+  // Complex Transform don't apply to A or B for SYMM
+  static ComplexTransform const TransformA = ComplexTransform::kNone; 
+  static ComplexTransform const TransformB = ComplexTransform::kNone; 
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type; 
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture complex datatype (hermitian)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode kSideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode kFillModeA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial>
+struct DefaultSymmComplex<
+  ElementA, LayoutA, kSideModeA, kFillModeA, ElementB, LayoutB, ElementC, 
+  layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, 
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, 
+  Operator, SplitKSerial, BlasMode::kHermitian> {
+
+  static BlasMode const kBlasMode = BlasMode::kHermitian;
+
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - with diagonal: alpha * A * B or alpha * B * A
+	static const DiagType kDiagTypeMma1 = DiagType::kNonUnit;
+  static ComplexTransform const TransformAMma1 = ComplexTransform::kNone; 
+  static ComplexTransform const TransformBMma1 = ComplexTransform::kNone; 
+  using Mma1 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, 
+      ElementB, LayoutB, 
+      kSideModeA, kFillModeA, kDiagTypeMma1, 
+      ElementAccumulator, layout::RowMajor, 
+      arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape,
+      Stages, TransformAMma1, TransformBMma1, Operator, BlasMode::kHermitian>::ThreadblockMma;
+
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  /// TRMM - withOUT diagonal - with conjugate transpose: alpha * AT * B or alpha * B * AT
+	static const DiagType kDiagTypeMma2 = DiagType::kZero;
+  using LayoutAMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                typename layout::LayoutTranspose<LayoutA>::type, 
+                                LayoutA
+                              >::type;
+  using LayoutBMma2 = typename platform::conditional<
+                                (kSideModeA == SideMode::kLeft), 
+                                LayoutB, 
+                                typename layout::LayoutTranspose<LayoutB>::type
+                              >::type;
+  static ComplexTransform const TransformAMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kConjugate : ComplexTransform::kNone;
+  static ComplexTransform const TransformBMma2 = (kSideModeA == SideMode::kLeft) ? 
+                                              ComplexTransform::kNone : ComplexTransform::kConjugate;
+
+	using Mma2 = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+			ElementA, LayoutAMma2, 
+			ElementB, LayoutBMma2, 
+			kSideModeA, InvertFillMode<kFillModeA>::mode, kDiagTypeMma2, 
+			ElementAccumulator, layout::RowMajor, 
+			arch::OpClassTensorOp, arch::Sm80,
+			ThreadblockShape, WarpShape, InstructionShape,
+			Stages, TransformAMma2, TransformBMma2, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma1::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level Symm operator.
+  using SymmKernel = kernel::SymmUniversal<Mma1, Mma2, Epilogue, ThreadblockSwizzle, kSideModeA, kFillModeA>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8915df6746c8bd00c09c4e64c48b625751cc49ae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_symm_universal.h
@@ -0,0 +1,342 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level SYMM/HEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/symm_universal.h"
+#include "cutlass/gemm/kernel/default_symm.h"
+#include "cutlass/gemm/kernel/default_symm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    /// Blas3 computation mode (symmetric/hermitian)
+    BlasMode BlasMode_ = BlasMode::kSymmetric,
+    ///
+    typename Enable = void
+    >
+struct DefaultSymmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYMM/HEMM
+    typename Operator>
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  BlasMode::kSymmetric,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymm<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator,
+    BlasMode::kSymmetric
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued SYMM/HEMM update kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Side Mode for A (kLeft or kRight)
+    SideMode SideModeA,
+    /// Fill Mode for A (kLower or kUpper)
+    FillMode FillModeA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by SYRK
+    typename Operator,
+    // BlasMode
+    BlasMode kBlasMode
+  >
+
+struct DefaultSymmUniversal<
+  ElementA,
+  LayoutA,
+  SideModeA,
+  FillModeA, 
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  kAlignmentB,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  kBlasMode,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultSymmkernel = typename kernel::DefaultSymmComplex<
+    ElementA,
+    LayoutA,
+    SideModeA,
+    FillModeA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    Operator,
+    SplitKSerial,
+    kBlasMode
+  >::SymmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using SymmKernel = kernel::SymmUniversal<
+    typename DefaultSymmkernel::Mma1,
+    typename DefaultSymmkernel::Mma2,
+    typename DefaultSymmkernel::Epilogue, 
+    ThreadblockSwizzle,
+    SideModeA,
+    FillModeA
+  >;
+};
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e004d075d1e59fabb3421e8033f686000e2d052
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// 
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_trmm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode SideMode_,
+    /// Fill Mode for the triangular matrix
+    FillMode FillMode_,
+    /// Diag Type for the triangular matrix
+    DiagType DiagType_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm90,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+                   kSideMode, kFillMode, kDiagType, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+                   EpilogueOutputOp, ThreadblockSwizzle, Stages, SplitKSerial,
+                   Operator> {
+                    
+  /// Define the threadblock-scoped triagular matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultTrmm<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB,
+      kSideMode, kFillMode, kDiagType, 
+      ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape, WarpShape, InstructionShape, Stages,
+      Operator>::ThreadblockMma;
+
+  static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp<
+          ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp,
+          EpilogueOutputOp::kCount>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8eeee10a754b82faa7ee75afeb5719ff279a285
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_complex.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_multistage_trmm_complex.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Side Mode for the kernel
+  SideMode SideMode_,
+  /// Fill Mode for the triangular matrix
+  FillMode FillMode_,
+  /// Diag Type for the triangular matrix
+  DiagType DiagType_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex elementwise transformation on A operand
+  ComplexTransform TransformA,
+  /// Complex elementwise transformation on B operand
+  ComplexTransform TransformB,
+  /// Multiply-add operator 
+  // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+  typename Operator,
+  /// If true, kernel is configured to support serial reduction in the epilogue
+  bool SplitKSerial
+>
+struct DefaultTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Hopper Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm90, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm90, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Multiply-add operator 
+    // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator,
+    /// If true, kernel is configured to support serial reduction in the epilogue
+    bool SplitKSerial
+  >
+struct DefaultTrmmComplex<
+  ElementA, LayoutA, ElementB, LayoutB, 
+  kSideMode, kFillMode, kDiagType,
+  ElementC, layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+  arch::Sm80, ThreadblockShape, WarpShape, InstructionShape,
+  EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> {
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using Mma = typename cutlass::gemm::threadblock::DefaultMultistageTrmmComplex<
+      ElementA, LayoutA, ElementB, LayoutB, 
+      kSideMode, kFillMode, kDiagType,
+      ElementAccumulator,layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape,
+      WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp<
+          ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp,
+          EpilogueOutputOp::kCount, Operator>::Epilogue;
+
+  /// Define the kernel-level TRMM operator.
+  using TrmmKernel = kernel::TrmmUniversal<Mma, Epilogue, ThreadblockSwizzle, kSideMode, kFillMode, kDiagType>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..fef1fcde33b3285956f8400ed8dff491d9900dc3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/default_trmm_universal.h
@@ -0,0 +1,359 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+      Default kernel-level TRMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+  
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/kernel/trmm_universal.h"
+#include "cutlass/gemm/kernel/default_trmm.h"
+#include "cutlass/gemm/kernel/default_trmm_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator,
+    ///
+    typename Enable = void
+    >
+struct DefaultTrmmUniversal;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Real-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator>
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  ComplexTransform::kNone,   // transform A
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  ComplexTransform::kNone,   // transform B
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename platform::enable_if< ! cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmm<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    SplitKSerial,
+    Operator
+  >::TrmmKernel;
+
+    /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Complex-valued TRMM kernels
+//
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Complex elementwise transformation on A operand
+    ComplexTransform TransformA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Complex elementwise transformation on B operand
+    ComplexTransform TransformB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// If true, kernel is configured to support serial reduction in the
+    /// epilogue
+    bool SplitKSerial,
+    /// Operation performed by TRMM
+    typename Operator
+  >
+struct DefaultTrmmUniversal<
+  ElementA,
+  LayoutA,
+  TransformA,
+  kAlignmentA,
+  ElementB,
+  LayoutB,
+  TransformB,
+  kAlignmentB,
+  kSideMode,
+  kFillMode,
+  kDiagType,
+  ElementC,
+  LayoutC,
+  ElementAccumulator,
+  OperatorClass,
+  ArchTag,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  EpilogueOutputOp,
+  ThreadblockSwizzle,
+  Stages,
+  SplitKSerial,
+  Operator,
+  typename platform::enable_if<cutlass::is_complex<ElementAccumulator>::value>::type
+> {
+
+  using DefaultTrmmKernel = typename kernel::DefaultTrmmComplex<
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    kSideMode,
+    kFillMode,
+    kDiagType,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    EpilogueOutputOp,
+    ThreadblockSwizzle,
+    Stages,
+    TransformA,
+    TransformB,
+    Operator,
+    SplitKSerial
+  >::TrmmKernel;
+
+  /// Define the kernel in terms of the default kernel
+  using TrmmKernel = kernel::TrmmUniversal<
+    typename DefaultTrmmKernel::Mma,
+    typename DefaultTrmmKernel::Epilogue, 
+    ThreadblockSwizzle,
+    kSideMode,
+    kFillMode,
+    kDiagType
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/ell_gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..16010fd66902aa9a69039134a5aa2e7cbe0b46c0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/ell_gemm.h
@@ -0,0 +1,824 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a Block-Ell sparse gemm kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial,              ///! If true, code supporting split-K via serial reduction is enabled.
+  bool IsASparse                  ///! If true, A is sparse matrix
+>
+struct EllGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+   Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+  EllGemm() = default;
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kM - 1 ) / Mma::Shape::kM;
+    int ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    int tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        ell_block_offset_m * params.ell_blocksize
+        + tile_offset_m * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.m() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutB
+      int ell_stride = iterator_B.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =  
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.operator()<true, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        } 
+        else {
+          mma.operator()<true, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_m = threadblock_tile_offset.m() / tile_in_ell_block;
+    tile_offset_m = threadblock_tile_offset.m() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      ell_block_offset_m * params.ell_blocksize
+      + tile_offset_m * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+         ell_block_offset_m * params.ell_blocksize
+         + min((tile_offset_m + 1) * Mma::Shape::kM, params.ell_blocksize)),
+      min(params.problem_size.n(),
+        (threadblock_tile_offset.n()+1) * Mma::Shape::kN)
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+// B is Sparse
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct EllGemm<Mma_, Epilogue_, ThreadblockSwizzle_, SplitKSerial, false> {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    typename OutputOp::Params output_op{};
+    int *semaphore = nullptr;
+    int gemm_k_iterations{0};
+    int gemm_k_size{0};
+    const int* ell_idx = nullptr;
+    int ell_ncol{0};
+    int ell_blocksize{0};
+    int ell_base_idx{0};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      const int* ell_idx,
+      int ell_ncol,
+      int ell_blocksize,
+      int ell_base_idx,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ell_idx(ell_idx),
+      ell_ncol(ell_ncol),
+      ell_blocksize(ell_blocksize),
+      ell_base_idx(ell_base_idx)
+    {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union{
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    };
+    typename cutlass::transform::threadblock::ell::SharedStorage ell;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  EllGemm() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+        params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int tile_in_ell_block = (params.ell_blocksize + Mma::Shape::kN - 1 ) / Mma::Shape::kN;
+    int ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    int tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+    int lane_idx = threadIdx.x % 32;
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // skip computation if matrix is 0
+    if (params.ell_ncol > 0) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.k() * params.gemm_k_size,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        threadblock_tile_offset.k() * params.gemm_k_size,
+        ell_block_offset_n * params.ell_blocksize
+        + tile_offset_n * Mma::Shape::kN,
+      };
+
+      int ell_idx_start =
+        (threadblock_tile_offset.n() / tile_in_ell_block) *
+        (params.ell_ncol / params.ell_blocksize);
+      const int* ell_idx_ptr = &(params.ell_idx[ell_idx_start]);
+
+      // Problem size is a function of threadblock index in the K dimension
+      int problem_size_k = min(
+        params.problem_size.k(),
+        (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+      problem_size_k = min(problem_size_k, params.ell_ncol);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations =
+        (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        {params.problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        {problem_size_k, params.problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Define coef for ELL index depending on LayoutA
+      int ell_stride = iterator_A.get_stride();
+
+      typename cutlass::transform::threadblock::ell::Iterator ell_iterator(
+        shared_storage.ell,
+        ell_idx_ptr,
+        params.ell_blocksize,
+        params.ell_base_idx,
+        Mma::Shape::kK,
+        problem_size_k,
+        ell_stride,
+        thread_idx
+      );
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      if (!kSplitKSerial || gemm_k_iterations > 0) {
+        // check if index computations can be skipped
+        static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+        static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+        static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+        constexpr bool is_double = (sizeof(typename Mma::IteratorA::Element) == 8);
+        constexpr bool is_multiple_alignment =
+          (kAlignmentA > 1) && (kAlignmentB > 1) && (kAlignmentC > 1);
+        const bool is_specialized_blocksize =
+          ((params.ell_blocksize) & (params.ell_blocksize-1)) == 0
+          && params.ell_blocksize >= Mma::Shape::kK;
+        // Compute threadblock-scoped matrix multiply-add
+        if ((is_double || is_multiple_alignment) && is_specialized_blocksize) {
+          mma.template operator()<false, true>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+        else {
+          mma.template operator()<false, false>(
+              gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators, ell_iterator);
+        }
+      }
+    } // if (params.ell_ncols > 0)
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    ell_block_offset_n = threadblock_tile_offset.n() / tile_in_ell_block;
+    tile_offset_n = threadblock_tile_offset.n() % tile_in_ell_block;
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      ell_block_offset_n * params.ell_blocksize
+      + tile_offset_n * Mma::Shape::kN
+    );
+
+    //avoid out of bounds
+    MatrixCoord threadblock_extent(
+      min(params.problem_size.m(),
+        (threadblock_tile_offset.m()+1) * Mma::Shape::kM),
+      min(params.problem_size.n(),
+         ell_block_offset_n * params.ell_blocksize
+         + min((tile_offset_n + 1) * Mma::Shape::kN, params.ell_blocksize))
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      threadblock_extent,
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..22b5f48d6f8abe670778be47038adc315f0a6c68
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/arch/arch.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct Gemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+    int gemm_k_size;
+    // For gather+scatter operations
+    int const *gather_A_indices;
+    int const *gather_B_indices;
+    int const *scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0), semaphore(0), gemm_k_size(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr,
+      int const *gather_A_indices = nullptr,
+      int const *gather_B_indices = nullptr,
+      int const *scatter_D_indices = nullptr
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      gather_A_indices(gather_A_indices),
+      gather_B_indices(gather_B_indices),
+      scatter_D_indices(scatter_D_indices) {
+
+      int total_gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+      
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Gemm() { } 
+
+  /// Determines whether kernel satisfies alignment
+  CUTLASS_HOST_DEVICE
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    typename Mma::IteratorA::TensorRef ref_A,
+    typename Mma::IteratorB::TensorRef ref_B,
+    typename Epilogue::OutputTileIterator::TensorRef ref_C,
+    typename Epilogue::OutputTileIterator::TensorRef ref_D) {
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB =  (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Epilogue::OutputTileIterator::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_array.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..8812806275c3ac2b171431f685bff86ab9b1251f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_array.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmArray {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::Element const * const * ptr_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::Element const * const * ptr_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Element const * const * ptr_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::Element * const * ptr_D;
+    int64_t stride_D;
+    typename OutputOp::Params epilogue;
+    int batch_count;
+    int gemm_k_iterations;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() : 
+      swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
+      typename Mma::IteratorA::Element const * const * ptr_A_,
+      typename Mma::IteratorA::Layout layout_A,
+      typename Mma::IteratorB::Element const * const * ptr_B_,
+      typename Mma::IteratorB::Layout layout_B,
+      typename Epilogue::OutputTileIterator::Element const * const * ptr_C_,
+      typename Epilogue::OutputTileIterator::Layout layout_C,
+      typename Epilogue::OutputTileIterator::Element * const * ptr_D_,
+      typename Epilogue::OutputTileIterator::Layout layout_D,
+      typename OutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(layout_A),
+      ptr_A(ptr_A_),
+      params_B(layout_B),
+      ptr_B(ptr_B_),
+      params_C(layout_C),
+      ptr_C(ptr_C_),
+      params_D(layout_D),
+      ptr_D(ptr_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_),
+      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {
+
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmArray() { } 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
+      batch_idx < params.batch_count; 
+      batch_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        0
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        const_cast<typename Mma::IteratorA::Element *>(params.ptr_A[batch_idx]),
+        params.problem_size.mk(),
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        const_cast<typename Mma::IteratorB::Element *>(params.ptr_B[batch_idx]),
+        params.problem_size.kn(),
+        thread_idx,
+        tb_offset_B);
+
+      //
+      // Main loop
+      //
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+      
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp output_op(params.epilogue);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        const_cast<typename Epilogue::OutputTileIterator::Element *>(params.ptr_C[batch_idx]),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        params.ptr_D[batch_idx],
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // run efficient epilogue
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_batched.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
new file mode 100644
index 0000000000000000000000000000000000000000..efd5b8461a86e337123389864e82f7bbffcb1e82
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_batched.h
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmBatched {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorA::TensorRef ref_A{};
+    int64_t stride_A{0};
+    typename Mma::IteratorB::Params params_B{};
+    typename Mma::IteratorB::TensorRef ref_B{};
+    int64_t stride_B{0};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_C{};
+    int64_t stride_C{0};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::TensorRef ref_D{};
+    int64_t stride_D{0};
+    typename OutputOp::Params epilogue{};
+    int batch_count{1};
+    int gemm_k_iterations{0};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape_,
+      typename Mma::IteratorA::TensorRef ref_A_,
+      int64_t stride_A_,
+      typename Mma::IteratorB::TensorRef ref_B_,
+      int64_t stride_B_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C_,
+      int64_t stride_C_,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D_,
+      int64_t stride_D_,
+      typename OutputOp::Params epilogue_,
+      int batch_count_
+    ):
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A_.layout()),
+      ref_A(ref_A_),
+      stride_A(stride_A_),
+      params_B(ref_B_.layout()),
+      ref_B(ref_B_),
+      stride_B(stride_B_),
+      params_C(ref_C_.layout()),
+      ref_C(ref_C_),
+      stride_C(stride_C_),
+      params_D(ref_D_.layout()),
+      ref_D(ref_D_),
+      stride_D(stride_D_),
+      epilogue(epilogue_),
+      batch_count(batch_count_),
+      gemm_k_iterations((problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+  GemmBatched() = default;
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_idx = threadblock_swizzle.get_batch_idx(); 
+      batch_idx < params.batch_count; 
+      batch_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        0
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        params.params_A,
+        params.ref_A.data(),
+        params.problem_size.mk(),
+        thread_idx,
+        tb_offset_A);
+
+      iterator_A.add_pointer_offset(params.stride_A * batch_idx);
+
+      typename Mma::IteratorB iterator_B(
+        params.params_B,
+        params.ref_B.data(),
+        params.problem_size.kn(),
+        thread_idx,
+        tb_offset_B);
+
+      iterator_B.add_pointer_offset(params.stride_B * batch_idx);
+
+
+      //
+      // Main loop
+      //
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+      
+      Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(params.gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp output_op(params.epilogue);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma::Shape::kM,
+        threadblock_tile_offset.n() * Mma::Shape::kN
+      );
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        params.ref_C.data(),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C.add_pointer_offset(params.stride_C * batch_idx);
+
+      // Tile iterator writing to output tile
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        params.ref_D.data(),
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D.add_pointer_offset(params.stride_D * batch_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // run efficient epilogue
+      epilogue(output_op, iterator_D, accumulators, iterator_C);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a4098cc93f994870101dfcf45833f57f48a8029
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped.h
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGrouped {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments = kernel::detail::MapArguments<
+    typename Mma::IteratorA::Element,
+    typename Mma::IteratorA::Layout,
+    Mma::kTransformA,
+    Mma::IteratorA::AccessType::kElements,
+    typename Mma::IteratorB::Element,
+    typename Mma::IteratorB::Layout,
+    Mma::kTransformB,
+    Mma::IteratorB::AccessType::kElements,
+    typename Mma::LayoutC,
+    kTransposed
+  >;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(    
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params output_op,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
+    ): 
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      output_op(output_op),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.output_op),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd)
+    { 
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmGrouped() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+ 
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations, 
+        accumulators, 
+        iterator_A, 
+        iterator_B, 
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op, 
+        iterator_D, 
+        accumulators, 
+        iterator_C); 
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..65325e50886c1d4435a2b0970b05e901618b0c76
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
@@ -0,0 +1,261 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGroupedPerGroupScale : 
+  public GemmGrouped<Mma_, Epilogue_, ThreadblockSwizzle_, GroupScheduleMode_, Transposed> {
+
+  // Inherit constructors
+  using Base = GemmGrouped<Mma_, Epilogue_, ThreadblockSwizzle_, GroupScheduleMode_, Transposed>;
+
+  // Inherit type definitions
+  using typename Base::Mma;
+  using typename Base::Epilogue;
+  using typename Base::EpilogueOutputOp;
+  using typename Base::ThreadblockSwizzle;
+  using typename Base::Params;
+  using typename Base::SharedStorage;
+
+  // Explicitly inherit the kTransposed constant
+  static bool const kTransposed = Base::kTransposed;
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    typename Base::ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+      
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations, 
+        accumulators, 
+        iterator_A, 
+        iterator_B, 
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // The if branch is for the per-group scaling epilogue. The customized epilogue operator scales each gemm output by a scalar value.
+      // This branch is only enabled if EpilogueOutputOp is LinearCombination.
+      if constexpr (platform::is_same<EpilogueOutputOp,
+                              ::cutlass::epilogue::thread::LinearCombination<typename EpilogueOutputOp::ElementOutput,
+                                  EpilogueOutputOp::kCount, typename EpilogueOutputOp::ElementAccumulator,
+                                  typename EpilogueOutputOp::ElementCompute, EpilogueOutputOp::kScale,
+                                  EpilogueOutputOp::kRound>>::value)
+      {
+        EpilogueOutputOp output_op(params.output_op, problem_idx);
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+            output_op, 
+            iterator_D, 
+            accumulators, 
+            iterator_C); 
+      } else {
+        EpilogueOutputOp output_op(params.output_op);
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+            output_op, 
+            iterator_D, 
+            accumulators, 
+            iterator_C); 
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc37d560896c8dd26ecf700197f0123df40e8bda
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
@@ -0,0 +1,121 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels 
+template <
+  typename ThreadblockShape,
+  bool Transposed
+>
+struct GemmGroupedProblemSizeHelper {
+
+  static bool const kTransposed = Transposed;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    if (kTransposed) {
+      cutlass::swap(problem.m(), problem.n());
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct GemmGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  static bool const kTransposed = Transposed;
+
+  using ProblemSizeHelper = detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transposed>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GemmGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (params_, shared_storage_, block_idx)
+  {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6fc2223588a57b7bc13c1f53c611862b836b5be
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped GEMMs with a softmax fused beforehand
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                           ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct GemmGroupedSoftmaxMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Optional transpose
+  using MapArguments = kernel::detail::MapArguments<
+    typename Mma::IteratorA::Element,
+    typename Mma::IteratorA::Layout,
+    Mma::kTransformA,
+    Mma::IteratorA::AccessType::kElements,
+    typename Mma::IteratorB::Element,
+    typename Mma::IteratorB::Layout,
+    Mma::kTransformB,
+    Mma::IteratorB::AccessType::kElements,
+    typename Mma::LayoutC,
+    kTransposed
+  >;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion.
+  using ElementA = typename MapArguments::ElementA;
+  using LayoutA = typename MapArguments::LayoutA;
+  using ElementB = typename MapArguments::ElementB;
+  using LayoutB = typename MapArguments::LayoutB;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArguments::LayoutC;
+
+  using ElementScaleBias = typename Mma::IteratorNormSum::Element;
+
+  static ComplexTransform const kTransformA = MapArguments::kTransformA;
+  static ComplexTransform const kTransformB = MapArguments::kTransformB;
+
+  // Type definitions about the mainloop.
+  using Operator = typename Mma::Operator;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = MapArguments::kAlignmentA;
+  static int const kAlignmentB = MapArguments::kAlignmentB;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = GemmGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kTransposed>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes{nullptr};
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params output_op,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      void ** ptr_norm,
+      void ** ptr_sum,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      output_op(output_op),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      ptr_norm(ptr_norm),
+      ptr_sum(ptr_sum),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    ElementA ** ptr_A{nullptr};
+    ElementB ** ptr_B{nullptr};
+    ElementC ** ptr_C{nullptr};
+    ElementC ** ptr_D{nullptr};
+
+    void ** ptr_norm{nullptr};
+    void ** ptr_sum{nullptr};
+
+    typename LayoutA::Stride::LongIndex *lda{nullptr};
+    typename LayoutB::Stride::LongIndex *ldb{nullptr};
+    typename LayoutC::Stride::LongIndex *ldc{nullptr};
+    typename LayoutC::Stride::LongIndex *ldd{nullptr};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.output_op),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      ptr_norm(args.ptr_norm),
+      ptr_sum(args.ptr_sum),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      ptr_norm = args.ptr_norm;
+      ptr_sum = args.ptr_sum;
+      lda = args.lda;
+      ldb = args.ldb;
+      ldc = args.ldc;
+      ldd = args.ldd;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma::SharedStorage main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemmGroupedSoftmaxMainloopFusion() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    //
+    // These types shadow the type-level definitions and support the ability to implement
+    // a 'transposed' GEMM that computes the transposed problems.
+    //
+    using ElementA = typename Mma::IteratorA::Element;
+    using LayoutA = typename Mma::IteratorA::Layout;
+    using ElementB = typename Mma::IteratorB::Element;
+    using LayoutB = typename Mma::IteratorB::Layout;
+    using ElementC = typename Epilogue::OutputTileIterator::Element;
+    using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+    //
+    // Problem visitor.
+    //
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_offset(
+        int(threadblock_idx / grid_shape.n()) * Mma::Shape::kM,
+        int(threadblock_idx % grid_shape.n()) * Mma::Shape::kN,
+        0);
+
+      // Load element pointers. Exchange pointers and strides if working on the transpose
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A{
+        threadblock_offset.m(),
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B{
+        0,
+        threadblock_offset.n()
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands
+      typename Mma::IteratorA iterator_A(
+        LayoutA(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size.k()},
+        thread_idx,
+        tb_offset_A);
+
+      typename Mma::IteratorB iterator_B(
+        LayoutB(ldm_B),
+        ptr_B,
+        {problem_size.k(), problem_size.n()},
+        thread_idx,
+        tb_offset_B);
+
+      // Construct iterator to the softmax norm/sum vector
+      typename Mma::IteratorNormSum iterator_norm_sum(
+        problem_size.m(),
+        static_cast<ElementScaleBias const *>(params.ptr_norm[problem_idx]),
+        static_cast<ElementScaleBias const *>(params.ptr_sum[problem_idx]),
+        thread_idx,
+        MatrixCoord(0, threadblock_offset.m())
+      );
+
+      typename Mma::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Matrix multiply phase
+      //
+
+      // Construct thread-scoped matrix multiply
+      Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_B,
+        iterator_norm_sum,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      ElementC *ptr_C = params.ptr_C[problem_idx];
+      ElementC *ptr_D = params.ptr_D[problem_idx];
+
+      LayoutC layout_C(params.ldc[problem_idx]);
+      LayoutC layout_D(params.ldd[problem_idx]);
+
+      typename Epilogue::OutputTileIterator::Params params_C(layout_C);
+      typename Epilogue::OutputTileIterator::Params params_D(layout_D);
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params_C,
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params_D,
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        threadblock_offset.mn()
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..c862cc0077e3513e60fd06b2cdb46a3a0c46dfdd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
@@ -0,0 +1,782 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel with layernorm operations fused in mainloop.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmLayernormMainloopFusion {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  using ElementScaleBias = typename Mma::IteratorVarMean::Element;
+  using LayoutScaleBias = typename Mma::IteratorVarMean::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_var{nullptr};
+    void const * ptr_mean{nullptr};
+    void const * ptr_gamma{nullptr};
+    void const * ptr_beta{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_var{0};
+    int64_t batch_stride_mean{0};
+    int64_t batch_stride_gamma{0};
+    int64_t batch_stride_beta{0};
+    int64_t batch_stride_C{0};
+
+    typename LayoutA::Stride stride_a{};
+    typename LayoutB::Stride stride_b{};
+    typename LayoutScaleBias::Stride stride_var{};
+    typename LayoutScaleBias::Stride stride_mean{};
+    typename LayoutScaleBias::Stride stride_gamma{};
+    typename LayoutScaleBias::Stride stride_beta{};
+    typename LayoutC::Stride stride_c{};
+    typename LayoutC::Stride stride_d{};
+
+    typename LayoutA::Stride::LongIndex lda{};
+    typename LayoutB::Stride::LongIndex ldb{};
+    typename LayoutScaleBias::Stride::LongIndex ld_var{};
+    typename LayoutScaleBias::Stride::LongIndex ld_mean{};
+    typename LayoutScaleBias::Stride::LongIndex ld_gamma{};
+    typename LayoutScaleBias::Stride::LongIndex ld_beta{};
+    typename LayoutC::Stride::LongIndex ldc{};
+    typename LayoutC::Stride::LongIndex ldd{};
+
+    int const * ptr_gather_A_indices{nullptr};
+    int const * ptr_gather_B_indices{nullptr};
+    int const * ptr_scatter_D_indices{nullptr};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutScaleBias::Stride stride_var,
+      typename LayoutScaleBias::Stride stride_mean,
+      typename LayoutScaleBias::Stride stride_gamma,
+      typename LayoutScaleBias::Stride stride_beta,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(0), ldb(0), ldc(0), ldd(0),
+      ld_var(0), ld_mean(0),
+      ld_gamma(0), ld_beta(0),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      stride_var(stride_var), stride_mean(stride_mean),
+      stride_gamma(stride_gamma), stride_beta(stride_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_var,
+      void const * ptr_mean,
+      void const * ptr_gamma,
+      void const * ptr_beta,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_var,
+      int64_t batch_stride_mean,
+      int64_t batch_stride_gamma,
+      int64_t batch_stride_beta,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutScaleBias::Stride::LongIndex ld_var,
+      typename LayoutScaleBias::Stride::LongIndex ld_mean,
+      typename LayoutScaleBias::Stride::LongIndex ld_gamma,
+      typename LayoutScaleBias::Stride::LongIndex ld_beta,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_var(ptr_var), ptr_mean(ptr_mean), 
+      ptr_gamma(ptr_gamma), ptr_beta(ptr_beta), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      batch_stride_var(batch_stride_var), batch_stride_mean(batch_stride_mean),
+      batch_stride_gamma(batch_stride_gamma), batch_stride_beta(batch_stride_beta),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ld_var(ld_var), ld_mean(ld_mean),
+      ld_gamma(ld_gamma), ld_beta(ld_beta),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      stride_var = make_Coord(ld_var);
+      stride_mean = make_Coord(ld_mean);
+      stride_gamma = make_Coord(ld_gamma);
+      stride_beta = make_Coord(ld_beta);
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_var;
+    void * ptr_mean;
+    void * ptr_gamma;
+    void * ptr_beta;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_var;
+    int64_t batch_stride_mean;
+    int64_t batch_stride_gamma;
+    int64_t batch_stride_beta;
+    int64_t batch_stride_C;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_var(const_cast<void *>(args.ptr_var)),
+      ptr_mean(const_cast<void *>(args.ptr_mean)),
+      ptr_gamma(const_cast<void *>(args.ptr_gamma)),
+      ptr_beta(const_cast<void *>(args.ptr_beta)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_var(args.batch_stride_var),
+      batch_stride_mean(args.batch_stride_mean),
+      batch_stride_gamma(args.batch_stride_gamma),
+      batch_stride_beta(args.batch_stride_beta),
+      batch_stride_C(args.batch_stride_C),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_var = const_cast<void *>(args.ptr_var);
+      ptr_mean = const_cast<void *>(args.ptr_mean);
+      ptr_gamma = const_cast<void *>(args.ptr_gamma);
+      ptr_beta = const_cast<void *>(args.ptr_beta);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_var = args.batch_stride_var;
+      batch_stride_mean = args.batch_stride_mean;
+      batch_stride_gamma = args.batch_stride_gamma;
+      batch_stride_beta = args.batch_stride_beta;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+      
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmLayernormMainloopFusion op;
+    op(params, shared_storage);
+  }
+ 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Construct iterators to A var/mean vector
+    typename Mma::IteratorVarMean iterator_var_mean(
+      params.problem_size.m(),
+      static_cast<ElementScaleBias const *>(params.ptr_var),
+      static_cast<ElementScaleBias const *>(params.ptr_mean),
+      thread_idx,
+      MatrixCoord(0, (threadblock_tile_offset.m() * Mma::Shape::kM))
+    );
+
+    // Construct iterators to A scale/bias vector
+    typename Mma::IteratorGammaBeta iterator_gamma_beta(
+      problem_size_k,
+      static_cast<ElementScaleBias const *>(params.ptr_gamma),
+      static_cast<ElementScaleBias const *>(params.ptr_beta),
+      thread_idx,
+      MatrixCoord(
+        0, (threadblock_tile_offset.k() * Mma::Shape::kK)
+      )
+    );
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B,
+      iterator_var_mean,
+      iterator_gamma_beta, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3b0eb89a3541439286806854b43dbba290c616d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_params.h
@@ -0,0 +1,189 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GemmParams {
+
+  //
+  // Type definitions
+  //
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  using MmaIteratorParams = typename cutlass::transform::threadblock::PredicatedTileAccessIteratorParams;  
+  using EpilogueIteratorParams = typename cutlass::epilogue::threadblock::PredicatedTileIteratorParams;
+
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{};
+
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  int batch_count{1};
+  int gemm_k_size{0};
+
+  void * ptr_A{nullptr};
+  void * ptr_B{nullptr};
+  void * ptr_C{nullptr};
+  void * ptr_D{nullptr};
+
+  LongIndex lda{0};
+  LongIndex ldb{0};
+  LongIndex ldc{0};
+  LongIndex ldd{0};
+
+  LongIndex batch_stride_A{0};
+  LongIndex batch_stride_B{0};
+  LongIndex batch_stride_C{0};
+  LongIndex batch_stride_D{0};
+
+  int *semaphore{nullptr};
+
+  //
+  // Methods
+  //
+
+  GemmParams() = default;
+
+  CUTLASS_HOST_DEVICE
+  GemmParams(
+    cutlass::gemm::GemmCoord problem_size_,
+    cutlass::gemm::GemmCoord grid_tiled_shape_,
+    int swizzle_log_tile_,
+    GemmUniversalMode mode_,
+    int batch_count_,
+    int gemm_k_size_,
+    void const * ptr_A_,
+    void const * ptr_B_,
+    void const * ptr_C_,
+    void * ptr_D_,
+    LongIndex lda_,
+    LongIndex ldb_, 
+    LongIndex ldc_, 
+    LongIndex ldd_,
+    int64_t batch_stride_A_,
+    int64_t batch_stride_B_,
+    int64_t batch_stride_C_,
+    int64_t batch_stride_D_,
+    MmaIteratorParams const & params_itr_a_,
+    MmaIteratorParams const & params_itr_b_,
+    EpilogueIteratorParams const & params_itr_c_,
+    EpilogueIteratorParams const & params_itr_d_,
+    void *workspace_ = nullptr) :
+      problem_size(problem_size_),
+      grid_tiled_shape(grid_tiled_shape_),
+      swizzle_log_tile(swizzle_log_tile_),
+      mode(mode_),
+      batch_count(batch_count_),
+      gemm_k_size(gemm_k_size_),
+      ptr_A(const_cast<void *>(ptr_A_)),
+      ptr_B(const_cast<void *>(ptr_B_)),
+      ptr_C(const_cast<void *>(ptr_C_)),
+      ptr_D(ptr_D_),
+      lda(lda_),
+      ldb(ldb_),
+      ldc(ldc_),
+      ldd(ldd_),
+      batch_stride_A(batch_stride_A_),
+      batch_stride_B(batch_stride_B_),
+      batch_stride_C(batch_stride_C_),
+      batch_stride_D(batch_stride_D_),
+      params_itr_a(params_itr_a_),
+      params_itr_b(params_itr_b_),      
+      params_itr_c(params_itr_c_),
+      params_itr_d(params_itr_d_),
+      semaphore(static_cast<int *>(workspace_)
+    ) { }
+
+
+  CUTLASS_HOST_DEVICE
+  void update(
+    void const * ptr_A_,
+    void const * ptr_B_,
+    void const * ptr_C_,
+    void * ptr_D_,
+    int64_t batch_stride_A_,
+    int64_t batch_stride_B_,
+    int64_t batch_stride_C_,
+    int64_t batch_stride_D_,
+    void *workspace_ = nullptr) {
+
+    ptr_A = const_cast<void *>(ptr_A_);
+    ptr_B = const_cast<void *>(ptr_B_);
+    ptr_C = const_cast<void *>(ptr_C_);
+    ptr_D = ptr_D_;
+
+    batch_stride_A = batch_stride_A_;
+    batch_stride_B = batch_stride_B_;
+    batch_stride_C = batch_stride_C_;
+    batch_stride_D = batch_stride_D_;
+
+
+    semaphore = static_cast<int *>(workspace_);
+    CUTLASS_TRACE_HOST("GemmParams::update()");
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d1998259c2fcb3e656f9295bf799fa8d735d688
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -0,0 +1,158 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma, typename Epilogue, typename ThreadblockSwizzle>
+CUTLASS_GLOBAL void GemmPipelined(
+  cutlass::gemm::GemmCoord problem_size,
+  cutlass::gemm::GemmCoord grid_tiled_shape,
+  typename Mma::IteratorA::Params params_A,
+  typename Mma::IteratorA::TensorRef ref_A,
+  typename Mma::IteratorB::Params params_B,
+  typename Mma::IteratorB::TensorRef ref_B,
+  typename Epilogue::Params params_epilogue
+  ) {
+
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ union {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  } shared_storage;
+
+  // Compute threadblock location
+  ThreadblockSwizzle threadblock_swizzle;
+
+  int swizzle_log_tile = ThreadblockSwizzle().get_log_tile(grid_tiled_shape);
+
+  cutlass::gemm::GemmCoord tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
+
+  if (grid_tiled_shape.m() <= tb_tile_offset.m() ||
+    grid_tiled_shape.n() <= tb_tile_offset.n()) {
+
+    return;
+  }
+
+  // Compute initial location in logical coordinates
+  cutlass::MatrixCoord tb_offset_A{
+    tb_tile_offset.m() * Mma::Shape::kM,
+    tb_tile_offset.k()
+  };
+
+  cutlass::MatrixCoord tb_offset_B{
+    tb_tile_offset.k(),
+    tb_tile_offset.n() * Mma::Shape::kN
+  };
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(
+    params_A,
+    ref_A.data(),
+    {problem_size.m(), problem_size.k()},
+    tb_thread_id,
+    tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(
+    params_B,
+    ref_B.data(),
+    {problem_size.k(), problem_size.n()},
+    tb_thread_id,
+    tb_offset_B);
+
+  int warp_id = canonical_warp_idx_sync();
+  int lane_id = threadIdx.x % 32;
+
+  //
+  // Main loop
+  //
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage.main_loop, tb_thread_id, warp_id, lane_id);
+
+  typename Mma::FragmentC accumulators;
+
+  accumulators.clear();
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(problem_size, accumulators, iterator_A, iterator_B, accumulators);
+
+  //
+  // Epilogue
+  //
+
+  Epilogue epilogue(
+    params_epilogue, 
+    shared_storage.epilogue, 
+    tb_thread_id, 
+    warp_id, 
+    lane_id);
+
+  tb_tile_offset = threadblock_swizzle.get_tile_offset(swizzle_log_tile);
+
+  //assume identity swizzle
+  MatrixCoord threadblock_offset(
+    tb_tile_offset.m() * Mma::Shape::kM,
+    tb_tile_offset.n() * Mma::Shape::kN
+  );
+
+  // run efficient epilogue
+  epilogue({problem_size.m(), problem_size.n()}, accumulators, threadblock_offset);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f8cd338d68e4b51d7be525bbf03fdb4ac943f7d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -0,0 +1,715 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmPlanarComplex {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using Operator = typename Mma::Operator;
+  using ArchTag = typename Mma::ArchTag;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value, 
+    128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Additional types needed for reflection
+  //
+
+  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::Shape;
+
+  static int const kStages = Mma::kStages;
+    
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  //
+  // Arguments structure
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A_real{nullptr};
+    void const * ptr_A_imag{nullptr};
+    void const * ptr_B_real{nullptr};
+    void const * ptr_B_imag{nullptr};
+    void const * ptr_C_real{nullptr};
+    void const * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
+    
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A_real,
+      void const * ptr_A_imag,
+      void const * ptr_B_real,
+      void const * ptr_B_imag,
+      void const * ptr_C_real,
+      void const * ptr_C_imag,
+      void * ptr_D_real,
+      void * ptr_D_imag,
+      typename LayoutA::Stride::Index lda_real,
+      typename LayoutA::Stride::Index lda_imag,
+      typename LayoutB::Stride::Index ldb_real,
+      typename LayoutB::Stride::Index ldb_imag,
+      typename LayoutC::Stride::Index ldc_real,
+      typename LayoutC::Stride::Index ldc_imag,
+      typename LayoutC::Stride::Index ldd_real,
+      typename LayoutC::Stride::Index ldd_imag,
+      int64_t batch_stride_A = 0,
+      int64_t batch_stride_A_imag = 0,
+      int64_t batch_stride_B = 0,
+      int64_t batch_stride_B_imag = 0,
+      int64_t batch_stride_C = 0,
+      int64_t batch_stride_C_imag = 0,
+      int64_t batch_stride_D = 0,
+      int64_t batch_stride_D_imag = 0)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue), 
+      ptr_A_real(ptr_A_real), 
+      ptr_A_imag(ptr_A_imag), 
+      ptr_B_real(ptr_B_real),
+      ptr_B_imag(ptr_B_imag),
+      ptr_C_real(ptr_C_real),
+      ptr_C_imag(ptr_C_imag),
+      ptr_D_real(ptr_D_real), 
+      ptr_D_imag(ptr_D_imag), 
+      lda_real(lda_real),
+      lda_imag(lda_imag),
+      ldb_real(ldb_real),
+      ldb_imag(ldb_imag),
+      ldc_real(ldc_real),
+      ldc_imag(ldc_imag),
+      ldd_real(ldd_real),
+      ldd_imag(ldd_imag),
+      batch_stride_A(batch_stride_A),
+      batch_stride_A_imag(batch_stride_A_imag),
+      batch_stride_B(batch_stride_B),
+      batch_stride_B_imag(batch_stride_B_imag),
+      batch_stride_C(batch_stride_C),
+      batch_stride_C_imag(batch_stride_C_imag),
+      batch_stride_D_imag(batch_stride_D_imag)
+    {}
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A_real, args.ptr_B_real);
+      std::swap(args.ptr_A_imag, args.ptr_B_imag);
+      std::swap(args.lda_real, args.ldb_real);
+      std::swap(args.lda_imag, args.ldb_imag);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.batch_stride_A_imag, args.batch_stride_B_imag);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_A_real{nullptr};
+    void * ptr_A_imag{nullptr};
+    void * ptr_B_real{nullptr};
+    void * ptr_B_imag{nullptr};
+    void * ptr_C_real{nullptr};
+    void * ptr_C_imag{nullptr};
+    void * ptr_D_real{nullptr};
+    void * ptr_D_imag{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+
+    int64_t batch_stride_A_imag{0};
+    int64_t batch_stride_B_imag{0};
+    int64_t batch_stride_C_imag{0};
+    int64_t batch_stride_D_imag{0};
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A_real(args.lda_real),
+      params_A_imag(args.lda_imag),
+      params_B_real(args.ldb_real),
+      params_B_imag(args.ldb_imag),
+      params_C_real(args.ldc_real),
+      params_C_imag(args.ldc_imag),
+      params_D_real(args.ldd_real),
+      params_D_imag(args.ldd_imag),
+      output_op(args.epilogue),
+      ptr_A_real(const_cast<void *>(args.ptr_A_real)),
+      ptr_A_imag(const_cast<void *>(args.ptr_A_imag)),
+      ptr_B_real(const_cast<void *>(args.ptr_B_real)),
+      ptr_B_imag(const_cast<void *>(args.ptr_B_imag)),
+      ptr_C_real(const_cast<void *>(args.ptr_C_real)),
+      ptr_C_imag(const_cast<void *>(args.ptr_C_imag)),
+      ptr_D_real(args.ptr_D_real),
+      ptr_D_imag(args.ptr_D_imag),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_A_imag(args.batch_stride_A_imag),
+      batch_stride_B_imag(args.batch_stride_B_imag),
+      batch_stride_C_imag(args.batch_stride_C_imag),
+      batch_stride_D_imag(args.batch_stride_D_imag)
+    {}
+
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Double the size returned by the base class because we need to
+        // accumulate two ElementC components
+        workspace_bytes *= 2;
+      }
+
+      return workspace_bytes;
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A_real = const_cast<void *>(args.ptr_A_real);
+      ptr_A_imag = const_cast<void *>(args.ptr_A_imag);
+
+      ptr_B_real = const_cast<void *>(args.ptr_B_real);
+      ptr_B_imag = const_cast<void *>(args.ptr_B_imag);
+
+      ptr_C_real = const_cast<void *>(args.ptr_C_real);
+      ptr_C_imag = const_cast<void *>(args.ptr_C_imag);
+
+      ptr_D_real = const_cast<void *>(args.ptr_D_real);
+      ptr_D_imag = const_cast<void *>(args.ptr_D_imag);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_A_imag = args.batch_stride_A_imag;
+      batch_stride_B_imag = args.batch_stride_B_imag;
+      batch_stride_C_imag = args.batch_stride_C_imag;
+      batch_stride_D_imag = args.batch_stride_D_imag;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(Arguments const &args)
+  {
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
+
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplex op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A_real = static_cast<ElementA *>(params.ptr_A_real);
+    ElementA *ptr_A_imag = static_cast<ElementA *>(params.ptr_A_imag);
+
+    ElementB *ptr_B_real = static_cast<ElementB *>(params.ptr_B_real);
+    ElementB *ptr_B_imag = static_cast<ElementB *>(params.ptr_B_imag);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A;
+      ptr_A_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_A_imag;
+      ptr_B_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B;
+      ptr_B_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_B_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A_real = static_cast<ElementA * const *>(params.ptr_A_real)[threadblock_tile_offset.k()];
+      ptr_A_imag = static_cast<ElementA * const *>(params.ptr_A_imag)[threadblock_tile_offset.k()];
+      ptr_B_real = static_cast<ElementB * const *>(params.ptr_B_real)[threadblock_tile_offset.k()];
+      ptr_B_imag = static_cast<ElementB * const *>(params.ptr_B_imag)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A_real(
+      params.params_A_real,
+      ptr_A_real,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorA iterator_A_imag(
+      params.params_A_imag,
+      ptr_A_imag,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B_real(
+      params.params_B_real,
+      ptr_B_real,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorB iterator_B_imag(
+      params.params_B_imag,
+      ptr_B_imag,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A_real,
+      iterator_A_imag,
+      iterator_B_real, 
+      iterator_B_imag, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C_real = static_cast<ElementC *>(params.ptr_C_real);
+    ElementC *ptr_C_imag = static_cast<ElementC *>(params.ptr_C_imag);
+    ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real);
+    ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D_real += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_D_imag += threadblock_tile_offset.k() * params.batch_stride_D_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C;
+      ptr_C_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_C_imag;
+      ptr_D_real += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D;
+      ptr_D_imag += int64_t(threadblock_tile_offset.k()) * params.batch_stride_D_imag;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C_real = static_cast<ElementC * const *>(params.ptr_C_real)[threadblock_tile_offset.k()];
+      ptr_C_imag = static_cast<ElementC * const *>(params.ptr_C_imag)[threadblock_tile_offset.k()];
+      ptr_D_real = static_cast<ElementC * const *>(params.ptr_D_real)[threadblock_tile_offset.k()];
+      ptr_D_imag = static_cast<ElementC * const *>(params.ptr_D_imag)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C_real(
+      params.params_C_real,
+      ptr_C_real,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_C_imag(
+      params.params_C_imag,
+      ptr_C_imag,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D_real(
+      params.params_D_real,
+      ptr_D_real,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_D_imag(
+      params.params_D_imag,
+      ptr_D_imag,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    //
+    // Construct epilogue
+    //
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C_real = iterator_D_real;
+        iterator_C_imag = iterator_D_imag;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D_real, 
+      iterator_D_imag, 
+      accumulators, 
+      iterator_C_real,
+      iterator_C_imag); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..1685f23fc1643b66be0067662486d25a037422d5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmPlanarComplexArray {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using Operator = typename Mma::Operator;
+  using ArchTag = typename Mma::ArchTag;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value, 
+    128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Additional types needed for reflection
+  //
+
+  using ElementAccumulator = typename Mma::Policy::Operator::ElementC;
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::Shape;
+
+  static int const kStages = Mma::kStages;
+    
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  //
+  // Arguments structure
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
+
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
+
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
+
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
+
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
+
+    typename LayoutA::Stride::Index lda_real{};
+    typename LayoutA::Stride::Index lda_imag{};
+    typename LayoutB::Stride::Index ldb_real{};
+    typename LayoutB::Stride::Index ldb_imag{};
+    typename LayoutC::Stride::Index ldc_real{};
+    typename LayoutC::Stride::Index ldc_imag{};
+    typename LayoutC::Stride::Index ldd_real{};
+    typename LayoutC::Stride::Index ldd_imag{};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      int const *ptr_M,
+      int const *ptr_N,
+      int const *ptr_K,
+      void const * const * ptr_A_real,
+      void const * const * ptr_A_imag,
+      void const * const * ptr_B_real,
+      void const * const * ptr_B_imag,
+      void const * const * ptr_C_real,
+      void const * const * ptr_C_imag,
+      void * const * ptr_D_real,
+      void * const * ptr_D_imag,
+      typename LayoutA::Stride::Index lda_real,
+      typename LayoutA::Stride::Index lda_imag,
+      typename LayoutB::Stride::Index ldb_real,
+      typename LayoutB::Stride::Index ldb_imag,
+      typename LayoutC::Stride::Index ldc_real,
+      typename LayoutC::Stride::Index ldc_imag,
+      typename LayoutC::Stride::Index ldd_real,
+      typename LayoutC::Stride::Index ldd_imag)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_M(ptr_M),
+      ptr_N(ptr_N),
+      ptr_K(ptr_K),
+      ptr_A_real(ptr_A_real), 
+      ptr_A_imag(ptr_A_imag), 
+      ptr_B_real(ptr_B_real),
+      ptr_B_imag(ptr_B_imag),
+      ptr_C_real(ptr_C_real),
+      ptr_C_imag(ptr_C_imag),
+      ptr_D_real(ptr_D_real), 
+      ptr_D_imag(ptr_D_imag), 
+      lda_real(lda_real),
+      lda_imag(lda_imag),
+      ldb_real(ldb_real),
+      ldb_imag(ldb_imag),
+      ldc_real(ldc_real),
+      ldc_imag(ldc_imag),
+      ldd_real(ldd_real),
+      ldd_imag(ldd_imag)
+    {}
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+      
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_M, args.ptr_N);
+      std::swap(args.ptr_A_real, args.ptr_B_real);
+      std::swap(args.ptr_A_imag, args.ptr_B_imag);
+      std::swap(args.lda_real, args.ldb_real);
+      std::swap(args.lda_imag, args.ldb_imag);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A_real{};
+    typename Mma::IteratorA::Params params_A_imag{};
+    typename Mma::IteratorB::Params params_B_real{};
+    typename Mma::IteratorB::Params params_B_imag{};
+    typename Epilogue::OutputTileIterator::Params params_C_real{};
+    typename Epilogue::OutputTileIterator::Params params_C_imag{};
+    typename Epilogue::OutputTileIterator::Params params_D_real{};
+    typename Epilogue::OutputTileIterator::Params params_D_imag{};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    int const *ptr_M{nullptr};
+    int const *ptr_N{nullptr};
+    int const *ptr_K{nullptr};
+
+    void const * const * ptr_A_real{nullptr};
+    void const * const * ptr_A_imag{nullptr};
+    void const * const * ptr_B_real{nullptr};
+    void const * const * ptr_B_imag{nullptr};
+    void const * const * ptr_C_real{nullptr};
+    void const * const * ptr_C_imag{nullptr};
+    void * const * ptr_D_real{nullptr};
+    void * const * ptr_D_imag{nullptr};
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      ptr_M(args.ptr_M),
+      ptr_N(args.ptr_N),
+      ptr_K(args.ptr_K),
+      params_A_real(args.lda_real),
+      params_A_imag(args.lda_imag),
+      params_B_real(args.ldb_real),
+      params_B_imag(args.ldb_imag),
+      params_C_real(args.ldc_real),
+      params_C_imag(args.ldc_imag),
+      params_D_real(args.ldd_real),
+      params_D_imag(args.ldd_imag),
+      output_op(args.epilogue),
+      ptr_A_real(args.ptr_A_real),
+      ptr_A_imag(args.ptr_A_imag),
+      ptr_B_real(args.ptr_B_real),
+      ptr_B_imag(args.ptr_B_imag),
+      ptr_C_real(args.ptr_C_real),
+      ptr_C_imag(args.ptr_C_imag),
+      ptr_D_real(args.ptr_D_real),
+      ptr_D_imag(args.ptr_D_imag)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_M = args.ptr_M;
+      ptr_N = args.ptr_N;
+      ptr_K = args.ptr_K;
+
+      ptr_A_real = args.ptr_A_real;
+      ptr_A_imag = args.ptr_A_imag;
+
+      ptr_B_real = args.ptr_B_real;
+      ptr_B_imag = args.ptr_B_imag;
+
+      ptr_C_real = args.ptr_C_real;
+      ptr_C_imag = args.ptr_C_imag;
+
+      ptr_D_real = args.ptr_D_real;
+      ptr_D_imag = args.ptr_D_imag;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(Arguments const &args) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = args.problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = args.problem_size.m() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = args.problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = args.problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = args.problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = args.problem_size.m() % kAlignmentC;
+    }
+
+    if (isAMisaligned || isBMisaligned || isCMisaligned) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmPlanarComplexArray op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int batch_idx = threadblock_tile_offset.k();
+
+    int problem_size_m = params.problem_size.m();
+    int problem_size_n = params.problem_size.n();
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A_real = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_real[batch_idx]));
+    ElementA *ptr_A_imag = static_cast<ElementA *>(const_cast<void *>(params.ptr_A_imag[batch_idx]));
+
+    ElementB *ptr_B_real = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_real[batch_idx]));
+    ElementB *ptr_B_imag = static_cast<ElementB *>(const_cast<void *>(params.ptr_B_imag[batch_idx]));
+
+    //
+    // If pointers for problem sizes are specified, these are loaded from global memory
+    //
+
+    if (params.ptr_M) {
+      problem_size_m = params.ptr_M[batch_idx];
+    }
+
+    if (params.ptr_N) {
+      problem_size_n = params.ptr_N[batch_idx];
+    }
+
+    if (params.ptr_K) {
+      problem_size_k = params.ptr_K[batch_idx];
+    }
+
+    int const kBlockCountM = (problem_size_m + Mma::Shape::kM - 1) / Mma::Shape::kM;
+    int const kBlockCountN = (problem_size_n + Mma::Shape::kN - 1) / Mma::Shape::kN;
+        
+    int const kGemmKIterations = (problem_size_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    //
+    // Each threadblock loops over the logical problem size which the kernel may have discovered
+    // after the grid is launched.
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int block_m = threadblock_tile_offset.m(); 
+      block_m < kBlockCountM; 
+      block_m += params.grid_tiled_shape.m()) {
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int block_n = threadblock_tile_offset.n(); 
+        block_n < kBlockCountN; 
+        block_n += params.grid_tiled_shape.n()) {
+
+        //
+        // Compute indices within threadblock and warp.
+        //
+        int thread_idx = threadIdx.x;
+
+        // Broadcast the warp_id computed by lane 0 to ensure dependent code
+        // is compiled as warp-uniform.
+        int warp_idx = canonical_warp_idx_sync();
+        int lane_idx = threadIdx.x % 32;
+    
+        //
+        // Proceed with regular GEMM logic.
+        //
+
+        // Compute initial location in logical coordinates
+        cutlass::MatrixCoord tb_offset_A{ block_m * Mma::Shape::kM, 0};
+        cutlass::MatrixCoord tb_offset_B{ 0, block_n * Mma::Shape::kN };
+
+        // Construct iterators to A and B operands
+        typename Mma::IteratorA iterator_A_real(
+          params.params_A_real,
+          ptr_A_real,
+          {problem_size_m, problem_size_k},
+          thread_idx,
+          tb_offset_A);
+
+        typename Mma::IteratorA iterator_A_imag(
+          params.params_A_imag,
+          ptr_A_imag,
+          {problem_size_m, problem_size_k},
+          thread_idx,
+          tb_offset_A);
+
+        typename Mma::IteratorB iterator_B_real(
+          params.params_B_real,
+          ptr_B_real,
+          {problem_size_k, problem_size_n},
+          thread_idx,
+          tb_offset_B);
+  
+        typename Mma::IteratorB iterator_B_imag(
+          params.params_B_imag,
+          ptr_B_imag,
+          {problem_size_k, problem_size_n},
+          thread_idx,
+          tb_offset_B);
+
+        //
+        // Main loop
+        //
+
+        // Construct thread-scoped matrix multiply
+        Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+        typename Mma::FragmentC accumulators;
+
+        accumulators.clear();
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(
+          kGemmKIterations, 
+          accumulators, 
+          iterator_A_real,
+          iterator_A_imag,
+          iterator_B_real, 
+          iterator_B_imag, 
+          accumulators);
+
+        //
+        // Epilogue
+        //
+
+        EpilogueOutputOp output_op(params.output_op);
+
+        //
+        // Masked tile iterators constructed from members
+        //
+
+        //assume identity swizzle
+        MatrixCoord threadblock_offset(
+          block_m * Mma::Shape::kM,
+          block_n * Mma::Shape::kN
+        );
+
+        ElementC *ptr_C_real = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_real[batch_idx]));
+        ElementC *ptr_C_imag = static_cast<ElementC *>(const_cast<void *>(params.ptr_C_imag[batch_idx]));
+        ElementC *ptr_D_real = static_cast<ElementC *>(params.ptr_D_real[batch_idx]);
+        ElementC *ptr_D_imag = static_cast<ElementC *>(params.ptr_D_imag[batch_idx]);
+
+        // Tile iterator loading from source tensor.
+        typename Epilogue::OutputTileIterator iterator_C_real(
+          params.params_C_real,
+          ptr_C_real,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        typename Epilogue::OutputTileIterator iterator_C_imag(
+          params.params_C_imag,
+          ptr_C_imag,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        // Tile iterator writing to destination tensor.
+        typename Epilogue::OutputTileIterator iterator_D_real(
+          params.params_D_real,
+          ptr_D_real,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        typename Epilogue::OutputTileIterator iterator_D_imag(
+          params.params_D_imag,
+          ptr_D_imag,
+          {problem_size_m, problem_size_n},
+          thread_idx,
+          threadblock_offset
+        );
+
+        //
+        // Construct epilogue
+        //
+
+        Epilogue epilogue(
+          shared_storage.epilogue, 
+          thread_idx, 
+          warp_idx, 
+          lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+          output_op, 
+          iterator_D_real, 
+          iterator_D_imag, 
+          accumulators, 
+          iterator_C_real,
+          iterator_C_imag); 
+
+
+      } // for block_n
+    } // for block_m
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..035caf7b8b6bfd35bda3894a0f9da566b03e4a82
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal.h
@@ -0,0 +1,804 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+template <
+  typename LayoutA,
+  typename LayoutB,
+  typename LayoutC,
+  typename LayoutE
+>
+struct SparseUniversalArgumentsBase : UniversalArgumentsBase {
+  //
+  // Data members
+  //
+
+  void const * ptr_A;
+  void const * ptr_B;
+  void const * ptr_C;
+  void * ptr_D;
+  void const * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  typename LayoutA::Stride::LongIndex lda;
+  typename LayoutB::Stride::LongIndex ldb;
+  typename LayoutC::Stride::LongIndex ldc;
+  typename LayoutC::Stride::LongIndex ldd;
+  typename LayoutE::Stride::LongIndex lde;
+
+  //
+  // Methods
+  //
+
+  SparseUniversalArgumentsBase():
+    ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr), ptr_E(nullptr)
+  {}
+
+  /// constructs an arguments structure
+  SparseUniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    void const * ptr_A,
+    void const * ptr_B,
+    void const * ptr_C,
+    void * ptr_D,
+    void const * ptr_E,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    int64_t batch_stride_E,
+    typename LayoutA::Stride::LongIndex lda,
+    typename LayoutB::Stride::LongIndex ldb,
+    typename LayoutC::Stride::LongIndex ldc,
+    typename LayoutC::Stride::LongIndex ldd,
+    typename LayoutC::Stride::LongIndex lde)
+  :
+    UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+    ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_E(ptr_E),
+    batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+    batch_stride_E(batch_stride_E),
+    lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), lde(lde)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalArgumentsBase::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+template <
+  typename Mma,
+  typename Epilogue,
+  typename Arguments,
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB
+>
+struct SparseUniversalParamsBase : UniversalParamsBase<
+  ThreadblockSwizzle,
+  ThreadblockShape,
+  ElementA,
+  ElementB,
+  ElementC,
+  LayoutA,
+  LayoutB> {
+  using ParamsBase = UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>;
+
+  //
+  // Data members
+  //
+
+  typename Mma::IteratorA::Params params_A;
+  typename Mma::IteratorB::Params params_B;
+  typename Epilogue::OutputTileIterator::Params params_C;
+  typename Epilogue::OutputTileIterator::Params params_D;
+  typename Mma::IteratorE::Params params_E;
+
+  void * ptr_A;
+  void * ptr_B;
+  void * ptr_C;
+  void * ptr_D;
+  void * ptr_E;
+
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_E;
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseUniversalParamsBase() = default;
+
+  /// Constructor
+  SparseUniversalParamsBase(
+    Arguments const &args,  /// GEMM application arguments
+    int device_sms,         /// Number of SMs on the device
+    int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+  :
+    ParamsBase(args, device_sms, sm_occupancy),
+    params_A(args.lda),
+    params_B(args.ldb),
+    params_C(args.ldc),
+    params_D(args.ldd),
+    params_E(args.lde),
+    ptr_A(const_cast<void *>(args.ptr_A)),
+    ptr_B(const_cast<void *>(args.ptr_B)),
+    ptr_C(const_cast<void *>(args.ptr_C)),
+    ptr_D(args.ptr_D),
+    ptr_E(const_cast<void *>(args.ptr_E)),
+    batch_stride_A(args.batch_stride_A),
+    batch_stride_B(args.batch_stride_B),
+    batch_stride_C(args.batch_stride_C),
+    batch_stride_E(args.batch_stride_E)
+  {}
+
+  /// Lightweight update given a subset of arguments.
+  void update(Arguments const &args)
+  {
+    CUTLASS_TRACE_HOST("SparseUniversalParamsBase::update()");
+
+    // Update input/output pointers
+    this->ptr_A = const_cast<void *>(args.ptr_A);
+    this->ptr_B = const_cast<void *>(args.ptr_B);
+    this->ptr_C = const_cast<void *>(args.ptr_C);
+    this->ptr_D = args.ptr_D;
+    this->ptr_E = const_cast<void *>(args.ptr_E);
+
+    this->batch_stride_A = args.batch_stride_A;
+    this->batch_stride_B = args.batch_stride_B;
+    this->batch_stride_C = args.batch_stride_C;
+    this->batch_stride_D = args.batch_stride_D;
+    this->batch_stride_E = args.batch_stride_E;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde)
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      epilogue(epilogue)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (cute::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (cute::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (cute::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+    bool isEMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = (problem_size.k() / kSparse) % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = (problem_size.k() / kSparse) % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    isEMisaligned = (problem_size.m() % kAlignmentE)
+                  || ((problem_size.k() / kSparse) % kAlignmentE);
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      isEMisaligned = true;
+    }
+
+    if (mode == GemmUniversalMode::kGemm
+     || mode == GemmUniversalMode::kGemmSplitKParallel) {
+      if ((problem_size.k() / split_k_count) % Mma::Shape::kK) {
+        isEMisaligned = true;
+      }
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      isEMisaligned = true;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isEMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for E operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversal op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..6251c389e90084df3e11efdd318d9b5a35b20eb9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/gemm/kernel/gemm_sparse_universal.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmSparseUniversalWithAbsmax {
+public:
+  using Base = GemmSparseUniversal<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using ElementAux = typename Epilogue::AuxOutputTileIterator::Element;
+  using LayoutAux = typename Epilogue::AuxOutputTileIterator::Layout;
+  using ElementVector = typename Epilogue::ElementVector;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    > {
+    using Base = detail::SparseUniversalArgumentsBase<
+      LayoutA,
+      LayoutB,
+      LayoutC,
+      LayoutE
+    >;
+
+    void const* ptr_Aux;
+    void const* ptr_Vector;
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename LayoutAux::Stride::LongIndex ldaux;
+    int64_t ldvector;
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    Arguments() {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void const * ptr_E,
+      void const * ptr_Aux,
+      void const * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_E,
+      int64_t batch_stride_Aux,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      typename LayoutC::Stride::LongIndex lde,
+      typename LayoutAux::Stride::LongIndex ldaux,
+      int64_t ldvector
+      )
+    :
+      Base(
+        mode, problem_size, batch_count,
+        ptr_A, ptr_B, ptr_C, ptr_D, ptr_E,
+        batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_E,
+        lda, ldb, ldc, ldd, lde
+      ),
+      ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_Aux(batch_stride_Aux),
+      batch_stride_Vector(batch_stride_Vector),
+      ldaux(ldaux),
+      ldvector(ldvector),
+      epilogue(epilogue)
+    { }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : detail::SparseUniversalParamsBase<
+    Mma,
+    Epilogue,
+    Arguments,
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = detail::SparseUniversalParamsBase<
+      Mma,
+      Epilogue,
+      Arguments,
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+    int64_t ldvector;
+
+    void* ptr_Aux;
+    void* ptr_Vector;
+
+    int64_t batch_stride_Aux;
+    int64_t batch_stride_Vector;
+    typename EpilogueOutputOp::Params output_op;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_Aux(args.ldaux),
+      ldvector(args.ldvector),
+      ptr_Aux(const_cast<void *>(args.ptr_Aux)),
+      ptr_Vector(const_cast<void *>(args.ptr_Vector)),
+      batch_stride_Aux(args.batch_stride_Aux),
+      batch_stride_Vector(args.batch_stride_Vector),
+      output_op(args.epilogue)
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      this->ptr_A = const_cast<void *>(args.ptr_A);
+      this->ptr_B = const_cast<void *>(args.ptr_B);
+      this->ptr_C = const_cast<void *>(args.ptr_C);
+      this->ptr_D = args.ptr_D;
+      this->ptr_E = const_cast<void *>(args.ptr_E);
+      ptr_Aux = const_cast<void *>(args.ptr_Aux);
+      ptr_Vector = const_cast<void *>(args.ptr_Vector);
+
+      this->batch_stride_A = args.batch_stride_A;
+      this->batch_stride_B = args.batch_stride_B;
+      this->batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      this->batch_stride_E = args.batch_stride_E;
+      this->batch_stride_Aux = args.batch_stride_Aux;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size,
+    GemmUniversalMode mode,
+    int split_k_count) {
+    return Base::can_implement(problem_size, mode, split_k_count);
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size, args.mode, args.batch_count);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmSparseUniversalWithAbsmax op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+    ElementE *ptr_E = static_cast<ElementE *>(params.ptr_E);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A / kSparse;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+      ptr_E += threadblock_tile_offset.k() * params.batch_stride_E / kSparse;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+      ptr_E = static_cast<ElementE * const *>(params.ptr_E)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k / kSparse / kElementsPerElementE,
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+      params.params_E,
+      ptr_E,
+      {params.problem_size.m(), problem_size_k / kSparse / kElementsPerElementE},
+      thread_idx,
+      tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      iterator_E,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    ElementAux * ptr_Aux = static_cast<ElementAux *>(params.ptr_Aux);
+    ElementVector * ptr_Vector = static_cast<ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_Aux;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<ElementAux * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldvector;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      // Only the final block uses Vector
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+       (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Vector,
+      iterator_D,
+      accumulators,
+      iterator_C,
+      iterator_Aux,
+      params.problem_size.mn(),
+      threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a21f0813d455295457da045af4d20e3eaab28781
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for GEMM performing a reduction over K partitions in parallel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmSplitKParallel {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  static int const kAlignmentK = Mma::Operator::Shape::kK;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorA::TensorRef ref_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Mma::IteratorB::TensorRef ref_B;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int64_t splitk_slice_stride;
+    int gemm_k_size;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename OutputOp::Params output_op,
+      int64_t splitk_slice_stride
+    ):
+      problem_size(problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(ref_A.layout()),
+      ref_A(ref_A),
+      params_B(ref_B.layout()),
+      ref_B(ref_B),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      splitk_slice_stride(splitk_slice_stride) {
+
+      int full_gemm_k_iterations = problem_size.k() / Mma::Shape::kK;
+      int gemm_k_iterations = full_gemm_k_iterations / grid_tiled_shape.k();
+
+      gemm_k_size = gemm_k_iterations * Mma::Shape::kK;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GemmSplitKParallel() { } 
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k;
+    if (threadblock_tile_offset.k() + 1 == params.grid_tiled_shape.k()) {
+      problem_size_k = params.problem_size.k();
+    }
+    else {
+      problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+    }
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_A.column() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    int warp_idx = threadIdx.x / 32;
+    int lane_idx = threadIdx.x % 32;
+
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators);
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator writing to output tile
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    iterator_D.add_pointer_offset(params.splitk_slice_stride * threadblock_tile_offset.k());
+
+    // Execute the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Run efficient epilogue
+    epilogue(output_op, iterator_D, accumulators, iterator_D);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
new file mode 100644
index 0000000000000000000000000000000000000000..473819af0b2827dde93a20051e2fda279761f3cc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
@@ -0,0 +1,2396 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Stream-K Gemm kernel compatible with fused epilogues
+    that broadcast a bias vector over the MMA output.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmStreamkWithFusedEpilogue;
+
+// GemmStreamkWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C1{nullptr};
+    void const * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc1{};
+    typename LayoutC::Stride::Index ldc2{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C1{nullptr};
+    void * ptr_C2{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C1{};
+    typename Epilogue::OutputTileIterator::Params params_C2{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C1{0};
+    int64_t batch_stride_C2{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += tile_work.tiled_coord.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += tile_work.tiled_coord.k() * params.batch_stride_C2;
+      }
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[tile_work.tiled_coord.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[tile_work.tiled_coord.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from residual1.
+    typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator loading from residual2.
+    typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C1,
+        iterator_C2,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+
+// GemmStreamkWithFusedEpilogue with one source
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmStreamkWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    void * ptr_Vector{nullptr};
+    void * ptr_Tensor{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+    typename LayoutC::Stride::Index ldr{};
+    typename LayoutC::Stride::Index ldt{};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt,
+      int avail_sms = -1)                           /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    :
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+      CUTLASS_TRACE_HOST("  avail_sms: " << this->avail_sms);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+    void * ptr_Tensor{nullptr};
+    void * ptr_Vector{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::TensorTileIterator::Params params_Tensor{};
+
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_Vector{0};
+    int64_t batch_stride_Tensor{0};
+
+    typename LayoutC::Stride::Index ldr{};
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+      CUTLASS_TRACE_HOST("  avail_sms: " << avail_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+    /// Lightweight update given a subset of arguments.  Problem geometry is assumed
+    /// to remain the same.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params const &params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmStreamkWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          tile_work.tiled_coord.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += tile_work.tiled_coord.k() * params.batch_stride_Vector;
+      }
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[tile_work.tiled_coord.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[tile_work.tiled_coord.k()];
+      }
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tile_work.tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        accumulator_tile,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        ptr_Tensor,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_item_begin.column() + tiled_coord.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        ptr_Vector,
+        iterator_D,
+        iterator_C,
+        tensor_iterator,
+        params.block_mapping.problem_size.mn(),
+        threadblock_item_begin);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmStreamkWithFusedEpilogue op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmStreamkWithFusedEpilogue(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()() {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
new file mode 100644
index 0000000000000000000000000000000000000000..98bc22714f5b3c61dd5842ceb6320e616c426c02
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_transpose_operands.h
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief The universal GEMM accommodates serial reductions, parallel reductions, batched strided, and 
+    batched array variants.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_, 
+  typename LayoutA_, 
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  bool Transpose
+>
+struct MapArguments {
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  static ComplexTransform const kTransformA = TransformA;
+  static int const kAlignmentA = AlignmentA; 
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kAlignmentB = AlignmentB; 
+  using LayoutC = LayoutC_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_, 
+  typename LayoutA_, 
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_
+>
+struct MapArguments<
+  ElementA_,
+  LayoutA_,
+  TransformA,
+  AlignmentA, 
+  ElementB_,
+  LayoutB_,
+  TransformB,
+  AlignmentB,
+  LayoutC_,
+  true
+> {
+  using ElementA = ElementB_;
+  using LayoutA = typename layout::LayoutTranspose<LayoutB_>::type;
+  static ComplexTransform const kTransformA = TransformB;
+  static int const kAlignmentA = AlignmentB; 
+  using ElementB = ElementA_;
+  using LayoutB = typename layout::LayoutTranspose<LayoutA_>::type;
+  static ComplexTransform const kTransformB = TransformA;
+  static int const kAlignmentB = AlignmentA; 
+  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..be1e1d868f3ffd0ddcdf0e1b7e0db0dfdfb4cb7e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.h
@@ -0,0 +1,702 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+class GemmUniversal<
+  Mma_,
+  Epilogue_,
+  ThreadblockSwizzle_,
+  void,
+  // 3.x kernels use the first template argument to define the ProblemShape
+  // We use this invariant to SFINAE dispatch against either the 2.x API or the 3.x API
+  cute::enable_if_t<not (cute::is_tuple<Mma_>::value || IsCutlass3ArrayKernel<Mma_>::value)>
+> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    typename LayoutA::Stride stride_a;
+    typename LayoutB::Stride stride_b;
+    typename LayoutC::Stride stride_c;
+    typename LayoutC::Stride stride_d;
+
+    typename LayoutA::Stride::LongIndex lda;
+    typename LayoutB::Stride::LongIndex ldb;
+    typename LayoutC::Stride::LongIndex ldc;
+    typename LayoutC::Stride::LongIndex ldd;
+
+    int const * ptr_gather_A_indices;
+    int const * ptr_gather_B_indices;
+    int const * ptr_scatter_D_indices;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr), ptr_B(nullptr), ptr_C(nullptr), ptr_D(nullptr),
+      ptr_gather_A_indices(nullptr),
+      ptr_gather_B_indices(nullptr),
+      ptr_scatter_D_indices(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      lda = 0;
+      ldb = 0;
+      ldc = 0;
+      ldd = 0;
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int const *ptr_gather_A_indices = nullptr,
+      int const *ptr_gather_B_indices = nullptr,
+      int const *ptr_scatter_D_indices = nullptr
+    ):
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      ptr_gather_A_indices(ptr_gather_A_indices), ptr_gather_B_indices(ptr_gather_B_indices),
+      ptr_scatter_D_indices(ptr_scatter_D_indices)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const
+    {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+      std::swap(args.ptr_gather_A_indices, args.ptr_gather_B_indices);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+    int * ptr_scatter_D_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices)),
+      ptr_scatter_D_indices(const_cast<int *>(args.ptr_scatter_D_indices))
+    {}
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+      ptr_scatter_D_indices = const_cast<int *>(args.ptr_scatter_D_indices);
+
+      output_op = args.epilogue;
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (cute::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (cute::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (cute::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (cute::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (cute::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (cute::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (cute::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (cute::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || cute::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (cute::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (cute::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || cute::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversal op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      params.ptr_scatter_D_indices
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op,
+      iterator_D,
+      accumulators,
+      iterator_C);
+
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b053963a76bafe6d7345dcb4e41155d9cea035f1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -0,0 +1,81 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+// In cases where ProblemShape is not a tuple, this is used to check if the
+// underlying problem shape type is aliased within or not.
+// Used for dispatching GemmUniversal to 2.x API or 3.x API
+template <class ProblemShape, class = void>
+struct IsCutlass3ArrayKernel : cute::false_type { };
+
+template <typename ProblemShape>
+struct IsCutlass3ArrayKernel<ProblemShape, cute::void_t<typename ProblemShape::UnderlyingProblemShape>>
+    : cute::true_type { };
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
+
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/kernel/sm70_gemm.hpp"
+#include "cutlass/gemm/kernel/sm70_gemm_array.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp"
+#include "cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mixed_input_transform.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp"
+#include "cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm100_gemm_mixed_tma_cpasync_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm103_blockscaled_gemm_array_tma_warpspecialized.hpp"
+#include "cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
new file mode 100644
index 0000000000000000000000000000000000000000..946523421967394d7ef305a8557da4d3b20b62f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -0,0 +1,61 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+namespace cutlass::gemm::kernel {
+
+
+/*
+ * Stateless universal device GEMM kernel type that treats GEMM as
+ * a composition of a collective mainloop and a collective epilogue.
+ *
+ * Supports both the 2.x and 3.x APIs based on whether the first type is
+ * a cute::tuple<> or not.
+ * 2.x API implementation: cutlass/gemm/kernel/gemm_universal.h
+ * 3.x API implementation: cutlass/gemm/kernel/gemm_*.hpp
+ *
+ * In the following declaration, the name preceding the 'Or' refers to
+ * 3.x API type argument order, and the name succeeding the 'Or' refers to
+ * 2.x API type argument order. Template arguments without two names
+ * belong to the 3.x API only.
+**/
+template <
+  class ProblemShapeOrThreadblockMma_, // (m, n, k) or (m, n, k, l)
+  class CollectiveMainloopOrEpilogue_,
+  class CollectiveEpilogueOrThreadblockSwizzle_,
+  class TileScheduler_ = void,
+  class Enable = void
+>
+class GemmUniversal;
+
+
+} // namespace cutlass::gemm::kernel
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
new file mode 100644
index 0000000000000000000000000000000000000000..96a095694f8654c8de627bc5c2615dfc216f96bf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_streamk.h
@@ -0,0 +1,1168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+struct GemmUniversalStreamk {
+public:
+
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count {1};        // Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride stride_a{0};
+    typename LayoutB::Stride stride_b{0};
+    typename LayoutC::Stride stride_c{0};
+    typename LayoutC::Stride stride_d{0};
+
+    typename LayoutA::Stride::LongIndex lda{0};
+    typename LayoutB::Stride::LongIndex ldb{0};
+    typename LayoutC::Stride::LongIndex ldc{0};
+    typename LayoutC::Stride::LongIndex ldd{0};
+
+    int avail_sms{-1};          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+
+
+    //
+    // Methods
+    //
+
+    /// Default Constructor
+    Arguments() = default;
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride stride_a,
+      typename LayoutB::Stride stride_b,
+      typename LayoutC::Stride stride_c,
+      typename LayoutC::Stride stride_d,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      stride_a(stride_a), stride_b(stride_b), stride_c(stride_c), stride_d(stride_d), avail_sms(avail_sms)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Constructor
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_split,                              /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::LongIndex lda,
+      typename LayoutB::Stride::LongIndex ldb,
+      typename LayoutC::Stride::LongIndex ldc,
+      typename LayoutC::Stride::LongIndex ldd,
+      int avail_sms = -1                            /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    ):
+      mode(mode),
+      problem_size(problem_size),
+      batch_count(batch_split),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), avail_sms(avail_sms)
+    {
+      stride_a = make_Coord(lda);
+      stride_b = make_Coord(ldb);
+      stride_c = make_Coord(ldc);
+      stride_d = make_Coord(ldd);
+      CUTLASS_TRACE_HOST("GemmUniversalStreamk::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const
+    {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.stride_a, args.stride_b);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace = nullptr;
+    void *partials_workspace = nullptr;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    void * ptr_D = nullptr;
+    void * ptr_C = nullptr;
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(args.epilogue),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+    }
+
+  };
+
+  /// Tile work descriptor
+  struct TileWorkDesc
+  {
+    /// The linear tile index
+    int tile_idx;
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    cutlass::gemm::GemmCoord tiled_coord;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    int iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_begin;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    int k_end;
+
+    /// The number of remaining MAC-iterations this threadblock will perform for this tile
+    int k_iters_remaining;
+
+    // Whether this block will perform the first iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_started()
+    {
+      return (k_begin == 0);
+    }
+
+    // Whether this block will perform the last iteration of this tile
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage
+  {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversalStreamk::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<LayoutA,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutA,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<LayoutB,
+                                                      layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutB,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return typename Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return typename Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Update pointers for batched/array mode(s)
+    if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += tile_work.tiled_coord.k() * params.batch_stride_C;
+      ptr_D += tile_work.tiled_coord.k() * params.batch_stride_D;
+    }
+    if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[tile_work.tiled_coord.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[tile_work.tiled_coord.k()];
+    }
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tile_work.tiled_coord.m() * Mma::Shape::kM,
+      tile_work.tiled_coord.n() * Mma::Shape::kN
+    );
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        accumulator_tile,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Location of this tile in item-coords
+    MatrixCoord threadblock_item_begin(
+      tiled_coord.m() * Mma::Shape::kM,
+      tiled_coord.n() * Mma::Shape::kN
+    );
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.block_mapping.problem_size.mn(),
+        thread_idx,
+        threadblock_item_begin);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        EpilogueOutputOp(params.output_op),
+        iterator_D,
+        iterator_C);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmUniversalStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  // Constructor
+  CUTLASS_DEVICE
+  GemmUniversalStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8fdea738607a03aeb1365f0fcb96d592c681005
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
@@ -0,0 +1,321 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that compute the epilogue visitor functor
+template <
+  typename Mma,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue,             ///! Epilogue
+  typename ThreadblockSwizzle_   ///! Threadblock swizzling function
+>
+class GemmWithEpilogueVisitor: public GemmUniversal<Mma, Epilogue, ThreadblockSwizzle_> {
+public:
+
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using Base = GemmUniversal<Mma, Epilogue, ThreadblockSwizzle>;
+  using Base::Base;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ElementA = typename Base::ElementA;
+  using LayoutA = typename Base::LayoutA;
+  using ElementB = typename Base::ElementB;
+  using LayoutB = typename Base::LayoutB;
+  using ElementC = typename Base::ElementC;
+  using LayoutC = typename Base::LayoutC;
+
+  using ThreadblockShape = typename Mma::Shape;
+
+  //
+  // Structures
+  //
+
+  using SharedStorage = typename Base::SharedStorage;
+  using Arguments = typename Base::Arguments;
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename FusionCallbacks::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+
+    int * ptr_gather_A_indices;
+    int * ptr_gather_B_indices;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      ptr_gather_A_indices(const_cast<int *>(args.ptr_gather_A_indices)),
+      ptr_gather_B_indices(const_cast<int *>(args.ptr_gather_B_indices))
+    {
+      // Raise error on unsupported modes
+      assert(args.mode != GemmUniversalMode::kGemmSplitKParallel && "Sm80 EVT does not support SplitKParallel.");
+      assert(!(args.mode == GemmUniversalMode::kGemm && this->grid_tiled_shape.k() > 1 )
+        && "Sm80 EVT does not support SplitKSerial.");
+      assert(args.mode != GemmUniversalMode::kArray && "Sm80 EVT does not support Array Gemm.");
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalwithVisitor::Params::update()");
+
+      // Update input pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      this->batch_stride_D = args.batch_stride_D;
+
+      ptr_gather_A_indices = const_cast<int *>(args.ptr_gather_A_indices);
+      ptr_gather_B_indices = const_cast<int *>(args.ptr_gather_B_indices);
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+  };
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitor op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    ThreadblockSwizzle threadblock_swizzle;
+    run_with_swizzle(params, shared_storage, threadblock_swizzle);
+  }
+
+  /// Executes one GEMM with an externally-provided swizzling function
+  CUTLASS_DEVICE
+  void run_with_swizzle(Params const &params, SharedStorage &shared_storage, ThreadblockSwizzle& threadblock_swizzle) {
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A,
+      params.ptr_gather_A_indices);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B,
+      params.ptr_gather_B_indices);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd9d60557cfa3500ffa1ae37ffc38ab03be8cf5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
@@ -0,0 +1,895 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue defined under the epilogue visitor concept with streamk.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock mapping function
+>
+class GemmWithEpilogueVisitorStreamk {
+public:
+
+  using Base = GemmUniversalStreamk<Mma_, Epilogue_, ThreadblockSwizzle_>;
+
+  //
+  // Types and constants
+  //
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// The per-thread tile of raw accumulators
+  using AccumulatorTile = typename Mma::FragmentC;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Workspace bytes per thread block
+  static size_t const kWorkspaceBytesPerBlock =
+    __NV_STD_MAX(
+      kThreadCount * sizeof(AccumulatorTile),
+      Epilogue::kWorkspaceBytesPerBlock);
+
+  /// Block-striped reduction utility
+  using BlockStripedReduceT = BlockStripedReduce<kThreadCount, AccumulatorTile>;
+
+
+
+  //
+  // Structures
+  //
+
+  using Arguments = typename Base::Arguments;
+
+
+  /// Parameters structure
+  struct Params
+  {
+  public:
+
+    //
+    // Data members
+    //
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape{};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+
+    ThreadblockSwizzle block_mapping{};
+
+    void *barrier_workspace{nullptr};
+    void *partials_workspace{nullptr};
+
+    typename FusionCallbacks::Params output_op{};
+
+
+    void * ptr_D{nullptr};
+    void * ptr_C{nullptr};
+
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+
+    int64_t batch_stride_D{0};
+    int64_t batch_stride_C{0};
+
+
+  protected:
+
+    //
+    // Host-only dispatch-utilities
+    //
+
+    /// Pad the given allocation size up to the nearest cache line
+    static size_t cacheline_align_up(size_t size)
+    {
+      static const int CACHELINE_SIZE = 128;
+      return (size + CACHELINE_SIZE - 1) / CACHELINE_SIZE * CACHELINE_SIZE;
+    }
+
+    /// Get the workspace size needed for barrier
+    size_t get_barrier_workspace_size() const
+    {
+      // For atomic reduction, each SK-block needs a synchronization flag.  For parallel reduction,
+      // each reduction block needs its own synchronization flag.
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      int num_flags = fast_max(sk_blocks, block_mapping.reduction_blocks);
+
+      return cacheline_align_up(sizeof(typename Barrier::T) * num_flags);
+    }
+
+    /// Get the workspace size needed for intermediate partial sums
+    size_t get_partials_workspace_size() const
+    {
+      int sk_blocks = block_mapping.sk_regions() * block_mapping.sk_blocks_per_region();
+      return cacheline_align_up(kWorkspaceBytesPerBlock * sk_blocks);
+    }
+
+
+  public:
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      problem_shape({args.problem_size.m(), args.problem_size.n(), args.batch_count}),
+      params_A(args.lda ? make_Coord_with_padding<LayoutA::kStrideRank>(args.lda) : args.stride_a),
+      params_B(args.ldb ? make_Coord_with_padding<LayoutB::kStrideRank>(args.ldb) : args.stride_b),
+      params_C(args.ldc ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldc) : args.stride_c),
+      params_D(args.ldd ? make_Coord_with_padding<LayoutC::kStrideRank>(args.ldd) : args.stride_d),
+      output_op(FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/)),
+      mode(args.mode),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      barrier_workspace(nullptr),
+      partials_workspace(nullptr)
+    {
+      // Number of SMs to make available for StreamK decomposition
+      int avail_sms = (args.avail_sms == -1) ?
+                        device_sms :
+                        fast_min(args.avail_sms, device_sms);
+
+      // Initialize the block mapping structure
+      block_mapping = ThreadblockSwizzle(
+        args.mode,
+        args.problem_size,
+        {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+        args.batch_count,
+        sm_occupancy,
+        device_sms,
+        avail_sms,
+        sizeof(ElementA),
+        sizeof(ElementB),
+        sizeof(ElementC),
+        Epilogue::kAccumulatorFragments);
+    }
+
+
+    /// Returns the workspace size (in bytes) needed for these parameters
+    size_t get_workspace_size() const
+    {
+      return
+        get_barrier_workspace_size() +
+        get_partials_workspace_size();
+    }
+
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      uint8_t *ptr = static_cast<uint8_t*>(workspace);
+
+      // Establish partials workspace
+      partials_workspace = nullptr;
+      size_t partials_workspace_bytes = get_partials_workspace_size();
+      if (partials_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        partials_workspace = ptr;
+        ptr += partials_workspace_bytes;
+      }
+
+      // Establish barrier workspace
+      barrier_workspace = nullptr;
+      size_t barrier_workspace_bytes = get_barrier_workspace_size();
+      if (barrier_workspace_bytes > 0)
+      {
+        if (!workspace) {
+          return Status::kErrorWorkspaceNull;
+        }
+        barrier_workspace = ptr;
+        ptr += barrier_workspace_bytes;
+      }
+
+      // Zero-initialize barrier workspace
+      if (barrier_workspace)
+      {
+        size_t barrier_workspace_bytes = get_barrier_workspace_size();
+
+        CUTLASS_TRACE_HOST("  Initialize " << barrier_workspace_bytes << " barrier bytes");
+
+        cudaError_t result = cudaMemsetAsync(
+          barrier_workspace,
+          0,
+          barrier_workspace_bytes,
+          stream);
+
+        if (result != cudaSuccess) {
+          CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+
+      return Status::kSuccess;
+    }
+
+
+    /// Returns the GEMM volume in thread block tiles
+    cutlass::gemm::GemmCoord get_tiled_shape() const
+    {
+      return block_mapping.tiled_shape();
+    }
+
+
+    /// Returns the total number of thread blocks to launch
+    int get_grid_blocks() const
+    {
+      dim3 grid_dims = get_grid_dims();
+      return grid_dims.x * grid_dims.y * grid_dims.z;
+    }
+
+
+    /// Returns the grid extents in thread blocks to launch
+    dim3 get_grid_dims() const
+    {
+      return block_mapping.get_grid_dims();
+    }
+
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversalStreamK::Params::update()");
+
+      // Update input/output pointers
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = FusionCallbacks::to_underlying_arguments(args.problem_size, args.epilogue, nullptr /*workspace*/);
+      problem_shape = make_shape(args.problem_size.m(), args.problem_size.n(), args.batch_count);
+    }
+
+  };
+
+  struct TileWorkDesc: Base::TileWorkDesc {
+    int k_end;
+    CUTLASS_DEVICE
+    bool tile_finished(Params const &params)
+    {
+      return (k_end == params.block_mapping.problem_size.k());
+    }
+  };
+
+  // using TileWorkDesc = typename Base::TileWorkDesc;
+  using SharedStorage = typename Base::SharedStorage;
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem parameters
+  Params params;
+
+  /// Shared storage reference
+  SharedStorage &shared_storage;
+
+  /// ID within the threadblock
+  int thread_idx;
+
+  /// ID of warp
+  int warp_idx;
+
+  /// ID of each thread within a warp
+  int lane_idx;
+
+  /// Threadblock scoped epilogue
+  Epilogue epilogue;
+
+
+public:
+
+  //
+  // Host-only dispatch API
+  //
+
+  /// Determines whether the GEMM problem size satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size)
+  {
+    return Base::can_implement(problem_size);
+  }
+
+  /// Determines whether the GEMM problem satisfies this kernel's
+  /// alignment requirements
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+protected:
+
+  //
+  // Device-only utility methods
+  //
+
+  /// Iterator for fetching tile fragments from A
+  CUTLASS_DEVICE
+  typename Mma::IteratorA init_iterator_A(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input A matrix
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_A += tile_work.tiled_coord.k() * params.batch_stride_A;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[tile_work.tiled_coord.k()];
+    }
+
+    int m_begin = tile_work.tiled_coord.m() * Mma::Shape::kM;
+    int m_end = params.block_mapping.problem_size.m();
+    return typename Mma::IteratorA(
+        params.params_A,
+        ptr_A,
+        { m_end, tile_work.k_end },
+        threadIdx.x,
+        { m_begin, tile_work.k_begin });
+
+  }
+
+
+  /// Iterator for fetching tile fragments from B
+  CUTLASS_DEVICE
+  typename Mma::IteratorB init_iterator_B(
+    TileWorkDesc &tile_work,
+    GemmUniversalMode mode)
+  {
+    // The input B matrix
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    // Update input pointers based on batched/array mode
+    if (mode == GemmUniversalMode::kBatched) {
+      ptr_B += tile_work.tiled_coord.k() * params.batch_stride_B;
+    }
+    if (mode == GemmUniversalMode::kArray) {
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[tile_work.tiled_coord.k()];
+    }
+
+    int n_begin = tile_work.tiled_coord.n() * Mma::Shape::kN;
+    int n_end = params.block_mapping.problem_size.n();
+    return typename Mma::IteratorB(
+        params.params_B,
+        ptr_B,
+        { tile_work.k_end, n_end },
+        threadIdx.x,
+        { tile_work.k_begin, n_begin });
+  }
+
+
+  CUTLASS_DEVICE
+  void init_dp_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = params.block_mapping.iters_per_tile();
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = 0;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = params.block_mapping.problem_size.k();
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void init_sk_tile_work(
+      TileWorkDesc &tile_work,
+      int tile_idx,
+      int block_iter_begin,
+      int block_iter_end)
+  {
+    // The linear tile index
+    tile_work.tile_idx = tile_idx;
+
+    // The first global-scoped MAC-iteration for this tile
+    int tile_iter_begin = tile_idx * params.block_mapping.iters_per_tile();
+
+    // The first global-scoped MAC-iteration this threadblock will perform for this tile
+    tile_work.iter_begin = max(block_iter_begin, tile_iter_begin);
+
+    // The first tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_begin = tile_work.iter_begin - tile_iter_begin;
+
+    // The last (one past) tile-scoped MAC-iteration this threadblock will perform for this tile
+    int k_iter_end = block_iter_end - tile_iter_begin;
+
+    // The number of MAC-iterations this threadblock will perform for this tile
+    tile_work.k_iters_remaining = k_iter_end - k_iter_begin;
+
+    // The starting index in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_begin = k_iter_begin * Mma::Shape::kK;
+
+    // The ending index (one-past) in the k-domain for MAC-iterations this threadblock will perform for this tile
+    tile_work.k_end = min(
+        params.block_mapping.problem_size.k(),            // extent of k domain
+        (k_iter_end * Mma::Shape::kK));                   // extent of the threadblock's global iteration assignment
+
+    // The location of this tile (in threadblock-tile coordinates) in the output matrix
+    tile_work.tiled_coord = params.block_mapping.get_tile_offset(tile_work.tile_idx);
+  }
+
+
+  /// Share accumulators with peers
+  CUTLASS_DEVICE
+  void share_accumulators(
+    AccumulatorTile const &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    int accum_tile_offset = first_block_idx * kThreadCount;
+
+    if (block_idx == first_block_idx)
+    {
+      // First peer initializes the workspace partials
+      BlockStripedReduceT::store(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+    else
+    {
+      // Subsequent peers atomically accumulate into the workspace partials
+      if (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic)
+      {
+        // Non-deterministic reduction order: wait for the first peer to have initialized the partials before we add to them
+        Barrier::wait_lt(params.barrier_workspace, thread_idx, first_block_idx, 1);
+      }
+      else
+      {
+        // Turnstile reduction order: wait until the previous peer has written
+        int wait_count = block_idx - first_block_idx;
+        Barrier::wait_eq(params.barrier_workspace, thread_idx, first_block_idx, wait_count);
+      }
+
+      // Perform reduction in workspace
+      BlockStripedReduceT::reduce(accum_tile_workspace + accum_tile_offset, accumulator_tile, thread_idx);
+    }
+
+    // Signal our arrival
+    Barrier::arrive_inc(params.barrier_workspace, thread_idx, first_block_idx);
+  }
+
+
+  /// Acquire accumulators from peers
+  CUTLASS_DEVICE
+  void acquire_accumulators(
+    AccumulatorTile &accumulator_tile,
+    int block_idx,
+    int first_block_idx)
+  {
+    AccumulatorTile *accum_tile_workspace = reinterpret_cast<AccumulatorTile *>(params.partials_workspace);
+
+    // Wait for arrival
+    int num_carry_in = block_idx - first_block_idx;
+    Barrier::wait_eq_reset(params.barrier_workspace, thread_idx, first_block_idx, num_carry_in);
+
+    // Load and add peer-partials accumulator tile to local accumulator tile
+    int accum_tile_offset = first_block_idx * kThreadCount;
+    BlockStripedReduceT::load_add(accumulator_tile, accum_tile_workspace + accum_tile_offset, thread_idx);
+  }
+
+
+  /// Perform epilogue computations and output
+  CUTLASS_DEVICE
+  void do_epilogue(
+    TileWorkDesc &tile_work,
+    AccumulatorTile &accumulator_tile)
+  {
+    cutlass::gemm::GemmCoord threadblock_tile_offset{
+      tile_work.tiled_coord.m(),
+      tile_work.tiled_coord.n(),
+      tile_work.tiled_coord.k()
+    };
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      accumulator_tile,
+      threadblock_tile_offset,
+      params.problem_shape,
+      thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void separate_reduction(int reduce_idx)
+  {
+    int peer_idx_begin, peer_idx_last, reduce_tile_idx, reduce_fragment_idx;
+
+    // Reduce by sk-tile (every tile contributed to by one or more blocks)
+    reduce_tile_idx = reduce_idx / Epilogue::kAccumulatorFragments;
+    reduce_fragment_idx = reduce_idx % Epilogue::kAccumulatorFragments;
+
+    int iter_tile_first = reduce_tile_idx * params.block_mapping.iters_per_tile();
+    int iter_tile_last = iter_tile_first + params.block_mapping.iters_per_tile() - 1;
+
+    peer_idx_begin = params.block_mapping.get_sk_block_idx(iter_tile_first);
+    peer_idx_last = params.block_mapping.get_sk_block_idx(iter_tile_last);
+
+    // Wait for peers to complete
+    int peer_idx_end = peer_idx_last + 1;
+    int num_peers = peer_idx_end - peer_idx_begin;
+    Barrier::wait_eq_reset(
+        params.barrier_workspace,
+        thread_idx,
+        (reduce_tile_idx * Epilogue::kAccumulatorFragments) + reduce_fragment_idx,
+        num_peers);
+
+    /// The location of this tile (in threadblock-tile coordinates) in the output matrix
+    GemmCoord tiled_coord = params.block_mapping.get_tile_offset(reduce_tile_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue.reduce(
+        peer_idx_begin,
+        peer_idx_end,
+        reduce_fragment_idx,
+        params.partials_workspace,
+        tiled_coord,
+        params.problem_shape,
+        thread_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  void process_tile(
+    TileWorkDesc tile_work,
+    int block_idx,
+    int dp_start_block_idx,
+    int block_iter_begin)
+  {
+    // Initialize input iterators
+    typename Mma::IteratorA iterator_A = init_iterator_A(tile_work, params.mode);
+    typename Mma::IteratorB iterator_B = init_iterator_B(tile_work, params.mode);
+
+    // Initialize accumulators
+    AccumulatorTile accumulator_tile;
+    accumulator_tile.clear();
+
+    // Initialize MMA abstraction
+    Mma mma(
+      shared_storage.main_loop,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Perform this tile's range of multiply-accumulate (MAC) iterations
+    mma(tile_work.k_iters_remaining, accumulator_tile, iterator_A, iterator_B, accumulator_tile);
+
+    if ((ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kAtomic) ||
+        (params.block_mapping.reduction_blocks == 0) ||
+        (block_idx >= dp_start_block_idx))
+    {
+      //
+      // Cooperative SK peer reduction or DP block
+      //
+
+      int first_block_idx = params.block_mapping.get_first_block_idx(tile_work.tile_idx, block_idx);
+
+      if (!tile_work.tile_finished(params)) {
+        // Non "finishing" SK blocks must share their partial accumulator sums through global scratch workspace
+        share_accumulators(accumulator_tile, block_idx, first_block_idx);
+      }
+      else
+      {
+        // DP blocks and "finishing" SK blocks must perform epilogue operations and write the output tile
+        if (!tile_work.tile_started())
+        {
+          // A "finishing" SK block must first aggregate its accumulator partial sums with those shared by peer threadblocks
+          acquire_accumulators(accumulator_tile, block_idx, first_block_idx);
+        }
+
+        do_epilogue(tile_work, accumulator_tile);
+      }
+    }
+    else
+    {
+      //
+      // Separate peer reduction
+      //
+
+      // Share accumulator partial sums with peer threadblock(s) through scratch workspace
+      epilogue.share(block_idx, params.partials_workspace, accumulator_tile, tile_work.tile_started());
+
+      // Signal arrival
+      Barrier::arrive_range_inc(
+        params.barrier_workspace,
+        thread_idx,
+        tile_work.tile_idx * Epilogue::kAccumulatorFragments,
+        Epilogue::kAccumulatorFragments);
+    }
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void gemm()
+  {
+    // Initialize block's iteration range
+    int tile_idx = 0;
+    int block_iter_begin = 0;
+    int block_iters_remaining = 0;
+
+    int block_idx = params.block_mapping.get_block_idx();
+
+    int sk_padding_start_block_idx =  params.block_mapping.sk_regions() * params.block_mapping.sk_blocks_per_region();
+    int dp_start_block_idx = params.block_mapping.sk_waves * params.block_mapping.avail_sms;
+    int reduce_start_block_idx = dp_start_block_idx + params.block_mapping.dp_blocks;
+    int grid_padding_start_block_idx = reduce_start_block_idx + params.block_mapping.reduction_blocks;
+
+    // Initialize tile work descriptor
+    TileWorkDesc tile_work;
+
+    bool dp_block = (block_idx >= dp_start_block_idx) && (block_idx < reduce_start_block_idx);
+    bool sk_block = (block_idx < sk_padding_start_block_idx);
+    bool reduce_block = (block_idx >= reduce_start_block_idx) &&
+            (block_idx < grid_padding_start_block_idx) &&
+            (ThreadblockSwizzle::kReductionStrategy == ThreadblockSwizzle::kMixed);
+
+    if (dp_block)
+    {
+      // This is a DP block
+      int dp_block_idx = block_idx - dp_start_block_idx;
+      int first_dp_tile = (params.block_mapping.cohort_raster) ? 0 : params.block_mapping.sk_tiles;
+
+      // Blocks in first DP wave get configured number of tiles
+      tile_idx = first_dp_tile + dp_block_idx;
+      int tile_allottment = params.block_mapping.dp_first_wave_tiles;
+
+      // Blocks in subsequent DP waves get 1 tile
+      if (dp_block_idx >= params.block_mapping.avail_sms) {
+          tile_allottment = 1;
+          tile_idx += (params.block_mapping.dp_first_wave_tiles - 1) * params.block_mapping.avail_sms;
+      }
+
+      block_iters_remaining = params.block_mapping.iters_per_tile() * tile_allottment;
+
+      init_dp_tile_work(tile_work, tile_idx);
+
+      // DP blocks exit if out of bounds or overlap an SK tile (only possible during cohort rasterization, where dp_first_wave_tiles must be 1)
+      if ((tile_idx < params.block_mapping.sk_tiles) ||
+          (tile_work.tiled_coord.m() >= params.block_mapping.tiled_shape().m()) ||
+          (tile_work.tiled_coord.n() >= params.block_mapping.tiled_shape().n()))
+      {
+        return;
+      }
+    }
+    else if (sk_block)
+    {
+      // This is a SK block
+      int block_iter_end;
+      params.block_mapping.get_iter_extents(block_idx, block_iter_begin, block_iter_end);
+      block_iters_remaining = block_iter_end - block_iter_begin;
+
+      tile_idx = params.block_mapping.get_sk_tile_idx(block_iter_end - 1);
+      init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+    }
+    else
+    {
+      if (reduce_block)
+      {
+        // This is a reduction threadblock
+        int reduce_block_idx = block_idx - reduce_start_block_idx;
+        separate_reduction(reduce_block_idx);
+      }
+
+      return;
+    }
+
+    // Iteration-processing loop body
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (true)
+    {
+      // Perform this block's share of work for this tile
+      process_tile(
+        tile_work,
+        block_idx,
+        dp_start_block_idx,
+        block_iter_begin);
+
+      block_iters_remaining -= tile_work.k_iters_remaining;
+
+      if (block_iters_remaining == 0)
+      {
+        break;
+      }
+
+      // Continue to next tile
+      __syncthreads();
+
+      if (block_idx >= dp_start_block_idx)
+      {
+        // DP block consume their tiles at stride
+        tile_idx += params.block_mapping.avail_sms;
+        init_dp_tile_work(tile_work, tile_idx);
+      }
+      else
+      {
+        // SK blocks consume their tiles in backwards order
+        tile_idx--;
+        init_sk_tile_work(tile_work, tile_idx, block_iter_begin, block_iter_begin + block_iters_remaining);
+      }
+    }
+
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithEpilogueVisitorStreamk op(params, shared_storage);
+    op();
+  }
+
+
+  CUTLASS_DEVICE
+  GemmWithEpilogueVisitorStreamk(
+      Params const &params,
+      SharedStorage &shared_storage)
+    :
+      params(params),
+      shared_storage(shared_storage),
+      thread_idx(threadIdx.x),
+      warp_idx(__shfl_sync(0xffffffff, threadIdx.x / 32, 0)),   // broadcast the warp_id computed by lane 0 to ensure dependent code
+      lane_idx(threadIdx.x % 32),
+      epilogue(
+        params.output_op,
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx)
+  {}
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()()
+  {
+    // Generic SK code path
+    gemm();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1a3ec863dea5a83b954fded77f81b2feca6e727
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_absmax.h
@@ -0,0 +1,759 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Gemm kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Gemm that computes the absolute maximum value of the output and a pre-activation-function
+// auxiliary output.
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithAbsMax {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldaux;
+    typename LayoutC::Stride::Index ldr;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_Aux(nullptr)
+    {}
+
+    /// Constructs an arguments structure with ldaux
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldaux)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_Aux(ptr_Aux),
+      ptr_Vector(ptr_Vector),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldaux(ldaux), ldr(ldr)
+    {
+    }
+
+    /// Constructs an Arguments structure without ldaux.
+    /// These parameters are overridden with D batch stride and ldd.
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Aux,
+      void * ptr_Vector,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr)
+    : Arguments(mode, problem_size, batch_count, epilogue, ptr_A, ptr_B, ptr_C, ptr_D, ptr_Aux, ptr_Vector,
+               batch_stride_A, batch_stride_B, batch_stride_C, batch_stride_D, batch_stride_Vector,
+               lda, ldb, ldc, ldd, ldr, ldd)
+    {
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::AuxOutputTileIterator::Params params_Aux;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+    void * ptr_Aux;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Aux(args.ldaux),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Aux(args.ptr_Aux),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_Vector(args.batch_stride_Vector)
+    {
+
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+      ptr_Aux = args.ptr_Aux;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      this->batch_stride_D = args.batch_stride_D;
+      batch_stride_Vector = args.batch_stride_Vector;
+
+      output_op = args.epilogue;
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithAbsMax op;
+    op(params, shared_storage);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementAuxOutput *ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput *>(params.ptr_Aux);
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to auxiliary tensor.
+      typename Epilogue::AuxOutputTileIterator iterator_Aux(
+        params.params_Aux,
+        ptr_Aux,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C,
+               iterator_Aux,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Aux) {
+        ptr_Aux += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Aux) {
+        ptr_Aux = static_cast<typename Epilogue::ElementAuxOutput * const *>(params.ptr_Aux)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : ptr_Aux,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
new file mode 100644
index 0000000000000000000000000000000000000000..b27c167863aaa6c66e1a9ad9697f5e21d3dce498
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
@@ -0,0 +1,1512 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Gemm kernel with fused reduction operation.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool IsSingleSource = Epilogue_::kIsSingleSource
+>
+struct GemmWithFusedEpilogue;
+
+// GemmWithFusedEpilogue with two sources
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, false> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase{
+
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C1;
+    void const * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc1;
+    typename LayoutC::Stride::Index ldc2;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C1(nullptr),
+      ptr_C2(nullptr),
+      ptr_D(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C1,
+      void const * ptr_C2,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C1,
+      int64_t batch_stride_C2,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc1,
+      typename LayoutC::Stride::Index ldc2,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C1(ptr_C1), ptr_C2(ptr_C2), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C1(batch_stride_C1),
+      batch_stride_C2(batch_stride_C2),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc1(ldc1), ldc2(ldc2), ldd(ldd), ldr(ldr), ldt(ldt)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::Params params_C2;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C1;
+    void * ptr_C2;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C1;
+    int64_t batch_stride_C2;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C1(args.ldc1),
+      params_C2(args.ldc2),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C1(const_cast<void *>(args.ptr_C1)),
+      ptr_C2(const_cast<void *>(args.ptr_C2)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C1(args.batch_stride_C1),
+      batch_stride_C2(args.batch_stride_C2),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C1 = const_cast<void *>(args.ptr_C1);
+      ptr_C2 = const_cast<void *>(args.ptr_C2);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C1 = args.batch_stride_C1;
+      batch_stride_C2 = args.batch_stride_C2;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0);
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C1 = static_cast<ElementC *>(params.ptr_C1);
+    ElementC *ptr_C2 = static_cast<ElementC *>(params.ptr_C2);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        ptr_C1,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      typename Epilogue::OutputTileIterator iterator_C2(
+        params.params_C2,
+        ptr_C2,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Additional tensor to load from
+      typename Epilogue::TensorTileIterator tensor_iterator(
+          params.params_Tensor,
+          // Only the final block outputs Tensor
+          ptr_Tensor,
+          params.problem_size.mn(),
+          thread_idx,
+          threadblock_offset);
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C1,
+               iterator_C2,
+               tensor_iterator,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+
+    #if SPLIT_K_ENABLED
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C1 += threadblock_tile_offset.k() * params.batch_stride_C1;
+      if (ptr_C2) {
+        ptr_C2 += threadblock_tile_offset.k() * params.batch_stride_C2;
+      }
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C1 = static_cast<ElementC * const *>(params.ptr_C1)[threadblock_tile_offset.k()];
+      if (ptr_C2) {
+        ptr_C2 = static_cast<ElementC * const *>(params.ptr_C2)[threadblock_tile_offset.k()];
+      }
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+    #endif
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C1(
+      params.params_C1,
+      ptr_C1,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    typename Epilogue::OutputTileIterator iterator_C2(
+      params.params_C2,
+      ptr_C2,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    #if SPLIT_K_ENABLED
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C1 = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+    #endif
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C1,
+             iterator_C2,
+             tensor_iterator,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    #if SPLIT_K_ENABLED
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+    #endif
+  }
+};
+
+// GemmWithFusedEpilogue with one source
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithFusedEpilogue<Mma_, Epilogue_, ThreadblockSwizzle_, true> {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(
+    128 / sizeof_bits<ElementA>::value,
+    128 / sizeof_bits<ElementB>::value
+  );
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutC::Stride::Index ldr;
+    typename LayoutC::Stride::Index ldt;
+
+    //
+    // Methods
+    //
+
+    Arguments():
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_Vector,
+      void * ptr_Tensor,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_Vector,
+      int64_t batch_stride_Tensor,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutC::Stride::Index ldr,
+      typename LayoutC::Stride::Index ldt)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D),
+      ptr_Vector(ptr_Vector),
+      ptr_Tensor(ptr_Tensor),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_Vector(batch_stride_Vector),
+      batch_stride_Tensor(batch_stride_Tensor),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ldr(ldr), ldt(ldt)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Arguments::Arguments() - problem_size: " << problem_size);
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << this->ldt);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::TensorTileIterator::Params params_Tensor;
+
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+
+    void * ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    void * ptr_Tensor;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_Vector;
+    int64_t batch_stride_Tensor;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      params_Tensor(args.ldt),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(args.ptr_D),
+      ptr_Vector(args.ptr_Vector),
+      ldr(args.ldr),
+      ptr_Tensor(args.ptr_Tensor),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_Vector(args.batch_stride_Vector),
+      batch_stride_Tensor(args.batch_stride_Tensor)
+    {
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::Params()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+      CUTLASS_TRACE_HOST("  ldt: " << args.ldt);
+    }
+
+    /// Lightweight update given a subset of arguments.
+    CUTLASS_HOST_DEVICE
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      ptr_Vector = args.ptr_Vector;
+      ldr = args.ldr;
+      ptr_Tensor = args.ptr_Tensor;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_Vector = args.batch_stride_Vector;
+      batch_stride_Tensor = args.batch_stride_Tensor;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::Params::update()");
+      CUTLASS_TRACE_HOST("  ptr_Vector: " << (void *)this->ptr_Vector);
+      CUTLASS_TRACE_HOST("  ptr_Tensor: " << (void *)this->ptr_Tensor);
+      CUTLASS_TRACE_HOST("  ldr: " << this->ldr);
+    }
+  };
+
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmWithFusedEpilogue::can_implement()");
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for A operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for B operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for C operand");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithFusedEpilogue op;
+    op(params, shared_storage);
+  }
+
+  #define SPLIT_K_ENABLED 1
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A);
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+
+    #if SPLIT_K_ENABLED
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm ||
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size;
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+    #endif
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations,
+      accumulators,
+      iterator_A,
+      iterator_B,
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    typename Epilogue::ElementTensor *ptr_Tensor = static_cast<typename Epilogue::ElementTensor *>(params.ptr_Tensor);
+
+    // Define the reduction output pointer and move to the appropriate place
+    typename Epilogue::ElementVector *ptr_Vector =
+      static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+
+    //
+    // Fetch pointers based on mode.
+    //
+
+    //
+    // Special path when split-K not enabled.
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() == 1) {
+
+      // Tile iterators loading from source tensors.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      // Additional tensor to load from
+      typename Epilogue::TensorTileIterator tensor_iterator(
+          params.params_Tensor,
+          // Only the final block outputs Tensor
+          ptr_Tensor,
+          params.problem_size.mn(),
+          thread_idx,
+          threadblock_offset);
+
+      // Construct the epilogue
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Move to appropriate location for this output tile
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op,
+               ptr_Vector,
+               iterator_D,
+               accumulators,
+               iterator_C,
+               tensor_iterator,
+               params.problem_size.mn(),
+               threadblock_offset);
+
+      return;
+    }
+
+    //
+    // Slower path when split-K or batching is needed
+    //
+
+
+    #if SPLIT_K_ENABLED
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      if (ptr_Tensor) {
+        ptr_Tensor = ReferenceFactory<typename Epilogue::ElementTensor>::add_pointer_offset(
+          ptr_Tensor,
+          threadblock_tile_offset.k() * params.batch_stride_Tensor);
+      }
+      if (ptr_Vector) {
+        ptr_Vector += threadblock_tile_offset.k() * params.batch_stride_Vector;
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      if (ptr_Tensor) {
+        ptr_Tensor = static_cast<typename Epilogue::ElementTensor * const *>(params.ptr_Tensor)[threadblock_tile_offset.k()];
+      }
+      if (ptr_Vector) {
+        ptr_Vector = static_cast<typename Epilogue::ElementVector * const *>(params.ptr_Vector)[threadblock_tile_offset.k()];
+      }
+    }
+    #endif
+
+    // Tile iterators loading from source tensors.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Additional tensor to load from
+    typename Epilogue::TensorTileIterator tensor_iterator(
+        params.params_Tensor,
+        // Only the final block outputs Tensor
+        ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+         (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+            ? nullptr
+            : ptr_Tensor,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset);
+
+    // Construct the epilogue
+    Epilogue epilogue(
+      shared_storage.epilogue,
+      thread_idx,
+      warp_idx,
+      lane_idx);
+
+    #if SPLIT_K_ENABLED
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if ((params.mode == GemmUniversalMode::kGemm) && params.grid_tiled_shape.k() > 1) {
+
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+    #endif
+
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             tensor_iterator,
+             params.problem_size.mn(),
+             threadblock_offset);
+
+    //
+    // Release the semaphore
+    //
+
+    #if SPLIT_K_ENABLED
+    if ((params.mode == GemmUniversalMode::kGemm)  && params.grid_tiled_shape.k() > 1) {
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      semaphore.release(lock);
+    }
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8b24ee4bc49d76dabc41abe5cb572182f252870
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
@@ -0,0 +1,704 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/gemm/kernel/params_universal_base.h"
+
+#include "cutlass/trace.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename EpilogueGemmKReduction_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct GemmWithKReduction {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using EpilogueGemmKReduction = EpilogueGemmKReduction_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  using LayoutGemmKReduction = cutlass::layout::PitchLinear;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  static int const kReduceKForA = Mma::kReduceKForA;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments : UniversalArgumentsBase
+  {
+    //
+    // Data members
+    //
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    void const * ptr_A;
+    void const * ptr_B;
+    void const * ptr_C;
+    void * ptr_D;
+    void * ptr_gemm_k_reduction;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_gemm_k_reduction;
+
+    typename LayoutA::Stride::Index lda;
+    typename LayoutB::Stride::Index ldb;
+    typename LayoutC::Stride::Index ldc;
+    typename LayoutC::Stride::Index ldd;
+    typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction;
+
+    //
+    // Methods
+    //
+
+    Arguments() :
+      ptr_A(nullptr),
+      ptr_B(nullptr),
+      ptr_C(nullptr),
+      ptr_D(nullptr),
+      ptr_gemm_k_reduction(nullptr)
+    {}
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      void * ptr_gemm_k_reduction,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      int64_t batch_stride_gemm_k_reduction,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      typename LayoutGemmKReduction::Stride::Index ld_gemm_k_reduction)
+    :
+      UniversalArgumentsBase(mode, problem_size, batch_count, batch_stride_D),
+      epilogue(epilogue),
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), ptr_gemm_k_reduction(ptr_gemm_k_reduction),
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_C(batch_stride_C), batch_stride_gemm_k_reduction(batch_stride_gemm_k_reduction),
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd), ld_gemm_k_reduction(ld_gemm_k_reduction)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+    }
+
+    /// Returns arguments for the transposed problem
+    Arguments transposed_problem() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params : UniversalParamsBase<
+    ThreadblockSwizzle,
+    ThreadblockShape,
+    ElementA,
+    ElementB,
+    ElementC,
+    LayoutA,
+    LayoutB>
+  {
+    using ParamsBase = UniversalParamsBase<
+      ThreadblockSwizzle,
+      ThreadblockShape,
+      ElementA,
+      ElementB,
+      ElementC,
+      LayoutA,
+      LayoutB>;
+
+    //
+    // Data members
+    //
+    
+    typename Mma::IteratorA::Params params_A;
+    typename Mma::IteratorB::Params params_B;
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    
+    typename EpilogueOutputOp::Params output_op;
+
+    void * ptr_A;
+    void * ptr_B;
+    void * ptr_C;
+    void * ptr_D;
+    void * ptr_gemm_k_reduction;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_gemm_k_reduction;
+
+    //
+    // Host dispatch API
+    //
+
+    /// Default constructor
+    Params() = default;
+
+    /// Constructor
+    Params(
+      Arguments const &args,  /// GEMM application arguments
+      int device_sms,         /// Number of SMs on the device
+      int sm_occupancy)       /// Kernel SM occupancy (in thread blocks)
+    :
+      ParamsBase(args, device_sms, sm_occupancy),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_gemm_k_reduction(args.batch_stride_gemm_k_reduction),
+      ptr_D(args.ptr_D),
+      ptr_gemm_k_reduction(args.ptr_gemm_k_reduction)
+    {}
+
+    /// Assign and initialize the specified workspace buffer.  Assumes
+    /// the memory allocated to workspace is at least as large as get_workspace_size().
+    Status init_workspace(
+      void *workspace,
+      cudaStream_t stream = nullptr)
+    {
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::Params() - problem_size: " << this->problem_size);
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel) {
+        ptr_D = workspace;
+        ptr_gemm_k_reduction = static_cast<uint8_t *>(workspace)
+                 + sizeof(ElementC) * size_t(this->batch_stride_D) * size_t(this->grid_tiled_shape.k());
+
+        return Status::kSuccess;
+      }
+
+      return ParamsBase::init_workspace(workspace, stream);
+    }
+
+    /// Returns the workspace size (in bytes) needed for this problem geometry
+    size_t get_workspace_size() const
+    {
+      size_t workspace_bytes = ParamsBase::get_workspace_size();
+
+      if (this->mode == GemmUniversalMode::kGemmSplitKParallel)
+      {
+        // Split-K parallel always requires a temporary workspace
+        workspace_bytes +=
+          sizeof(ElementC) *
+          size_t(batch_stride_gemm_k_reduction) *
+          size_t(this->grid_tiled_shape.k());
+      }
+
+      return workspace_bytes;
+    }
+
+    /// Lightweight update given a subset of arguments.
+    void update(Arguments const &args)
+    {
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+      ptr_gemm_k_reduction = args.ptr_gemm_k_reduction;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_gemm_k_reduction = args.batch_stride_gemm_k_reduction;
+      this->batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      CUTLASS_TRACE_HOST("GemmUniversal::Params::update()");
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+
+public:
+
+  //
+  // Host dispatch API
+  //
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    CUTLASS_TRACE_HOST("GemmUniversal::can_implement()");
+
+    static int const kAlignmentA = (platform::is_same<typename Mma::IteratorA::Layout,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorA::Layout,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = (platform::is_same<typename Mma::IteratorB::Layout,
+                                                       layout::RowMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<typename Mma::IteratorB::Layout,
+                                                        layout::RowMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC =  (platform::is_same<LayoutC,
+                                                      layout::ColumnMajorInterleaved<32>>::value)
+                                   ? 32
+                                   : (platform::is_same<LayoutC,
+                                                        layout::ColumnMajorInterleaved<64>>::value)
+                                     ? 64
+                                     : Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    bool isAMisaligned = false;
+    bool isBMisaligned = false;
+    bool isCMisaligned = false;
+
+    if (platform::is_same<LayoutA, layout::RowMajor>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajor>::value) {
+      isAMisaligned = problem_size.m() % kAlignmentA;
+    } else if (platform::is_same<LayoutA, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutA, layout::ColumnMajorInterleaved<64>>::value) {
+      isAMisaligned = problem_size.k() % kAlignmentA;
+    }
+
+    if (platform::is_same<LayoutB, layout::RowMajor>::value) {
+      isBMisaligned = problem_size.n() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::ColumnMajor>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    } else if (platform::is_same<LayoutB, layout::RowMajorInterleaved<32>>::value
+            || platform::is_same<LayoutB, layout::RowMajorInterleaved<64>>::value) {
+      isBMisaligned = problem_size.k() % kAlignmentB;
+    }
+
+    if (platform::is_same<LayoutC, layout::RowMajor>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajor>::value) {
+      isCMisaligned = problem_size.m() % kAlignmentC;
+    } else if (platform::is_same<LayoutC, layout::ColumnMajorInterleaved<32>>::value
+            || platform::is_same<LayoutC, layout::ColumnMajorInterleaved<64>>::value) {
+      isCMisaligned = problem_size.n() % kAlignmentC;
+    }
+
+    if (isAMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand A");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isBMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand B");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (isCMisaligned) {
+      CUTLASS_TRACE_HOST("  returning kErrorMisalignedOperand for operand C");
+      return Status::kErrorMisalignedOperand;
+    }
+
+    CUTLASS_TRACE_HOST("  returning kSuccess");
+
+    return Status::kSuccess;
+  }
+
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+
+public:
+
+  //
+  // Device-only API
+  //
+
+  // Factory invocation
+  CUTLASS_DEVICE
+  static void invoke(
+    Params const &params,
+    SharedStorage &shared_storage)
+  {
+    GemmWithKReduction op;
+    op(params, shared_storage);
+  }
+
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    typename Mma::FragmentReduction gemm_k_accumulators;
+
+    gemm_k_accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators,
+      gemm_k_accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+    ElementC *ptr_gemm_k_reduction = static_cast<ElementC *>(params.ptr_gemm_k_reduction);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      ptr_gemm_k_reduction += threadblock_tile_offset.k() * params.batch_stride_gemm_k_reduction;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+ 
+    if ((kReduceKForA && threadblock_tile_offset.n() == 0)
+     || (!kReduceKForA && threadblock_tile_offset.m() == 0)) {
+
+      int warp_idx_mn = warp_idx % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      int warp_idx_m = warp_idx_mn % Mma::Base::WarpCount::kM;
+      int warp_idx_n = warp_idx_mn / Mma::Base::WarpCount::kM;
+ 
+     if ((kReduceKForA && warp_idx_n == 0)
+      || (!kReduceKForA && warp_idx_m == 0)) {
+
+        int reduction_warp_idx = kReduceKForA ? warp_idx_m : warp_idx_n;
+        int reduction_threadblock_offset = kReduceKForA ? threadblock_tile_offset.m() :
+                                                          threadblock_tile_offset.n();
+        int reduction_vector_size = kReduceKForA ? params.problem_size.m()
+                                                 : params.problem_size.n();
+        EpilogueGemmKReduction epilogue_gemm_k_reduction(thread_idx,
+                                                         reduction_warp_idx,
+                                                         lane_idx,
+                                                         reduction_threadblock_offset,
+                                                         ptr_gemm_k_reduction);
+        epilogue_gemm_k_reduction(
+          reduction_vector_size,
+          gemm_k_accumulators,
+          params.mode == GemmUniversalMode::kGemm
+            && (params.grid_tiled_shape.k() > 1)
+            && (threadblock_tile_offset.k() > 0));
+      }
+    }
+   
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb5da1a7cfd17fd79080830004b73183c7866ea0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv.h
@@ -0,0 +1,638 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/cache_operation.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/numeric_conversion.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_ = 1,            ///< Number of elements involved in a global access.
+  int kThreadCount_ = 0,                  ///< Number of threads in the thread block.
+                                          ///  It will be calculated automatically if set to 0.
+  int kThreadsPerRow_ = 0                 ///< Number of threads in the k dimension.
+                                          ///  It will be calculated automatically if set to 0.
+>
+struct Gemv;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for column-major A matrix
+template <
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_,
+  int kThreadCount_,
+  int kThreadsPerRow_
+>
+struct Gemv <
+  ElementA_,
+  layout::ColumnMajor,
+  ElementB_,
+  ElementC_,
+  ElementAccumulator_,
+  EpilogueOutputOp_,
+  kElementsPerAccess_,
+  kThreadCount_,
+  kThreadsPerRow_
+>{
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  // thread block shape (kThreadCount, 1, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 32 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ? 1 : kThreadsPerRow_;
+
+  static int const kStages = 1;
+
+  static int const kAlignmentA = 1;
+  static int const kAlignmentB = 1;
+  static int const kAlignmentC = 1;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    MatrixCoord     problem_size;
+    int32_t         batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA      ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC       *ptr_D;
+
+    int64_t         inc_B;
+    int64_t         inc_C;
+    int64_t         inc_D;
+
+    int64_t         batch_stride_A;
+    int64_t         batch_stride_B;
+    int64_t         batch_stride_C;
+    int64_t         batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments(): batch_count(0) { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ): 
+      problem_size(problem_size),
+      batch_count(batch_count),
+      output_op(output_op),
+      ref_A(ref_A),
+      ptr_B(static_cast<ElementB const *>(ptr_B)),
+      ptr_C(static_cast<ElementC const *>(ptr_C)),
+      ptr_D(static_cast<ElementC       *>(ptr_D)),
+      inc_B(inc_B),
+      inc_C(inc_C),
+      inc_D(inc_D),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ): 
+      Arguments(
+        problem_size, 
+        batch_count, 
+        output_op, 
+        ref_A, 
+        ptr_B, 
+        ptr_C, 
+        ptr_D,
+        1, 
+        1, 
+        1, 
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     inc_B,
+      int64_t     inc_C,
+      int64_t     inc_D
+    ): 
+      Arguments(
+        problem_size, 
+        1, 
+        output_op, 
+        ref_A, 
+        ptr_B, 
+        ptr_C, 
+        ptr_D,
+        inc_B, 
+        inc_C, 
+        inc_D, 
+        1, 
+        1, 
+        1, 
+        1)
+    { }
+
+    Status update(Arguments const &args) {
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Gemv() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+ 
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
+
+      int i = blockIdx.x * kThreadCount + threadIdx.x;
+
+      ElementA const *ptr_A = params.ref_A.data() + i;
+      ElementB const *ptr_B = params.ptr_B;
+
+      ptr_A += batch_idx * params.batch_stride_A;
+      ptr_B += batch_idx * params.batch_stride_B;
+
+      ElementAccumulator accum = ElementAccumulator();
+
+      // Compute inner product
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int k = 0; k < params.problem_size.column(); ++k) {
+
+        // Fetch from A
+        ElementA a = ElementA();
+        if (i < params.problem_size.row()) {
+          a = *ptr_A;
+        }
+        ptr_A += params.ref_A.stride(0);
+
+        // Fetch from B
+        ElementB b = *ptr_B;
+        ptr_B += params.inc_B;
+
+        // Math
+        accum += ElementAccumulator(a) * ElementAccumulator(b);
+      }
+
+      //
+      // Epilogue phase
+      //
+
+      ElementC const *ptr_C = params.ptr_C + i * params.inc_C + batch_idx * params.batch_stride_C;
+      ElementC       *ptr_D = params.ptr_D + i * params.inc_D + batch_idx * params.batch_stride_D;
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+      typename EpilogueOutputOp::FragmentOutput      source_fragment;
+      typename EpilogueOutputOp::FragmentOutput      output_fragment;
+      
+      accum_fragment[0] = accum;
+
+      if (i < params.problem_size.row()) {
+        if (output_op.is_source_needed()) {
+          source_fragment[0] = *ptr_C;
+          output_fragment = output_op(accum_fragment, source_fragment);
+        }
+        else {
+          output_fragment = output_op(accum_fragment);
+        }
+
+        *ptr_D = output_fragment[0];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for row-major A matrix
+template <
+    typename ElementA_,
+    typename ElementB_,
+    typename ElementC_,
+    typename ElementAccumulator_,
+    typename EpilogueOutputOp_,
+    int kElementsPerAccess_,
+    int kThreadCount_,
+    int kThreadsPerRow_ 
+>
+struct Gemv <
+    ElementA_,            
+    layout::RowMajor,
+    ElementB_,            
+    ElementC_,
+    ElementAccumulator_,
+    EpilogueOutputOp_,
+    kElementsPerAccess_,
+    kThreadCount_,
+    kThreadsPerRow_
+>{
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static int const kElementsPerAccess = kElementsPerAccess_;
+  
+  using FragmentA = Array<ElementA, kElementsPerAccess>;
+  using FragmentB = Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
+
+  // thread block shape (kThreadsPerRow, kThreadCount / kThreadsPerRow, 1)
+  static int const kThreadCount = (kThreadCount_ <= 0) ? 128 : kThreadCount_;
+  static int const kThreadsPerRow = (kThreadsPerRow_ <= 0) ?
+                                  std::min(static_cast<int>(kThreadCount / (kElementsPerAccess * sizeof(ElementA))), 16)
+                                  : kThreadsPerRow_;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    MatrixCoord     problem_size;
+    int32_t         batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA      ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC       *ptr_D;
+
+    int64_t         batch_stride_A;
+    int64_t         batch_stride_B;
+    int64_t         batch_stride_C;
+    int64_t         batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments(): batch_count(0) { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      int32_t     batch_count,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D,
+      int64_t     batch_stride_A,
+      int64_t     batch_stride_B,
+      int64_t     batch_stride_C,
+      int64_t     batch_stride_D
+    ):
+      problem_size(problem_size),
+      batch_count(batch_count),
+      output_op(output_op),
+      ref_A(ref_A),
+      ptr_B(static_cast<ElementB const *>(ptr_B)),
+      ptr_C(static_cast<ElementC const *>(ptr_C)),
+      ptr_D(static_cast<ElementC       *>(ptr_D)),
+      batch_stride_A(batch_stride_A),
+      batch_stride_B(batch_stride_B),
+      batch_stride_C(batch_stride_C),
+      batch_stride_D(batch_stride_D)
+    { }
+
+    Arguments(
+      MatrixCoord problem_size,
+      typename EpilogueOutputOp::Params output_op,
+      TensorRefA  ref_A,
+      void const *ptr_B,
+      void const *ptr_C,
+      void       *ptr_D
+    ):
+      Arguments(
+        problem_size,
+        1,
+        output_op,
+        ref_A,
+        ptr_B,
+        ptr_C,
+        ptr_D,
+        1,
+        1,
+        1,
+        1)
+    { }
+
+    Status update(Arguments const &args) {
+      problem_size = args.problem_size;
+      batch_count = args.batch_count;
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage {
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Gemv() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size) {
+    if (problem_size.column() % kElementsPerAccess != 0) {
+      return Status::kErrorMisalignedOperand;
+    }
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) {
+      int idx_col_k = threadIdx.x;
+      int idx_row_m = blockIdx.x * blockDim.y + threadIdx.y;
+
+      if (idx_row_m < params.problem_size.row()) {
+        // problem_size (row = m, column = k)
+        // matrix A (batch, m, k)
+        // vector B (batch, 1, k)
+        // vector C (batch, m, 1)
+        // vector D (batch, m, 1)
+
+        // move in the batch dimension
+        ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
+        ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
+
+        ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+        ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+        // move in the k dimension
+        ptr_A += idx_col_k * kElementsPerAccess;
+        ptr_B += idx_col_k * kElementsPerAccess;
+
+        // move in the m dimension
+        ptr_A += idx_row_m * params.problem_size.column();
+        ptr_C += idx_row_m;
+        ptr_D += idx_row_m;
+
+        NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+        NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+
+        ElementAccumulator accum = 0.f;
+
+        FragmentB fragB;
+        FragmentA fragA;
+
+        int unroll_col_k = 0;
+
+        // rows of the rolling tile
+        int const tileA_k = kThreadsPerRow * kElementsPerAccess;
+
+        for (; unroll_col_k < params.problem_size.column() / tileA_k * tileA_k; unroll_col_k += tileA_k) {
+
+          // fetch from matrix A
+          arch::global_load<FragmentA,
+                            sizeof(FragmentA),
+                            arch::CacheOperation::LastUse>(fragA, (ptr_A + unroll_col_k), true);
+
+          // fetch from vector B
+          arch::global_load<FragmentB,
+                            sizeof(FragmentB),
+                            arch::CacheOperation::Always>(fragB, (ptr_B + unroll_col_k), true);
+
+          FragmentCompute fragB_Compute = srcB_converter(fragB);
+          FragmentCompute fragA_Compute = srcA_converter(fragA);
+
+          // Math
+          CUTLASS_PRAGMA_UNROLL
+          for (int e = 0; e < kElementsPerAccess; e++) {
+            accum += fragA_Compute.at(e) * fragB_Compute.at(e);
+          }
+        }
+
+        // calculate the rest of K elements
+        // each thread fetch 1 element each time
+        for (int k = unroll_col_k + idx_col_k; k < params.problem_size.column(); k += kThreadsPerRow) {
+          ElementB b = *(ptr_B - idx_col_k * kElementsPerAccess + k);
+          ElementA a = *(ptr_A - idx_col_k * kElementsPerAccess + k);
+
+          accum += ElementAccumulator(a) * ElementAccumulator(b);
+        }
+
+        EpilogueOutputOp output_op(params.output_op);
+        typename EpilogueOutputOp::FragmentOutput source_fragment;
+
+        // prefetch from source matrix C
+        if (output_op.is_source_needed()) {         
+          source_fragment[0] = *(ptr_C);
+        }
+
+        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+        typename EpilogueOutputOp::FragmentOutput output_fragment;
+
+        for (int mask = (kThreadsPerRow >> 1); mask > 0; mask >>= 1) {
+          accum += __shfl_xor_sync(0xFFFFFFFF, accum, mask, 32);
+        }
+
+        if (idx_col_k == 0) {
+          accum_fragment[0] = accum;
+
+          if (output_op.is_source_needed()) {
+            output_fragment = output_op(accum_fragment, source_fragment);
+          }
+          else {
+            output_fragment = output_op(accum_fragment);
+          }
+
+          *ptr_D = output_fragment[0];
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
new file mode 100644
index 0000000000000000000000000000000000000000..42b12c3e98a84517a1d277ae2d3dc4c17c2c5515
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -0,0 +1,244 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+namespace detail
+{
+  template<typename ElementAlphaBeta, bool BetaIsZero>
+  struct GemvBatchedStridedEpilogueScaling
+  {
+    ElementAlphaBeta const & alpha;
+    ElementAlphaBeta const & beta;
+
+    CUTLASS_DEVICE
+    GemvBatchedStridedEpilogueScaling(ElementAlphaBeta& alpha_, ElementAlphaBeta& beta_) :
+      alpha(alpha_), beta(beta_)
+    { }
+
+    template<typename FragmentCD, typename FragmentAccumulator>
+    CUTLASS_DEVICE
+    void operator()(FragmentAccumulator& accumulators,
+                    FragmentCD const& fragment_C,
+                    FragmentCD& fragment_D) const
+    {
+      using AccType = typename FragmentAccumulator::value_type;
+      using CDType = typename FragmentCD::value_type;
+
+      static_assert(FragmentCD::kElements == FragmentAccumulator::kElements,
+                    "Mismatch in fragment sizes.");
+
+      for (int i = 0; i < FragmentCD::kElements; ++i)
+      {
+        if (BetaIsZero)
+        {
+          fragment_D[i] = CDType(accumulators[i] * AccType(alpha));
+        }
+        else
+        {
+          fragment_D[i] = CDType(accumulators[i] * AccType(alpha)
+                                 + AccType(fragment_C[i]) * AccType(beta));
+        } 
+      } 
+    }
+  };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero=false>
+CUTLASS_DEVICE void GemvBatchedStridedDevice(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  ElementAlphaBeta beta,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_C,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv;
+  using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle;
+  using EpilogueScale = detail::GemvBatchedStridedEpilogueScaling<ElementAlphaBeta, BetaIsZero>;
+
+  ThreadBlockSwizzle swizzler;
+
+  // Compute initial location in logical coordinates
+  BatchedGemmCoord tb_offset = swizzler.get_tile_offset();
+  int const batch_idx = swizzler.get_batch_idx();
+
+  // Offset to the batch
+  ref_A.add_pointer_offset(batch_idx*lda);
+  ref_B.add_pointer_offset(batch_idx*ldb);
+
+  // Construct iterators to A and B operands
+  typename GemvKernel::IteratorA::Params params_A(ref_A.layout());
+  typename GemvKernel::IteratorA iterator_A(
+      params_A,
+      ref_A.data(),
+      { 1, problem_size.k() },
+      0,
+      { 0, 0 });
+
+  typename GemvKernel::IteratorB::Params params_B(ref_B.layout());
+  typename GemvKernel::IteratorB iterator_B(
+      params_B,
+      ref_B.data(),
+      { problem_size.k(), problem_size.n() },
+      threadIdx.x,
+      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+
+  //
+  // Main loop
+  //
+
+  // Construct thread-scoped matrix multiply
+  ThreadBlockGemv mma;
+
+  typename ThreadBlockGemv::FragmentC accumulators;
+  accumulators.clear();
+
+  // Compute threadblock-scoped gemv
+  mma(problem_size.mnk(), accumulators, iterator_A, iterator_B, accumulators);
+
+  //
+  // Epilogue
+  //
+  typename GemvKernel::FragmentCD fragment_CD;
+
+  // Load C (skip if beta is zero)
+  if (!BetaIsZero)
+  {
+    tb_offset = swizzler.get_tile_offset();
+    ref_C.add_pointer_offset(batch_idx*ldc);
+    typename GemvKernel::IteratorCD::Params params_C(ref_C.layout());
+    typename GemvKernel::IteratorCD iterator_C(
+        params_C,
+        ref_C.data(),
+        { 1, problem_size.n() },
+        threadIdx.x,
+        { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+    iterator_C.load(fragment_CD);
+  }
+
+  // Apply alpha/beta scaling
+  EpilogueScale epilogue_scale(alpha, beta);
+  epilogue_scale(accumulators, fragment_CD, fragment_CD);
+
+  // Store D
+  tb_offset = swizzler.get_tile_offset();
+  ref_D.add_pointer_offset(batch_idx*ldd);
+  typename GemvKernel::IteratorCD::Params params_D(ref_D.layout());
+  typename GemvKernel::IteratorCD iterator_D(
+      params_D,
+      ref_D.data(),
+      { 1, problem_size.n() },
+      threadIdx.x,
+      { 0, tb_offset.n()*ThreadBlockGemv::Shape::kN });
+  iterator_D.store(fragment_CD);
+}
+
+template <typename GemvKernel, typename ElementAlphaBeta, bool BetaIsZero>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  ElementAlphaBeta beta,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_C,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldc,
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, BetaIsZero>(
+    problem_size, alpha, beta, ref_A, lda, ref_B, ldb, ref_C, ldc, ref_D, ldd
+  );
+}
+
+template <typename GemvKernel, typename ElementAlphaBeta>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  ElementAlphaBeta alpha,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
+    problem_size, alpha, ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
+  );
+}
+
+template <typename GemvKernel>
+CUTLASS_GLOBAL void GemvBatchedStrided(
+  cutlass::gemm::BatchedGemmCoord problem_size,
+  typename GemvKernel::IteratorA::TensorRef ref_A,
+  typename GemvKernel::IteratorA::TensorRef::LongIndex lda, 
+  typename GemvKernel::IteratorB::TensorRef ref_B,
+  typename GemvKernel::IteratorB::TensorRef::LongIndex ldb, 
+  typename GemvKernel::IteratorCD::TensorRef ref_D,
+  typename GemvKernel::IteratorCD::TensorRef::LongIndex ldd)
+{
+  using ElementAlphaBeta = typename GemvKernel::IteratorCD::Element;
+  GemvBatchedStridedDevice<GemvKernel, ElementAlphaBeta, true>(
+    problem_size, ElementAlphaBeta(1), ElementAlphaBeta(0), ref_A, lda, ref_B, ldb, ref_D, ldd, ref_D, ldd
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_blockscaled.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_blockscaled.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7891fa4b53f5657ecf551567a5acbe0aa26b7ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/gemv_blockscaled.h
@@ -0,0 +1,885 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/arch/cache_operation.h"  /// cutlass::arch::CacheOperation
+#include "cutlass/arch/memory.h"           // cutlass::arch::global_load
+#include "cutlass/arch/memory_sm80.h"      // cp.async helpers, ldsm, cp_async_wait
+#include "cutlass/complex.h"               // cutlass::ComplexTransform:
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"             // cutlass::fast_max
+#include "cutlass/layout/matrix.h"         // cutlass::layout::RowMajor
+#include "cutlass/matrix_coord.h"          // cutlass::MatrixCoord
+#include "cutlass/numeric_conversion.h"    // cutlass::FloatRoundStyle, cutlass::NumericConverter
+#include "cutlass/numeric_types.h"         // cutlass::float_e4m3_t
+#include "cutlass/platform/platform.h"     // cutlass::is_same_v
+#include "cutlass/tensor_ref.h"            // cutlass::TensorRef
+#include "cutlass/semaphore.h"             // split-k
+
+#include "cute/algorithm/functional.hpp"   // cute::for_each
+#include "cute/numeric/arithmetic_tuple.hpp" // cute::make_int_sequence
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename EpilogueOutputOp_,
+  int kElementsPerAccess_ = 1,            ///< Number of elements involved in a global access.
+  int kThreadCount_ = 0,                  ///< Number of threads in the thread block.
+                                          ///  It will be calculated automatically if set to 0.
+  int kThreadsPerRow_ = 0,                ///< Number of threads in the k dimension.
+                                          ///  It will be calculated automatically if set to 0.
+  typename ElementSFA_ = cutlass::float_e4m3_t,
+  typename ElementSFB_ = cutlass::float_e4m3_t,
+  int kSFVecSize_ = 16
+>
+struct GemvBlockScaled;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GEMV for row-major A matrix
+template <typename ElementA_,
+          typename ElementB_,
+          typename ElementC_,
+          typename ElementAccumulator_,
+          typename EpilogueOutputOp_,
+          int kElementsPerAccess_,
+          int kThreadCount_,
+          int kThreadsPerRow_,
+          typename ElementSFA_,
+          typename ElementSFB_,
+          int kSFVecSize_>
+struct GemvBlockScaled<ElementA_,
+            cutlass::layout::RowMajor,
+            ElementB_,
+            ElementC_,
+            ElementAccumulator_,
+            EpilogueOutputOp_,
+            kElementsPerAccess_,
+            kThreadCount_,
+            kThreadsPerRow_,
+            ElementSFA_,
+            ElementSFB_,
+            kSFVecSize_>
+{
+public:
+  using ElementA = ElementA_;
+  using ElementSFA = ElementSFA_;
+  using LayoutA = cutlass::layout::RowMajor;
+  using TensorRefA = cutlass::TensorRef<ElementA, LayoutA>;
+  static_assert(cutlass::sizeof_bits<ElementSFA>::value == 8, "ElementSFA should be FP8 type");
+
+  using ElementB = ElementB_;
+  using ElementSFB = ElementSFB_;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static_assert(cutlass::sizeof_bits<ElementSFB>::value == 8, "ElementSFB should be FP8 type");
+
+  using ElementC = ElementC_;
+  using LayoutC = cutlass::layout::ColumnMajor;
+
+  using ElementAccumulator = ElementAccumulator_;
+
+  static constexpr cutlass::ComplexTransform kTransformA = cutlass::ComplexTransform::kNone;
+  static constexpr cutlass::ComplexTransform kTransformB = cutlass::ComplexTransform::kNone;
+
+  static constexpr FloatRoundStyle Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static constexpr int kElementsPerAccess = kElementsPerAccess_;
+  static constexpr int kSFVecSize = kSFVecSize_;
+  static constexpr int kSFPerAccess = cutlass::const_max(1, kElementsPerAccess / kSFVecSize);
+
+  static_assert(kSFVecSize == 16, "Only SFVecSize = 16 is supported");
+  // Hardcode some check for easier debug
+  static_assert(kElementsPerAccess == 32, "for fp4 kernel, 32 elt per access");
+  static_assert(kSFPerAccess == 2, "fpr fp4 kernel, 2 sf read per thread");
+
+  static constexpr bool kDequantizeA = cutlass::sizeof_bits<ElementA>::value == 4;
+  static constexpr bool kDequantizeB = cutlass::sizeof_bits<ElementB>::value == 4;
+  static constexpr int kPackedElementsA = cutlass::sizeof_bits<ElementA>::value == 4 ? 2 : 1;
+  static constexpr int kPackedElementsB = cutlass::sizeof_bits<ElementB>::value == 4 ? 2 : 1;
+  static constexpr int kPackedElements = cutlass::const_max(kPackedElementsA, kPackedElementsB);
+
+  static_assert(kDequantizeA == true, "kDequantizeA should be true");
+  static_assert(kDequantizeB == true, "kDequantizeB should be true");
+
+  using FragmentA = cutlass::Array<ElementA, kElementsPerAccess>;
+  using FragmentB = cutlass::Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = cutlass::Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentSFA = cutlass::Array<ElementSFA, kSFPerAccess>;
+  using FragmentSFB = cutlass::Array<ElementSFB, kSFPerAccess>;
+  using FragmentPackedA = cutlass::Array<ElementA, kPackedElements>;
+  using FragmentPackedB = cutlass::Array<ElementB, kPackedElements>;
+
+  static_assert(sizeof_bits<FragmentA>::value == 128, "FragmentA should be 128 bits");
+  static_assert(sizeof_bits<FragmentB>::value == 128, "FragmentB should be 128 bits");
+
+  // // thread block shape (kThreadsPerRow, kThreadCount / kThreadsPerRow, 1)
+  static constexpr int kThreadCount = (kThreadCount_ <= 0) ? 128 : kThreadCount_;
+  static constexpr int kThreadsPerRow = (kThreadsPerRow_ <= 0) ? 
+                                        cutlass::const_min(static_cast<int>(kThreadCount / cutlass::bits_to_bytes(kElementsPerAccess * cutlass::sizeof_bits<ElementA>::value)), 16) :
+                                        kThreadsPerRow_;
+  static constexpr int kThreadsPerCol = kThreadCount / kThreadsPerRow;
+
+  static constexpr int kStageCount = 4;
+  static constexpr int kBufferCount = 2;
+
+  // Number of elements stored in shared memory per stage for operands A and B.
+  // Each thread contributes `kElementsPerAccess / kPackedElements{A,B}` packed
+  // values.
+  static constexpr int kSmemPerStageA = kThreadCount * kElementsPerAccess / kPackedElementsA;
+  // B is uniform across all threads in the same k-column, so only store it once per k-thread
+  static constexpr int kSmemPerStageB = kThreadsPerRow * kElementsPerAccess / kPackedElementsB;
+
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  // Ensure epilogue and mainloop have same thread layout
+  static_assert(kThreadCount == EpilogueOutputOp::kThreadCount, "mainloop, epilogue thread count mismatch");
+  static_assert(kThreadsPerRow == EpilogueOutputOp::kThreadsPerRow, "mainloop, epilogue thread per row mismatch");
+  static_assert(kThreadsPerCol == EpilogueOutputOp::kThreadsPerCol, "mainloop, epilogue thread per col mismatch");
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+    MatrixCoord problem_size;
+    int32_t batch_count{0};
+    typename EpilogueOutputOp::Params epilogue;
+
+    TensorRefA ref_A;
+
+    ElementB const *ptr_B{nullptr};
+    ElementC const *ptr_C{nullptr};
+    ElementC *ptr_D{nullptr};
+
+    ElementSFA const *ptr_SFA{nullptr};
+    ElementSFB const *ptr_SFB{nullptr};
+
+    int64_t stride_A{0};
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    int64_t batch_stride_SFA{0};
+    int64_t batch_stride_SFB{0};
+    int64_t batch_stride_SFD{0};
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  struct SharedStorage
+  {
+    using EpilogueStorage = typename EpilogueOutputOp::SharedStorage;
+    EpilogueStorage epilogue;
+
+    alignas(16) ElementA  smem_A[kBufferCount][kStageCount][kSmemPerStageA];
+    alignas(16) ElementB  smem_B[kBufferCount][kStageCount][kSmemPerStageB];
+    alignas(16) ElementSFA smem_SFA[kBufferCount][kStageCount][kThreadCount * kSFPerAccess];
+    alignas(16) ElementSFB smem_SFB[kBufferCount][kStageCount][kThreadsPerRow * kSFPerAccess];
+  };
+
+public:
+  //
+  // Methods
+  //
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size)
+  {
+    if (problem_size.column() % kElementsPerAccess != 0) {
+      return Status::kErrorMisalignedOperand;
+    }
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args)
+  {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage)
+  {
+    EpilogueOutputOp epilogue(params.epilogue, shared_storage.epilogue);
+
+    // Converters only needed for regular GEMV fallback case
+    NumericConverter<ElementAccumulator, ElementA, Round> A_converter;
+    NumericConverter<ElementAccumulator, ElementB, Round> B_converter;
+    NumericConverter<ElementAccumulator, ElementSFA, Round> SFA_converter;
+    NumericConverter<ElementAccumulator, ElementSFB, Round> SFB_converter;
+
+    const int32_t gemm_m = params.problem_size.row();
+    [[maybe_unused]] static constexpr int32_t gemm_n = 1;
+    const int32_t gemm_k = params.problem_size.column();
+    const int32_t gemm_batch = params.batch_count;
+
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < gemm_batch; batch_idx += gridDim.z) {
+      
+      int idx_col_k = threadIdx.x;
+      int idx_row_m = blockIdx.x * blockDim.y + threadIdx.y;
+
+      if (idx_row_m < gemm_m) {
+        // problem_size (row = m, column = k)
+        // matrix A (batch, m, k)
+        // vector B (batch, k, 1)
+        // vector C (batch, m, 1)
+        // vector D (batch, m, 1)
+        // move in the batch dimension
+        ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A / kPackedElementsA;
+        ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B / kPackedElementsB;
+        ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+        ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+        // move in the k dimension
+        ptr_A += idx_col_k * kElementsPerAccess / kPackedElementsA;
+        ptr_B += idx_col_k * kElementsPerAccess / kPackedElementsB;
+
+        // move in the m dimension
+        ptr_A += idx_row_m * params.stride_A / kPackedElementsA;
+        ptr_C += idx_row_m;
+        ptr_D += idx_row_m;
+
+        ElementSFA const *ptr_SF_A{nullptr};
+        ElementSFB const *ptr_SF_B{nullptr};
+        int global_k{0};
+
+        int SF_blocks_by_M = (gemm_m + 127) >> 7;
+        int SF_blocks_by_K = (gemm_k / kSFVecSize + 3) >> 2;
+
+        // move in the batch dimension
+        ptr_SF_A = params.ptr_SFA + batch_idx * SF_blocks_by_M * SF_blocks_by_K * 512;
+        ptr_SF_B = params.ptr_SFB + batch_idx * SF_blocks_by_K * 512;
+
+         // move in the m dimension
+        ptr_SF_A += (((idx_row_m >> 7) * SF_blocks_by_K) << 9) + ((idx_row_m & 0x1f) << 4) + ((idx_row_m & 0x7f) >> 5 << 2);
+
+        global_k = idx_col_k * kElementsPerAccess;
+
+        ElementAccumulator accum = ElementAccumulator(0);
+
+        // Local aliases
+        const int tileA_k_local = kThreadsPerRow * kElementsPerAccess;
+        const int total_tiles   = gemm_k / tileA_k_local;
+
+        int unroll_col_k = 0; // total K elements consumed so far by this thread
+        const int thread_id = threadIdx.y * kThreadsPerRow + threadIdx.x;
+        const bool is_even_thread = (threadIdx.x % 2 == 0);
+        const bool load_b = (threadIdx.y == 0);
+        const int smem_sf_write_offset = (thread_id / 2) * 4;  // 4 FP8 per even thread
+        const int smem_sf_offset = thread_id * kSFPerAccess;
+        
+        // Fast path: if the problem fits entirely in the tail path, skip SMEM
+        if (total_tiles == 0) {
+          accum += process_tail_elements(0, idx_col_k, gemm_k,
+                                         ptr_A, ptr_B,
+                                         ptr_SF_A, ptr_SF_B,
+                                         A_converter, B_converter,
+                                         SFA_converter, SFB_converter);
+        } else {
+
+          // Scaling factors are now loaded from shared memory, no register pipeline needed
+
+          // Thread-local SMEM line offset
+          const int thread_linear = threadIdx.y * kThreadsPerRow + threadIdx.x;
+          const int smem_offset_A = thread_linear * (kElementsPerAccess / kPackedElementsA);
+          // Only one row of threads (threadIdx.y == 0) loads B
+          const int smem_offset_B = threadIdx.x * (kElementsPerAccess / kPackedElementsB);
+
+          // PROLOGUE – prime first kStageCount-1 stages into buffer 0
+          CUTLASS_PRAGMA_UNROLL
+          for (int b = 0; b < kBufferCount - 1; ++b) {
+            // Load all stages using the helper function
+            load_stages_gmem_to_smem(
+                b,                    // buffer_idx
+                kStageCount,          // num_stages
+                unroll_col_k,         // passed by reference
+                global_k,             // passed by reference
+                tileA_k_local,
+                smem_offset_A,
+                smem_offset_B,
+                smem_sf_write_offset,
+                is_even_thread,
+                load_b,
+                true,                 // valid_tile = true for prologue
+                ptr_A,
+                ptr_B,
+                ptr_SF_A,
+                ptr_SF_B,
+                shared_storage);
+          }
+          cutlass::arch::cp_async_fence();
+
+          // Ensure first stage committed
+          cutlass::arch::cp_async_wait<kBufferCount - 2>();
+          __syncthreads();
+
+          // Register double buffering for A/B fragments and SFA/SFB like SM80
+          FragmentA fragA_reg[2];
+          FragmentB fragB_reg[2];
+          FragmentSFA fragSFA_reg[2];
+          FragmentSFB fragSFB_reg[2];
+          
+          // Current pipe index in smem to read from
+          int smem_pipe_read  = 0;
+          // Current pipe index in smem to write to  
+          int smem_pipe_write = kBufferCount - 1;
+
+          // PREFETCH register pipeline - load first kblock (stage 0) into register bank 0
+          if constexpr (kStageCount > 1) 
+          {
+            int frag_idx = 0;
+            
+            // Load fragments using the helper function
+            load_smem_fragments(
+                fragA_reg[frag_idx], 
+                fragB_reg[frag_idx],
+                fragSFA_reg[frag_idx],
+                fragSFB_reg[frag_idx],
+                smem_pipe_read,
+                0,  // k_block = 0
+                smem_offset_A,
+                smem_offset_B,
+                smem_sf_offset,
+                shared_storage);
+            
+          }
+
+          // Mainloop
+          int tile_idx = 0;
+          while (tile_idx < total_tiles) {
+            int smem_pipe_read_curr = smem_pipe_read;
+
+            for_each(make_int_sequence<kStageCount>{}, [&] (auto k_block)
+            {
+              if (k_block == kStageCount - 1)
+              {
+                cutlass::arch::cp_async_wait<kBufferCount - 2>();
+                __syncthreads();
+                
+                smem_pipe_read_curr = smem_pipe_read;
+              }
+
+              // Load A/B/SFA/SFB smem->regs for k_block_next
+              auto k_block_next = (k_block + Int<1>{}) % kStageCount;
+              int frag_idx_next = (k_block + 1) & 1;
+
+              // Prefetch next kblock data using saved pipe index
+              load_smem_fragments(
+                  fragA_reg[frag_idx_next],
+                  fragB_reg[frag_idx_next],
+                  fragSFA_reg[frag_idx_next],
+                  fragSFB_reg[frag_idx_next],
+                  smem_pipe_read_curr,
+                  k_block_next,
+                  smem_offset_A,
+                  smem_offset_B,
+                  smem_sf_offset,
+                  shared_storage);
+              // Copy gmem to smem before computing gemm on each k-pipe
+              if (k_block == 0)
+              {
+                // Use predicate instead of branch for cp_async
+                bool valid_tile = (global_k < gemm_k);
+                
+                // Load all stages using the helper function
+                load_stages_gmem_to_smem(
+                    smem_pipe_write,      // buffer_idx
+                    kStageCount,          // num_stages
+                    unroll_col_k,         // passed by reference
+                    global_k,             // passed by reference
+                    tileA_k_local,
+                    smem_offset_A,
+                    smem_offset_B,
+                    smem_sf_write_offset,
+                    is_even_thread,
+                    load_b,
+                    valid_tile,
+                    ptr_A,
+                    ptr_B,
+                    ptr_SF_A,
+                    ptr_SF_B,
+                    shared_storage);
+                
+                cutlass::arch::cp_async_fence();
+                
+                // Advance the pipe indices
+                smem_pipe_write = smem_pipe_read;
+                ++smem_pipe_read;
+                smem_pipe_read = (smem_pipe_read == kBufferCount) ? 0 : smem_pipe_read;
+              }
+
+              {
+                int frag_idx = k_block & 1;
+                
+                // Compute using current fragments
+                accum += blockscaled_multiply_add(
+                    fragA_reg[frag_idx], fragB_reg[frag_idx],
+                    fragSFA_reg[frag_idx],
+                    fragSFB_reg[frag_idx]);
+              }
+            });
+
+            tile_idx += kStageCount;
+          }
+
+          // Drain outstanding async copies
+          cutlass::arch::cp_async_wait<0>();
+          __syncthreads();
+
+          // Tail elements that don't fill a full tile
+          if (unroll_col_k + idx_col_k * kPackedElementsA < gemm_k) {
+            accum += process_tail_elements(unroll_col_k, idx_col_k, gemm_k,
+                                           ptr_A, ptr_B,
+                                           ptr_SF_A, ptr_SF_B,
+                                           A_converter, B_converter,
+                                           SFA_converter, SFB_converter);
+          }
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mask = (kThreadsPerRow >> 1); mask > 0; mask >>= 1) {
+          accum += ElementAccumulator(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(accum), mask, 32));
+        }
+
+        auto frag_acc = static_cast<typename EpilogueOutputOp::ElementAccumulator>(accum);
+        auto frag_c = static_cast<typename EpilogueOutputOp::ElementC>(*(ptr_C));
+        
+        // Applying blockscaled epilogue
+        epilogue(frag_acc, frag_c, batch_idx);
+      }
+    }
+  } //end of operator()
+
+private:
+  // Load multiple stages from global to shared memory
+  CUTLASS_DEVICE
+  void load_stages_gmem_to_smem(
+      int buffer_idx,
+      int num_stages,
+      int& unroll_col_k,
+      int& global_k,
+      int tileA_k_local,
+      int smem_offset_A,
+      int smem_offset_B,
+      int smem_sf_write_offset,
+      bool is_even_thread,
+      bool load_b,
+      bool valid_tile,
+      ElementA const* ptr_A,
+      ElementB const* ptr_B,
+      ElementSFA const* ptr_SF_A,
+      ElementSFB const* ptr_SF_B,
+      SharedStorage& shared_storage) {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < num_stages; ++s) {
+      // Load scaling factors using cp.async - only even threads participate
+      // Calculate SF indices for this thread
+      int SF_idx = global_k / kSFVecSize;
+      int SF_offset_by_k = ((SF_idx >> 2) << 9) + (SF_idx & 0x3);
+        
+      void *smem_ptr_SFA = &shared_storage.smem_SFA[buffer_idx][s][smem_sf_write_offset];
+      const void *gmem_ptr_SFA = ptr_SF_A + SF_offset_by_k;
+      // Load 4 FP8 values (32 bits) - for this thread and next thread
+      cutlass::arch::cp_async<sizeof(uint32_t)>(smem_ptr_SFA, gmem_ptr_SFA, valid_tile && is_even_thread);
+        
+      void *smem_ptr_SFB = &shared_storage.smem_SFB[buffer_idx][s][(threadIdx.x / 2) * 4];
+      const void *gmem_ptr_SFB = ptr_SF_B + SF_offset_by_k;
+      // Load 4 FP8 values (32 bits) - for this thread and next thread, only if threadIdx.y == 0
+      cutlass::arch::cp_async<sizeof(uint32_t)>(smem_ptr_SFB, gmem_ptr_SFB, valid_tile && load_b && is_even_thread);
+
+      void *smem_ptr_A = &shared_storage.smem_A[buffer_idx][s][smem_offset_A];
+      const void *gmem_ptr_A = ptr_A + unroll_col_k / kPackedElementsA;
+      cutlass::arch::cp_async<sizeof(FragmentA)>(smem_ptr_A, gmem_ptr_A, valid_tile);
+
+      void *smem_ptr_B = &shared_storage.smem_B[buffer_idx][s][smem_offset_B];
+      const void *gmem_ptr_B = ptr_B + unroll_col_k / kPackedElementsB;
+      cutlass::arch::cp_async<sizeof(FragmentB)>(smem_ptr_B, gmem_ptr_B, valid_tile && load_b);
+
+      unroll_col_k += tileA_k_local;
+      global_k     += tileA_k_local;
+    }
+  }
+
+  /// Fused blockscaled GEMV computation using PTX
+  CUTLASS_DEVICE
+  ElementAccumulator blockscaled_multiply_add(
+      FragmentA const& fragA,
+      FragmentB const& fragB, 
+      FragmentSFA const& fragSFA,
+      FragmentSFB const& fragSFB) {
+
+      #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+        uint16_t const& src_fragSFA_packed = reinterpret_cast<uint16_t const&>(fragSFA);
+        uint16_t const& src_fragSFB_packed = reinterpret_cast<uint16_t const&>(fragSFB);
+
+        uint32_t const* src_fragA_packed = reinterpret_cast<uint32_t const*>(&fragA);
+        uint32_t const* src_fragB_packed = reinterpret_cast<uint32_t const*>(&fragB);
+
+        ElementAccumulator out;
+        uint16_t* out_fp16 = reinterpret_cast<uint16_t*>(&out);
+
+        asm volatile( \
+            "{\n" \
+            // declare registers for A / B tensors
+            ".reg .b8 byte0_0, byte0_1, byte0_2, byte0_3;\n" \
+            ".reg .b8 byte0_4, byte0_5, byte0_6, byte0_7;\n" \
+            ".reg .b8 byte1_0, byte1_1, byte1_2, byte1_3;\n" \
+            ".reg .b8 byte1_4, byte1_5, byte1_6, byte1_7;\n" \
+            ".reg .b8 byte2_0, byte2_1, byte2_2, byte2_3;\n" \
+            ".reg .b8 byte2_4, byte2_5, byte2_6, byte2_7;\n" \
+            ".reg .b8 byte3_0, byte3_1, byte3_2, byte3_3;\n" \
+            ".reg .b8 byte3_4, byte3_5, byte3_6, byte3_7;\n" \
+
+            // declare registers for accumulators
+            ".reg .f16x2 accum_0_0, accum_0_1, accum_0_2, accum_0_3;\n" \
+            ".reg .f16x2 accum_1_0, accum_1_1, accum_1_2, accum_1_3;\n" \
+            ".reg .f16x2 accum_2_0, accum_2_1, accum_2_2, accum_2_3;\n" \
+            ".reg .f16x2 accum_3_0, accum_3_1, accum_3_2, accum_3_3;\n" \
+
+            // declare registers for scaling factors
+            ".reg .f16x2 sfa_f16x2;\n" \
+            ".reg .f16x2 sfb_f16x2;\n" \
+            ".reg .f16x2 sf_f16x2;\n" \
+            
+            // declare registers for conversion
+            ".reg .f16x2 cvt_0_0, cvt_0_1, cvt_0_2, cvt_0_3;\n" \
+            ".reg .f16x2 cvt_0_4, cvt_0_5, cvt_0_6, cvt_0_7;\n" \
+            ".reg .f16x2 cvt_1_0, cvt_1_1, cvt_1_2, cvt_1_3;\n" \
+            ".reg .f16x2 cvt_1_4, cvt_1_5, cvt_1_6, cvt_1_7;\n" \
+            ".reg .f16x2 cvt_2_0, cvt_2_1, cvt_2_2, cvt_2_3;\n" \
+            ".reg .f16x2 cvt_2_4, cvt_2_5, cvt_2_6, cvt_2_7;\n" \
+            ".reg .f16x2 cvt_3_0, cvt_3_1, cvt_3_2, cvt_3_3;\n" \
+            ".reg .f16x2 cvt_3_4, cvt_3_5, cvt_3_6, cvt_3_7;\n" \
+            ".reg .f16 result_f16, lane0, lane1;\n" \
+            ".reg .f16x2 mul_f16x2_0, mul_f16x2_1;\n" \
+
+            // convert scaling factors from fp8 to f16x2
+            "cvt.rn.f16x2.e4m3x2 sfa_f16x2, %1;\n" \
+            "cvt.rn.f16x2.e4m3x2 sfb_f16x2, %2;\n" \
+            
+            // clear accumulators
+            "mov.b32 accum_0_0, 0;\n" \
+            "mov.b32 accum_0_1, 0;\n" \
+            "mov.b32 accum_0_2, 0;\n" \
+            "mov.b32 accum_0_3, 0;\n" \
+            "mov.b32 accum_1_0, 0;\n" \
+            "mov.b32 accum_1_1, 0;\n" \
+            "mov.b32 accum_1_2, 0;\n" \
+            "mov.b32 accum_1_3, 0;\n" \
+            "mov.b32 accum_2_0, 0;\n" \
+            "mov.b32 accum_2_1, 0;\n" \
+            "mov.b32 accum_2_2, 0;\n" \
+            "mov.b32 accum_2_3, 0;\n" \
+            "mov.b32 accum_3_0, 0;\n" \
+            "mov.b32 accum_3_1, 0;\n" \
+            "mov.b32 accum_3_2, 0;\n" \
+            "mov.b32 accum_3_3, 0;\n" \
+            
+            // multiply, unpacking and permuting scale factors
+            "mul.rn.f16x2 sf_f16x2, sfa_f16x2, sfb_f16x2;\n" \
+            "mov.b32 {lane0, lane1}, sf_f16x2;\n" \
+            "mov.b32 mul_f16x2_0, {lane0, lane0};\n" \
+            "mov.b32 mul_f16x2_1, {lane1, lane1};\n" \
+
+            // unpacking A and B tensors
+            "mov.b32 {byte0_0, byte0_1, byte0_2, byte0_3}, %3;\n" \
+            "mov.b32 {byte0_4, byte0_5, byte0_6, byte0_7}, %4;\n" \
+            "mov.b32 {byte1_0, byte1_1, byte1_2, byte1_3}, %5;\n" \
+            "mov.b32 {byte1_4, byte1_5, byte1_6, byte1_7}, %6;\n" \
+            "mov.b32 {byte2_0, byte2_1, byte2_2, byte2_3}, %7;\n" \
+            "mov.b32 {byte2_4, byte2_5, byte2_6, byte2_7}, %8;\n" \
+            "mov.b32 {byte3_0, byte3_1, byte3_2, byte3_3}, %9;\n" \
+            "mov.b32 {byte3_4, byte3_5, byte3_6, byte3_7}, %10;\n" \
+
+            // convert A and B tensors from fp4 to f16x2
+
+            // A[0 - 7] and B[0 - 7]
+            "cvt.rn.f16x2.e2m1x2 cvt_0_0, byte0_0;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_1, byte0_1;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_2, byte0_2;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_3, byte0_3;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_4, byte0_4;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_5, byte0_5;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_6, byte0_6;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_0_7, byte0_7;\n" \
+
+            // A[8 - 15] and B[8 - 15]
+            "cvt.rn.f16x2.e2m1x2 cvt_1_0, byte1_0;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_1, byte1_1;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_2, byte1_2;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_3, byte1_3;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_4, byte1_4;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_5, byte1_5;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_6, byte1_6;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_1_7, byte1_7;\n" \
+
+            // A[16 - 23] and B[16 - 23]
+            "cvt.rn.f16x2.e2m1x2 cvt_2_0, byte2_0;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_1, byte2_1;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_2, byte2_2;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_3, byte2_3;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_4, byte2_4;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_5, byte2_5;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_6, byte2_6;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_2_7, byte2_7;\n" \
+
+            // A[24 - 31] and B[24 - 31]
+            "cvt.rn.f16x2.e2m1x2 cvt_3_0, byte3_0;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_1, byte3_1;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_2, byte3_2;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_3, byte3_3;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_4, byte3_4;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_5, byte3_5;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_6, byte3_6;\n" \
+            "cvt.rn.f16x2.e2m1x2 cvt_3_7, byte3_7;\n" \
+
+            // fma for A[0 - 7] and B[0 - 7]
+            "fma.rn.f16x2 accum_0_0, cvt_0_0, cvt_0_4, accum_0_0;\n" \
+            "fma.rn.f16x2 accum_0_1, cvt_0_1, cvt_0_5, accum_0_1;\n" \
+            "fma.rn.f16x2 accum_0_2, cvt_0_2, cvt_0_6, accum_0_2;\n" \
+            "fma.rn.f16x2 accum_0_3, cvt_0_3, cvt_0_7, accum_0_3;\n" \
+
+            // fma for A[8 - 15] and B[8 - 15]
+            "fma.rn.f16x2 accum_1_0, cvt_1_0, cvt_1_4, accum_1_0;\n" \
+            "fma.rn.f16x2 accum_1_1, cvt_1_1, cvt_1_5, accum_1_1;\n" \
+            "fma.rn.f16x2 accum_1_2, cvt_1_2, cvt_1_6, accum_1_2;\n" \
+            "fma.rn.f16x2 accum_1_3, cvt_1_3, cvt_1_7, accum_1_3;\n" \
+
+            // fma for A[16 - 23] and B[16 - 23]
+            "fma.rn.f16x2 accum_2_0, cvt_2_0, cvt_2_4, accum_2_0;\n" \
+            "fma.rn.f16x2 accum_2_1, cvt_2_1, cvt_2_5, accum_2_1;\n" \
+            "fma.rn.f16x2 accum_2_2, cvt_2_2, cvt_2_6, accum_2_2;\n" \
+            "fma.rn.f16x2 accum_2_3, cvt_2_3, cvt_2_7, accum_2_3;\n" \
+
+            // fma for A[24 - 31] and B[24 - 31]
+            "fma.rn.f16x2 accum_3_0, cvt_3_0, cvt_3_4, accum_3_0;\n" \
+            "fma.rn.f16x2 accum_3_1, cvt_3_1, cvt_3_5, accum_3_1;\n" \
+            "fma.rn.f16x2 accum_3_2, cvt_3_2, cvt_3_6, accum_3_2;\n" \
+            "fma.rn.f16x2 accum_3_3, cvt_3_3, cvt_3_7, accum_3_3;\n" \
+
+            // tree reduction for accumulators
+            "add.rn.f16x2 accum_0_0, accum_0_0, accum_0_1;\n" \
+            "add.rn.f16x2 accum_0_2, accum_0_2, accum_0_3;\n" \
+            "add.rn.f16x2 accum_1_0, accum_1_0, accum_1_1;\n" \
+            "add.rn.f16x2 accum_1_2, accum_1_2, accum_1_3;\n" \
+            "add.rn.f16x2 accum_2_0, accum_2_0, accum_2_1;\n" \
+            "add.rn.f16x2 accum_2_2, accum_2_2, accum_2_3;\n" \
+            "add.rn.f16x2 accum_3_0, accum_3_0, accum_3_1;\n" \
+            "add.rn.f16x2 accum_3_2, accum_3_2, accum_3_3;\n" \
+
+            "add.rn.f16x2 accum_0_0, accum_0_0, accum_0_2;\n" \
+            "add.rn.f16x2 accum_1_0, accum_1_0, accum_1_2;\n" \
+            "add.rn.f16x2 accum_2_0, accum_2_0, accum_2_2;\n" \
+            "add.rn.f16x2 accum_3_0, accum_3_0, accum_3_2;\n" \
+
+            "add.rn.f16x2 accum_0_0, accum_0_0, accum_1_0;\n" \
+            "add.rn.f16x2 accum_2_0, accum_2_0, accum_3_0;\n" \
+
+            // apply scaling factors and final reduction
+            "mul.rn.f16x2 accum_0_0, mul_f16x2_0, accum_0_0;\n" \
+            "mul.rn.f16x2 accum_2_0, mul_f16x2_1, accum_2_0;\n" \
+
+            "add.rn.f16x2 accum_0_0, accum_0_0, accum_2_0;\n" \
+            
+            "mov.b32 {lane0, lane1}, accum_0_0;\n" \
+            "add.rn.f16 result_f16, lane0, lane1;\n" \
+
+            "mov.b16 %0, result_f16;\n" \
+
+            "}\n"
+            : "=h"(out_fp16[0])                                     // 0
+            : "h"(src_fragSFA_packed), "h"(src_fragSFB_packed),     // 1, 2
+              "r"(src_fragA_packed[0]), "r"(src_fragB_packed[0]),   // 3, 4
+              "r"(src_fragA_packed[1]), "r"(src_fragB_packed[1]),   // 5, 6
+              "r"(src_fragA_packed[2]), "r"(src_fragB_packed[2]),   // 7, 8
+              "r"(src_fragA_packed[3]), "r"(src_fragB_packed[3])    // 9, 10
+            : "memory"
+        );
+
+        return out;
+
+      #else
+        NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+        NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+        NumericConverter<ElementAccumulator, ElementSFA, Round> SFA_converter;
+        NumericConverter<ElementAccumulator, ElementSFB, Round> SFB_converter;
+
+        FragmentCompute fragA_Compute = srcA_converter(fragA);
+        FragmentCompute fragB_Compute = srcB_converter(fragB);
+        ElementAccumulator accum = ElementAccumulator(0);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kSFPerAccess; i++) {
+          ElementAccumulator accum_SF_block = ElementAccumulator(0);
+
+          int local_k_offset = i * kSFVecSize;
+          ElementAccumulator multiplier{1};
+                  
+          multiplier = SFA_converter(fragSFA.at(i)) * SFB_converter(fragSFB.at(i));
+
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int e = 0; e < kSFVecSize; e++) {
+            accum_SF_block += fragA_Compute.at(e + local_k_offset) * fragB_Compute.at(e + local_k_offset);
+          }
+
+          accum_SF_block *= multiplier;
+          accum += accum_SF_block;
+        }
+
+        return accum;
+
+      #endif
+  }
+
+  CUTLASS_DEVICE
+  ElementAccumulator process_tail_elements(
+      int unroll_col_k,
+      int idx_col_k,
+      int gemm_k,
+      ElementA const *ptr_A,
+      ElementB const *ptr_B,
+      ElementSFA const *ptr_SF_A,
+      ElementSFB const *ptr_SF_B,
+      NumericConverter<ElementAccumulator, ElementA, Round> const &A_converter,
+      NumericConverter<ElementAccumulator, ElementB, Round> const &B_converter,
+      NumericConverter<ElementAccumulator, ElementSFA, Round> const &SFA_converter,
+      NumericConverter<ElementAccumulator, ElementSFB, Round> const &SFB_converter) {
+
+      ElementAccumulator accum = ElementAccumulator(0);
+
+      // calculate the rest of K elements
+      // each thread fetch 1 element each time
+      for (int k = unroll_col_k + idx_col_k * kPackedElementsA; k < gemm_k; k += kThreadsPerRow * kPackedElementsA) {
+        // blockscaled GEMV
+        int SF_idx = k / kSFVecSize;
+        int SF_offset_by_k = ((SF_idx >> 2) << 9) + (SF_idx & 0x3);
+
+        ElementSFA sfa = *(ptr_SF_A + SF_offset_by_k);
+        ElementSFB sfb = *(ptr_SF_B + SF_offset_by_k);
+
+        FragmentPackedA fragA;
+        FragmentPackedB fragB;
+
+        // fetch from matrix A
+        arch::global_load<FragmentPackedA, sizeof(FragmentPackedA), arch::CacheOperation::Always>(
+          fragA,
+          ptr_A - (idx_col_k * kElementsPerAccess - k) / kPackedElementsA,
+          true);
+
+        // fetch from vector B
+        arch::global_load<FragmentPackedB, sizeof(FragmentPackedB), arch::CacheOperation::Always>(
+          fragB,
+          ptr_B - (idx_col_k * kElementsPerAccess - k) / kPackedElementsB,
+          true);
+
+        ElementAccumulator accum_SF_packed = ElementAccumulator(0);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int e = 0; e < kPackedElements; e++) {
+          accum_SF_packed += A_converter(fragA.at(e)) * B_converter(fragB.at(e));
+        }
+
+        accum_SF_packed *= SFA_converter(sfa) * SFB_converter(sfb);
+
+        accum += accum_SF_packed;
+
+      }
+
+      return accum;
+  }
+
+  // Load fragments from shared memory
+  template<typename FragmentA, typename FragmentB, typename FragmentSFA, typename FragmentSFB>
+  CUTLASS_DEVICE 
+  void load_smem_fragments(
+      FragmentA& fragA,
+      FragmentB& fragB,
+      FragmentSFA& fragSFA,
+      FragmentSFB& fragSFB,
+      int smem_pipe_idx,
+      int k_block,
+      int smem_offset_A,
+      int smem_offset_B,
+      int smem_sf_offset,
+      SharedStorage& shared_storage) const {
+    
+    // Load A/B fragments
+    arch::shared_load(fragA, &shared_storage.smem_A[smem_pipe_idx][k_block][smem_offset_A]);
+    arch::shared_load(fragB, &shared_storage.smem_B[smem_pipe_idx][k_block][smem_offset_B]);
+    
+    // Load SF fragments
+    uint32_t smem_ptr = cutlass::arch::cutlass_get_smem_pointer(&shared_storage.smem_SFA[smem_pipe_idx][k_block][smem_sf_offset]);
+    arch::shared_load<2>(&fragSFA, smem_ptr);
+    smem_ptr = cutlass::arch::cutlass_get_smem_pointer(&shared_storage.smem_SFB[smem_pipe_idx][k_block][threadIdx.x * kSFPerAccess]);
+    arch::shared_load<2>(&fragSFB, smem_ptr);
+
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7aaaa094c3dbe67328f1e39521fefb3f2b682b58
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/grouped_problem_visitor.h
@@ -0,0 +1,463 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base scheduler for grouped problems
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing the type of scheduling to perform for the ProblemVisitor
+enum class GroupScheduleMode {
+  // Perform all scheduling on device
+  kDeviceOnly,
+  // Precompute on the host the full sequence of problems to access
+  kHostPrecompute
+};
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape_>
+struct BaseGroupedProblemVisitor {
+  using ThreadblockShape = ThreadblockShape_;
+
+  struct ProblemInfo {
+    static int32_t const kNoPrefetchEntry = -1;
+    int32_t problem_idx;
+    int32_t problem_start;
+
+    CUTLASS_HOST_DEVICE
+    ProblemInfo() : problem_idx(kNoPrefetchEntry), problem_start(kNoPrefetchEntry) {}
+
+    CUTLASS_HOST_DEVICE
+    ProblemInfo(int32_t problem_idx_, int32_t problem_start_) :
+      problem_idx(problem_idx_), problem_start(problem_start_) {}
+  };
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes(nullptr), problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+  };
+
+  Params params;
+  int32_t tile_idx;
+  int32_t problem_tile_start;
+  int32_t problem_idx;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaseGroupedProblemVisitor(
+    Params const &params_,
+    int32_t block_idx
+  ):
+  params(params_),
+  tile_idx(block_idx),
+  problem_tile_start(0),
+  problem_idx(0)
+  {}
+
+  /// Get the grid shape
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return ProblemSizeHelper::grid_shape(problem);
+  }
+
+  /// Gets the global tile index
+  CUTLASS_HOST_DEVICE
+  int32_t tile_index() const {
+    return tile_idx;
+  }
+
+  /// Gets the index of the problem
+  CUTLASS_HOST_DEVICE
+  int32_t problem_index() const {
+    return problem_idx;
+  }
+
+  CUTLASS_HOST_DEVICE
+  int32_t threadblock_idx() const {
+    return tile_idx - problem_tile_start;
+  }
+
+  CUTLASS_DEVICE
+  void advance(int32_t grid_size) {
+    tile_idx += grid_size;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+  }
+
+  /// Returns the problem size for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size() const {
+    GemmCoord problem = params.problem_sizes[problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return ProblemSizeHelper::tile_count(grid);
+  }
+
+  static int32_t group_tile_count(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr, int32_t problem_count) {
+    int32_t total_tiles = 0;
+    for (int32_t i = 0; i < problem_count; ++i) {
+      auto problem = host_problem_sizes_ptr[i];
+      possibly_transpose_problem(problem);
+      auto grid = grid_shape(problem);
+      total_tiles += tile_count(grid);
+    }
+
+    return total_tiles;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ProblemSizeHelper,
+  typename ThreadblockShape,
+  GroupScheduleMode GroupScheduleMode_,
+  int PrefetchTileCount,
+  int ThreadCount
+>
+struct GroupedProblemVisitor;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ProblemVisitor that performs all scheduling on device
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kDeviceOnly,
+                             PrefetchTileCount,
+                             ThreadCount>: public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+  static bool const kRequiresPrecomputation = false;
+  static int const kThreadsPerWarp = 32;
+
+  struct SharedStorage {};
+
+  // Final tile of the problem loaded by this thread. Each thread will hold
+  // a separate value.
+  int32_t problem_ending_tile;
+
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  problem_ending_tile(0),
+  shared_storage(shared_storage_)
+  {
+    this->problem_idx = -1 * kThreadsPerWarp;
+    this->problem_tile_start = 0;
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    // Check whether the tile to compute is within the range of the current problem.
+    int32_t problem_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, this->problem_idx % kThreadsPerWarp);
+    if (this->tile_idx < problem_tile_end) {
+      return true;
+    }
+
+    // Check whether the tile to compute is within the current group of problems fetched by the warp.
+    // The last tile for this group is the final tile of the problem held by the final thread in the warp.
+    int32_t group_tile_end = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+    // Keep the starting problem for this group in `problem_idx`. This is done to reduce
+    // register pressure. The starting problem for this group is simply the first problem
+    // in the group most recently fetched by the warp.
+    int32_t &group_problem_start = this->problem_idx;
+    group_problem_start = (this->problem_idx / kThreadsPerWarp) * kThreadsPerWarp;
+
+    // Keep the starting tile for this group in `problem_tile_start`. This is done to reduce
+    // register pressure.
+    int32_t &group_tile_start = this->problem_tile_start;
+
+    // Each thread in the warp processes a separate problem to advance until
+    // reaching a problem whose starting tile is less less than tile_idx.
+    while (group_tile_end <= this->tile_idx) {
+      group_problem_start += kThreadsPerWarp;
+      if (group_problem_start > this->params.problem_count) {
+        return false;
+      }
+
+      // Since `group_tile_start` is a reference to `this->problem_tile_start`, this
+      // also sets `this->problem_tile_start`. The fact that `this->problem_tile_start`
+      // is also set here is used later in `next_tile`.
+      group_tile_start = group_tile_end;
+
+      int lane_idx = threadIdx.x % kThreadsPerWarp;
+      int32_t lane_problem = group_problem_start + lane_idx;
+
+      // Compute the number of tiles in the problem assigned to each thread.
+      problem_ending_tile = 0;
+      if (lane_problem < this->params.problem_count) {
+        cutlass::gemm::GemmCoord problem = this->params.problem_sizes[lane_problem];
+        this->possibly_transpose_problem(problem);
+        cutlass::gemm::GemmCoord grid = this->grid_shape(problem);
+        problem_ending_tile = this->tile_count(grid);
+      }
+
+      // Compute a warp-wide inclusive prefix sum to compute the ending tile index of
+      // each thread's problem.
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < kThreadsPerWarp; i <<= 1) {
+        int32_t val = __shfl_up_sync(0xffffffff, problem_ending_tile, i);
+        if (lane_idx >= i) {
+          problem_ending_tile += val;
+        }
+      }
+
+      // The total tile count for this group is now in the final position of the prefix sum
+      int32_t tiles_in_group = __shfl_sync(0xffffffff, problem_ending_tile, kThreadsPerWarp-1);
+
+      problem_ending_tile += group_tile_start;
+      group_tile_end += tiles_in_group;
+    }
+
+    // The next problem to process is the first one that does not have ending tile position
+    // that is greater than or equal to tile index.
+    int32_t problem_idx_in_group =
+        __popc(__ballot_sync(0xffffffff, problem_ending_tile <= this->tile_idx));
+
+    this->problem_idx = group_problem_start + problem_idx_in_group;
+
+    // The starting tile for this problem is the ending tile of the previous problem. In cases
+    // where `problem_idx_in_group` is the first problem in the group, we do not need to reset
+    // `problem_tile_start`, because it is set to the previous group's ending tile in the while
+    // loop above.
+    if (problem_idx_in_group > 0) {
+      this->problem_tile_start = __shfl_sync(0xffffffff, problem_ending_tile, problem_idx_in_group - 1);
+    }
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Precomputes schedule on host and prefetches into shared memory
+//
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct GroupedProblemVisitor<ProblemSizeHelper,
+                             ThreadblockShape,
+                             GroupScheduleMode::kHostPrecompute,
+                             PrefetchTileCount,
+                             ThreadCount> : public BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  static_assert(PrefetchTileCount > 0,
+                "GroupedProblemVisitor with GroupScheduleMode `kHostPrecompute` currently requires prefetching to shared memory");
+
+  using Base = BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  using ProblemInfo = typename Base::ProblemInfo;
+  static bool const kRequiresPrecomputation = true;
+
+  static int const kPrefetchTileCount = PrefetchTileCount;
+  static int const kThreadCount = ThreadCount;
+
+  struct SharedStorage {
+    // Sequence of problem IDs and starting tiles to compute
+    cutlass::Array<ProblemInfo, kPrefetchTileCount> prefetched_problems;
+  };
+
+  int32_t tiles_computed;
+  int32_t iterations_per_block;
+  int32_t block_load_start;
+  SharedStorage &shared_storage;
+  ProblemInfo const *problem_info_ptr;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  GroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  tiles_computed(0),
+  shared_storage(shared_storage_),
+  problem_info_ptr(reinterpret_cast<ProblemInfo const*>(params_.workspace))
+  {
+    iterations_per_block = (params_.tile_count - 1 + gridDim.x) / gridDim.x;
+    block_load_start = iterations_per_block * block_idx;
+    // Start prefetching the first set of tiles to compute
+    prefetch_tiles();
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx >= this->params.tile_count) {
+      return false;
+    }
+
+    int32_t prefetch_idx = (tiles_computed % kPrefetchTileCount);
+    if (prefetch_idx == 0) {
+      // Ensure all previous stores to shared memory have been completed
+      __syncthreads();
+    }
+
+    auto problem_info = shared_storage.prefetched_problems[prefetch_idx];
+    ++tiles_computed;
+
+    if ((tiles_computed % kPrefetchTileCount) == 0) {
+      // Begin prefetching next set of tiles. Synchronize first to ensure that
+      // we don't overwrite the current buffer while someone else is using it.
+      __syncthreads();
+      prefetch_tiles();
+    }
+
+    this->problem_idx = problem_info.problem_idx;
+    this->problem_tile_start = problem_info.problem_start;
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = ((total_tiles - 1 + block_count) / block_count);
+    return sizeof(ProblemInfo) * entries_per_block * block_count;
+  }
+#if !defined(__CUDACC_RTC__)
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {
+    ProblemInfo* host_problem_info_ptr = reinterpret_cast<ProblemInfo*>(host_workspace_ptr);
+    int32_t total_tiles = Base::group_tile_count(host_problem_sizes_ptr, problem_count);
+    int32_t entries_per_block = (total_tiles - 1 + block_count) / block_count;
+
+    int tile = 0;
+    int start_tile = 0;
+    for (int p_idx = 0; p_idx < problem_count; ++p_idx) {
+      auto problem = host_problem_sizes_ptr[p_idx];
+      Base::possibly_transpose_problem(problem);
+      auto grid = Base::grid_shape(problem);
+      int tiles = Base::tile_count(grid);
+      ProblemInfo problem_info(p_idx, start_tile);
+      for (int i = 0; i < tiles; ++i, ++tile) {
+        host_problem_info_ptr[(entries_per_block * (tile % block_count)) + (tile / block_count)] = problem_info;
+      }
+      start_tile += tiles;
+    }
+  }
+#endif
+private:
+  CUTLASS_DEVICE
+  void prefetch_tiles() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int32_t i = 0; i < kPrefetchTileCount; i += kThreadCount) {
+      int32_t offset = threadIdx.x + i;
+      if (offset < kPrefetchTileCount && (tiles_computed + offset < iterations_per_block)) {
+        shared_storage.prefetched_problems[offset] = problem_info_ptr[block_load_start + tiles_computed + offset];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b1d2c95e8500bd444c385864da091be664d2ae8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_sparse_base.h
@@ -0,0 +1,115 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of sparse GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ParamsA,
+  typename TensorRefA,
+  typename ParamsB,
+  typename TensorRefB,
+  typename ParamsE,
+  typename TensorRefE>
+struct SparseParamsBase
+{
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size{};
+  cutlass::gemm::GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile;
+  ParamsA params_A{};
+  TensorRefA ref_A{};
+  ParamsB params_B{};
+  TensorRefB ref_B{};
+  ParamsE params_E{};
+  TensorRefE ref_E{};
+  int gemm_k_iterations{0};
+  int gemm_k_size{0};
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  SparseParamsBase() = default;
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SparseParamsBase(
+    cutlass::gemm::GemmCoord const & problem_size,
+    cutlass::gemm::GemmCoord const & grid_tiled_shape,
+    TensorRefA ref_A,
+    TensorRefB ref_B,
+    TensorRefE ref_E,
+    int const mma_shape_k)
+  :
+    problem_size(problem_size),
+    grid_tiled_shape(grid_tiled_shape),
+    swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+    params_A(ref_A.layout()),
+    ref_A(ref_A),
+    params_B(ref_B.layout()),
+    ref_B(ref_B),
+    params_E(ref_E.layout()),
+    ref_E(ref_E)
+  {
+    int total_gemm_k_iterations = (problem_size.k() + mma_shape_k - 1) / mma_shape_k;
+    int gemm_k_iterations = (total_gemm_k_iterations + grid_tiled_shape.k() - 1) / grid_tiled_shape.k();
+
+    gemm_k_size = gemm_k_iterations * mma_shape_k;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_universal_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..46933d904d1a9a52adf7fdd8e790f1f034211cec
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/params_universal_base.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Base functionality for common types of universal GEMM kernel parameters
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/gemm.h"
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace util {
+
+template <class LayoutA, class LayoutB>
+CUTLASS_HOST_DEVICE
+static bool 
+is_continous_k_aligned(GemmCoord problem_size, size_t alignmentA, size_t alignmentB) {
+  return (platform::is_same<LayoutA, layout::RowMajor>::value && (problem_size.k() % alignmentA) == 0) ||
+         (platform::is_same<LayoutB, layout::ColumnMajor>::value && (problem_size.k() % alignmentB) == 0);
+}
+
+}  // namespace util
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Argument structure
+struct UniversalArgumentsBase
+{
+  //
+  // Data members
+  //
+
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  GemmCoord problem_size{};
+  int batch_count{1};
+  int64_t batch_stride_D{0};
+
+  //
+  // Methods
+  //
+
+  UniversalArgumentsBase() = default;
+
+  /// constructs an arguments structure
+  UniversalArgumentsBase(
+    GemmUniversalMode mode,
+    GemmCoord problem_size,
+    int batch_count,
+    int64_t batch_stride_D)
+  :
+    mode(mode),
+    problem_size(problem_size),
+    batch_count(batch_count),
+    batch_stride_D(batch_stride_D)
+  {
+    CUTLASS_TRACE_HOST("GemmUniversal::Arguments::Arguments() - problem_size: " << problem_size);
+  }
+};
+
+
+/// Parameters structure
+template <
+  typename ThreadblockSwizzle,
+  typename ThreadblockShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename LayoutA,
+  typename LayoutB>
+struct UniversalParamsBase
+{
+  //
+  // Data members
+  //
+
+  GemmCoord problem_size{};
+  GemmCoord grid_tiled_shape{};
+  int swizzle_log_tile{0};
+  GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+  int batch_count {0};
+  int gemm_k_size {0};
+  int64_t batch_stride_D {0};
+  int *semaphore = nullptr;
+
+
+  //
+  // Host dispatch API
+  //
+
+  /// Default constructor
+  UniversalParamsBase() = default;
+
+  /// Constructor
+  UniversalParamsBase(
+    UniversalArgumentsBase const &args, /// GEMM application arguments
+    int device_sms,                     /// Number of SMs on the device
+    int sm_occupancy)                   /// Kernel SM occupancy (in thread blocks)
+  :
+    problem_size(args.problem_size),
+    mode(args.mode),
+    batch_count(args.batch_count),
+    batch_stride_D(args.batch_stride_D),
+    semaphore(nullptr)
+  {
+    init_grid_tiled_shape();
+  }
+
+  /// Returns the workspace size (in bytes) needed for this problem geometry
+  size_t get_workspace_size() const
+  {
+    size_t workspace_bytes = 0;
+    if (mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      // Split-K parallel always requires a temporary workspace
+      workspace_bytes =
+        sizeof(ElementC) *
+        size_t(batch_stride_D) *
+        size_t(grid_tiled_shape.k());
+    }
+    else if (mode == GemmUniversalMode::kGemm && grid_tiled_shape.k() > 1)
+    {
+      // Serial split-K only requires a temporary workspace if the number of partitions along the
+      // GEMM K dimension is greater than one.
+      workspace_bytes = sizeof(int) * size_t(grid_tiled_shape.m()) * size_t(grid_tiled_shape.n());
+    }
+
+    return workspace_bytes;
+  }
+
+
+  /// Assign and initialize the specified workspace buffer.  Assumes
+  /// the memory allocated to workspace is at least as large as get_workspace_size().
+  Status init_workspace(
+    void *workspace,
+    cudaStream_t stream = nullptr)
+  {
+    semaphore = static_cast<int *>(workspace);
+    // Zero-initialize entire workspace
+    if (semaphore)
+    {
+      size_t workspace_bytes = get_workspace_size();
+
+      CUTLASS_TRACE_HOST("  Initialize " << workspace_bytes << " workspace bytes");
+
+      cudaError_t result = cudaMemsetAsync(
+        static_cast<int *>(workspace),
+        0,
+        workspace_bytes,
+        stream);
+
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+
+  /// Returns the GEMM volume in thread block tiles
+  GemmCoord get_tiled_shape() const
+  {
+    return grid_tiled_shape;
+  }
+
+
+  /// Returns the total number of thread blocks to launch
+  int get_grid_blocks() const
+  {
+    dim3 grid_dims = get_grid_dims();
+    return grid_dims.x * grid_dims.y * grid_dims.z;
+  }
+
+
+  /// Returns the grid extents in thread blocks to launch
+  dim3 get_grid_dims() const
+  {
+    return ThreadblockSwizzle().get_grid_shape(grid_tiled_shape);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  void init_grid_tiled_shape() {
+    // Get GEMM volume in thread block tiles
+    grid_tiled_shape = ThreadblockSwizzle::get_tiled_shape(
+      problem_size,
+      {ThreadblockShape::kM, ThreadblockShape::kN, ThreadblockShape::kK},
+      batch_count);
+
+    swizzle_log_tile = ThreadblockSwizzle::get_log_tile(grid_tiled_shape);
+
+    // Determine extent of K-dimension assigned to each block
+    gemm_k_size = problem_size.k();
+
+    if (mode == GemmUniversalMode::kGemm || mode == GemmUniversalMode::kGemmSplitKParallel)
+    {
+      static const uint32_t CACHELINE_BYTES = 128;
+      static const size_t element_bytes_a = sizeof(ElementA);
+      static const size_t element_bytes_b = sizeof(ElementB);
+      static const size_t cacheline_elements_a = CACHELINE_BYTES / element_bytes_a;
+      static const size_t cacheline_elements_b = CACHELINE_BYTES / element_bytes_b;
+
+      const bool cacheline_alignment_needed =
+          util::is_continous_k_aligned<LayoutA, LayoutB>(problem_size, cacheline_elements_a, cacheline_elements_b);
+
+      int const kAlignK = const_max(
+                                    const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value),
+                                    cacheline_alignment_needed ? const_max(cacheline_elements_a, cacheline_elements_b) : 1);
+
+      gemm_k_size = round_up(ceil_div(problem_size.k(), batch_count), kAlignK);
+      if (gemm_k_size) {
+        grid_tiled_shape.k() = ceil_div(problem_size.k(), gemm_k_size);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..41165cfd94da02042567823d71f3c32cbda16298
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped.h
@@ -0,0 +1,686 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped Rank2K kernel.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/rank_2k_transpose_operands.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                          ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                          ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,                      ///! Epilogue
+  typename ThreadblockSwizzle_,            ///! Threadblock swizzling function
+  ComplexTransform OriginalTransformA_,    ///! Public-facing transformation on A
+  ComplexTransform OriginalTransformB_,    ///! Public-facing transformation on B
+  FillMode FillModeC_,                     ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_,                      ///! Blas3 computation mode
+  GroupScheduleMode GroupScheduleMode_,    ///! Type of scheduling to perform
+  bool Transposed = false
+>
+struct Rank2KGrouped {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+
+  static_assert(platform::is_same<typename Mma1::LayoutC, cutlass::layout::RowMajor>::value &&
+                platform::is_same<typename Mma2::LayoutC, cutlass::layout::RowMajor>::value,
+                "Kernel-level grouped Rank2K requires that LayoutC be row major.");
+
+  // Define generic Mma for usecases that use Kernel::Mma
+  using Mma = Mma1_;
+
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+  static bool const kTransposed = Transposed;
+
+  // Public-facing type definitions related to operand element type, layout, and complex conjugate
+  // operation. Must interact with the 'kTransposed' notion to reflect the original layout,
+  // fill mode, etc. passed in.
+  //
+  // Recall that a Rank2K operation performs (A x BT) + (B x AT)
+  // This is performed via:
+  //    Mma1 = (A x BT)
+  //    Mma2 = (B x AT)
+  //
+  // However, if C needs to be transposed, then this is changed to the following:
+  //    Mma1 = (B x AT)
+  //    Mma2 = (A x BT)
+  //
+  // The transformation above is achieved by swapping the Layouts/Elements/Transforms/etc.
+  // of A and B as they are passed into the instantiations of Mma1 and Mma2.
+  //
+  // Now, given access to only Mma1 and Mma2, as well as whether a transposition has occurred,
+  // we wish to retrieve the original Layouts/Elements/etc. for A and B that were passed into
+  // the device-level call.
+  //
+  // The logic to do this (which is made clearer by referencing the above instantiations) is as follows:
+  //   LayoutA = kTransposed ? Mma2::LayoutA : Mma1::LayoutA
+  //   LayoutB = kTransposed ? Mma1::LayoutA : Mma2::LayoutA
+  //
+  // We achieve this swapping by passing Mma1::*A and Mma2::*B to Rank2KMapArguments:
+  using MapArgumentsA = kernel::detail::Rank2KMapArguments<
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementA = typename MapArgumentsA::ElementA;
+  using LayoutA = typename MapArgumentsA::LayoutA;
+  static int const kAlignmentA = MapArgumentsA::kAlignmentA;
+
+  using MapArgumentsB = kernel::detail::Rank2KMapArguments<
+    typename Mma2::IteratorA::Element,
+    typename Mma2::IteratorA::Layout,
+    Mma2::kTransformA,
+    Mma2::IteratorA::AccessType::kElements,
+    typename Mma1::IteratorA::Element,
+    typename Mma1::IteratorA::Layout,
+    Mma1::kTransformA,
+    Mma1::IteratorA::AccessType::kElements,
+    typename Mma2::LayoutC,
+    FillModeC_,
+    kTransposed
+  >;
+
+  using ElementB = typename MapArgumentsB::ElementA;
+  using LayoutB = typename MapArgumentsB::LayoutA;
+  static int const kAlignmentB = MapArgumentsB::kAlignmentA;
+
+  // Use the user-provided TransformA and TransformB, rather than those
+  // resulting from MapArguments, because Mma1 and Mma2 may have different
+  // complex transforms than those passed in by the user.
+  // (See kernel/rank_2k_complex.h for an example of this)
+  static cutlass::ComplexTransform const kTransformA = OriginalTransformA_;
+  static cutlass::ComplexTransform const kTransformB = OriginalTransformB_;
+
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename MapArgumentsA::LayoutC;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static FillMode const kFillModeC = MapArgumentsA::kFillModeC;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static BlasMode const kBlasMode = BlasMode_;
+
+private:
+  static FillMode const kInternalFillModeC = FillModeC_;
+
+public:
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ProblemVisitor = Rank2KGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount,
+                            kInternalFillModeC>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord *problem_sizes = nullptr;
+    int problem_count{0};
+    int threadblock_count{0};
+
+    typename EpilogueOutputOp::Params epilogue;
+
+    ElementA ** ptr_A = nullptr;
+    ElementB ** ptr_B = nullptr;
+    ElementC ** ptr_C = nullptr;
+    ElementC ** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex *lda = nullptr;
+    typename LayoutB::Stride::LongIndex *ldb = nullptr;
+    typename LayoutC::Stride::LongIndex *ldc = nullptr;
+    typename LayoutC::Stride::LongIndex *ldd = nullptr;
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord *problem_sizes,
+      int problem_count,
+      int threadblock_count,
+      typename EpilogueOutputOp::Params epilogue,
+      ElementA ** ptr_A,
+      ElementB ** ptr_B,
+      ElementC ** ptr_C,
+      ElementC ** ptr_D,
+      typename LayoutA::Stride::LongIndex *lda,
+      typename LayoutB::Stride::LongIndex *ldb,
+      typename LayoutC::Stride::LongIndex *ldc,
+      typename LayoutC::Stride::LongIndex *ldd,
+      GemmCoord *host_problem_sizes=nullptr,
+      bool allow_early_exit=false
+    ):
+      mode(mode),
+      problem_sizes(problem_sizes),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      epilogue(epilogue),
+      ptr_A(ptr_A),
+      ptr_B(ptr_B),
+      ptr_C(ptr_C),
+      ptr_D(ptr_D),
+      lda(lda),
+      ldb(ldb),
+      ldc(ldc),
+      ldd(ldd),
+      host_problem_sizes(host_problem_sizes),
+      allow_early_exit(allow_early_exit)
+    {
+
+    }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor{};
+    int threadblock_count = 0;
+
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count = 0;
+
+    ElementA** ptr_A = nullptr;
+    ElementB** ptr_B = nullptr;
+    ElementC** ptr_C = nullptr;
+    ElementC** ptr_D = nullptr;
+
+    typename LayoutA::Stride::LongIndex* lda = nullptr;
+    typename LayoutB::Stride::LongIndex* ldb = nullptr;
+    typename LayoutC::Stride::LongIndex* ldc = nullptr;
+    typename LayoutC::Stride::LongIndex* ldd = nullptr;
+
+    bool allow_early_exit = false;
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args, void *workspace = nullptr, int tile_count = 0):
+      problem_visitor(args.problem_sizes, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      output_op(args.epilogue),
+      ptr_A(args.ptr_A),
+      ptr_B(args.ptr_B),
+      ptr_C(args.ptr_C),
+      ptr_D(args.ptr_D),
+      lda(args.lda),
+      ldb(args.ldb),
+      ldc(args.ldc),
+      ldd(args.ldd),
+      allow_early_exit(args.allow_early_exit)
+    {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes, args.problem_count, workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      output_op = args.output_op;
+      ptr_A = args.ptr_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+    }
+  };
+
+  /// Shared memory storage structure
+  struct SharedStorage {
+    union {
+      typename Mma1::SharedStorage mma1_main_loop;
+      typename Mma2::SharedStorage mma2_main_loop;
+      typename Epilogue::SharedStorage epilogue;
+    } kernel;
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  Rank2KGrouped() = default;
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    //
+    // Problem visitor.
+    //
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size  = problem_visitor.problem_size();
+      int32_t problem_idx     = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+
+      cutlass::gemm::GemmCoord threadblock_tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
+
+      //
+      // Perform checks to determine whether the results of this threadblock will be needed.
+      // An example of an unneeded threadblock is one that is assigned to compute in the upper
+      // portion of a Rank2K kernel filled with mode kLower.
+      //
+      //
+
+      // Early exit if threadblock is out of range
+      if (grid_shape.m() <= threadblock_tile_offset.m() ||
+          grid_shape.n() <= threadblock_tile_offset.n()) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Lower and
+      // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kLower &&
+          (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      // Skip this tile if Fill Mode is Upper and
+      // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+      if (kInternalFillModeC == cutlass::FillMode::kUpper &&
+          threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        // Next tile
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      bool tile_on_diagonal = false;
+      // Mark tiles that are being crossed by the main diagonal
+      // (top-right and bottom-left corners are on either side of the diagonal)
+      if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+          && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+        tile_on_diagonal = true;
+      }
+
+      int offset_k = 0;
+      int problem_size_k = problem_size.k();
+
+      //
+      // Fetch pointers based on mode.
+      //
+      if (params.mode == GemmUniversalMode::kGemm ||
+          params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+        if (threadblock_tile_offset.k() + 1 < grid_shape.k()) {
+          problem_size_k = (threadblock_tile_offset.k() + 1) * problem_size.k();
+        }
+
+        offset_k = threadblock_tile_offset.k() * problem_size.k();
+      }
+
+      ElementA *ptr_A = reinterpret_cast<ElementA *>((kTransposed ? params.ptr_B[problem_idx] : params.ptr_A[problem_idx]));
+      typename LayoutA::Stride::LongIndex ldm_A = (kTransposed ? params.ldb[problem_idx] : params.lda[problem_idx]);
+
+      ElementB *ptr_B = reinterpret_cast<ElementB *>((kTransposed ? params.ptr_A[problem_idx] : params.ptr_B[problem_idx]));
+      typename LayoutB::Stride::LongIndex ldm_B = (kTransposed ? params.lda[problem_idx] : params.ldb[problem_idx]);
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_MxK{
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        offset_k,
+      };
+
+      cutlass::MatrixCoord tb_offset_KxN{
+        offset_k,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      };
+
+      // Assume identity swizzle
+      MatrixCoord tb_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Construct iterators to A and B operands for Mma1
+      typename Mma1::IteratorA iterator_A(
+        typename Mma1::IteratorA::Params(ldm_A),
+        ptr_A,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma1::IteratorB iterator_BT(
+        typename Mma1::IteratorB::Params(ldm_B),
+        ptr_B,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Construct iterators to A and B operands for Mma2
+      typename Mma2::IteratorA iterator_B(
+        typename Mma2::IteratorA::Params(ldm_B),
+        ptr_B,
+        {problem_size.m(), problem_size_k},
+        thread_idx,
+        tb_offset_MxK);
+
+      typename Mma2::IteratorB iterator_AT(
+        typename Mma2::IteratorB::Params(ldm_A),
+        ptr_A,
+        {problem_size_k, problem_size.n()},
+        thread_idx,
+        tb_offset_KxN);
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = canonical_warp_idx_sync();
+
+      int lane_idx = threadIdx.x % 32;
+
+      //
+      // Main loop
+      //
+
+      // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+      Mma1 mma1(shared_storage.kernel.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+      // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+      Mma2 mma2(shared_storage.kernel.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+      typename Mma1::FragmentC accumulators;
+
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+      // Wait for all threads to finish their epilogue phases from the previous tile.
+      __syncthreads();
+
+      // Compute threadblock-scoped matrix multiply-add (A x BT)
+      mma1(
+        gemm_k_iterations,
+        accumulators,
+        iterator_A,
+        iterator_BT,
+        accumulators);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+
+        //
+        // Epilogue
+        //
+
+        EpilogueOutputOp output_op(params.output_op);
+
+        int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+        ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+        ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+        // If TB not on diagonal, FillMode doesn't apply.
+        FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+        // Tile iterator loading from source tensor.
+        typename Epilogue::OutputTileIterator iterator_C(
+          typename Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+          ptr_C,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        // Tile iterator writing to destination tensor.
+        typename Epilogue::OutputTileIterator iterator_D(
+          typename Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+          ptr_D,
+          problem_size.mn(),
+          thread_idx,
+          tb_offset,
+          kFillModeTB
+        );
+
+        Epilogue epilogue(
+          shared_storage.kernel.epilogue,
+          thread_idx,
+          warp_idx,
+          lane_idx);
+
+        // Execute the epilogue operator to update the destination tensor.
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+
+        __syncthreads();
+
+        accumulators.clear();
+      }
+
+      // Compute threadblock-scoped matrix multiply-add (B x AT)
+      mma2(
+        gemm_k_iterations,
+        accumulators,
+        iterator_B,
+        iterator_AT,
+        accumulators);
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+      typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+      EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * grid_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C[problem_idx]);
+
+      // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+      if (kBlasMode == BlasMode::kHermitian) {
+        ptr_C = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+      }
+
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D[problem_idx]);
+
+      // If TB not on diagonal, FillMode doesn't apply.
+      FillMode kFillModeTB = tile_on_diagonal ? kInternalFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        typename Epilogue::OutputTileIterator::Params(params.ldc[problem_idx]),
+        ptr_C,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        typename Epilogue::OutputTileIterator::Params(params.ldd[problem_idx]),
+        ptr_D,
+        problem_size.mn(),
+        thread_idx,
+        tb_offset,
+        kFillModeTB
+      );
+
+      Epilogue epilogue(
+        shared_storage.kernel.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      if (kBlasMode == BlasMode::kSymmetric) {
+        epilogue(
+          output_op,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      } else {
+        epilogue(
+          output_op_her2k,
+          iterator_D,
+          accumulators,
+          iterator_C);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9fcf0c011f5a0ab828d5de286cd8bef5a6c6127
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Problem visitor for grouped Rank2K operations.
+
+    This problem visitor is specialized for Rank2K operations, for which matrix C is upper/lower
+    triangular. Using a problem visitor designed for GEMMs for Rank2K problems is inefficient
+    because threadblocks will be frequently assigned to tiles that exit early (e.g., due to
+    being assigned to a tile in the upper-triangular portion of a lower-triangular problem).
+    This can lead to load imbalance among threadblocks, as the GEMM-based scheduler
+    assigns all threadblocks to nearly the same number of tiles, regardless of whether
+    those tiles exit early.
+
+    Consider an example of a group of four Rank2Ks with matrix C consisting of a grid of 2x2 tiles.
+    Consider a grid of 8 threadblocks. The default GEMM scheduler will assign threadblocks to
+    tiles in the following order:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  1          4  5           0  1          4  5
+          2  3          6  7           2  3          6  7
+    Assuming that the problems are lower triangular, blocks 1 and 5 are continuously assigned
+    to inactive tiles.
+
+    This problem visitor aims to assign threadblocks to only those tiles which are in the
+    upper/lower triangular portion of a given problem. Using the example above, the resulting
+    assignment would be:
+        Rank2K 0      Rank2K 1       Rank2K 2      Rank2K 3
+          0  -          3  -           6  -          1  -
+          1  2          4  5           7  0          2  3
+
+    Achieving the schedule above requires a mapping from threadblock ID to tile coordinates (i, j).
+    We will illustrate this by mapping on a lower-triangular matrix with a 3x3 grid. We first
+    calculate row and column indices assuming one-indexed rows, tiles, and threadblock IDs, and
+    then subtract one to convert to zero-indexed.
+                      Col 1   Col 2   Col 3
+                     ----------------------
+              Row 1 |   1      -       -
+              Row 2 |   2      3       -
+              Row 3 |   4      5       6
+
+    We next outline this mapping, borrowing from: https://stackoverflow.com/a/40954159
+
+    Calculating row i given threadblock ID t
+    ----------------------------------------
+    For a given row i, all threadblock IDs t in that row satisfy the following:
+          t <= 1 + 2 + 3 + ... + (i-1) + i
+
+    The closed-form equation for the right-hand side is: i(i+1)/2.
+    Using this, we can solve for i given t:
+          t  <= i(i+1)/2
+          2t <= i^2 + i
+          2t <= i^2 + i + 0.25 - 0.25
+          2t + 0.25 <= i^2 + i + 0.25
+          2t + 0.25 <= (i + 0.5)^2
+          sqrt(2t + 0.25) - 0.5 <= i
+
+    To account for fractional values, we set:
+          i = ceil(sqrt(2t + 0.25) - 0.5)
+
+    To turn this into a zero-indexed row and work with zero-indexed t, we perform:
+          i = ceil(sqrt(2(t+1) + 0.25) - 0.5) - 1
+            = ceil(sqrt(2t + 2.25) - 0.5) - 1
+
+    Calculating column j given threadblock ID t and row i
+    -----------------------------------------------------
+    For a given row i, all threadblock IDs t in that row also satisfy the following:
+          t > 1 + 2 + 3 + ... + (i-2) + (i-1)
+      --> t > i(i-1)/2
+
+    Threadblock IDs within a given row are sequential, so the one-indexed column ID
+    for one-indexed threadblock ID t and row i is:
+          j = t - (i(i-1)/2)
+
+    The zero-indexed version becomes:
+          j = (t+1) - (i(i+1)/2) -1
+            = t - (i(i+1)/2)
+
+    Accounting for non-square grids
+    -------------------------------
+    Though the overall output problem size for Rank2K problems is guranteed to be square, the
+    grids used in computing may not be square due to using non-square threadblock shapes. For
+    example, a threadblock shape of 64x32 operating on a problem of output size 128x128 would
+    result in a grid of 2x4 tiles.
+
+    This case can be handled by noting that the output resembles a square grid of 2x2 "macro tiles"
+    each of which contains 2 "true tiles." We can thus first map a threadblock ID to its "macro tile"
+    using the equations above, and then map it to the "true tile" within its "macro tile." In the example
+    of a 2x4 grid, this mapping would look as follows:
+        "Macro grid"           "True grid"
+       {0, 1}    -            0   1   -   -
+       {2, 3}  {4, 5}         2   3   4   5
+
+    A zero-indexed threadblock ID t is mapped to its "macro tile ID" t_macro as:
+      t_macro = t // r
+    Where r is the ratio of the maximum dimension of the grid to the minimum dimension of the grid
+    (i.e., r = 4 / 2 = 2 in the previous example).
+
+    One uses t_macro and the calculations above to find the row and column in the square matrix to
+    obtain i_macro and j_macro (zero-indexed). The mapping from (i_macro, j_macro) --> (i, j)
+    is simply the following:
+        if (ThreadblockShape::M > ThreadblockShape::N):
+            r = ThreadblockShape::M / ThreadblockShape::N
+            i = i_macro
+            j = (j_macro * r) + (t % r)
+        elif (ThreadblockShape::M < ThreadblockShape::N):
+            r = ThreadblockShape::N / ThreadblockShape::M
+            i = (i_macro * r) + (t % r)
+            j = j_macro
+        else:
+            i = i_macro
+            j = j_macro
+
+    Handling cases with grid dimensions that aren't multiples of eachother
+    ----------------------------------------------------------------------
+    Even though threadblock shapes M and N are typically multiples of one another, the grid
+    for a given problem may not have dimensions of the same ratio as that of the threadblock.
+    For example, a problem of size 132x132 using a threadblock of shape 64x32 will result
+    in a grid of 3x5 tiles. In this case, there is not an integer number of "true tiles"
+    per "macro tile."
+
+    When this scenario arises, we simply pad the larger dimension of the grid such that
+    there are an integer number of "true tiles" per "macro tile." Thus, the 3x5 grid in
+    the example above will be treated as a 3x6 grid. Row and column positions for each
+    tile are calculated as above. Any threadblocks that map to tiles that are outside the
+    problem range or upper/lower triangular portion (e.g., (2, 5)) will exit early from
+    this problem and may proceed to the next problem in the group.
+
+    Handling upper-triangular matrices
+    ----------------------------------
+    The only modification needed for upper-triangular matrices is to swap i_macro and j_macro
+    in the calculations above.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+namespace detail {
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Helpers for calculating offsets for Rank2K problem visitor. These helpers specifically pertain
+// to the conversion from "macro tiles" to "true tiles" in the description above.
+//
+template <
+  typename ThreadblockShape,
+  typename Enable = void
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper;
+
+// Partial specialization for the case where threadblock shape M > threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM > ThreadblockShape::kN) >::type
+> {
+  static_assert(ThreadblockShape::kM % ThreadblockShape::kN == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M > threadblock shape N "
+             "requires that threadblock shape M be a multiple of threadblock shape N.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kM / ThreadblockShape::kN;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return (col * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+};
+
+// Partial specialization for the case where threadblock shape M < threadblock shape N
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM < ThreadblockShape::kN) >::type
+> {
+
+  static_assert(ThreadblockShape::kN % ThreadblockShape::kM == 0,
+             "Rank2KGroupedProblemVisitor with threadblock shape M < threadblock shape N "
+             "requires that threadblock shape N be a multiple of threadblock shape M.");
+
+  static int32_t const kThreadblockSkewRatio = ThreadblockShape::kN / ThreadblockShape::kM;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.n();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return (row * kThreadblockSkewRatio) + (threadblock_id % kThreadblockSkewRatio);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Partial specialization for the case where threadblock shape M == threadblock shape N
+// In this case, macro tiles are equivalent to true tiles, so the conversions are
+// identity functions.
+template <
+  typename ThreadblockShape
+>
+struct Rank2KGroupedProblemVisitorOffsetHelper<
+    ThreadblockShape,
+    typename platform::enable_if< (ThreadblockShape::kM == ThreadblockShape::kN) >::type
+> {
+
+  static int32_t const kThreadblockSkewRatio = 1;
+
+  CUTLASS_HOST_DEVICE
+  static int32_t min_dim(cutlass::gemm::GemmCoord grid) {
+    return grid.m();
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_row_to_row(int32_t row, int32_t threadblock_id) {
+    return row;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t macro_col_to_col(int32_t col, int32_t threadblock_id) {
+    return col;
+  }
+};
+
+// Helper for correctly representing problem sizes in grouped kernels 
+template <typename ThreadblockShape>
+struct Rank2KGroupedProblemSizeHelper {
+  using OffsetHelper = Rank2KGroupedProblemVisitorOffsetHelper<ThreadblockShape>;
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+      ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN),
+      1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    // Return the number of tiles at or below the diagonal (or at and above
+    // for mode kUpper). We do this by first calculating this value assuming
+    // we have a square matrix of tiles of size `dim x dim` where `dim` is the
+    // minimum among {grid.m(), grid.n()}. We then multiply the resulting value
+    // by OffsetHelper::kThreadblockSkewRatio to account for cases in which there
+    // are more tiles in one dimension than the other.
+    int32_t dim = OffsetHelper::min_dim(grid);
+    int32_t tiles_on_diagonal = dim;
+    int32_t tiles_below_diagonal = ((dim * (dim - 1)) / 2);
+    return (tiles_on_diagonal + tiles_below_diagonal) * OffsetHelper::kThreadblockSkewRatio;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Default problem visitor for fill modes kUpper and kLower.
+//
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC>
+struct Rank2KGroupedProblemVisitor : public GroupedProblemVisitor<
+                                              detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
+                                              ThreadblockShape,
+                                              GroupScheduleMode_,
+                                              PrefetchTileCount,
+                                              ThreadCount> {
+
+  static cutlass::FillMode const kFillModeC = FillModeC;
+
+  static_assert(kFillModeC == cutlass::FillMode::kLower || kFillModeC == cutlass::FillMode::kUpper,
+              "Default Rank2KGroupedProblemVisitor requires fill mode of kLower or kUpper.");
+
+  using ProblemSizeHelper = detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper,
+                                     ThreadblockShape,
+                                     GroupScheduleMode_,
+                                     PrefetchTileCount,
+                                     ThreadCount>;
+  using OffsetHelper = typename ProblemSizeHelper::OffsetHelper;
+  using Params = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  Rank2KGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, shared_storage_, block_idx)
+  {}
+
+  CUTLASS_DEVICE
+  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
+    int32_t macro_id = threadblock_id / OffsetHelper::kThreadblockSkewRatio;
+    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
+    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
+
+    if (kFillModeC == cutlass::FillMode::kUpper) {
+      cutlass::swap(macro_row, macro_col);
+    }
+
+    int32_t row = OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
+    int32_t col = OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
+
+    return cutlass::gemm::GemmCoord(row, col, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
new file mode 100644
index 0000000000000000000000000000000000000000..349cd25d028648a1e742d37c23893603c25b5ad6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+  \brief Transpositions for Rank2K problems.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_,
+  bool Transpose
+>
+struct Rank2KMapArguments {
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  static ComplexTransform const kTransformA = TransformA;
+  static int const kAlignmentA = AlignmentA;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  static ComplexTransform const kTransformB = TransformB;
+  static int const kAlignmentB = AlignmentB;
+  using LayoutC = LayoutC_;
+  static FillMode const kFillModeC = FillModeC_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  ComplexTransform TransformA,
+  int AlignmentA,
+  typename ElementB_,
+  typename LayoutB_,
+  ComplexTransform TransformB,
+  int AlignmentB,
+  typename LayoutC_,
+  FillMode FillModeC_
+>
+struct Rank2KMapArguments<
+  ElementA_,
+  LayoutA_,
+  TransformA,
+  AlignmentA,
+  ElementB_,
+  LayoutB_,
+  TransformB,
+  AlignmentB,
+  LayoutC_,
+  FillModeC_,
+  true
+> {
+  using ElementA = ElementB_;
+  using LayoutA = LayoutB_;
+  static ComplexTransform const kTransformA = TransformB;
+  static int const kAlignmentA = AlignmentB;
+  using ElementB = ElementA_;
+  using LayoutB = LayoutA_;
+  static ComplexTransform const kTransformB = TransformA;
+  static int const kAlignmentB = AlignmentA;
+  using LayoutC = typename layout::LayoutTranspose<LayoutC_>::type;
+  static FillMode const kFillModeC = InvertFillMode<FillModeC_>::mode;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..f304d060bf1c013caaeee531fa786c98fd4640fe
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_2k_universal.h
@@ -0,0 +1,769 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped matrix multiply-accumulate (A*B^T)
+  typename Mma2_,                 ///! Threadblock-scoped matrix multiply-accumulate (B*A^T)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_,            ///! Fill Mode for C (kLower or kUpper)
+  BlasMode BlasMode_              ///! Blas3 computation mode
+>
+struct Rank2KUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (A x B^T)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (B x A^T)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+  static BlasMode const kBlasMode = BlasMode_;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    GemmCoord problem_size {};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A = nullptr;
+    void const * ptr_B = nullptr;
+    void const * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
+
+      }
+
+      /// Returns arguments for a the transposed problem
+      Arguments transposed_problem() const {
+        Arguments args(*this);
+        
+        std::swap(args.ptr_A, args.ptr_B);
+        std::swap(args.lda, args.ldb);
+        std::swap(args.batch_stride_A, args.batch_stride_B);
+
+        return args;
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A{};
+    typename Mma1::IteratorB::Params params_BT{};
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_B{};
+    typename Mma2::IteratorB::Params params_AT{};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
+
+    void * ptr_A = nullptr;
+    void * ptr_B = nullptr;
+    void * ptr_C = nullptr;
+    void * ptr_D = nullptr;
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    int *semaphore = nullptr;
+
+    bool allow_early_exit {false};
+
+    //
+    // Methods
+    //
+
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_BT(args.ldb),
+      params_B(args.ldb),
+      params_AT(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  Rank2KUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma1::Shape::kM <= threadblock_tile_offset.n() * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma1::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM > threadblock_tile_offset.n() * Mma1::Shape::kN
+        && threadblock_tile_offset.m() * Mma1::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma1::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma1::IteratorB iterator_BT(
+      params.params_BT,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_B(
+      params.params_B,
+      ptr_B,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK);
+
+    typename Mma2::IteratorB iterator_AT(
+      params.params_AT,
+      ptr_A,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1 (A x BT)
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2 (B x AT)
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add (A x BT)
+    mma1(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_BT, 
+      accumulators);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+
+      //
+      // Epilogue
+      //
+
+      EpilogueOutputOp output_op(params.output_op);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * Mma1::Shape::kM,
+        threadblock_tile_offset.n() * Mma1::Shape::kN
+      );
+
+      int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+      ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+      ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+      //
+      // Fetch pointers based on mode.
+      //
+      
+      // Construct the semaphore.
+      Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+      if (params.mode == GemmUniversalMode::kGemm) {
+
+        // If performing a reduction via split-K, fetch the initial synchronization
+        if (params.grid_tiled_shape.k() > 1) {
+          
+          // Fetch the synchronization lock initially but do not block.
+          semaphore.fetch();
+
+          // Indicate which position in a serial reduction the output operator is currently updating
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+      else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kBatched) {
+        ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+        ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+      }
+      else if (params.mode == GemmUniversalMode::kArray) {
+        ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+        ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+      }
+
+      
+      // If CTA not on diagonal, FillMode doesn't apply. 
+      FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C(
+        params.params_C,
+        ptr_C,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D(
+        params.params_D,
+        ptr_D,
+        params.problem_size.mn(),
+        thread_idx,
+        threadblock_offset,
+        kFillModeCTA
+      );
+
+      Epilogue epilogue(
+        shared_storage.epilogue, 
+        thread_idx, 
+        warp_idx, 
+        lane_idx);
+
+      // Wait on the semaphore - this latency may have been covered by iterator construction
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+          
+        // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+        if (threadblock_tile_offset.k()) {
+          iterator_C = iterator_D;
+        }
+
+        semaphore.wait(threadblock_tile_offset.k());
+
+        __threadfence();
+      }
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(
+        output_op, 
+        iterator_D, 
+        accumulators, 
+        iterator_C); 
+      
+      //
+      // Release the semaphore
+      //
+
+      if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+        int lock = 0;
+        if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+          // The final threadblock resets the semaphore for subsequent grids.
+          lock = 0;
+        }
+        else {
+          // Otherwise, the semaphore is incremented
+          lock = threadblock_tile_offset.k() + 1;
+        }
+        
+        semaphore.release(lock);
+      }
+
+      __syncthreads();
+
+      accumulators.clear();
+    }
+
+    // Compute threadblock-scoped matrix multiply-add (B x AT)
+    mma2(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_B, 
+      iterator_AT, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    /* Needed for HER2K where the second HERK is multiplied by conj(alpha) */
+    typename EpilogueOutputOp::Params second_her2k_params(conj(params.output_op.alpha), 1);
+    EpilogueOutputOp output_op_her2k(second_her2k_params);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C);
+
+    // HER2K kernel needs Alpha to be complex and is conj(Alpha) is applied to the second HERK.
+    if (kBlasMode == BlasMode::kHermitian) {
+      ptr_C = static_cast<ElementC *>(params.ptr_D);
+    }
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        if (kBlasMode == BlasMode::kSymmetric) {
+          output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        } else {
+          output_op_her2k.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+        }
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    if (kBlasMode == BlasMode::kSymmetric) {
+      epilogue(
+        output_op,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    } else {
+      epilogue(
+        output_op_her2k,
+        iterator_D,
+        accumulators,
+        iterator_C);
+    }
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..960914327bf421b400dddb89d8054d12f7451685
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/rank_k_universal.h
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  FillMode FillModeC_             ///! Fill Mode for C (kLower or kUpper)
+>
+struct RankKUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static FillMode const kFillModeC = FillModeC_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = 128 / sizeof_bits<ElementA>::value;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{};
+    typename LayoutB::Stride::Index ldb{};
+    typename LayoutC::Stride::Index ldc{};
+    typename LayoutC::Stride::Index ldd{};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd,
+      bool allow_early_exit = false
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(0),
+      ldc(ldc), ldd(ldd),
+      allow_early_exit(allow_early_exit) {
+
+      }
+
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+   
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count{0};
+    int gemm_k_size{0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    int *semaphore{nullptr};
+
+    bool allow_early_exit{false};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.lda),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_A)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_A),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)),
+      allow_early_exit(args.allow_early_exit) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_A);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  RankKUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit following LAPACK's definition
+    if (params.allow_early_exit &&
+        (params.output_op.alpha == ElementC(0)) && (params.output_op.beta == ElementC(1))) {
+      return;
+    }
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    // Early exit if Fill Mode is Lower and
+    // if the entire tile is above the main diagonal (bottom-left corner is at or above the diagonal)
+    if (kFillModeC == cutlass::FillMode::kLower &&
+        (threadblock_tile_offset.m() + 1) * Mma::Shape::kM <= threadblock_tile_offset.n() * Mma::Shape::kN) {
+      return;
+    }    
+    
+    // Early exit if Fill Mode is Upper and
+    // if the entire tile is below the main diagonal (top-right corner is at or below the diagonal)
+    if (kFillModeC == cutlass::FillMode::kUpper &&
+        threadblock_tile_offset.m() * Mma::Shape::kM >= (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      return;
+    }    
+    
+    bool tile_on_diagonal = false;
+    // Mark tiles that are being crossed by the main diagonal
+    // (top-right and bottom-left corners are on either side of the diagonal)
+    if ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM > threadblock_tile_offset.n() * Mma::Shape::kN
+        && threadblock_tile_offset.m() * Mma::Shape::kM < (threadblock_tile_offset.n() + 1) * Mma::Shape::kN) {
+      tile_on_diagonal = true;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // If CTA not on diagonal, FillMode doesn't apply. 
+    FillMode kFillModeCTA = tile_on_diagonal ? kFillModeC : FillMode::kNone;
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset,
+      kFillModeCTA
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b86919b1cb1b98c765cbcd40b932f761a15d7d5d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp
@@ -0,0 +1,1147 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/mainloop_fusion_helper_scale_factor.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cute::disjunction_v<
+      cutlass::detail::is_kernel_tag_of<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+        KernelPtrArrayTmaWarpSpecializedSm100>,
+      cutlass::detail::is_kernel_tag_of<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+        KernelPtrArrayTmaWarpSpecializedBlockScaledSm100>>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using LayoutSFA = typename cutlass::detail::LayoutSFAType<CollectiveMainloop>::type;
+  using LayoutSFB = typename cutlass::detail::LayoutSFBType<CollectiveMainloop>::type;
+  using ElementSF = typename cutlass::detail::ElementSFType<CollectiveMainloop>::type;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  using TileSchedulerTag = cute::conditional_t<IsGroupedGemmKernel, GroupScheduler, TileSchedulerTag_>;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount, ProblemShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+  
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+      alignas(128) MainloopTensorMapStorage mainloop;
+    } tensormaps;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    MainloopLoad = 2,
+    EpilogueLoad = 3,
+    Epilogue     = 4
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t main_load = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+    ProblemShape problem_shapes = args.problem_shape;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (IsGroupedGemmKernel && sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    else if (!IsGroupedGemmKernel && sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace
+      );
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      scheduler,
+      args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    }
+    else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Mainloop, Epilogue or Scheduler don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Dynamic Cluster or Preferred Cluster don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    constexpr bool IsBlockscaled = !cute::is_void_v<ElementSF>;
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Mainloop
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Mainloop
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE: cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape.get_host_problem_shape(),
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    auto problem_shape = params.problem_shape;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue) ? WarpCategory(warp_idx)
+                                                                                     : WarpCategory::Epilogue;
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoad),                        // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue)                             // epilogue
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load;
+    mainloop_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop,
+                                       mainloop_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 1;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopLoadThreads;
+    load_order_barrier_params.initializing_warp = 3;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.initializing_warp = 4;
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+    // Now declare the pipeline outside the if constexpr
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 5;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
+    }
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category) {
+      if constexpr(!IsOverlappingAccum) {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumMMAThreads);
+        }
+      }
+      else {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+        }
+        else if (lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+        }
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    int32_t sm_id = static_cast<int32_t>(cutlass::arch::SmId());
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    
+    //
+    // TMEM "Allocation"
+    //
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+    pipeline_init_wait(cluster_size);
+
+    if constexpr (IsGroupedGemmKernel) {
+      if (not work_tile_info.is_valid()) {
+        // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+        return;
+      }
+      // In case user wants to engage less SMs than available on device
+      sm_id = blockIdx.x + (blockIdx.y * gridDim.x);
+    }
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    if (is_participant.main_load) {
+    auto load_inputs = collective_mainloop.load_init(
+      problem_shape_MNKL, params.mainloop,
+      shared_storage.tensors.mainloop,
+      shared_storage.tensormaps.mainloop,
+      params.hw_info.sm_count, sm_id, work_tile_info.L_idx);
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      Tensor gA_mkl = get<0>(load_inputs);
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto input_tensormaps = get<rank(load_inputs) - 1>(load_inputs);
+
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      bool requires_clc_query = true;
+
+      do {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
+
+        // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+        // we are managing TMA descriptors to change batches, we need to neglect the L mode 
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter, k_tile_prologue,
+          did_batch_change
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter_next, k_tile_count - k_tile_prologue,
+          false /* did_batch_change - prologue loads handle tensormap acquire */
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl));
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      // Grouped GEMM uses static tile scheduler
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+          
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+      else {
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+      auto mma_inputs = collective_mainloop.mma_init(tmem_storage, shared_storage.tensors.mainloop);
+
+      do {
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_producer_state.phase() ^ 1;
+          }
+          else {
+            return accumulator_pipe_producer_state.index();
+          }
+        }();
+        auto accumulator = collective_mainloop.slice_accumulator(tmem_storage, acc_stage);
+        if (is_mma_leader_cta) {
+          mainloop_pipe_consumer_state = collective_mainloop.mma(
+            cute::make_tuple(mainloop_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state),
+            accumulator,
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+          );
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_load_tensormap = get<0>(collective_epilogue.load_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = true;
+
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+          }
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            cute::make_tuple(epi_load_tensormap, did_batch_change),
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto warp_idx_in_epi = canonical_warp_idx_sync() - static_cast<int>(WarpCategory::Epilogue);
+      bool do_tail_store = false;
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change && warp_idx_in_epi == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+        auto accumulator = collective_mainloop.slice_accumulator(tmem_storage, acc_stage);
+
+        // Fusions may need problem shape for the current group
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        //
+        // Epilogue and write to gD
+        //
+        auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          problem_shape_MNKL,
+          CtaShape_MNK{},
+          cta_coord_mnkl,
+          TileShape{},
+          TiledMma{},
+          accumulator,
+          shared_storage.tensors.epilogue,
+          cute::make_tuple(epi_store_tensormap, did_batch_change)
+        );
+        epi_load_pipe_consumer_state = load_state_next;
+        epi_store_pipe_producer_state = store_state_next;
+        accumulator_pipe_consumer_state = acc_state_next;
+
+        do_tail_store |= TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..76432e1e787961eb74e6bf178e4929227b19ba45
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp
@@ -0,0 +1,1186 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, 
+                                KernelPtrArrayTmaWarpSpecializedInputTransformSm100>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+
+  // Get Blk and Scheduling tile shapes
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  // TileID scheduler
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads           = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMMAThreads             = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueThreads        = CollectiveMainloop::NumAccumThreads;           // 4 warps
+  static constexpr uint32_t NumEpilogueWarps          = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumTransformationThreads  = CollectiveMainloop::NumTransformationThreads;  // 4 warps
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads +
+                                                 NumEpilogueThreads + NumTransformationThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr cutlass::gemm::detail::KernelInputTransformType InputTransformType = DispatchPolicy::InputTransformType;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Transfer registers from regular warps to Accum warps
+  static constexpr uint32_t GenericRegisterRequirement = 152;
+  static constexpr uint32_t AccumRegisterRequirement = 200;
+
+  // Pipeline and pipeline state types
+  using Load2TransformPipeline = typename CollectiveMainloop::Load2TransformPipeline;
+  using Load2TransformPipelineState = typename CollectiveMainloop::Load2TransformPipelineState;
+
+  using Transform2MmaPipeline = typename CollectiveMainloop::Transform2MmaPipeline;
+  using Transform2MmaPipelineState = typename CollectiveMainloop::Transform2MmaPipelineState;
+
+  using Mma2AccumPipeline = typename CollectiveMainloop::Mma2AccumPipeline;
+  using Mma2AccumPipelineState = typename CollectiveMainloop::Mma2AccumPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+      alignas(16) arch::ClusterBarrier epilogue_throttle;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+      alignas(128) MainloopTensorMapStorage mainloop;
+    } tensormaps;
+    
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  // NOTE: MMA must be on the 0th thread of the warp-group, so make sure pipeline leader is on MainloopLoad warp
+  enum class WarpCategory : int32_t {
+    MMA           = 0,
+    Sched         = 1,
+    MainloopLoad  = 2,
+    EpilogueLoad  = 3,
+    Epilogue      = 4,
+    // Transformation starts at 256 thread alignment
+    Transformation    = 8
+  };
+
+  struct IsParticipant {
+    uint32_t mma            = false;
+    uint32_t sched          = false;
+    uint32_t main_load      = false;
+    uint32_t epi_load       = false;
+    uint32_t epilogue       = false;
+    uint32_t transformation = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+    ProblemShape problem_shapes = args.problem_shape;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(
+        problem_shapes.get_host_problem_shape(), TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Mainloop
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Mainloop
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape.get_host_problem_shape(),
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info
+       );
+}
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    auto problem_shape = params.problem_shape;
+
+    // Account for multiple epilogue and transformation warps
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue)       ? WarpCategory(warp_idx)
+                               : warp_idx < static_cast<int>(WarpCategory::Transformation) ? WarpCategory::Epilogue
+                                                                                           : WarpCategory::Transformation;
+    int thread_idx          = int(threadIdx.x);
+    int thread_idx_in_warp  = thread_idx % 32;
+    uint32_t lane_predicate = cute::elect_one_sync();
+    int cta_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size                = size(cluster_shape);
+    bool is_first_cta_in_cluster    = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
+    bool is_mma_leader_cta          = (cta_rank_in_cluster % size<0>(TiledMma{}) == 0);
+    // Even if this variable is unused, shape_div still performs useful compile-time checks.
+    [[maybe_unused]] auto mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                               // mma
+      (warp_category == WarpCategory::Sched) && (is_first_cta_in_cluster),                // sched
+      (warp_category == WarpCategory::MainloopLoad),                                      // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,                // epi_load
+      (warp_category == WarpCategory::Epilogue),                                          // epilogue
+      (warp_category == WarpCategory::Transformation)                                     // transformation
+    };
+
+    // MainloopLoad <--> Transformation Pipeline
+    typename Load2TransformPipeline::Params load2transform_pipeline_params;
+    if (warp_category == WarpCategory::MainloopLoad) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Transformation) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Consumer;
+    }
+    load2transform_pipeline_params.is_leader = (thread_idx_in_warp == 0);
+    load2transform_pipeline_params.num_consumers = NumTransformationThreads;
+    load2transform_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    load2transform_pipeline_params.initializing_warp = 0;
+    Load2TransformPipeline load2transform_pipeline(shared_storage.pipelines.mainloop.load2transform_pipeline,
+                                                   load2transform_pipeline_params,
+                                                   cluster_shape,
+                                                   cute::true_type{},  // Perform barrier init
+                                                   cute::false_type{}  // Delay mask calculation
+                                                   );
+
+    Load2TransformPipelineState load2transform_pipeline_consumer_state;
+    Load2TransformPipelineState load2transform_pipeline_producer_state = cutlass::make_producer_start_state<Load2TransformPipeline>();
+
+    // Transformation <--> MMA pipeline
+    typename Transform2MmaPipeline::Params transform2mma_pipeline_params;
+    if (warp_category == WarpCategory::Transformation) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::MMA) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Consumer;
+    }
+    transform2mma_pipeline_params.consumer_arv_count = 1;
+    transform2mma_pipeline_params.producer_arv_count = size(AtomThrShapeMNK{}) * NumTransformationThreads;
+    transform2mma_pipeline_params.initializing_warp = 2;
+    Transform2MmaPipeline transform2mma_pipeline(shared_storage.pipelines.mainloop.transform2mma_pipeline,
+                                                 transform2mma_pipeline_params,
+                                                 cluster_shape,
+                                                 cute::true_type{},  // Perform barrier init
+                                                 cute::false_type{}  // Delay mask calculation
+                                                 );
+
+    Transform2MmaPipelineState transform2mma_pipeline_consumer_state;
+    Transform2MmaPipelineState transform2mma_pipeline_producer_state = cutlass::make_producer_start_state<Transform2MmaPipeline>();
+
+    // MMA <--> Accumulator pipeline
+    typename Mma2AccumPipeline::Params mma2accum_pipeline_params;
+    if (warp_category == WarpCategory::MMA) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Epilogue) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Consumer;
+    }
+    mma2accum_pipeline_params.producer_arv_count = 1;
+    mma2accum_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    mma2accum_pipeline_params.initializing_warp = 6;
+    Mma2AccumPipeline mma2accum_pipeline(shared_storage.pipelines.mainloop.mma2accum_pipeline, 
+                                         mma2accum_pipeline_params,
+                                         cluster_shape,
+                                         cute::true_type{},  // Perform barrier init
+                                         cute::false_type{}  // Delay mask calculation
+                                         );
+
+    Mma2AccumPipelineState mma2accum_pipeline_consumer_state;
+    Mma2AccumPipelineState mma2accum_pipeline_producer_state = cutlass::make_producer_start_state<Mma2AccumPipeline>();
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = 1;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // CLC pipeline
+    // Operates Scheduling Warp <--> All Warps
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+
+    clc_pipeline_params.initializing_warp = 1;
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                                   NumTransformationThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                               NumTransformationThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+    
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
+
+    CLCPipelineState clc_pipeline_consumer_state;
+    CLCPipelineState clc_pipeline_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
+    }
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between transform, MMA, and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumTransformationThreads + NumMMAThreads + NumEpilogueThreads,
+                                                          cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
+    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
+    if (WarpCategory::MMA == warp_category && lane_predicate) {
+      epilogue_throttle_barrier.init(                          NumMMAThreads +
+                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
+                                                               NumMainloopLoadThreads +
+                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0) +
+                                                               NumTransformationThreads);
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    load2transform_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    transform2mma_pipeline.init_masks(cluster_shape);
+    mma2accum_pipeline.init_masks(cluster_shape);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    // Allocate accumulators
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+
+    // NOTE: we can assume the tmem buf starts at zero since we allocate all tmem in this kernel
+    auto bulk_tmem = TiledMma::make_fragment_C(append(acc_shape,
+                                                      Int<AccumulatorPipelineStageCount>{}));
+
+    // Tile transform inputs now to get the k tile count
+    auto transform_inputs = collective_mainloop.transform_init(params.mainloop, problem_shape_MNKL, bulk_tmem, shared_storage.tensors.mainloop);
+    Tensor gA_mkl = get<0>(transform_inputs);
+
+    // Synchronization call. Blocks until barriers are initialized in shared memory.
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop,
+          params.hw_info.sm_count, static_cast<int32_t>(cutlass::arch::SmId()));
+      Tensor gA_mkl = get<0>(load_inputs);
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto input_tensormaps = get<rank(load_inputs) - 1>(load_inputs);
+
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to
+      // that args value being device-only.
+      bool did_batch_change = true;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+      bool requires_clc_query = true;
+
+      do {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            curr_batch,
+            lane_predicate
+          );
+          // Ensure warp is converged before issuing tensormap fence release
+          __syncwarp();
+          // Entire warp must do this (i.e. it's aligned)
+          collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+        }
+
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(Load2TransformPipeline::Stages, k_tile_count);
+
+        // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+        // we are managing TMA descriptors to change batches, we need to neglect the L mode
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Check to see if tensormaps have been replaced in gmem
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+        }
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        if (lane_predicate) {
+          auto [load2transform_pipeline_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
+            params.mainloop,
+            load2transform_pipeline,
+            load2transform_pipeline_producer_state,
+            load_inputs,
+            cta_coord_mnk,
+            k_tile_iter, k_tile_prologue
+          );
+          load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next;
+
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          auto [load2transform_pipeline_producer_state_next_, unused_] = collective_mainloop.load(
+            params.mainloop,
+            load2transform_pipeline,
+            load2transform_pipeline_producer_state,
+            load_inputs,
+            cta_coord_mnk,
+            k_tile_iter_next, k_tile_count - k_tile_prologue
+          );
+          load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next_;
+        }
+        
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+        work_tile_info = next_work_tile_info;
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl));
+      } while (work_tile_info.is_valid());
+      if (lane_predicate) {
+        load2transform_pipeline.producer_tail(load2transform_pipeline_producer_state);
+      }
+
+    }
+
+    else if (is_participant.transformation) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+        auto [load2transform_pipeline_consumer_state_next, transform2mma_pipeline_producer_state_next] = collective_mainloop.transform(
+          load2transform_pipeline,
+          load2transform_pipeline_consumer_state,
+          transform2mma_pipeline,
+          transform2mma_pipeline_producer_state,
+          bulk_tmem,
+          transform_inputs,
+          k_tile_iter, k_tile_count
+        );
+        transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state_next;
+        load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state_next;
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      transform2mma_pipeline.producer_tail(transform2mma_pipeline_producer_state);
+    }
+
+    else if (is_participant.sched) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      // Grouped GEMM uses static tile scheduler
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipeline_producer_state = scheduler.advance_to_next_work(
+              clc_pipeline, 
+              clc_pipeline_producer_state
+            );
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipeline_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipeline_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipeline_producer_state);
+      }
+      else {
+        cutlass::arch::wait_on_dependent_grids();
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipeline_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipeline_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipeline_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Allocate all tmem
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      bulk_tmem.data() = tmem_base_ptr;
+
+      auto mma_input_operands = collective_mainloop.mma_init(bulk_tmem, shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (is_mma_leader_cta) {
+          auto [transform2mma_pipeline_consumer_state_next, mma2accum_pipeline_producer_state_next] = collective_mainloop.mma(
+            transform2mma_pipeline,
+            transform2mma_pipeline_consumer_state,
+            mma2accum_pipeline,
+            mma2accum_pipeline_producer_state,
+            bulk_tmem,
+            mma_input_operands,
+            k_tile_count
+          );
+          // Advance the mm2accum pipe
+          transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state_next;
+          mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state_next;
+        }
+      } while (work_tile_info.is_valid());
+
+      // leader MMA waits for leader + peer epilogues to release accumulator stage
+      if (is_mma_leader_cta) {
+        mma2accum_pipeline.producer_tail(mma2accum_pipeline_producer_state);
+      }
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Signal to peer MMA that stage can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+
+      // Tmem deallocation sequence
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_load_tensormap = get<0>(collective_epilogue.load_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, static_cast<int32_t>(cutlass::arch::SmId())));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = true;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        // Get current work tile and fetch next work tile
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            cute::make_tuple(epi_load_tensormap, did_batch_change)
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Register reconfiguration
+      arch::warpgroup_reg_alloc<AccumRegisterRequirement>();
+
+      // Throttle the epilogue warps to improve prologue performance
+      static constexpr int epilogue_throttle_phase_bit = 0;
+      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      bulk_tmem.data() = tmem_base_ptr;
+
+      auto accum_inputs = collective_mainloop.accum_init(bulk_tmem, typename CollectiveEpilogue::CopyOpT2R{}, typename CollectiveEpilogue::EpilogueTile{});
+      bool do_tail_store = false;
+      auto warp_idx_in_epi = canonical_warp_idx_sync() - static_cast<int>(WarpCategory::Epilogue);
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, static_cast<int32_t>(cutlass::arch::SmId())));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change && warp_idx_in_epi == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        if constexpr (InputTransformType == cutlass::gemm::detail::KernelInputTransformType::FastF32) {
+          auto [mma2accum_pipeline_consumer_state_next,tTR_rGlobAcc] = collective_mainloop.accum(
+            accum_inputs,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            k_tile_count);
+
+          // Check to see if tensormaps have been replaced in gmem
+          if (did_batch_change && warp_idx_in_epi == 0) {
+            collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+          }
+          auto [load_state_next, store_state_next] = collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            tTR_rGlobAcc,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            get<0>(accum_inputs) // tiled_t2r
+          );
+          
+          do_tail_store |= TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          // Advance the mm2accum pipe
+          mma2accum_pipeline_consumer_state = mma2accum_pipeline_consumer_state_next;
+        }
+        // Complex kernels use a collective epilogue
+        else {
+          mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state);
+
+          // Accumulators (real and imag)
+          Tensor accumulators = bulk_tmem(_,_,_,_,mma2accum_pipeline_consumer_state.index()); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+          //
+          // Epilogue and write to gD
+          //
+          // The tile scheduler and current work are passed into the collective epilogue to
+          // support fixup operations needed by split-/stream-K. These operations are pushed
+          // to the collective layer so that they can reuse the TMEM -> RF copy performed
+          // at the collective layer.
+          auto [mma2accum_pipeline_state_next] = collective_epilogue(
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            accumulators,
+            shared_storage.tensors.epilogue
+          );
+          // Advance the mm2accum pipe
+          mma2accum_pipeline_consumer_state = mma2accum_pipeline_state_next;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ec1049bc013e0b654e05ddd78f5ac5d2000f585
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp
@@ -0,0 +1,1297 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/mainloop_fusion_helper_scale_factor.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelPtrArrayTmaWarpSpecializedMmaTransformSm100>>> {
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using LayoutSFA = typename cutlass::detail::LayoutSFAType<CollectiveMainloop>::type;
+  using LayoutSFB = typename cutlass::detail::LayoutSFBType<CollectiveMainloop>::type;
+  using ElementSF = typename cutlass::detail::ElementSFType<CollectiveMainloop>::type;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  static_assert(!IsOverlappingAccum, "Does not support overlapping accumulator");
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  using TileSchedulerTag = cute::conditional_t<IsGroupedGemmKernel, GroupScheduler, TileSchedulerTag_>;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount, ProblemShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads            = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopABLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads   = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopSFLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads       = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps         = NumEpilogueThreads / NumThreadsPerWarp;
+
+
+  static constexpr uint32_t MaxThreadsPerBlock = cute::round_up(NumSchedThreads +
+                                                 NumMainloopABLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads +
+                                                 NumMainloopSFLoadThreads, 128);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+  
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Pipeline and pipeline state types
+  using MainloopABPipeline = typename CollectiveMainloop::MainloopABPipeline;
+  using MainloopABPipelineState = typename CollectiveMainloop::MainloopABPipelineState;
+
+  using MainloopSFPipeline = typename CollectiveMainloop::MainloopSFPipeline;
+  using MainloopSFPipelineState = typename CollectiveMainloop::MainloopSFPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = typename CollectiveMainloop::AccumulatorPipeline;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  static constexpr uint32_t GenericRegisterRequirement = 48;
+  static constexpr uint32_t AccumRegisterRequirement = 256;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+      alignas(16) arch::ClusterBarrier epilogue_throttle;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+      alignas(128) MainloopTensorMapStorage mainloop;
+    } tensormaps;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA            = 0,
+    Sched          = 1,
+    MainloopABLoad = 2,
+    EpilogueLoad   = 3,
+    Epilogue       = 4, // 4 warps
+    MainloopSFLoad = 8,
+    Unused         = 9,
+  };
+
+  struct IsParticipant {
+    uint32_t mma            = false;
+    uint32_t sched          = false;
+    uint32_t main_ab_load   = false;
+    uint32_t epi_load       = false;
+    uint32_t epilogue       = false;
+    uint32_t main_sf_load   = false;
+    uint32_t unused         = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+    ProblemShape problem_shapes = args.problem_shape;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (IsGroupedGemmKernel && sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    else if (!IsGroupedGemmKernel && sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace
+      );
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      scheduler,
+      args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    }
+    else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Mainloop, Epilogue or Scheduler don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Dynamic Cluster or Preferred Cluster don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    constexpr bool IsBlockscaled = !cute::is_void_v<ElementSF>;
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Mainloop
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Mainloop
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE: cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape.get_host_problem_shape(),
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    return grid_shape;
+  }
+
+  static constexpr
+  dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    auto problem_shape = params.problem_shape;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (warp_idx < static_cast<int>(WarpCategory::Epilogue)) {
+        return WarpCategory(warp_idx);
+      }
+      else if (warp_idx < static_cast<int>(WarpCategory::MainloopSFLoad)) {
+        return WarpCategory::Epilogue;
+      }
+      else if (warp_idx == static_cast<int>(WarpCategory::MainloopSFLoad)) {
+        return WarpCategory::MainloopSFLoad;
+      }
+      else {
+        return WarpCategory::Unused;
+      }
+    }();
+
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopABLoad),                      // main_ab_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopSFLoad),                      // main_sf_load
+      (warp_category == WarpCategory::Unused)                               // unused
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopABPipeline::Params mainloop_ab_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_ab_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_ab_load;
+    mainloop_ab_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_ab_pipeline_params.initializing_warp = 0;
+    MainloopABPipeline mainloop_ab_pipeline(shared_storage.pipelines.mainloop.pipeline_ab,
+                                       mainloop_ab_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    typename MainloopSFPipeline::Params mainloop_sf_pipeline_params;
+    if (WarpCategory::MainloopSFLoad == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_sf_pipeline_params.initializing_warp = 8;
+    mainloop_sf_pipeline_params.producer_arv_count = CollectiveMainloop::NumMainloopSFProducerThreadEvents;
+    mainloop_sf_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+
+    MainloopSFPipeline mainloop_sf_pipeline(shared_storage.pipelines.mainloop.pipeline_sf,
+                                            mainloop_sf_pipeline_params);
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopABLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopABLoadThreads;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+
+    clc_pipeline_params.initializing_warp = 1;
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopABLoadThreads + NumEpilogueThreads + 
+                                                    NumMainloopSFLoadThreads + NumMMAThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopABLoadThreads + NumEpilogueThreads + NumMMAThreads +
+                                               NumMainloopSFLoadThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+
+    CLCPipeline clc_pipeline = [&] () {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    } ();
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.mainloop.pipeline_accum,
+                                                 accumulator_pipeline_params,
+                                                 cluster_shape);
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopABLoad == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopABLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
+    }
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
+    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
+    if (WarpCategory::MMA == warp_category && lane_predicate) {
+      epilogue_throttle_barrier.init(                          NumMMAThreads +
+                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
+                                                               NumMainloopABLoadThreads +
+                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0));
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    MainloopABPipelineState mainloop_ab_pipe_consumer_state;
+    MainloopABPipelineState mainloop_ab_pipe_producer_state = cutlass::make_producer_start_state<MainloopABPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    MainloopSFPipelineState mainloop_sf_pipe_consumer_state;
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state = cutlass::make_producer_start_state<MainloopSFPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    int32_t sm_id = static_cast<int32_t>(cutlass::arch::SmId());
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_ab_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    
+    //
+    // TMEM "Allocation"
+    //
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    TiledMma tiled_mma;
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if constexpr (IsGroupedGemmKernel) {
+      if (not work_tile_info.is_valid()) {
+        // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+        return;
+      }
+      // In case user wants to engage less SMs than available on device
+      sm_id = blockIdx.x + (blockIdx.y * gridDim.x);
+    }
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    if (is_participant.main_ab_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_ab_init(
+          problem_shape_MNKL, params.mainloop,
+          shared_storage.tensors.mainloop,
+          shared_storage.tensormaps.mainloop,
+          params.hw_info.sm_count, sm_id, work_tile_info.L_idx);
+      Tensor gA_mkl = get<0>(load_inputs);
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto input_tensormaps = get<rank(load_inputs) - 1>(load_inputs);
+
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+      bool requires_clc_query = true;
+
+      do {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_perform_update(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(MainloopABPipeline::Stages, k_tile_count);
+
+        // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+        // we are managing TMA descriptors to change batches, we need to neglect the L mode 
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_ab_producer_state_next, k_tile_iter_next] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter, k_tile_prologue,
+          did_batch_change
+        );
+        mainloop_ab_pipe_producer_state = mainloop_ab_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_ab_producer_state_next_, unused_] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter_next, k_tile_count - k_tile_prologue,
+          false /* did_batch_change - prologue loads handle tensormap acquire */
+        );
+        mainloop_ab_pipe_producer_state = mainloop_ab_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl));
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_ab_tail(mainloop_ab_pipeline, mainloop_ab_pipe_producer_state);
+
+    }
+
+    else if (is_participant.main_sf_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      int32_t curr_batch = idx2crd(work_tile_info.L_idx, get<3>(problem_shape_MNKL)); // Usually just returns work_tile_info.L_idx;
+
+      auto mainloop_sf_inputs = collective_mainloop.load_sf_init(
+        problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop, curr_batch);
+
+      Tensor gA_mkl = get<0>(mainloop_sf_inputs);
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool requires_clc_query = true;
+      bool did_batch_change = true;
+
+      do {
+
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, size<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        if (did_batch_change) {
+          mainloop_sf_inputs = collective_mainloop.load_sf_update(
+            problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop, curr_batch);
+        }
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the loads
+        // we are managingo an array of pointers to change batches, we need to neglect the L mode 
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_sf_producer_state_next, k_tile_iter_next] = collective_mainloop.load_sf(
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          mainloop_sf_inputs,
+          cta_coord_mnk,
+          k_tile_iter, k_tile_count
+        );
+        mainloop_sf_pipe_producer_state = mainloop_sf_producer_state_next;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, size<4>(gA_mkl));
+      } while (work_tile_info.is_valid());
+
+      collective_mainloop.load_sf_tail(
+        mainloop_sf_pipeline, 
+        mainloop_sf_pipe_producer_state
+      );
+      
+    }
+
+    else if (is_participant.sched) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+      
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      // Grouped GEMM uses static tile scheduler
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+          
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+      else {
+        cutlass::arch::wait_on_dependent_grids();
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+      int tmem_non_accumulator_base =  tmem_base_ptr + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+
+
+      auto mma_inputs = collective_mainloop.mma_init(
+        params.mainloop,
+        collective_mainloop.slice_accumulator(accumulators, 0),
+        shared_storage.tensors.mainloop,
+        tmem_non_accumulator_base /*Start SF TMEM allocation after the accumulator*/);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        if (is_mma_leader_cta) {
+          auto [mainloop_ab_pipe_consumer_state_next, accumulator_pipe_producer_state_next] = collective_mainloop.mma(
+            cute::make_tuple(
+              mainloop_ab_pipeline, accumulator_pipeline),
+            cute::make_tuple(
+              mainloop_ab_pipe_consumer_state, accumulator_pipe_producer_state),
+            accumulators,
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count);
+          mainloop_ab_pipe_consumer_state = mainloop_ab_pipe_consumer_state_next;
+          accumulator_pipe_producer_state = accumulator_pipe_producer_state_next;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      // Leader MMA waits for leader + peer epilogues to release accumulator stage
+      if (is_mma_leader_cta) {
+        accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+      }
+      // Signal to peer MMA that entire tmem allocation can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+
+  
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_load_tensormap = get<0>(collective_epilogue.load_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = true;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+          }
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            cute::make_tuple(epi_load_tensormap, did_batch_change),
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Register reconfiguration
+      arch::warpgroup_reg_alloc<AccumRegisterRequirement>();
+
+      // Throttle the epilogue warps to improve prologue performance
+      static constexpr int epilogue_throttle_phase_bit = 0;
+      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
+      
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+
+      auto accum_inputs = collective_mainloop.accum_init(shared_storage.tensors.mainloop); 
+
+      auto warp_idx_in_epi = canonical_warp_idx_sync() - static_cast<int>(WarpCategory::Epilogue);
+      bool do_tail_store = false;
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      auto pipelines = cute::make_tuple(accumulator_pipeline, mainloop_sf_pipeline);
+      auto states = cute::make_tuple(accumulator_pipe_consumer_state, mainloop_sf_pipe_consumer_state);
+
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change && warp_idx_in_epi == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Fusions may need problem shape for the current group
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+
+        // Get accumulator 
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        auto [accum, tiled_t2r, next_state] = collective_mainloop.accum(
+          pipelines,
+          states,
+          accumulators,
+          accum_inputs,
+          cta_coord_mnkl,
+          typename CollectiveEpilogue::CopyOpT2R{},
+          typename CollectiveEpilogue::EpilogueTile{},
+          k_tile_count
+        );
+
+        states = next_state;
+
+        //
+        // Epilogue and write to gD
+        //
+        // Check to see if tensormaps have been replaced in gmem
+        if (did_batch_change && warp_idx_in_epi == 0) {
+          collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+        auto [load_state_next, store_state_next] = collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          CtaShape_MNK{},
+          cta_coord_mnkl,
+          TileShape{},
+          TiledMma{},
+          accum,
+          shared_storage.tensors.epilogue,
+          epi_store_tensormap,
+          tiled_t2r // tiled_t2r
+        );
+        
+        do_tail_store |= TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        epi_load_pipe_consumer_state = load_state_next;
+        epi_store_pipe_producer_state = store_state_next;
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+    else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..21ff5959e15408d563b473d87f85fda33b2be94b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_cpasync_warpspecialized.hpp
@@ -0,0 +1,793 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelWarpSpecializedSm100>>>
+{
+public:
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+
+  static_assert(size(AtomThrShapeMNK{}) == 1, "Lower alignment kernel only supports 1x1x1 cluster shape.");
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEmptyThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = CollectiveMainloop::NumLoadThreads; // 4 warps
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads + NumEmptyThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Pipelines and pipeline states
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+
+  static constexpr int EpilogueWarpRegs = 248;
+  static constexpr int NonEpilogueWarpRegs = 128;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    EpilogueLoad = 3,
+    Epilogue     = 4,
+    MainloopLoad = 8
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+    uint32_t main_load = false;
+  };
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    
+    static constexpr int MaxClusterSize = 16;
+    implementable &= size(ClusterShape{}) <= MaxClusterSize;
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    status = cutlass::Status::kSuccess;
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto blk_shape = CtaShape_MNK{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info
+       );
+
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+public:
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue)     ? WarpCategory(warp_idx)
+                               : warp_idx < static_cast<int>(WarpCategory::MainloopLoad) ? WarpCategory::Epilogue
+                                                                                         : WarpCategory::MainloopLoad;
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto tile_shape = TileShape{};
+    auto cluster_shape = ClusterShape{};
+    constexpr int cluster_size = size(ClusterShape{});
+    int cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    int mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA)   && is_mma_leader_cta,          // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopLoad)                         // main_load
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+
+    mainloop_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+    mainloop_pipeline_params.consumer_arv_count = 1; // Only UMMA consumes the A and B buffers
+    mainloop_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, cluster_shape);
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 3;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 1;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator, accumulator_pipeline_params, cluster_shape);
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    //
+    // TMEM "Allocation"
+    //
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    auto bulk_tmem = TiledMma::make_fragment_C(append(acc_shape,
+                                                      Int<AccumulatorPipelineStageCount>{}));
+
+    //
+    // END PROLOGUE
+    //
+
+    // Synchronization call. Blocks until barriers are initialized in shared memory.
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+
+      auto load_inputs = collective_mainloop.load_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+      Tensor gA_mkl = get<0>(load_inputs);
+
+      do {
+        // Get current work tile and fetch next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        auto [mainloop_producer_state_next, unused_] = collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      bulk_tmem.data() = tmem_base_ptr;
+
+      // Pass the acc with tuple type since the bgrad kernel change the mma_init API
+      auto mma_inputs = collective_mainloop.mma_init(params.mainloop, cute::make_tuple(bulk_tmem, bulk_tmem), shared_storage.tensors.mainloop);
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        
+        int acc_stage = accumulator_pipe_producer_state.index();
+        Tensor accumulators = bulk_tmem(_,_,_,acc_stage);
+        mainloop_pipe_consumer_state = collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          // Pass the acc with tuple type since the bgrad kernel change the mma API
+          cute::make_tuple(accumulators, accumulators),
+          mma_inputs,
+          k_tile_count
+        );
+
+        accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+      } while (work_tile_info.is_valid());
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+
+      bool do_tail_load = false;
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      cutlass::arch::warpgroup_reg_alloc<EpilogueWarpRegs>();
+
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      bulk_tmem.data() = tmem_base_ptr;
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // Accumulator stage slice
+        int acc_stage = accumulator_pipe_consumer_state.index();
+        Tensor accumulators = bulk_tmem(_,_,_,acc_stage);
+
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulators,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulators,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_mixed_tma_cpasync_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_mixed_tma_cpasync_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..99da60bfae51ce599d041c363693b31e336b64a7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_mixed_tma_cpasync_warpspecialized.hpp
@@ -0,0 +1,1011 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelMixedTmaCpAsyncWarpSpecializedSm100>>>
+{
+public:
+  using ProblemShape = ProblemShape_;
+
+  static constexpr bool IsGroupedGemmKernel = cutlass::gemm::detail::is_moe_problem_shape<ProblemShape>::value;
+  static constexpr bool IsMoEScheduler = false; // stub for MoE scheduler, which accepts a MoEProblemShape instead of GroupProblemShape
+  
+  CUTLASS_HOST_DEVICE
+  static auto get_problem_shape_gemm(ProblemShape const& shape) {
+    if constexpr (IsGroupedGemmKernel) {
+      return shape.max_problem_shape;
+    }
+    else {
+      return shape;
+    }
+  }
+  CUTLASS_HOST_DEVICE
+  static auto get_problem_shape_scheduler(ProblemShape const& shape) {
+    if constexpr (IsMoEScheduler) {
+      return shape;
+    }
+    else if constexpr (IsGroupedGemmKernel) {
+      return shape.problem_shape;
+    }
+    else {
+      return shape;
+    }
+  }
+
+  template<class ProblemShape, class WorkTileInfo>
+  CUTLASS_HOST_DEVICE
+  static auto get_effective_shape(ProblemShape const& shape, WorkTileInfo const& work_tile_info) {
+    if constexpr (IsGroupedGemmKernel) {
+      return append<4>(shape.problem_shape.get_problem_shape(work_tile_info.L_idx), Int<1>{});
+    }
+    else {
+      return append<4>(shape, Int<1>{});
+    }
+  }
+
+  using ProblemShapeGemm = decltype(get_problem_shape_gemm(ProblemShape{}));
+  using ProblemShapeScheduler = decltype(get_problem_shape_scheduler(ProblemShape{}));
+
+  static_assert(rank(ProblemShapeGemm{}) == 3 or rank(ProblemShapeGemm{}) == 4,
+    "ProblemShapeGemm{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+  static_assert(!IsOverlappingAccum, "TMA+CPASYNC kernel currently only supports non-overlapping accum.");
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+
+  static_assert(size(AtomThrShapeMNK{}) == 1, "Lower alignment kernel only supports 1x1x1 cluster shape.");
+  using TileSchedulerTag = cute::conditional_t<IsGroupedGemmKernel && !IsMoEScheduler, GroupScheduler, TileSchedulerTag_>;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount, ProblemShapeScheduler>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads               = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads                 = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEmptyThreads               = 0;
+  static constexpr uint32_t NumMainloopTMALoadThreads     = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopCpAsyncLoadThreads = CollectiveMainloop::NumLoadThreadsCpAsync; // 4 warps
+  static constexpr uint32_t NumEpilogueLoadThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads            = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps              = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopTMALoadThreads + NumMainloopCpAsyncLoadThreads +
+                                                 NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads + NumEmptyThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_load_pipe_increment(CtaShape_MNK{});
+
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Pipelines and pipeline states
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+
+  // Pipeline and pipeline state types
+  using MainloopPipelineTMA = typename CollectiveMainloop::MainloopPipelineTMA;
+  using MainloopPipelineTMAState = typename CollectiveMainloop::MainloopPipelineTMAState;
+  using MainloopPipelineCpAsync = typename CollectiveMainloop::MainloopPipelineCpAsync;
+  using MainloopPipelineCpAsyncState = typename CollectiveMainloop::MainloopPipelineCpAsyncState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  // using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using TmemAllocator = cute::TMEM::Allocator1Sm;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    ProblemShapeGemm problem_shape_gemm{};
+    ProblemShapeScheduler problem_shape_scheduler{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA                 = 0,
+    Sched               = 1,
+    MainloopLoadTMA     = 2,
+    EpilogueLoad        = 3,
+    Epilogue            = 4,
+    MainloopLoadCpAsync = 8
+  };
+
+  struct IsParticipant {
+    uint32_t mma               = false;
+    uint32_t sched             = false;
+    uint32_t main_load_tma     = false;
+    uint32_t epi_load          = false;
+    uint32_t epilogue          = false;
+    uint32_t main_load_cpasync = false;
+  };
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    // auto problem_shape = args.problem_shape;
+    // auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    auto problem_shape_gemm = get_problem_shape_gemm(args.problem_shape);
+    auto problem_shape_scheduler = get_problem_shape_scheduler(args.problem_shape);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShapeScheduler, ElementAccumulator>(
+      args.scheduler, problem_shape_scheduler, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+        problem_shape_scheduler, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      auto problem_shape = args.problem_shape;
+      auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+      scheduler = TileScheduler::to_underlying_arguments(
+        problem_shape, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      );
+    }
+
+    return {
+      args.mode,
+      args.problem_shape,
+      problem_shape_gemm,
+      problem_shape_scheduler,
+      CollectiveMainloop::to_underlying_arguments(problem_shape_gemm, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shape_gemm, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+
+    if constexpr (IsGroupedGemmKernel) {
+      implementable &= args.mode == GemmUniversalMode::kGrouped;
+      implementable &= rank(ProblemShapeGemm{}) == 4;
+      implementable &= rank(typename ProblemShape::UnderlyingProblemShape::UnderlyingProblemShape{}) == 3;
+    }
+    else {
+      implementable &= (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShapeGemm{}) == 4);
+    }
+    
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    
+    auto problem_shape_gemm = get_problem_shape_gemm(args.problem_shape);
+    implementable &= CollectiveMainloop::can_implement(problem_shape_gemm, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(problem_shape_gemm, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    
+    static constexpr int MaxClusterSize = 16;
+    implementable &= size(ClusterShape{}) <= MaxClusterSize;
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    auto problem_shape_gemm = get_problem_shape_gemm(args.problem_shape);
+    auto problem_shape_scheduler = get_problem_shape_scheduler(args.problem_shape);
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(problem_shape_gemm, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShapeScheduler, ElementAccumulator>(
+      args.scheduler, problem_shape_scheduler, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    auto problem_shape_gemm = get_problem_shape_gemm(args.problem_shape);
+    auto problem_shape_scheduler = get_problem_shape_scheduler(args.problem_shape);
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(problem_shape_gemm, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shape_gemm, args.epilogue);
+    status = cutlass::Status::kSuccess;
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShapeScheduler, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, problem_shape_scheduler, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShapeScheduler, ElementAccumulator>(
+      args.scheduler, problem_shape_scheduler, args.hw_info, NumFixupBarriers);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape_scheduler,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    else {
+      auto problem_shape_MNKL = append<4>(params.problem_shape_scheduler, 1);
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape_gemm, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue)            ? WarpCategory(warp_idx)
+                               : warp_idx < static_cast<int>(WarpCategory::MainloopLoadCpAsync) ? WarpCategory::Epilogue
+                                                                                                : WarpCategory::MainloopLoadCpAsync;
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto tile_shape = TileShape{};
+    auto cluster_shape = ClusterShape{};
+    constexpr int cluster_size = size(ClusterShape{});
+    int cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    int mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+
+    // printf("is_epi_load_needed = %d", (int)is_epi_load_needed);
+
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA)   && is_mma_leader_cta,          // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoadTMA),                     // main_load_tma
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopLoadCpAsync)                  // main_load_cpasync
+    };
+
+    // Mainloop Load pipeline (TMA)
+    typename MainloopPipelineTMA::Params mainloop_pipeline_tma_params;
+    if (WarpCategory::MainloopLoadTMA == warp_category) {
+      mainloop_pipeline_tma_params.role = MainloopPipelineTMA::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_tma_params.role = MainloopPipelineTMA::ThreadCategory::Consumer;
+    }
+
+    mainloop_pipeline_tma_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load_tma;
+    mainloop_pipeline_tma_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_pipeline_tma_params.initializing_warp = 0;
+    MainloopPipelineTMA mainloop_pipeline_tma(shared_storage.pipelines.mainloop.tma,
+                                              mainloop_pipeline_tma_params,
+                                              cluster_shape,
+                                              cute::true_type{},   // Perform barrier init
+                                              cute::false_type{}); // Delay mask calculation
+
+    // Mainloop Load pipeline (CpAsync)
+    typename MainloopPipelineCpAsync::Params mainloop_pipeline_cpasync_params;
+    if (WarpCategory::MainloopLoadCpAsync == warp_category) {
+      mainloop_pipeline_cpasync_params.role = MainloopPipelineCpAsync::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_cpasync_params.role = MainloopPipelineCpAsync::ThreadCategory::Consumer;
+    }
+
+    mainloop_pipeline_cpasync_params.producer_arv_count = NumMainloopCpAsyncLoadThreads;
+    mainloop_pipeline_cpasync_params.consumer_arv_count = 1; // Only UMMA consumes the A and B buffers
+    mainloop_pipeline_cpasync_params.dst_blockid = cta_rank_in_cluster;
+    mainloop_pipeline_cpasync_params.initializing_warp = 0;
+    MainloopPipelineCpAsync mainloop_pipeline_cpasync(shared_storage.pipelines.mainloop.cpasync, mainloop_pipeline_cpasync_params, cluster_shape);
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 3;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? CLCPipeline::ThreadCategory::ProducerConsumer : CLCPipeline::ThreadCategory::Producer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopTMALoadThreads + NumMainloopCpAsyncLoadThreads  + NumEpilogueThreads + NumMMAThreads);
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    }
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopTMALoadThreads + NumMainloopCpAsyncLoadThreads + NumEpilogueThreads + NumMMAThreads;
+    }
+    
+    clc_pipeline_params.initializing_warp = 1;
+    // CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+    // Now declare the pipeline outside the if constexpr
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator, accumulator_pipeline_params, cluster_shape);
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+
+    MainloopPipelineTMAState mainloop_pipe_tma_consumer_state;
+    MainloopPipelineTMAState mainloop_pipe_tma_producer_state = cutlass::make_producer_start_state<MainloopPipelineTMA>();
+    MainloopPipelineCpAsyncState mainloop_pipe_cpasync_consumer_state;
+    MainloopPipelineCpAsyncState mainloop_pipe_cpasync_producer_state = cutlass::make_producer_start_state<MainloopPipelineCpAsync>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    //
+    // TMEM "Allocation"
+    //
+    // auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    // auto bulk_tmem = TiledMma::make_fragment_C(append(acc_shape,
+    //                                                   Int<AccumulatorPipelineStageCount>{}));
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+
+    //
+    // END PROLOGUE
+    //
+
+    // Synchronization call. Blocks until barriers are initialized in shared memory.
+    pipeline_init_wait(cluster_size);
+
+    // __syncwarp();
+    // if (threadIdx.x % 32 == 0) {
+    //   printf("warp %d start\n", warp_idx);
+    // }
+
+    if (is_participant.main_load_tma) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      // bool do_load_order_arrive = is_epi_load_needed;
+      bool requires_clc_query = true;
+
+      auto load_inputs = collective_mainloop.load_init_tma(
+        problem_shape_MNKL, shared_storage.tensors.mainloop);
+      auto k_tiles = cute::get<0>(load_inputs);
+
+      do {
+        auto effective_shape = get_effective_shape(params.problem_shape, work_tile_info);
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, effective_shape, CtaShape_MNK{}, k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, effective_shape, CtaShape_MNK{});
+        // auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
+
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load_tma(
+          mainloop_pipeline_tma,
+          mainloop_pipe_tma_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count      // - k_tile_prologue
+        );
+        mainloop_pipe_tma_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail_tma(mainloop_pipeline_tma, mainloop_pipe_tma_producer_state);
+
+    }
+
+    else if (is_participant.main_load_cpasync) {
+      auto load_inputs = collective_mainloop.load_init_cpasync(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop,
+          scheduler, work_tile_info);
+      Tensor gA_mkl = get<0>(load_inputs);
+
+      do {
+        // Get current work tile and fetch next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+        auto effective_shape = get_effective_shape(params.problem_shape, work_tile_info);
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, effective_shape, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, effective_shape, CtaShape_MNK{});
+
+        auto [mainloop_producer_state_next, unused_] = collective_mainloop.load_cpasync(
+          params.mainloop,
+          mainloop_pipeline_cpasync,
+          mainloop_pipe_cpasync_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count,
+          effective_shape
+        );
+        mainloop_pipe_cpasync_producer_state = mainloop_producer_state_next;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      collective_mainloop.load_tail_cpasync(mainloop_pipeline_cpasync, mainloop_pipe_cpasync_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+      else {
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      // bulk_tmem.data() = tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+
+      // Pass the acc with tuple type since the bgrad kernel change the mma_init API
+      auto mma_inputs = collective_mainloop.mma_init(params.mainloop, 
+        tmem_storage, 
+        shared_storage.tensors.mainloop);
+      do {
+        auto effective_shape = get_effective_shape(params.problem_shape, work_tile_info);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, effective_shape, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        // accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+        
+        int acc_stage = accumulator_pipe_producer_state.index();
+        // Tensor accumulators = bulk_tmem(_,_,_,acc_stage);
+        auto [mainloop_pipe_tma_consumer_state_next_, mainloop_pipe_cpasync_consumer_state_next_] = collective_mainloop.mma(
+          cute::make_tuple(mainloop_pipeline_tma, mainloop_pipeline_cpasync, accumulator_pipeline),
+          cute::make_tuple(mainloop_pipe_tma_consumer_state, mainloop_pipe_cpasync_consumer_state, accumulator_pipe_producer_state),
+          // Pass the acc with tuple type since the bgrad kernel change the mma API
+          // cute::make_tuple(accumulators, accumulators),
+          collective_mainloop.slice_accumulator(tmem_storage, acc_stage),
+          mma_inputs,
+          cta_coord_mnkl,
+          k_tile_count
+        );
+        mainloop_pipe_tma_consumer_state = mainloop_pipe_tma_consumer_state_next_;
+        mainloop_pipe_cpasync_consumer_state = mainloop_pipe_cpasync_consumer_state_next_;
+
+        accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      bool do_tail_load = false;
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+      // bulk_tmem.data() = tmem_base_ptr;
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // Accumulator stage slice
+        int acc_stage = accumulator_pipe_consumer_state.index();
+        // Tensor accumulators = bulk_tmem(_,_,_,acc_stage);
+        auto accumulator = get<0>(collective_mainloop.slice_accumulator(tmem_storage, acc_stage));
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulator,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulator,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fb62f1b81fbdf829d325d1336e99480fa9a08f92
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,963 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/mainloop_fusion_helper_scale_factor.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cute::disjunction_v<cutlass::detail::is_kernel_tag_of<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelTmaWarpSpecializedSm100>,
+    cutlass::detail::is_kernel_tag_of<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelTmaWarpSpecializedBlockScaledSm100>>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using LayoutSFA = typename cutlass::detail::LayoutSFAType<CollectiveMainloop>::type;
+  using LayoutSFB = typename cutlass::detail::LayoutSFBType<CollectiveMainloop>::type;
+  using ElementSF = typename cutlass::detail::ElementSFType<CollectiveMainloop>::type;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_load_pipe_increment(CtaShape_MNK{});
+
+  // Fixup performed for split-/stream-K is done across warps in different CTAs
+  // at epilogue subtile granularity. Thus, there must be one barrier per sub-tile per
+  // epilogue warp.
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{}; 
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    MainloopLoad = 2,
+    EpilogueLoad = 3,
+    Epilogue     = 4
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t main_load = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+    
+    constexpr bool IsBlockscaled = !cute::is_void_v<ElementSF>;
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue) ? WarpCategory(warp_idx)
+                                                                                     : WarpCategory::Epilogue;
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoad),                        // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue)                             // epilogue
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load;
+    mainloop_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop,
+                                       mainloop_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 1;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopLoadThreads;
+    load_order_barrier_params.initializing_warp = 3;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 4;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 5;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category) {
+      if constexpr(!IsOverlappingAccum) {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumMMAThreads);
+        }
+      }
+      else {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+        }
+        else if (lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+        }
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_init(
+        problem_shape_MNKL, shared_storage.tensors.mainloop);
+
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    //
+    // TMEM "Allocation"
+    //
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      bool requires_clc_query = true;
+
+      do {
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, load_inputs.k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(MainloopPipeline::Stages, k_tile_count);
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_prologue
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter_next, k_tile_count - k_tile_prologue
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto mma_inputs = collective_mainloop.mma_init(
+        tmem_storage,
+        shared_storage.tensors.mainloop);
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_producer_state.phase() ^ 1;
+          }
+          else {
+            return accumulator_pipe_producer_state.index();
+          }
+        }();
+
+        if (is_mma_leader_cta) {
+          mainloop_pipe_consumer_state = collective_mainloop.mma(
+            cute::make_tuple(mainloop_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state),
+            collective_mainloop.slice_accumulator(tmem_storage, acc_stage),
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+            );
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+
+        auto accumulator = get<0>(collective_mainloop.slice_accumulator(tmem_storage, acc_stage));
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulator,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulator,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..24efff6faec55822475a61fd3e3b6a68a6bd8160
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp
@@ -0,0 +1,1070 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, 
+                                KernelTmaWarpSpecializedInputTransformSm100>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+
+  // Get Blk and Scheduling tile shapes
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static constexpr bool IsComplex = DispatchPolicy::InputTransformType == cutlass::gemm::detail::KernelInputTransformType::InterleavedComplexTF32;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  // TileID scheduler
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads           = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMMAThreads             = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueThreads        = CollectiveMainloop::NumAccumThreads;           // 4 warps
+  static constexpr uint32_t NumEpilogueWarps          = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumTransformationThreads  = CollectiveMainloop::NumTransformationThreads;  // 4 warps
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads +
+                                                 NumEpilogueThreads + NumTransformationThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr cutlass::gemm::detail::KernelInputTransformType InputTransformType = DispatchPolicy::InputTransformType;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Transfer registers from regular warps to Accum warps
+  static constexpr uint32_t GenericRegisterRequirement = 152;
+  static constexpr uint32_t AccumRegisterRequirement = 200;
+
+  // Pipeline and pipeline state types
+  using Load2TransformPipeline = typename CollectiveMainloop::Load2TransformPipeline;
+  using Load2TransformPipelineState = typename CollectiveMainloop::Load2TransformPipelineState;
+
+  using Transform2MmaPipeline = typename CollectiveMainloop::Transform2MmaPipeline;
+  using Transform2MmaPipelineState = typename CollectiveMainloop::Transform2MmaPipelineState;
+
+  using Mma2AccumPipeline = typename CollectiveMainloop::Mma2AccumPipeline;
+  using Mma2AccumPipelineState = typename CollectiveMainloop::Mma2AccumPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = cutlass::PipelineState<SchedulerPipelineStageCount>;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+      alignas(16) arch::ClusterBarrier epilogue_throttle;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+    
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA           = 0,
+    Sched         = 1,
+    MainloopLoad  = 2,
+    EpilogueLoad  = 3,
+    Epilogue      = 4,
+    // Transformation starts at 256 thread alignment
+    Transformation    = 8
+  };
+
+  struct IsParticipant {
+    uint32_t mma            = false;
+    uint32_t sched          = false;
+    uint32_t main_load      = false;
+    uint32_t epi_load       = false;
+    uint32_t epilogue       = false;
+    uint32_t transformation = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+    auto blk_shape = CtaShape_MNK{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Account for multiple epilogue and transformation warps
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue)       ? WarpCategory(warp_idx)
+                               : warp_idx < static_cast<int>(WarpCategory::Transformation) ? WarpCategory::Epilogue
+                                                                                           : WarpCategory::Transformation;
+    int thread_idx          = int(threadIdx.x);
+    int thread_idx_in_warp  = thread_idx % 32;
+    uint32_t lane_predicate = cute::elect_one_sync();
+    int cta_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size                = size(cluster_shape);
+    bool is_first_cta_in_cluster    = (cta_rank_in_cluster == 0);
+    bool is_mma_leader_cta          = (cta_rank_in_cluster % size<0>(TiledMma{}) == 0);
+    // Even if this variable is unused, shape_div still performs useful compile-time checks.
+    [[maybe_unused]] auto mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                               // mma
+      (warp_category == WarpCategory::Sched) && (is_first_cta_in_cluster),                // sched
+      (warp_category == WarpCategory::MainloopLoad),                                      // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,                // epi_load
+      (warp_category == WarpCategory::Epilogue),                                          // epilogue
+      (warp_category == WarpCategory::Transformation)                                     // transformation
+    };
+
+    // MainloopLoad <--> Transformation Pipeline
+    typename Load2TransformPipeline::Params load2transform_pipeline_params;
+    if (warp_category == WarpCategory::MainloopLoad) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Transformation) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Consumer;
+    }
+    load2transform_pipeline_params.is_leader = (thread_idx_in_warp == 0);
+    load2transform_pipeline_params.num_consumers = NumTransformationThreads;
+    load2transform_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    load2transform_pipeline_params.initializing_warp = 0;
+    Load2TransformPipeline load2transform_pipeline(shared_storage.pipelines.mainloop.load2transform_pipeline,
+                                                   load2transform_pipeline_params,
+                                                   cluster_shape,
+                                                   cute::true_type{},  // Perform barrier init
+                                                   cute::false_type{}  // Delay mask calculation
+                                                   );
+
+    Load2TransformPipelineState load2transform_pipeline_consumer_state;
+    Load2TransformPipelineState load2transform_pipeline_producer_state = cutlass::make_producer_start_state<Load2TransformPipeline>();
+
+    // Transformation <--> MMA pipeline
+    typename Transform2MmaPipeline::Params transform2mma_pipeline_params;
+    if (warp_category == WarpCategory::Transformation) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::MMA) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Consumer;
+    }
+    transform2mma_pipeline_params.consumer_arv_count = 1;
+    transform2mma_pipeline_params.producer_arv_count = size(AtomThrShapeMNK{}) * NumTransformationThreads;
+    transform2mma_pipeline_params.initializing_warp = 2;
+    Transform2MmaPipeline transform2mma_pipeline(shared_storage.pipelines.mainloop.transform2mma_pipeline,
+                                                 transform2mma_pipeline_params,
+                                                 cluster_shape,
+                                                 cute::true_type{},  // Perform barrier init
+                                                 cute::false_type{}  // Delay mask calculation
+                                                 );
+
+    Transform2MmaPipelineState transform2mma_pipeline_consumer_state;
+    Transform2MmaPipelineState transform2mma_pipeline_producer_state = cutlass::make_producer_start_state<Transform2MmaPipeline>();
+
+    // MMA <--> Accumulator pipeline
+    typename Mma2AccumPipeline::Params mma2accum_pipeline_params;
+    if (warp_category == WarpCategory::MMA) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Epilogue) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Consumer;
+    }
+    mma2accum_pipeline_params.producer_arv_count = 1;
+    mma2accum_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    mma2accum_pipeline_params.initializing_warp = 6;
+    Mma2AccumPipeline mma2accum_pipeline(shared_storage.pipelines.mainloop.mma2accum_pipeline, 
+                                         mma2accum_pipeline_params,
+                                         cluster_shape,
+                                         cute::true_type{},  // Perform barrier init
+                                         cute::false_type{}  // Delay mask calculation
+                                         );
+
+    Mma2AccumPipelineState mma2accum_pipeline_consumer_state;
+    Mma2AccumPipelineState mma2accum_pipeline_producer_state = cutlass::make_producer_start_state<Mma2AccumPipeline>();
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = 1;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // CLC pipeline
+    // Operates Scheduling Warp <--> All Warps
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads +
+                                                  NumMMAThreads + NumTransformationThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 1;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    CLCPipelineState clc_pipeline_consumer_state;
+    CLCPipelineState clc_pipeline_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between transform, MMA, and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumTransformationThreads + NumMMAThreads + NumEpilogueThreads,
+                                                          cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
+    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
+    if (WarpCategory::MMA == warp_category && lane_predicate) {
+      epilogue_throttle_barrier.init(                          NumMMAThreads +
+                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
+                                                               NumMainloopLoadThreads +
+                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0) +
+                                                               NumTransformationThreads);
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    load2transform_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    transform2mma_pipeline.init_masks(cluster_shape);
+    mma2accum_pipeline.init_masks(cluster_shape);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    // Allocate accumulators
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    auto bulk_tmem = TiledMma::make_fragment_C(append(acc_shape,
+                                                      Int<AccumulatorPipelineStageCount>{}));
+
+    // Tile transform inputs now to get the k tile count
+    auto transform_inputs = collective_mainloop.transform_init(params.mainloop, problem_shape_MNKL, bulk_tmem, shared_storage.tensors.mainloop);
+    Tensor gA_mkl = get<0>(transform_inputs);
+
+    // Synchronization call. Blocks until barriers are initialized in shared memory.
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+      bool requires_clc_query = true;
+
+      do {
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(Load2TransformPipeline::Stages, k_tile_count);
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        if (lane_predicate) {
+          auto [load2transform_pipeline_producer_state_next, k_tile_iter_next] = collective_mainloop.load(
+            params.mainloop,
+            load2transform_pipeline,
+            load2transform_pipeline_producer_state,
+            load_inputs,
+            cta_coord_mnkl,
+            k_tile_iter, k_tile_prologue
+          );
+          load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next;
+
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          auto [load2transform_pipeline_producer_state_next_, unused_] = collective_mainloop.load(
+            params.mainloop,
+            load2transform_pipeline,
+            load2transform_pipeline_producer_state,
+            load_inputs,
+            cta_coord_mnkl,
+            k_tile_iter_next, k_tile_count - k_tile_prologue
+          );
+          load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next_;
+        }
+        
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+        work_tile_info = next_work_tile_info;
+      } while (work_tile_info.is_valid());
+      if (lane_predicate) {
+        load2transform_pipeline.producer_tail(load2transform_pipeline_producer_state);
+      }
+
+    }
+
+    else if (is_participant.sched) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipeline_producer_state = scheduler.advance_to_next_work(
+              clc_pipeline, 
+              clc_pipeline_producer_state
+            );
+         }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipeline_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipeline_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipeline_producer_state);
+      }
+    }
+
+    else if (is_participant.transformation) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+        auto [load2transform_pipeline_consumer_state_next, transform2mma_pipeline_producer_state_next] = collective_mainloop.transform(
+          load2transform_pipeline,
+          load2transform_pipeline_consumer_state,
+          transform2mma_pipeline,
+          transform2mma_pipeline_producer_state,
+          bulk_tmem,
+          transform_inputs,
+          k_tile_iter, k_tile_count
+        );
+        transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state_next;
+        load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state_next;
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      transform2mma_pipeline.producer_tail(transform2mma_pipeline_producer_state);
+    }
+
+    else if (is_participant.mma) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+
+      auto mma_input_operands = collective_mainloop.mma_init(bulk_tmem, shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (is_mma_leader_cta) {
+          auto [transform2mma_pipeline_consumer_state_next, mma2accum_pipeline_producer_state_next] = collective_mainloop.mma(
+            transform2mma_pipeline,
+            transform2mma_pipeline_consumer_state,
+            mma2accum_pipeline,
+            mma2accum_pipeline_producer_state,
+            bulk_tmem,
+            mma_input_operands,
+            k_tile_count
+          );
+          // Advance the mm2accum pipe
+          transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state_next;
+          mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state_next;
+        }
+      } while (work_tile_info.is_valid());
+
+      // leader MMA waits for leader + peer epilogues to release accumulator stage
+      if (is_mma_leader_cta) {
+        mma2accum_pipeline.producer_tail(mma2accum_pipeline_producer_state);
+      }
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Signal to peer MMA that entire tmem allocation can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Register reconfiguration
+      arch::warpgroup_reg_alloc<AccumRegisterRequirement>();
+
+      // Throttle the epilogue warps to improve prologue performance
+      static constexpr int epilogue_throttle_phase_bit = 0;
+      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+
+      auto accum_inputs = collective_mainloop.accum_init(bulk_tmem, typename CollectiveEpilogue::CopyOpT2R{}, typename CollectiveEpilogue::EpilogueTile{});
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        if constexpr (InputTransformType == cutlass::gemm::detail::KernelInputTransformType::FastF32) {
+          auto [mma2accum_pipeline_consumer_state_next,tTR_rGlobAcc] = collective_mainloop.accum(
+            accum_inputs,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            k_tile_count);
+
+          mma2accum_pipeline_consumer_state_next = scheduler.template fixup<IsComplex>(
+            TiledMma{},
+            work_tile_info,
+            tTR_rGlobAcc,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state_next,
+            typename CollectiveEpilogue::CopyOpT2R{}
+          );
+
+          //
+          // Epilogue and write to gD
+          //
+          if (scheduler.compute_epilogue(work_tile_info)) {
+            auto [load_state_next, store_state_next] = collective_epilogue.store(
+              epi_load_pipeline,
+              epi_load_pipe_consumer_state,
+              epi_store_pipeline,
+              epi_store_pipe_producer_state,
+              problem_shape_MNKL,
+              CtaShape_MNK{},
+              cta_coord_mnkl,
+              TileShape{},
+              TiledMma{},
+              tTR_rGlobAcc,
+              shared_storage.tensors.epilogue,
+              get<0>(accum_inputs) // tiled_t2r
+            );
+            epi_load_pipe_consumer_state = load_state_next;
+            epi_store_pipe_producer_state = store_state_next;
+            do_tail_store = true;
+          }
+
+          // Advance the mm2accum pipe
+          mma2accum_pipeline_consumer_state = mma2accum_pipeline_consumer_state_next;
+        }
+        // Complex kernels use a collective epilogue
+        else {
+          mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state);
+
+          // Accumulators (real and imag)
+          Tensor accumulators = bulk_tmem(_,_,_,_,mma2accum_pipeline_consumer_state.index()); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+          mma2accum_pipeline_consumer_state = scheduler.template fixup<IsComplex>(
+            TiledMma{},
+            work_tile_info,
+            accumulators,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            typename CollectiveEpilogue::CopyOpT2R{}
+          );
+
+          //
+          // Epilogue and write to gD
+          //
+          if (scheduler.compute_epilogue(work_tile_info)) {
+            auto [mma2accum_pipeline_state_next] = collective_epilogue(
+              mma2accum_pipeline,
+              mma2accum_pipeline_consumer_state,
+              problem_shape_MNKL,
+              CtaShape_MNK{},
+              cta_coord_mnkl,
+              accumulators,
+              shared_storage.tensors.epilogue
+            );
+            // Advance the mm2accum pipe
+            mma2accum_pipeline_consumer_state = mma2accum_pipeline_state_next;
+          }
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mixed_input_transform.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mixed_input_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55c18c9a7a830991306782e95e08f4abdf501c91
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mixed_input_transform.hpp
@@ -0,0 +1,1090 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule, 
+                                KernelTmaWarpSpecializedMixedInputTransformSm100>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+
+  // Get Blk and Scheduling tile shapes
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static constexpr bool IsComplex = DispatchPolicy::InputTransformType == cutlass::gemm::detail::KernelInputTransformType::InterleavedComplexTF32;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  // TileID scheduler
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads           = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMMAThreads             = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads    = NumThreadsPerWarp;                             // 1 warp
+  static constexpr uint32_t NumEpilogueThreads        = CollectiveMainloop::NumAccumThreads;           // 4 warps
+  static constexpr uint32_t NumEpilogueWarps          = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumTransformationThreads  = CollectiveMainloop::NumTransformationThreads;  // 4 warps
+  static constexpr uint32_t NumMainloopLoadBThreads   = NumThreadsPerWarp;                            // 1 warp
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads +
+                                                 NumEpilogueThreads + NumTransformationThreads + NumMainloopLoadBThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr cutlass::gemm::detail::KernelInputTransformType InputTransformType = DispatchPolicy::InputTransformType;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Pipeline and pipeline state types
+  using Load2TransformPipeline = typename CollectiveMainloop::Load2TransformPipeline;
+  using Load2TransformPipelineState = typename CollectiveMainloop::Load2TransformPipelineState;
+
+  using Load2MmaPipeline = typename CollectiveMainloop::Load2MmaPipeline;
+  using Load2MmaPipelineState = typename CollectiveMainloop::Load2MmaPipelineState;
+
+  using Transform2MmaPipeline = typename CollectiveMainloop::Transform2MmaPipeline;
+  using Transform2MmaPipelineState = typename CollectiveMainloop::Transform2MmaPipelineState;
+
+  using Mma2AccumPipeline = typename CollectiveMainloop::Mma2AccumPipeline;
+  using Mma2AccumPipelineState = typename CollectiveMainloop::Mma2AccumPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = cutlass::PipelineState<SchedulerPipelineStageCount>;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+      alignas(16) arch::ClusterBarrier epilogue_throttle;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+    
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA           = 0,
+    Sched         = 1,
+    MainloopLoad  = 2,
+    EpilogueLoad  = 3,
+    Epilogue      = 4,
+    // Transformation starts at 256 thread alignment
+    Transformation = 8,
+    MainloopLoadB  = 12,
+  };
+
+  struct IsParticipant {
+    uint32_t mma            = false;
+    uint32_t sched          = false;
+    uint32_t main_load      = false;
+    uint32_t main_loadA     = false;
+    uint32_t main_loadB     = false;
+    uint32_t epi_load       = false;
+    uint32_t epilogue       = false;
+    uint32_t transformation = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+    auto blk_shape = CtaShape_MNK{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Account for multiple epilogue and transformation warps
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue)       ? WarpCategory(warp_idx)
+                               : warp_idx < static_cast<int>(WarpCategory::Transformation) ? WarpCategory::Epilogue
+                               : warp_idx < static_cast<int>(WarpCategory::MainloopLoadB)  ? WarpCategory::Transformation
+                               : WarpCategory::MainloopLoadB;   
+
+    int thread_idx          = int(threadIdx.x);
+    int thread_idx_in_warp  = thread_idx % 32;
+    uint32_t lane_predicate = cute::elect_one_sync();
+    int cta_rank_in_cluster = cute::block_rank_in_cluster();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size                = size(cluster_shape);
+    bool is_first_cta_in_cluster    = (cta_rank_in_cluster == 0);
+    bool is_mma_leader_cta          = (cta_rank_in_cluster % size<0>(TiledMma{}) == 0);
+    // Even if this variable is unused, shape_div still performs useful compile-time checks.
+    [[maybe_unused]] auto mma_leader_ctas = size(shape_div(cluster_shape, AtomThrShapeMNK{}));
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                               // mma
+      (warp_category == WarpCategory::Sched) && (is_first_cta_in_cluster),                // sched
+      (warp_category == WarpCategory::MainloopLoad || warp_category == WarpCategory::MainloopLoadB), // main_load
+      (warp_category == WarpCategory::MainloopLoad),                                                 // main_loadA
+      (warp_category == WarpCategory::MainloopLoadB),                                                // main_loadB
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,                // epi_load
+      (warp_category == WarpCategory::Epilogue),                                          // epilogue
+      (warp_category == WarpCategory::Transformation)                                     // transformation
+    };
+
+    // MainloopLoad <--> Transformation Pipeline
+    typename Load2TransformPipeline::Params load2transform_pipeline_params;
+    if (warp_category == WarpCategory::MainloopLoad) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Transformation) {
+      load2transform_pipeline_params.role = Load2TransformPipeline::ThreadCategory::Consumer;
+    }
+    load2transform_pipeline_params.is_leader = (thread_idx_in_warp == 0);
+    load2transform_pipeline_params.num_consumers = NumTransformationThreads;
+    load2transform_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes_A;
+    load2transform_pipeline_params.initializing_warp = 0;
+    Load2TransformPipeline load2transform_pipeline(shared_storage.pipelines.mainloop.load2transform_pipeline,
+                                                   load2transform_pipeline_params,
+                                                   cluster_shape,
+                                                   McastDirection::kRow,
+                                                   cute::true_type{},  // Perform barrier init
+                                                   cute::false_type{}  // Delay mask calculation
+                                                   );
+
+    Load2TransformPipelineState load2transform_pipeline_consumer_state;
+    Load2TransformPipelineState load2transform_pipeline_producer_state = cutlass::make_producer_start_state<Load2TransformPipeline>();
+
+    // MainloopLoad <--> MMA Pipeline
+    typename Load2MmaPipeline::Params load2mma_pipeline_params;
+    if (warp_category == WarpCategory::MainloopLoadB) {
+      load2mma_pipeline_params.role = Load2MmaPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::MMA) {
+      load2mma_pipeline_params.role = Load2MmaPipeline::ThreadCategory::Consumer;
+    }
+    load2mma_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_loadB;
+    load2mma_pipeline_params.num_consumers = NumMMAThreads;
+    load2mma_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes_B;
+    load2mma_pipeline_params.initializing_warp = 8;
+    Load2MmaPipeline load2mma_pipeline(shared_storage.pipelines.mainloop.load2mma_pipeline,
+                                                   load2mma_pipeline_params,
+                                                   cluster_shape,
+                                                   McastDirection::kCol,
+                                                   cute::true_type{},  // Perform barrier init
+                                                   cute::false_type{}  // Delay mask calculation
+                                                   );
+
+    Load2MmaPipelineState load2mma_pipeline_consumer_state;
+    Load2MmaPipelineState load2mma_pipeline_producer_state = cutlass::make_producer_start_state<Load2MmaPipeline>();
+
+
+    // Transformation <--> MMA pipeline
+    typename Transform2MmaPipeline::Params transform2mma_pipeline_params;
+    if (warp_category == WarpCategory::Transformation) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::MMA) {
+      transform2mma_pipeline_params.role = Transform2MmaPipeline::ThreadCategory::Consumer;
+    }
+    transform2mma_pipeline_params.consumer_arv_count = 1;
+    transform2mma_pipeline_params.producer_arv_count = size(AtomThrShapeMNK{}) * NumTransformationThreads;
+    transform2mma_pipeline_params.initializing_warp = 2;
+    Transform2MmaPipeline transform2mma_pipeline(shared_storage.pipelines.mainloop.transform2mma_pipeline,
+                                                 transform2mma_pipeline_params,
+                                                 cluster_shape,
+                                                 cute::true_type{},  // Perform barrier init
+                                                 cute::false_type{}  // Delay mask calculation
+                                                 );
+
+    Transform2MmaPipelineState transform2mma_pipeline_consumer_state;
+    Transform2MmaPipelineState transform2mma_pipeline_producer_state = cutlass::make_producer_start_state<Transform2MmaPipeline>();
+
+    // MMA <--> Accumulator pipeline
+    typename Mma2AccumPipeline::Params mma2accum_pipeline_params;
+    if (warp_category == WarpCategory::MMA) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Producer;
+    }
+    else if (warp_category == WarpCategory::Epilogue) {
+      mma2accum_pipeline_params.role = Mma2AccumPipeline::ThreadCategory::Consumer;
+    }
+    mma2accum_pipeline_params.producer_arv_count = 1;
+    mma2accum_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    mma2accum_pipeline_params.initializing_warp = 6;
+    Mma2AccumPipeline mma2accum_pipeline(shared_storage.pipelines.mainloop.mma2accum_pipeline, 
+                                         mma2accum_pipeline_params,
+                                         cluster_shape,
+                                         cute::true_type{},  // Perform barrier init
+                                         cute::false_type{}  // Delay mask calculation
+                                         );
+
+    Mma2AccumPipelineState mma2accum_pipeline_consumer_state;
+    Mma2AccumPipelineState mma2accum_pipeline_producer_state = cutlass::make_producer_start_state<Mma2AccumPipeline>();
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = 1;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // CLC pipeline
+    // Operates Scheduling Warp <--> All Warps
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumMainloopLoadBThreads + NumEpilogueThreads +
+                                                  NumMMAThreads + NumTransformationThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 1;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    CLCPipelineState clc_pipeline_consumer_state;
+    CLCPipelineState clc_pipeline_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between transform, MMA, and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumTransformationThreads + NumMMAThreads + NumEpilogueThreads,
+                                                          cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
+    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
+    if (WarpCategory::MMA == warp_category && lane_predicate) {
+      epilogue_throttle_barrier.init(                          NumMMAThreads +
+                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
+                                                               NumMainloopLoadThreads + 
+                                                               NumMainloopLoadBThreads +
+                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0) +
+                                                               NumTransformationThreads);
+    }
+    
+    
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    load2transform_pipeline.init_masks(cluster_shape, block_id_in_cluster, cutlass::McastDirection::kRow);
+    load2mma_pipeline.init_masks(cluster_shape, cutlass::McastDirection::kCol);
+    transform2mma_pipeline.init_masks(cluster_shape);
+    mma2accum_pipeline.init_masks(cluster_shape);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    // Allocate accumulators
+    auto acc_shape = collective_mainloop.partition_accumulator_shape();
+    auto bulk_tmem = TiledMma::make_fragment_C(append(acc_shape,
+                                                      Int<AccumulatorPipelineStageCount>{}));
+
+    // Tile transform inputs now to get the k tile count
+    auto transform_inputs = collective_mainloop.transform_init(params.mainloop, problem_shape_MNKL, bulk_tmem, shared_storage.tensors.mainloop);
+    Tensor gA_mkl = get<0>(transform_inputs);
+
+    // Synchronization call. Blocks wait until barriers are initialized in shared memory.
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+      bool requires_clc_query = true;
+
+      do {
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(Load2TransformPipeline::Stages, k_tile_count);
+
+        if(is_participant.main_loadA){        
+          if constexpr (IsSchedDynamicPersistent) {
+            if (is_first_cta_in_cluster && requires_clc_query) {
+              clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+              clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+              ++clc_pipe_throttle_producer_state;
+            }
+          }
+        }
+
+        if (lane_predicate) {
+          if(is_participant.main_loadA){
+            auto [load2transform_pipeline_producer_state_next, k_tile_iter_next] = collective_mainloop.load_A(
+              params.mainloop,
+              load2transform_pipeline,
+              load2transform_pipeline_producer_state,
+              load_inputs,
+              cta_coord_mnkl,
+              k_tile_iter, k_tile_prologue
+            );
+            load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next;
+
+            if (do_load_order_arrive) {
+              load_order_barrier.arrive();
+              do_load_order_arrive = false;
+            }
+
+            auto [load2transform_pipeline_producer_state_next_, unused_] = collective_mainloop.load_A(
+              params.mainloop,
+              load2transform_pipeline,
+              load2transform_pipeline_producer_state,
+              load_inputs,
+              cta_coord_mnkl,
+              k_tile_iter_next, k_tile_count - k_tile_prologue
+            );
+            load2transform_pipeline_producer_state = load2transform_pipeline_producer_state_next_;
+          }
+
+          if(is_participant.main_loadB){
+            auto [load2mma_pipeline_producer_state_next, k_tile_iter_next] = collective_mainloop.load_B(
+              params.mainloop,
+              load2mma_pipeline,
+              load2mma_pipeline_producer_state,
+              load_inputs,
+              cta_coord_mnkl,
+              k_tile_iter, k_tile_prologue
+            );
+            load2mma_pipeline_producer_state = load2mma_pipeline_producer_state_next;
+
+            auto [load2mma_pipeline_producer_state_next_, unused_] = collective_mainloop.load_B(
+              params.mainloop,
+              load2mma_pipeline,
+              load2mma_pipeline_producer_state,
+              load_inputs,
+              cta_coord_mnkl,
+              k_tile_iter_next, k_tile_count - k_tile_prologue
+            );
+            load2mma_pipeline_producer_state = load2mma_pipeline_producer_state_next_;
+
+          }
+        }
+        
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+        work_tile_info = next_work_tile_info;
+      } while (work_tile_info.is_valid());
+
+      if(is_participant.main_loadA){
+        if (lane_predicate) {
+          load2transform_pipeline.producer_tail(load2transform_pipeline_producer_state);
+        }
+      }
+      if(is_participant.main_loadB){
+        if (lane_predicate) {
+          load2mma_pipeline.producer_tail(load2mma_pipeline_producer_state);
+        }
+      }
+
+    }
+
+    else if (is_participant.sched) {
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipeline_producer_state = scheduler.advance_to_next_work(
+              clc_pipeline, 
+              clc_pipeline_producer_state
+            );
+         }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipeline_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipeline_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipeline_producer_state);
+      }
+    }
+
+    else if (is_participant.transformation) {
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+        auto [load2transform_pipeline_consumer_state_next, transform2mma_pipeline_producer_state_next] = collective_mainloop.transform(
+          load2transform_pipeline,
+          load2transform_pipeline_consumer_state,
+          transform2mma_pipeline,
+          transform2mma_pipeline_producer_state,
+          bulk_tmem,
+          transform_inputs,
+          k_tile_iter, k_tile_count
+        );
+        transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state_next;
+        load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state_next;
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      transform2mma_pipeline.producer_tail(transform2mma_pipeline_producer_state);
+    }
+
+    else if (is_participant.mma) {
+
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+
+      auto mma_input_operands = collective_mainloop.mma_init(bulk_tmem, shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (is_mma_leader_cta) {
+            auto [load2mma_pipeline_consumer_state_next, transform2mma_pipeline_consumer_state_next, mma2accum_pipeline_producer_state_next] = collective_mainloop.mma(
+              load2mma_pipeline,
+              load2mma_pipeline_consumer_state,
+              transform2mma_pipeline,
+              transform2mma_pipeline_consumer_state,
+              mma2accum_pipeline,
+              mma2accum_pipeline_producer_state,
+              bulk_tmem,
+              mma_input_operands,
+              k_tile_count
+            );
+            // Advance the mm2accum pipe
+            load2mma_pipeline_consumer_state = load2mma_pipeline_consumer_state_next;
+            transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state_next;
+            mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state_next;
+        }
+      } while (work_tile_info.is_valid());
+
+      // leader MMA waits for leader + peer epilogues to release accumulator stage
+      if (is_mma_leader_cta) {
+        mma2accum_pipeline.producer_tail(mma2accum_pipeline_producer_state);
+      }
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Signal to peer MMA that entire tmem allocation can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue
+          );
+
+          do_tail_load = true;
+        }
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+
+      // Throttle the epilogue warps to improve prologue performance
+      static constexpr int epilogue_throttle_phase_bit = 0;
+      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
+
+      // Wait for tmem allocation
+      tmem_allocation_result_barrier.arrive_and_wait_unaligned();
+
+      auto accum_inputs = collective_mainloop.accum_init(bulk_tmem, typename CollectiveEpilogue::CopyOpT2R{}, typename CollectiveEpilogue::EpilogueTile{});
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipeline_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipeline_consumer_state;
+        }
+
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+          mma2accum_pipeline.consumer_wait(mma2accum_pipeline_consumer_state);
+
+          // Accumulators
+          Tensor accumulators = bulk_tmem(_,_,_,mma2accum_pipeline_consumer_state.index()); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+          mma2accum_pipeline_consumer_state = scheduler.template fixup<IsComplex>(
+            TiledMma{},
+            work_tile_info,
+            accumulators,
+            mma2accum_pipeline,
+            mma2accum_pipeline_consumer_state,
+            typename CollectiveEpilogue::CopyOpT2R{}
+          );
+
+          //
+          // Epilogue and write to gD
+          //
+          if (scheduler.compute_epilogue(work_tile_info)) {
+            auto [load_state_next, store_state_next, mma2accum_pipeline_state_next] = collective_epilogue.store(
+              epi_load_pipeline,
+              epi_load_pipe_consumer_state,
+              epi_store_pipeline,
+              epi_store_pipe_producer_state,
+              mma2accum_pipeline,
+              mma2accum_pipeline_consumer_state,
+              problem_shape_MNKL,
+              CtaShape_MNK{},
+              cta_coord_mnkl,
+              TileShape{},
+              TiledMma{},
+              accumulators,
+              shared_storage.tensors.epilogue
+            );
+            epi_load_pipe_consumer_state = load_state_next;
+            epi_store_pipe_producer_state = store_state_next;
+            do_tail_store = true;
+
+            // Advance the mma2accum pipe
+            mma2accum_pipeline_consumer_state = mma2accum_pipeline_state_next;
+          }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..11d381d29dbaca7eac19de9341360b4dcde4fed4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp
@@ -0,0 +1,1068 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/detail/mainloop_fusion_helper_scale_factor.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelTmaWarpSpecializedMmaTransformSm100>>> {
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  static_assert(!IsOverlappingAccum, "Does not support overlapping accumulator");
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads            = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopABLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads   = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads       = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps         = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumMainloopSFLoadThreads = NumThreadsPerWarp; // 1 warp
+
+
+  static constexpr uint32_t MaxThreadsPerBlock = cute::round_up(NumSchedThreads +
+                                                 NumMainloopABLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads + 
+                                                 NumMainloopSFLoadThreads, 128);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_load_pipe_increment(CtaShape_MNK{});
+
+  // Fixup performed for split-/stream-K is done across warps in different CTAs
+  // at epilogue subtile granularity. Thus, there must be one barrier per sub-tile per
+  // epilogue warp.
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopABPipeline = typename CollectiveMainloop::MainloopABPipeline;
+  using MainloopABPipelineState = typename CollectiveMainloop::MainloopABPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = typename CollectiveMainloop::AccumulatorPipeline;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using MainloopSFPipeline = typename CollectiveMainloop::MainloopSFPipeline;
+  using MainloopSFPipelineState = typename MainloopSFPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  static constexpr uint32_t GenericRegisterRequirement = 48;
+  static constexpr uint32_t AccumRegisterRequirement = 256;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+      alignas(16) arch::ClusterBarrier epilogue_throttle;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{}; 
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA            = 0,
+    Sched          = 1,
+    MainloopABLoad = 2,
+    EpilogueLoad   = 3,
+    Epilogue       = 4, // 4 warps
+    MainloopSFLoad = 8,
+    Unused         = 9,
+  };
+
+  struct IsParticipant {
+    uint32_t mma          = false;
+    uint32_t sched        = false;
+    uint32_t main_ab_load = false;
+    uint32_t epi_load     = false;
+    uint32_t epilogue     = false;
+    uint32_t main_sf_load = false;
+    uint32_t unused       = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
+      if (warp_idx < static_cast<int>(WarpCategory::Epilogue)) {
+        return WarpCategory(warp_idx);
+      } 
+      else if (warp_idx < static_cast<int>(WarpCategory::MainloopSFLoad)) {
+        return WarpCategory::Epilogue;
+      } 
+      else if (warp_idx == static_cast<int>(WarpCategory::MainloopSFLoad)) {
+        return WarpCategory::MainloopSFLoad;
+      } 
+      else {
+        return WarpCategory::Unused;
+      }
+    }();
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopABLoad),                      // main_ab_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopSFLoad),                      // main_sf_load
+      (warp_category == WarpCategory::Unused)                               // unused
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopABPipeline::Params mainloop_ab_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_ab_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_ab_load;
+    mainloop_ab_pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytes;
+    mainloop_ab_pipeline_params.initializing_warp = 0;
+    MainloopABPipeline mainloop_ab_pipeline(shared_storage.pipelines.mainloop.pipeline_ab,
+                                            mainloop_ab_pipeline_params,
+                                            cluster_shape,
+                                            cute::true_type{},   // Perform barrier init
+                                            cute::false_type{}); // Delay mask calculation
+
+    typename MainloopSFPipeline::Params mainloop_sf_pipeline_params;
+    if (WarpCategory::MainloopSFLoad == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_sf_pipeline_params.initializing_warp = 8;
+    mainloop_sf_pipeline_params.producer_arv_count = CollectiveMainloop::NumMainloopSFProducerThreadEvents;
+    mainloop_sf_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+
+    MainloopSFPipeline mainloop_sf_pipeline(shared_storage.pipelines.mainloop.pipeline_sf,
+                                            mainloop_sf_pipeline_params);
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopABLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopABLoadThreads;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopABLoadThreads + NumEpilogueThreads + 
+                                                  NumMMAThreads + NumMainloopSFLoadThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 1;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.mainloop.pipeline_accum,
+                                                 accumulator_pipeline_params,
+                                                 cluster_shape);
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopABLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    
+    if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+      tmem_deallocation_result_barrier.init(NumMMAThreads);
+    }
+
+
+    // Initialize smem barrier for prologue throttling. Epilogue warps are stalled until the prologue finishes.
+    arch::ClusterBarrier& epilogue_throttle_barrier = shared_storage.pipelines.epilogue_throttle;
+    if (WarpCategory::MMA == warp_category && lane_predicate) {
+      epilogue_throttle_barrier.init(                          NumMMAThreads +
+                                    (is_first_cta_in_cluster ? NumSchedThreads : 0) +
+                                                               NumMainloopABLoadThreads +
+                                    (is_epi_load_needed      ? NumEpilogueLoadThreads : 0));
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_ab_init(
+        problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+    MainloopABPipelineState mainloop_ab_pipe_consumer_state;
+    MainloopABPipelineState mainloop_ab_pipe_producer_state = cutlass::make_producer_start_state<MainloopABPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    MainloopSFPipelineState mainloop_sf_pipe_consumer_state;
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state = cutlass::make_producer_start_state<MainloopSFPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_ab_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    //
+    // TMEM "Allocation"
+    //
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_ab_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+      bool requires_clc_query = true;
+
+      do {
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, load_inputs.k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(MainloopABPipeline::Stages, k_tile_count);
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_ab_producer_state_next, k_tile_iter_next] = collective_mainloop.load_ab(
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_prologue
+        );
+        mainloop_ab_pipe_producer_state = mainloop_ab_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_ab_producer_state_next_, unused_] = collective_mainloop.load_ab(
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter_next, k_tile_count - k_tile_prologue
+        );
+        mainloop_ab_pipe_producer_state = mainloop_ab_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      collective_mainloop.load_ab_tail(
+        mainloop_ab_pipeline, 
+        mainloop_ab_pipe_producer_state
+      );
+      
+    }
+
+    else if (is_participant.main_sf_load) {
+      auto mainloop_sf_inputs = collective_mainloop.load_sf_init(
+        problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool requires_clc_query = true;
+
+      do {
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, mainloop_sf_inputs.k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_sf_producer_state_next, k_tile_iter_next] = collective_mainloop.load_sf(
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          mainloop_sf_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count
+        );
+        mainloop_sf_pipe_producer_state = mainloop_sf_producer_state_next;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+
+      collective_mainloop.load_sf_tail(
+        mainloop_sf_pipeline, 
+        mainloop_sf_pipe_producer_state
+      );
+      
+    }
+
+    else if (is_participant.sched) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      if constexpr (IsSchedDynamicPersistent) {
+
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto mma_inputs = collective_mainloop.mma_init(
+        tmem_storage,
+        shared_storage.tensors.mainloop);
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (is_mma_leader_cta) {
+          auto [mainloop_ab_pipe_consumer_state_, accumulator_pipe_producer_state_] = collective_mainloop.mma(
+            cute::make_tuple(mainloop_ab_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_ab_pipe_consumer_state, accumulator_pipe_producer_state),
+            tmem_storage,
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+          );
+          mainloop_ab_pipe_consumer_state = mainloop_ab_pipe_consumer_state_;
+          accumulator_pipe_producer_state = accumulator_pipe_producer_state_;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      // Leader MMA waits for leader + peer epilogues to release stage
+      if (is_mma_leader_cta) {
+        accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+      }
+      // Signal to peer MMA that entire tmem allocation can be deallocated
+      if constexpr (has_mma_peer_cta) {
+        // Leader does wait + arrive, follower does arrive + wait
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+        tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+      }
+ 
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+      
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      // Signal the epilogue warps to proceed once the prologue is complete
+      epilogue_throttle_barrier.arrive();
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Register reconfiguration
+      arch::warpgroup_reg_alloc<AccumRegisterRequirement>();
+
+      // Throttle the epilogue warps to improve prologue performance
+      static constexpr int epilogue_throttle_phase_bit = 0;
+      epilogue_throttle_barrier.wait(epilogue_throttle_phase_bit);
+      
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto accum_inputs = collective_mainloop.accum_init(
+        problem_shape_MNKL, 
+        shared_storage.tensors.mainloop
+      );
+
+      auto pipelines = cute::make_tuple(accumulator_pipeline, mainloop_sf_pipeline);
+      auto states = cute::make_tuple(accumulator_pipe_consumer_state, mainloop_sf_pipe_consumer_state);
+      bool do_tail_store = false;
+      do {
+
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        auto [accum, tiled_t2r, next_state] = collective_mainloop.accum(
+          pipelines,
+          states,
+          tmem_storage,
+          accum_inputs,
+          cta_coord_mnkl,
+          typename CollectiveEpilogue::CopyOpT2R{},
+          typename CollectiveEpilogue::EpilogueTile{},
+          k_tile_count
+        );
+
+        states = next_state;
+
+        auto fixup_next_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accum,
+          get<0>(pipelines),
+          get<0>(next_state),
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        get<0>(states) = fixup_next_state;
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+            auto [load_state_next, store_state_next] = collective_epilogue.store(
+              epi_load_pipeline,
+              epi_load_pipe_consumer_state,
+              epi_store_pipeline,
+              epi_store_pipe_producer_state,
+              problem_shape_MNKL,
+              CtaShape_MNK{},
+              cta_coord_mnkl,
+              TileShape{},
+              TiledMma{},
+              accum,
+              shared_storage.tensors.epilogue,
+              tiled_t2r
+            );
+            epi_load_pipe_consumer_state = load_state_next;
+            epi_store_pipe_producer_state = store_state_next;
+            do_tail_store = true;
+        }
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    } else {
+      // Register reconfiguration
+      arch::warpgroup_reg_dealloc<GenericRegisterRequirement>();
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5f6eb9b7190c08dcf0b633b9d9d47efe1cb5a94
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,1003 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelSparseTmaWarpSpecializedSm100> ||
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelSparseTmaWarpSpecializedBlockScaledSm100>>
+  >
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using LayoutA   = typename CollectiveMainloop::LayoutA;
+  using StrideA   = remove_cvref_t<decltype(LayoutA{}.stride())>;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using ElementE  = typename CollectiveMainloop::ElementE;
+  using LayoutE   = typename CollectiveMainloop::LayoutE;
+  using LayoutSFA = typename cutlass::detail::LayoutSFAType<CollectiveMainloop>::type;
+  using LayoutSFB = typename cutlass::detail::LayoutSFBType<CollectiveMainloop>::type;
+  using ElementSF = typename cutlass::detail::ElementSFType<CollectiveMainloop>::type;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  static constexpr bool IsBlockscaled = cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                                                            KernelSparseTmaWarpSpecializedBlockScaledSm100>;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads     = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps       = NumEpilogueThreads / NumThreadsPerWarp;
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  static constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_load_pipe_increment(CtaShape_MNK{});
+
+  // Fixup performed for split-/stream-K is done across warps in different CTAs
+  // at epilogue subtile granularity. Thus, there must be one barrier per sub-tile per
+  // epilogue warp.
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+  using MainloopPipelineState = typename CollectiveMainloop::MainloopPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(16) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{}; 
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA          = 0,
+    Sched        = 1,
+    MainloopLoad = 2,
+    EpilogueLoad = 3,
+    Epilogue     = 4
+  };
+
+  struct IsParticipant {
+    uint32_t mma       = false;
+    uint32_t sched     = false;
+    uint32_t main_load = false;
+    uint32_t epi_load  = false;
+    uint32_t epilogue  = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    const uint32_t ktile_start_alignment_count = 2u;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    auto scheduler_params = [&]() {
+      if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+        return TileScheduler::to_underlying_arguments(
+            problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+            args.hw_info, args.scheduler, scheduler_workspace,
+            ktile_start_alignment_count
+            );
+      }
+      else {
+        return TileScheduler::to_underlying_arguments(
+            problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+            args.hw_info, args.scheduler, scheduler_workspace
+          );
+      }
+    }();
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      scheduler_params
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+    
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      const uint32_t ktile_start_alignment_count = 2u;
+      workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    if constexpr (cute::is_same_v<TileSchedulerTag, cutlass::gemm::StreamKScheduler> && not IsBlockscaled) {
+      const uint32_t ktile_start_alignment_count = 2u;
+      status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+        args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter, ktile_start_alignment_count);
+      workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+        args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers,
+        /*epilogue_subtile=*/1, /*num_accumulator_mtx=*/1,
+        ktile_start_alignment_count);
+    }
+    else {
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    }
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static constexpr
+  dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = warp_idx < static_cast<int>(WarpCategory::Epilogue) ? WarpCategory(warp_idx)
+                                                                                     : WarpCategory::Epilogue;
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{});
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop, cluster_shape, cta_rank_in_cluster);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      collective_mainloop.prefetch_tma_descriptors();
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      collective_epilogue.prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopLoad),                        // main_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue)                             // epilogue
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    typename MainloopPipeline::ParamsMetadata mainloop_pipeline_params_metadata;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_load;
+    mainloop_pipeline_params_metadata.transaction_bytes = CollectiveMainloop::MainLoadTmaTransactionBytes;
+    mainloop_pipeline_params_metadata.metadata_transaction_bytes = CollectiveMainloop::MetadataTmaTransactionBytes;
+    mainloop_pipeline_params.initializing_warp = 0;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop,
+                                       mainloop_pipeline_params,
+                                       mainloop_pipeline_params_metadata,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 1;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopLoadThreads;
+    load_order_barrier_params.initializing_warp = 3;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopLoadThreads + NumEpilogueThreads + NumMMAThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 4;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 5;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopLoad == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if (WarpCategory::MMA == warp_category) {
+      if constexpr(!IsOverlappingAccum) {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumMMAThreads);
+        }
+      }
+      else {
+        if (has_mma_peer_cta && lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+        }
+        else if (lane_predicate) {
+          tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+        }
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    auto load_inputs = collective_mainloop.load_init(
+        problem_shape_MNKL, shared_storage.tensors.mainloop);
+
+    MainloopPipelineState mainloop_pipe_consumer_state;
+    MainloopPipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+    accumulator_pipeline.init_masks(cluster_shape, block_id_in_cluster);
+
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+    //
+    // TMEM "Allocation"
+    //
+    auto tmem_storage = collective_mainloop.template init_tmem_tensors<EpilogueTile, IsOverlappingAccum>(EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      bool requires_clc_query = true;
+
+      do {
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, load_inputs.k_tiles);
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, unused_] = collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_count
+        );
+        mainloop_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.mma) {
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      auto mma_inputs = collective_mainloop.mma_init(
+        tmem_storage,
+        shared_storage.tensors.mainloop);
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_producer_state.phase() ^ 1;
+          }
+          else {
+            return accumulator_pipe_producer_state.index();
+          }
+        }();
+
+        if (is_mma_leader_cta) {
+          mainloop_pipe_consumer_state = collective_mainloop.mma(
+            cute::make_tuple(mainloop_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_pipe_consumer_state, accumulator_pipe_producer_state),
+            collective_mainloop.slice_accumulator(tmem_storage, acc_stage),
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+            );
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.template load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      collective_mainloop.set_tmem_offsets(tmem_storage, tmem_base_ptr);
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+
+        auto accumulator = get<0>(collective_mainloop.slice_accumulator(tmem_storage, acc_stage));
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulator,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulator,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6a1e6a8fe6bfa42024824ea377a04e87460a5f7f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+class StaticPersistentTileScheduler100:
+public StaticPersistentTileScheduler<
+  StaticPersistentTileScheduler100
+  > {
+
+public:
+  using BaseScheduler = StaticPersistentTileScheduler<StaticPersistentTileScheduler100>;
+public:
+  using BaseScheduler::StaticPersistentTileScheduler;
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  struct CLCResponse { uint32_t data[4] = {0}; };
+
+  static constexpr bool IsDynamicPersistent = false;
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
+  using WorkTileInfo = typename BaseScheduler::WorkTileInfo;
+  using Arguments = typename BaseScheduler::Arguments;
+
+  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order) {
+
+    uint64_t cluster_id, cluster_major_offset = 0 ;
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+    int32_t minor_work_idx, major_work_idx;
+
+    minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor);
+    major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx};
+    }
+    else {
+      return {major_work_idx, minor_work_idx};
+    }
+  }
+
+  // clc_response_ptr is a placeholder; it is just to make the StaticPersistentTileScheduler100 and PersistentTileScheduler100 constructor interfaces consistent
+  CUTLASS_DEVICE explicit
+  StaticPersistentTileScheduler100(CLCResponse* /* clc_response_ptr */, Params const& params, dim3 block_id_in_cluster)
+    : BaseScheduler(params) {}
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&args, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    size_t workspace_size  = 0;
+    return workspace_size;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace_ptr, cudaStream_t stream, ProblemShape problem_shape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& arguments,
+      [[maybe_unused]] void* workspace = nullptr,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1
+      ) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = BaseScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk,
+                                                                 atom_thr_shape_mnk, cluster_shape_mnk);
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape_mnk),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    ProblemShapeMNKL problem_shape_mnkl,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    [[maybe_unused]] KernelHardwareInfo const& hw_info,
+    Arguments const& arguments,
+    [[maybe_unused]] void* workspace=nullptr,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = BaseScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  template <
+    bool IsComplex,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  fixup(
+      TiledMma const& ,
+      WorkTileInfo const&,
+      cute::Tensor<AccEngine, AccLayout>&,
+      AccumulatorPipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R) const {
+    return acc_pipe_consumer_state;
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+      Params const& params,
+      WorkTileInfo const& work_tile_info,
+      FrgTensorC& accumulators,
+      uint32_t num_barriers,
+      uint32_t barrier_idx) {
+  }
+
+};
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..806d90261e11536957dbda8f282abf63b52b576a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp
@@ -0,0 +1,825 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+// Enable printing of transformation of CLC IDs into swizzled tile coordinates
+#define CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT 0
+
+#include "cute/int_tuple.hpp"
+
+#include "cutlass/arch/config.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/detail/cluster.hpp" 
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/conv/detail.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+//////////////////// Blackwell Scheduler /////////////////////////
+
+template<
+  class ClusterShape_,
+  uint32_t Stages_
+>
+class PersistentTileSchedulerSm100 {
+
+private:
+
+  using UnderlyingTileScheduler = PersistentTileSchedulerSm90;
+
+public:
+  using ClusterShape = ClusterShape_;
+  using RasterOrder = UnderlyingTileScheduler::RasterOrder;
+  using RasterOrderOptions = UnderlyingTileScheduler::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = true;
+
+  static constexpr uint32_t Stages = Stages_;
+
+  // CLC response is an opaque 16B value
+  struct CLCResponse { uint32_t data[4] = {0}; };
+
+  using WorkTileInfo = typename UnderlyingTileScheduler::WorkTileInfo;
+
+  using Params = PersistentTileSchedulerSm100Params;
+  using Pipeline = PipelineCLCFetchAsync<Stages, ClusterShape>;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+
+  using ThrottlePipeline = PipelineAsync<Stages>;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  class SharedStorage {
+  public:
+
+    CUTLASS_DEVICE PipelineStorage& pipeline() { return pipeline_; }
+    CUTLASS_DEVICE ThrottlePipelineStorage& throttle_pipeline() { return throttle_pipeline_; }
+    CUTLASS_DEVICE CLCResponse* data() { return data_; }
+
+  private: 
+    alignas(16) PipelineStorage pipeline_;
+    alignas(16) ThrottlePipelineStorage throttle_pipeline_;
+    alignas(16) CLCResponse data_[Stages];
+  };
+
+  struct Arguments {
+    int max_swizzle_size = 0;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+  };
+
+  //
+  // Static Host Methods
+  //
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    ProblemShapeMNKL problem_shape_mnkl,
+    TileShape tile_shape,
+    [[maybe_unused]] ClusterShape cluster_shape,
+    [[maybe_unused]] KernelHardwareInfo const& hw_info,
+    [[maybe_unused]] Arguments const& args,
+    [[maybe_unused]] void* workspace = nullptr,
+    [[maybe_unused]] uint32_t NumEpilogueSubTiles = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u
+    ) {
+
+    auto cs = cutlass::detail::select_cluster_shape(ClusterShape_{}, hw_info.cluster_shape);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cs);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cs),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order
+    );
+    return params;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace = nullptr
+    ) {
+
+    auto selected_cluster_shape = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk,
+                                                  atom_thr_shape_mnk, selected_cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(selected_cluster_shape),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order
+    );
+    return params;
+  }
+
+  // Conv Specialization
+  template <conv::Operator ConvOp, int NumSpatialDims, class TileShape, class AtomThrShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDims> problem_shape,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace = nullptr
+    ) { 
+    
+    auto problem_shape_mnkl = [&] () {
+      // Infer im2col linearization from ConvOp and TileShape
+      constexpr bool is_linearized_M = (ConvOp == conv::Operator::kFprop || ConvOp == conv::Operator::kDgrad)
+                                        && depth<0>(TileShape{}) == _0{};
+      constexpr bool is_linearized_K = ConvOp == conv::Operator::kWgrad && depth<2>(TileShape{}) == _0{};
+
+      if constexpr (is_linearized_M || is_linearized_K) {
+        // transformation + im2col linearization
+        return cutlass::conv::detail::get_linearized_problem_shape_MNKL(problem_shape);
+      }
+      else {
+        // transformation
+        return cutlass::conv::detail::get_transformed_problem_shape_MNKL(problem_shape);
+      }
+    }();
+
+    return to_underlying_arguments(
+      problem_shape_mnkl,
+      tile_shape_mnk,
+      atom_thr_shape_mnk,
+      cluster_shape_mnk,
+      hw_info,
+      args,
+      workspace
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      ProblemShapeMNKL problem_shape_mnk,
+      BlockShape cta_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo hw_info,
+      [[maybe_unused]] Arguments arguments) {
+    auto problem_shape_MNKL = append<4>(problem_shape_mnk, Int<1>{});
+    auto grid = get_tiled_cta_shape_mnl(problem_shape_MNKL, cta_shape, cluster_shape);
+    return possibly_transpose_grid(params.raster_order_, params.divmod_cluster_shape_m_, params.divmod_cluster_shape_n_, grid);
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo hw_info) {
+    auto grid = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+    return possibly_transpose_grid(params.raster_order_, params.divmod_cluster_shape_m_, params.divmod_cluster_shape_n_, grid);
+  }
+
+  // Possibly transpose the grid depending on rasterization order.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  possibly_transpose_grid(RasterOrder raster_order, FastDivmod divmod_cluster_shape_m, FastDivmod divmod_cluster_shape_n, dim3 grid) {
+    if (raster_order == RasterOrder::AlongN) {
+      // Swap grid.x and grid.y for AlongN rasterization order, since the CLC scheduler
+      // will schedule in AlongM order by default.
+      //
+      // Each grid dimension must also be a multiple of the corresponding cluster dimension,
+      // so we convert the untransposed x into the number of clusters along the M mode,
+      // and multiply this by cluster.n (and vice-versa for y).
+      auto tmp = grid.x;
+      grid.x = divmod_cluster_shape_n.divide(grid.y) * divmod_cluster_shape_m;
+      grid.y = divmod_cluster_shape_m.divide(tmp) * divmod_cluster_shape_n;
+    }
+    return grid;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+      Arguments const& args, 
+      ProblemShape problem_shape, 
+      KernelHardwareInfo const& hw_info, 
+      [[maybe_unused]] uint32_t reduction_warp_groups,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t num_accumulator_mtxs = 1) {
+    
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(ClusterShape_{}, hw_info.cluster_shape);
+
+    return Params::get_workspace_size(
+      to_gemm_coord(problem_shape_mnkl),
+      GemmCoord(1, 1, 1),                 // Tile shape. Unused.
+      to_gemm_coord(cs),
+      hw_info,
+      args.max_swizzle_size, 
+      args.raster_order
+    );
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape, class ClusterShape>
+  static size_t
+  get_workspace_size(Arguments const& args, ProblemShape problem_shape, TileShapeMNK, AtomThrShape, ClusterShape, KernelHardwareInfo const& hw_info,
+      uint32_t reduction_warp_groups, uint32_t num_accumulator_mtxs = 1) {
+    return get_workspace_size<ProblemShape, ElementAccumulator>(args, problem_shape, hw_info, reduction_warp_groups, num_accumulator_mtxs);
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    cudaStream_t stream,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t,     // reduction_warp_groups
+    uint32_t = 1, // epilogue_subtile
+    uint32_t = 1, // num_accumulator_mtxs
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(ClusterShape_{}, hw_info.cluster_shape);
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      to_gemm_coord(problem_shape_mnkl),
+      GemmCoord(1, 1, 1),                 // Tile shape. Unused.
+      to_gemm_coord(cs),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order,
+      cuda_adapter
+    );
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  static cutlass::Status
+  initialize_workspace(
+      Arguments const& args,
+      void* workspace,
+      cudaStream_t stream,
+      ProblemShape const& problem_shape,
+      TileShapeMNK,
+      AtomThrShape,
+      ClusterShape,
+      KernelHardwareInfo const& hw_info,
+      uint32_t reduction_warp_groups,
+      uint32_t num_accumulator_mtxs = 1,
+      CudaHostAdapter *cuda_adapter = nullptr) {
+
+    return initialize_workspace<ProblemShape, ElementAccumulator>(
+      args,
+      workspace,
+      stream,
+      problem_shape,
+      hw_info,
+      reduction_warp_groups,
+      1,  // epilogue_subtile
+      num_accumulator_mtxs,
+      cuda_adapter
+    );
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return true;
+  }
+
+  //
+  // Constructors
+  //
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100(Params const& params)
+    : params_(params) {}
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100(CLCResponse* clc_response_ptr, Params const& params, dim3 block_id_in_cluster)
+    : clc_response_ptr_(clc_response_ptr), params_(params), block_id_in_cluster_(block_id_in_cluster) {}
+
+  template <class ProblemShapeMNKL, class TileShape>
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100(CLCResponse* clc_response_ptr, Params const& params, ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape, dim3 block_id_in_cluster)
+    : PersistentTileSchedulerSm100(clc_response_ptr, params, block_id_in_cluster) {}
+
+  //
+  // Work Tile API
+  //
+
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    return swizzle_and_rasterize(blockIdx.x, blockIdx.y, blockIdx.z, /*valid=*/true, /*cluster_offset_m=*/0, /*cluster_offset_n=*/0);
+  }
+
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    return make_coord(work_tile_info.M_idx, work_tile_info.N_idx, _, work_tile_info.L_idx);
+  }
+
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
+    int m_coord = idx2crd(params_.divmod_cluster_shape_m_.divide(work_tile_info.M_idx),
+                          params_.problem_tiles_m_);
+    int n_coord = idx2crd(params_.divmod_cluster_shape_n_.divide(work_tile_info.N_idx),
+                          params_.problem_tiles_n_);
+    int l_coord = idx2crd(work_tile_info.L_idx,
+                          params_.problem_tiles_l_);
+    return make_coord(m_coord, n_coord, _, l_coord);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void
+  issue_clc_query(PipelineState<Stages> state, uint32_t mbarrier_addr, CLCResponse* clc_response_ptr) {
+  #if defined(CUTLASS_ARCH_CLC_ENABLED)
+      uint32_t result_addr = cute::cast_smem_ptr_to_uint(reinterpret_cast<const void*>(
+            &clc_response_ptr[state.index()]));
+      asm volatile(
+        "{\n\t"
+        "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [%0], [%1];\n\t" 
+        "}\n"
+        :
+        : "r"(result_addr), "r"(mbarrier_addr));
+  #else
+      CUTLASS_NOT_IMPLEMENTED();
+  #endif
+  }
+
+  CUTLASS_DEVICE
+  static WorkTileInfo
+  work_tile_info_from_clc_response(uint32_t result_addr) {
+    WorkTileInfo work_tile_info;
+    uint32_t valid = 0;
+
+    #if defined(CUTLASS_ARCH_CLC_ENABLED)
+      asm volatile(
+        "{\n"
+        ".reg .pred p1;\n\t"
+        ".reg .b128 clc_result;\n\t"
+        "ld.shared.b128 clc_result, [%4];\n\t"
+        "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 p1, clc_result;\n\t"
+        "selp.u32 %3, 1, 0, p1;\n\t"
+        "@p1 clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {%0, %1, %2, _}, clc_result;\n\t"
+        "}\n"
+        : "=r"(work_tile_info.M_idx), "=r"(work_tile_info.N_idx), "=r"(work_tile_info.L_idx), "=r"(valid)
+        : "r"(result_addr)
+        : "memory"
+      );
+
+      cutlass::arch::fence_view_async_shared();
+    #else
+      CUTLASS_NOT_IMPLEMENTED();
+    #endif
+    work_tile_info.is_valid_tile = (valid == 1);
+    return work_tile_info;
+  }
+
+  CUTLASS_DEVICE
+  PipelineState<Stages> 
+  advance_to_next_work(Pipeline& clc_pipeline, PipelineState<Stages> clc_pipe_producer_state) const {
+    uint32_t mbarrier_addr = clc_pipeline.producer_get_barrier(clc_pipe_producer_state);
+    // Wait for clcID buffer to become empty with a flipped phase
+    clc_pipeline.producer_acquire(clc_pipe_producer_state);
+
+    if (cute::elect_one_sync()) {
+      issue_clc_query(clc_pipe_producer_state, mbarrier_addr, clc_response_ptr_);
+    }
+
+    ++clc_pipe_producer_state;
+    return clc_pipe_producer_state;
+  }
+
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_HOST_DEVICE
+  auto
+  fetch_next_work(
+    WorkTileInfo work_tile_info,
+    TileSchedulerPipeline& scheduler_pipeline,
+    TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+
+    scheduler_pipeline.consumer_wait(scheduler_pipe_consumer_state);
+    uint32_t smem_addr = cute::cast_smem_ptr_to_uint(&clc_response_ptr_[scheduler_pipe_consumer_state.index()]);
+    auto work_tile = work_tile_info_from_clc_response(smem_addr);
+    scheduler_pipeline.consumer_release(scheduler_pipe_consumer_state);
+
+    work_tile = swizzle_and_rasterize(
+      work_tile.M_idx, work_tile.N_idx, work_tile.L_idx, work_tile.is_valid(),
+      block_id_in_cluster_.x, block_id_in_cluster_.y);
+
+    // Return true to indicate that the tile scheduler pipeline state should be advanced
+    return cute::make_tuple(work_tile, true);
+  }
+
+  //
+  // K Tile API
+  //
+  // Permute K iteration loading order from [C, S, R, T] to [S, R, T, C] for better L2 locality
+  template <class ProblemShapeMNKL, class TileShape, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) {
+    constexpr int32_t rank_t = cute::rank<2>(ProblemShapeMNKL{});
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
+    if constexpr (rank_t == 4) {
+      return cute::make_coord_iterator<cute::Step<_3, _0, _1, _2>>(k_tiles);
+    }
+    else if constexpr (rank_t == 3) {
+      return cute::make_coord_iterator<cute::Step<_2, _0, _1>>(k_tiles);
+    }
+    else if constexpr (rank_t == 2) {
+      return cute::make_coord_iterator<cute::Step<_1, _0>>(k_tiles);
+    }
+    else {
+      return cute::make_coord_iterator(k_tiles);
+    }
+  }
+
+  template <class ProblemShape, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  // Compatible with sm90 kernel layers 
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&) {
+    return true;
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`. None of the work units returned by
+  // this scheduler require fixup, since none of the work units partition the reduction extent.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const work_tile_info) {
+    return false;
+  }
+
+  // Performs the reduction across splits for a given output tile. No fixup is required for
+  // work units returned by this scheduler.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t, uint32_t = 1) const { }
+
+  template <
+    bool IsComplex,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  fixup(
+      TiledMma const& ,
+      WorkTileInfo const&,
+      cute::Tensor<AccEngine, AccLayout>&,
+      AccumulatorPipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R) const {
+    return acc_pipe_consumer_state;
+  }
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  //
+  // Implementation Helpers
+  //
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape blk_shape, ClusterShape cluster_shape) {
+    auto grid_shape    = shape(ceil_div(problem_shape_mnkl, blk_shape));
+    auto grid_shape_up = round_up(product_each(grid_shape), cluster_shape); // Assumes ClusterShape is flat
+    return dim3(size<0>(grid_shape_up),   // M
+                size<1>(grid_shape_up),   // N
+                size<3>(grid_shape_up));  // L
+  }
+
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl,
+                          TileShape tile_shape_mnk,
+                          AtomThrShape atom_thr_shape_mnk,
+                          ClusterShape cluster_shape_mnk) {
+    auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk)));
+    auto ctas_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk));
+    auto ctas_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk));
+    auto ctas_l = tiles_l;
+
+    return {static_cast<uint32_t>(ctas_m),
+            static_cast<uint32_t>(ctas_n),
+            static_cast<uint32_t>(ctas_l)};
+  }
+
+  CUTLASS_DEVICE
+  void
+  store_invalid_response(PipelineState<Stages> state) {
+    // Only writes to local CTA.
+    store_query_response(state, make_invalid_response());
+  }
+
+  CUTLASS_HOST_DEVICE
+  void
+  store_query_response(PipelineState<Stages> state, CLCResponse clc_response) {
+    #if defined(__CUDA_ARCH__)
+    uint32_t smem_ptr = cute::cast_smem_ptr_to_uint(&clc_response_ptr_[state.index()]);
+    asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n"
+                  : : "r"(smem_ptr)
+                    , "r"(clc_response.data[0])
+                    , "r"(clc_response.data[1])
+                    , "r"(clc_response.data[2])
+                    , "r"(clc_response.data[3]));
+    cutlass::arch::fence_view_async_shared();
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  static CLCResponse
+  make_invalid_response() {
+    return CLCResponse{};
+  }
+
+  // Set data SMEM ptr 
+  CUTLASS_DEVICE
+  void
+  set_data_ptr(CLCResponse* clc_response_ptr) {
+    clc_response_ptr_ = clc_response_ptr;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    return cute::make_tuple(work_tile_info, true);
+  }
+
+  CUTLASS_DEVICE
+  static cute::tuple<int32_t, int32_t>
+  possibly_transpose_work_tile(RasterOrder raster_order, int32_t M_idx, int32_t N_idx, FastDivmod divmod_cluster_shape_m, FastDivmod divmod_cluster_shape_n) {
+    if (raster_order == RasterOrder::AlongN) {
+      int cluster_m, remainder_m, cluster_n, remainder_n;
+      divmod_cluster_shape_m(cluster_m, remainder_m, M_idx);
+      divmod_cluster_shape_n(cluster_n, remainder_n, N_idx);
+      M_idx = cluster_n * divmod_cluster_shape_m.divisor + remainder_m;
+      N_idx = cluster_m * divmod_cluster_shape_n.divisor + remainder_n;
+    }
+    return cute::make_tuple(M_idx, N_idx);
+  }
+
+
+  CUTLASS_DEVICE
+  static void
+  possibly_transpose_work_tile(WorkTileInfo& work_tile_info, Params const& params) {
+    auto [M_idx, N_idx] = possibly_transpose_work_tile(
+      params.raster_order_, work_tile_info.M_idx, work_tile_info.N_idx, params.divmod_cluster_shape_m_, params.divmod_cluster_shape_n_);
+    work_tile_info.M_idx = M_idx;
+    work_tile_info.N_idx = N_idx;
+  }
+
+  CUTLASS_DEVICE
+  void
+  possibly_transpose_work_tile(WorkTileInfo& work_tile_info) {
+    possibly_transpose_work_tile(work_tile_info, params_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  swizzle_and_rasterize(
+      int cta_coord_m,
+      int cta_coord_n,
+      int cta_coord_l,
+      bool valid,
+      int cta_in_cluster_offset_m,
+      int cta_in_cluster_offset_n) const {
+    #if CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT == 1
+    // Save original cta_coord_m and cta_coord_n
+    int orig_cta_coord_m = cta_coord_m;
+    int orig_cta_coord_n = cta_coord_n;
+    #endif
+
+    // Swizzling is enabled if the swizzle size is greater than 0
+    if (params_.divmod_swizzle_size_.divisor > 0) {
+      //
+      // Swizzling enabled
+      //
+
+      // Swizzling is performed in terms of clusters. Convert the major and minor CTA coordinates
+      // into cluster coordinates.
+      int32_t cluster_coord_major, cluster_coord_minor, cluster_offset_m, cluster_offset_n;
+      params_.divmod_cluster_shape_m_(cluster_coord_major, cluster_offset_m, cta_coord_m);
+      params_.divmod_cluster_shape_n_(cluster_coord_minor, cluster_offset_n, cta_coord_n);
+
+      // The general swizzling transformation is performed as follows:
+      //
+      // Consider a grid of size (M,N) (in terms of clusters) that uses a swizzle size of S.
+      // For simplicity, assume that both M and N are divisible by S.
+      //
+      // Consider M=4, N=4, and S=2. We'd like to transform the original rasterization as follows
+      //
+      //                           <---- N ---->
+      //                           <- S ->
+      //  +--+--+--+--+            +--+--+--+--+  ^
+      //  |00|04|08|12|            |00|01|14|15|  |
+      //  +--+--+--+--+            +--+--+--+--+  |
+      //  |01|05|09|13|            |02|03|12|13|  |
+      //  +--+--+--+--+     --->   +--+--+--+--+  M
+      //  |02|06|10|14|            |04|05|10|11|  |
+      //  +--+--+--+--+            +--+--+--+--+  |
+      //  |03|07|11|15|            |06|07|08|09|  |
+      //  +--+--+--+--+            +--+--+--+--+  v
+      //
+      // An easy way to do this is by breaking our MxN grid into (N/S) grids of size MxS:
+      //
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |00|04|        |00|01|             |08|12|        |14|15|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |01|05|        |02|03|             |09|13|        |12|13|
+      //  +--+--+  --->  +--+--+     and     +--+--+  --->  +--+--+
+      //  |02|06|        |04|05|             |10|14|        |10|11|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //  |03|07|        |06|07|             |11|15|        |08|09|
+      //  +--+--+        +--+--+             +--+--+        +--+--+
+      //
+      // Given an M and N cluster coordinate (m,n) within one of these MxS grids, the desired remapping can
+      // be performed as:
+      //   new_m_local = (m / S) + ((M / S) * (n % S))
+      //   new_n_local = (m % S)
+      //
+      // We can map these local coordinates within the MxS subgrid to the full MxN grid by offsetting the new
+      // local N coordinate based on which subgrid we're in. We can obtain the serpantine rasterization order
+      // across subgrids by flipping the new M coordinate depending on which subgrid we're in.
+      //
+      //   new_m_global = (n / S) % 2 == 0 ? new_m_local : M - new_m_local
+      //   new_n_global = new_n_local + ((n / S) * S)
+      //
+      // In reality, we need to handle cases in which M and N are not divisible by swizzle size. In this case,
+      // we currently simply perform the swizzling transformation above for the ((M/S)*S) x ((N/S)*S) subgrid
+      // that is divisible by swizzle size, and do not remap any residual tiles.
+      //
+
+      int32_t minor_div_swizz, minor_mod_swizz;
+      params_.divmod_swizzle_size_(minor_div_swizz, minor_mod_swizz, cluster_coord_minor);
+
+      int32_t major_clusters = params_.divmod_cluster_shape_m_.divide(gridDim.x);
+
+      // Determine the first IDs in the major and minor mode that constitute "residual" space
+      int32_t major_clusters_div_swizzle = params_.divmod_swizzle_size_.divide(major_clusters);
+      int32_t first_residual_major_cluster_id = major_clusters_div_swizzle * params_.divmod_swizzle_size_.divisor;
+      int32_t minor_clusters_div_swizzle = params_.divmod_swizzle_size_.divide(params_.divmod_cluster_shape_n_.divide(gridDim.y));
+      int32_t first_residual_minor_cluster_id = minor_clusters_div_swizzle * params_.divmod_swizzle_size_.divisor;
+
+      // Only schedule via the swizzle if we're not within the residual space in either the major or minor mode.
+      int32_t new_major_coord = cluster_coord_major, new_minor_coord = cluster_coord_minor;
+      if (cluster_coord_major < first_residual_major_cluster_id && cluster_coord_minor < first_residual_minor_cluster_id) {
+        // Not a residual cluster
+        int32_t major_div_swizz, major_mod_swizz;
+        params_.divmod_swizzle_size_(major_div_swizz, major_mod_swizz, cluster_coord_major);
+
+        new_major_coord = major_div_swizz + (major_clusters_div_swizzle * minor_mod_swizz);
+        new_minor_coord = major_mod_swizz + (minor_div_swizz * params_.divmod_swizzle_size_.divisor);
+      }
+
+      // Map the swizzled cluster tile back to a CTA tile
+      cta_coord_m = new_major_coord * params_.divmod_cluster_shape_m_.divisor + cluster_offset_m;
+      cta_coord_n = new_minor_coord * params_.divmod_cluster_shape_n_.divisor + cluster_offset_n;
+    }
+    // Since we swap the grid x and y modes if raster order is AlongN, swap the M and N tile offsets when
+    // raster order is AlongN.
+    auto [new_cta_coord_m, new_cta_coord_n] = possibly_transpose_work_tile(
+      params_.raster_order_, cta_coord_m, cta_coord_n, params_.divmod_cluster_shape_m_, params_.divmod_cluster_shape_n_);
+
+    new_cta_coord_m += cta_in_cluster_offset_m;
+    new_cta_coord_n += cta_in_cluster_offset_n;
+
+    #if CUTLASS_SWIZZLE_DEVICE_DEBUG_PRINT == 1
+    if (threadIdx.x == 0) {
+      printf("B[%d,%d,%d] T=%d new=%d,%d,%d orig=%d,%d,%d valid=%d\n",
+        blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,
+        new_cta_coord_m, new_cta_coord_n, cta_coord_l,
+        orig_cta_coord_m, orig_cta_coord_n, cta_coord_l, (int)valid);
+      }
+    #endif
+
+    return {new_cta_coord_m, new_cta_coord_n, static_cast<int32_t>(cta_coord_l), valid};
+  }
+
+  //
+  // Data Members
+  //
+  CLCResponse *clc_response_ptr_ = nullptr;
+  Params const& params_;
+  dim3 block_id_in_cluster_ = {0, 0, 0};
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8cf885f8900583285143f0424657e9f83ba0474b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp
@@ -0,0 +1,335 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+//////////////////// Blackwell Grouped Static Scheduler /////////////////////////
+
+// This tile scheduler is a SM100 wrapper for scheduling by the SM90 Group tile scheduler.
+// This helps to enable reusing SM90 group tile scheduling capability for SM100 kernels
+// (e.g., support for CTA rasterization).
+
+// For Grouped GEMM, most common use case have Problem Shapes for all groups only on device.
+// Therefore, we don't how many tiles there will be for the scheduler to hand out.
+// Hence, we have a SM90 style static group scheduler that launches the largest grid possible.
+// If we had access to host-side problem shapes, one could to use it to figure out the grid shape
+// and thereafter use CLC query (which can then be linearized and mapped to an appropriate tile coord).
+
+template<class GroupProblemShape, int SchedulerPipelineStageCount>
+class PersistentTileSchedulerSm100Group {
+
+public:
+  using UnderlyingScheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
+  using Params = PersistentTileSchedulerSm100GroupParams<GroupProblemShape>;
+  using WorkTileInfo = typename UnderlyingScheduler::WorkTileInfo;
+  using Arguments = typename UnderlyingScheduler::Arguments;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+
+  using CLCResponse = WorkTileInfo;
+  
+  static constexpr bool IsDynamicPersistent = UnderlyingScheduler::IsDynamicPersistent;
+
+private:
+  UnderlyingScheduler scheduler_sm90;
+
+public:
+  template <class TileShape, class AtomThrShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape_mnk,
+    AtomThrShape atom_thr_shape_mnk,
+    ClusterShape cluster_shape_mnk,
+    KernelHardwareInfo const& hw_info,
+    Arguments const& args,
+    void* workspace = nullptr) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+
+    auto selected_cluster_shape = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
+    auto cta_shape = shape_div(tile_shape_mnk, atom_thr_shape_mnk); // For 2SM kernels, use CTA tile shape for the underlying scheduler
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes,
+      hw_info,
+      cta_shape, selected_cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      problem_shapes,
+      to_gemm_coord(cta_shape),
+      to_gemm_coord(selected_cluster_shape),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order
+    );
+
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100Group() { }
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100Group(CLCResponse* clc_response_ptr, Params const& params)
+    : scheduler_params(params),
+      scheduler_sm90(params.params_sm90_, clc_response_ptr) { }
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100Group(CLCResponse* clc_response_ptr, Params const& params, dim3 /* block_id_in_cluster */)
+    : scheduler_params(params),
+      scheduler_sm90(params.params_sm90_, clc_response_ptr) { }
+
+  // Returns the initial work tile info that will be computed over
+  template <typename ClusterShape>
+  CUTLASS_DEVICE
+  auto
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    return scheduler_sm90.initial_work_tile_info(cluster_shape);
+  }
+
+  template<class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(GroupProblemShape const &problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shapes, hw_info, cta_shape, cluster_shape);
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class BlockShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      GroupProblemShape const& problem_shapes,
+      BlockShape cta_shape,
+      [[maybe_unused]] AtomThrShape atom_thr_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo hw_info) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes,
+      hw_info,
+      cta_shape,
+      cluster_shape);
+
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    Arguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.params_sm90_.log_swizzle_size_;
+    }
+    args.raster_order = params.params_sm90_.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order,
+      /* truncate_by_problem_size = */true,
+      cute::is_static_v<ClusterShape> ? true : false
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    // SM90 static scheduler implicitly handles CTA coord in a Cluster
+    return make_coord(
+      work_tile_info.M_idx,
+      work_tile_info.N_idx,
+      _,
+      work_tile_info.L_idx
+    );
+  }
+
+  template <typename CLCPipeline, typename CLCPipelineState>
+  CUTLASS_DEVICE
+  auto
+  advance_to_next_work(
+    CLCPipeline& clc_pipeline,
+    CLCPipelineState clc_pipe_producer_state,
+    uint32_t advance_count = 1) {
+
+    return scheduler_sm90.advance_to_next_work(clc_pipeline, clc_pipe_producer_state, advance_count);
+  }
+
+  //
+  // K Tile API
+  //
+  template <class ProblemShape, class TileShape, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShape problem_shape_MNKL, TileShape tile_shape, Shape) {
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
+    return cute::make_coord_iterator(k_tiles);
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the Group tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&) {
+    return true;
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`. None of the work units returned by
+  // this scheduler require fixup, since none of the work units partition the reduction extent.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const work_tile_info) {
+    return false;
+  }
+
+  // Performs the reduction across splits for a given output tile. No fixup is required for
+  // work units returned by this scheduler.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t, uint32_t = 1) const { }
+
+  template <
+    bool IsComplex,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  fixup(
+      TiledMma const& ,
+      WorkTileInfo const&,
+      cute::Tensor<AccEngine, AccLayout>&,
+      AccumulatorPipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R) const {
+    return acc_pipe_consumer_state;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const& args, ProblemShape problem_shape, KernelHardwareInfo const& hw_info, uint32_t, uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape, class ClusterShape>
+  static size_t
+  get_workspace_size(Arguments const& args, ProblemShape problem_shape, TileShapeMNK, AtomThrShape, ClusterShape, KernelHardwareInfo const& hw_info,
+      uint32_t reduction_warp_groups, uint32_t num_accumulator_mtxs = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape_MNKL, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape const&, KernelHardwareInfo const&, uint32_t, uint32_t = 1, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape, class ClusterShape>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape const&, TileShapeMNK, AtomThrShape, ClusterShape, KernelHardwareInfo const&,
+      uint32_t, uint32_t = 1, CudaHostAdapter *cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  // Kernel helper function to get next CLC ID
+  template <class CLCPipeline, class CLCPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+    WorkTileInfo work_tile_info,
+    CLCPipeline& clc_pipeline,
+    CLCPipelineState clc_pipe_consumer_state) {
+
+    return scheduler_sm90.fetch_next_work(work_tile_info, clc_pipeline, clc_pipe_consumer_state);
+  }
+
+private:
+  //
+  // Methods
+  //
+  [[nodiscard]] CUTLASS_DEVICE
+  static CLCResponse
+  load_query_response(uint32_t smem_ptr) {
+    return UnderlyingScheduler::load_query_response(smem_ptr);
+  }
+  //
+  // Storage
+  //
+  Params scheduler_params;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d6e286ba9797d2dd5b033febe89990a6bb4482b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp
@@ -0,0 +1,966 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t Stages_
+>
+class PersistentTileSchedulerSm100StreamK {
+  using UnderlyingScheduler = PersistentTileSchedulerSm100<ClusterShape, Stages_>;
+  using UnderlyingStreamKScheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
+  using InternalWorkTileInfo = typename UnderlyingScheduler::WorkTileInfo;
+  using InternalParams = typename UnderlyingScheduler::Params;
+  // Shapediv failures currently occur with tile shape N of 192
+  static constexpr bool ForceDataParallel = size<1>(TileShape{}) == 192;
+
+public:
+  static constexpr uint32_t Stages = Stages_;
+
+  using CLCResponse = typename UnderlyingScheduler::CLCResponse;
+  using WorkTileInfo = typename UnderlyingStreamKScheduler::WorkTileInfo;
+  using Arguments = typename UnderlyingStreamKScheduler::Arguments;
+
+  using Params = PersistentTileSchedulerSm100StreamKParams;
+  using RasterOrder = PersistentTileSchedulerSm90Params::RasterOrder;
+  using RasterOrderOptions = PersistentTileSchedulerSm90Params::RasterOrderOptions;
+
+  using SharedStorage = typename UnderlyingScheduler::SharedStorage;
+  using Pipeline = typename UnderlyingScheduler::Pipeline;
+  using ThrottlePipeline = typename UnderlyingScheduler::ThrottlePipeline;
+
+  static constexpr bool IsDynamicPersistent = true;
+
+  // Number of sub blocks in the kernel epilogue
+  static constexpr int EpilogueSubtiles = 1;
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerSm100StreamK() { }
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100StreamK(Params const& params)
+    : sm100_scheduler_(params.sm100_params_)
+    , params_(params)
+    , block_id_in_cluster_(cute::block_id_in_cluster()) {
+    // Set the current linear idx to be equal to the linear idx of the first work tile to be computed
+    auto cs = make_shape(
+      params.sm100_params_.divmod_cluster_shape_m_.divisor,
+      params.sm100_params_.divmod_cluster_shape_n_.divisor,
+      Int<1>{});
+  }
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100StreamK(CLCResponse* clc_response_ptr, Params const& params, dim3 block_id_in_cluster)
+    : sm100_scheduler_(clc_response_ptr, params.sm100_params_, block_id_in_cluster),
+      params_(params),
+      block_id_in_cluster_(block_id_in_cluster) {
+    // Set the current linear idx to be equal to the linear idx of the first work tile to be computed
+    auto cs = make_shape(
+      params.sm100_params_.divmod_cluster_shape_m_.divisor,
+      params.sm100_params_.divmod_cluster_shape_n_.divisor,
+      Int<1>{});
+  }
+
+  template <class ProblemShape, class TileShapeMNK>
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm100StreamK(CLCResponse* clc_response_ptr, Params const& params,
+    ProblemShape problem_shape_mnkl, TileShapeMNK tile_shape, dim3 block_id_in_cluster)
+    : PersistentTileSchedulerSm100StreamK(clc_response_ptr, params, block_id_in_cluster) { }
+
+  template <class ProblemShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShape problem_shape,
+      TileShape tile_shape,
+      [[maybe_unused]] ClusterShape cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+      uint32_t ktile_start_alignment_count = 1u) {
+
+    auto cs = cutlass::detail::select_cluster_shape(cluster_shape, hw_info.cluster_shape);
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      workspace,
+      ktile_start_alignment_count
+    );
+    return params;
+  }
+
+  template <class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShape problem_shape_mnkl,
+      TileShapeMNK tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace = nullptr,
+      uint32_t ktile_start_alignment_count = 1u
+      ) {
+
+    auto cs = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      workspace,
+      ktile_start_alignment_count
+    );
+
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return UnderlyingStreamKScheduler::can_implement(args);
+  }
+
+  CUTLASS_DEVICE
+  PipelineState<Stages> 
+  advance_to_next_work(Pipeline& clc_pipeline, PipelineState<Stages> clc_pipe_producer_state) const {
+    return sm100_scheduler_.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+ }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape blk_shape, ClusterShape cluster_shape) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, blk_shape, cluster_shape);
+  }
+
+  template<class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl,
+                          TileShapeMNK tile_shape_mnk,
+                          AtomThrShape atom_thr_shape_mnk,
+                          ClusterShape cluster_shape_mnk) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    Params const& params,
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    [[maybe_unused]] Arguments arguments) {
+    
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    return params.get_grid_shape(problem_blocks, to_gemm_coord(cluster_shape));
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    Params const& params,
+    ProblemShape problem_shape_mnkl,
+    TileShapeMNK tile_shape_mnk,
+    AtomThrShape atom_thr_shape_mnk,
+    ClusterShape cluster_shape_mnk,
+    KernelHardwareInfo hw_info) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+    return params.get_grid_shape(problem_blocks, to_gemm_coord(cluster_shape_mnk));
+  }
+
+
+  // Returns the initial work tile info that will be computed over
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    InternalWorkTileInfo work_tile_info = sm100_scheduler_.initial_work_tile_info(cluster_shape);
+    work_tile_info.is_valid_tile = false;
+    return convert_work(work_tile_info);
+  }
+
+  // Returns a CTA-tiled coordinate for the provided work tile info
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cta_coord(WorkTileInfo const& work_tile_info) {
+    if (is_dp_only()) {
+      // For data-parallel decompositions, simply default to the
+      // underlying SM100 scheduler.
+      auto underlying_work_tile = to_underlying_work_tile_info(work_tile_info);
+      return sm100_scheduler_.work_tile_to_cta_coord(underlying_work_tile);
+    }
+    else {
+      // The SM90 stream-K scheduler already operates only at CTA level,
+      // so the returned work tile info already contains CTA offsets within
+      // each cluster tile.
+      return cute::make_coord(
+        work_tile_info.M_idx,
+        work_tile_info.N_idx,
+        _,
+        work_tile_info.L_idx
+      );
+    }
+  }
+
+  // Returns whether the current work_tile_info passed in should continue to be used.
+  CUTLASS_DEVICE
+  bool
+  continue_current_work(WorkTileInfo& work_tile_info) const {
+    return UnderlyingStreamKScheduler::continue_current_work_for_linear_idx(
+      current_work_linear_idx_, unit_iter_start_, block_id_in_cluster_, work_tile_info, params_.sk_params_);
+  }
+
+  // Kernel helper function to get next CLC ID and whether to advance the CLC pipeline state.
+  template <class CLCPipeline, class CLCPipelineState>
+  CUTLASS_DEVICE
+  cute::tuple<WorkTileInfo, bool>
+  fetch_next_work(
+    WorkTileInfo work_tile_info,
+    CLCPipeline& clc_pipeline,
+    CLCPipelineState clc_pipe_consumer_state) {
+    // Check whether we should continue on with the current work unit. If this is the case,
+    // the work unit will have been updated in continue_current_work to reflect the new
+    // tile to be computed. Return `false` to indicate that the CLC pipeline state
+    // need not be advanced.
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, false);
+    }
+
+    auto [work_tile, _] = sm100_scheduler_.fetch_next_work(InternalWorkTileInfo{}, clc_pipeline, clc_pipe_consumer_state);
+    if (!work_tile.is_valid()) {
+      return cute::make_tuple(invalid_work_tile(), true);
+    }
+
+    auto converted_work_tile = convert_work(work_tile);
+
+    // Return true to indicate that the CLC pipeline state should be advanced
+    return cute::make_tuple(converted_work_tile, true);
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<WorkTileInfo, bool>
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    return cute::make_tuple(work_tile_info, true);
+  }
+
+  // Set data SMEM ptr 
+  CUTLASS_DEVICE
+  void
+  set_data_ptr(CLCResponse* clc_response_ptr) {
+    sm100_scheduler_.set_data_ptr(clc_response_ptr);
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
+    return UnderlyingStreamKScheduler::compute_epilogue(work_tile_info, params.sk_params_);
+  }
+
+  // Non-static variant of compute_epilogue. Used in cases where passing
+  // in Params is inconvenient.
+  CUTLASS_HOST_DEVICE
+  bool
+  compute_epilogue(WorkTileInfo const& work_tile_info) const {
+    return UnderlyingStreamKScheduler::compute_epilogue(work_tile_info, params_.sk_params_);
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+    Arguments const& args,
+    ProblemShape problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t reduction_warp_groups,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      args.reduction_mode,
+      reduction_warp_groups,
+      sizeof_bits<typename UnderlyingStreamKScheduler::BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      EpilogueSubtiles,
+      num_accumulator_mtxs,
+      ktile_start_alignment_count
+    );
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  static size_t
+  get_workspace_size(
+      Arguments const& args,
+      ProblemShape problem_shape,
+      TileShapeMNK tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      uint32_t reduction_warp_groups,
+      uint32_t num_accumulator_mtxs = 1,
+      uint32_t ktile_start_alignment_count = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    auto cta_tile_shape_mnk = shape_div(tile_shape_mnk, atom_thr_shape_mnk);
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cta_tile_shape_mnk),
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      args.reduction_mode,
+      reduction_warp_groups,
+      sizeof_bits<typename UnderlyingStreamKScheduler::BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      EpilogueSubtiles,
+      num_accumulator_mtxs,
+      ktile_start_alignment_count
+    );
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    cudaStream_t stream,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t reduction_warp_groups,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape);
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      args.reduction_mode,
+      reduction_warp_groups,
+      sizeof_bits<typename UnderlyingStreamKScheduler::BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      EpilogueSubtiles,
+      num_accumulator_mtxs,
+      cuda_adapter,
+      ktile_start_alignment_count
+    );
+  }
+
+  template <class ElementAccumulator, class ProblemShape, class TileShapeMNK, class AtomThrShape>
+  static cutlass::Status
+  initialize_workspace(
+      Arguments const& args,
+      void* workspace,
+      cudaStream_t stream,
+      ProblemShape const& problem_shape,
+      TileShapeMNK tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo const& hw_info,
+      uint32_t reduction_warp_groups,
+      uint32_t num_accumulator_mtxs = 1,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      uint32_t ktile_start_alignment_count = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    auto cs = cutlass::detail::select_cluster_shape(cluster_shape_mnk, hw_info.cluster_shape);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cs);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    auto cta_tile_shape_mnk = shape_div(tile_shape_mnk, atom_thr_shape_mnk);
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cta_tile_shape_mnk),
+      to_gemm_coord(cs),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      ForceDataParallel ? Params::DecompositionMode::DataParallel : args.decomposition_mode,
+      args.reduction_mode,
+      reduction_warp_groups,
+      sizeof_bits<typename UnderlyingStreamKScheduler::BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      EpilogueSubtiles,
+      num_accumulator_mtxs,
+      cuda_adapter,
+      ktile_start_alignment_count
+    );
+  }
+
+  template <class ProblemShape, class TileShapeMNK>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShapeMNK) {
+    return work_tile_info.k_tile_count;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
+  }
+
+  template <class ProblemShape, class TileShapeMNK, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShapeMNK tile_shape, Shape) {
+    // Get the shape of k tiles instead of the counter.  Otherwise, if the problem shape has
+    // multiple k modes, the DMA loop would need to decompose the iterator onto every mode
+    // every time global loading happens.  This would incur extra overhead.
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape));
+    auto k_tile_start = get_work_k_tile_start(work_tile_info);
+    // Iterate start from current k tile start over the k tiles shape.
+    return cute::make_coord_iterator(idx2crd(k_tile_start, k_tiles), k_tiles);
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`.
+  CUTLASS_HOST_DEVICE
+  bool
+  requires_fixup(WorkTileInfo const work_tile_info) const {
+    return UnderlyingStreamKScheduler::requires_fixup(params_.sk_params_, work_tile_info);
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx,
+    uint32_t num_accumulator_mtxs = 1) const {
+
+    using BarrierManager = SyncManager<cutlass::detail::SyncwarpSync, NumThreadsPerWarp>;
+
+    UnderlyingStreamKScheduler s;
+    return s.template fixup_helper<FrgTensorC, BarrierManager>(
+      params_.sk_params_, work_tile_info, accumulators, num_barriers, barrier_idx, num_accumulator_mtxs);
+  }
+
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+      Params const& params,
+      WorkTileInfo const& work_tile_info,
+      FrgTensorC& accumulators,
+      uint32_t num_barriers,
+      uint32_t barrier_idx) {
+    UnderlyingStreamKScheduler::fixup(params.sk_params_, work_tile_info, accumulators, num_barriers, barrier_idx);
+  }
+
+  // Performs reduction across splits for a given output tile
+  template <
+    bool IsComplex,
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  fixup(
+      TiledMma const& tiled_mma,
+      WorkTileInfo const& work_tile_info,
+      cute::Tensor<AccEngine, AccLayout>& accumulators,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R) const {
+    using namespace cute;
+    static_assert(cute::is_rmem_v<AccEngine> || cute::is_tmem_v<AccEngine>, "Accumulator must be in either TMEM or RF");
+
+    if constexpr (ForceDataParallel) {
+      return acc_pipe_consumer_state;
+    }
+    else {
+      if (!requires_fixup(work_tile_info)) {
+        if constexpr (cute::is_tmem_v<AccEngine>) {
+          if (!work_tile_info.is_valid()) {
+            // The first work tile can be invalid, but still must release TMEM
+            acc_pipeline.consumer_wait(acc_pipe_consumer_state);
+            acc_pipeline.consumer_release(acc_pipe_consumer_state);
+            ++acc_pipe_consumer_state;
+          }
+        }
+        return acc_pipe_consumer_state;
+      }
+
+      if constexpr (cute::is_tmem_v<AccEngine>) {
+        // When accumulators reside in TMEM, perform TMEM -> RF loads before performing fixup,
+        // and perform RF -> TMEM stores after fixup (when the split must compute the epilogue)
+        if constexpr (IsComplex) {
+          constexpr uint32_t NumAccumulatorMtx = 2;
+          Tensor accumulators_real = accumulators(_,_,_,0);
+          tmem_fixup(
+            tiled_mma,
+            work_tile_info,
+            accumulators_real,
+            acc_pipeline,
+            acc_pipe_consumer_state,
+            CopyOpT2R{},
+            NumAccumulatorMtx,
+            0 /*idx_accumulator_mtx*/
+          );
+
+          Tensor accumulators_imag = accumulators(_,_,_,1);
+          return tmem_fixup(
+            tiled_mma,
+            work_tile_info,
+            accumulators_imag,
+            acc_pipeline,
+            acc_pipe_consumer_state,
+            CopyOpT2R{},
+            NumAccumulatorMtx,
+            1 /*idx_accumulator_mtx*/
+          );
+        }
+        else {
+          return tmem_fixup(
+            tiled_mma,
+            work_tile_info,
+            accumulators,
+            acc_pipeline,
+            acc_pipe_consumer_state,
+            CopyOpT2R{}
+          );
+        }
+      }
+      else {
+        // Simply perform fixup without TMEM loads when accumulators reside in RF
+        constexpr uint32_t ThreadsForFixup = NumThreadsPerWarpGroup;
+        constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+        constexpr uint32_t MaxNumNamedBarriers = 1;
+        constexpr uint32_t BarrierIdx = 0;
+        using BarrierManager = NamedBarrierManager<ThreadsForFixup, Offset, MaxNumNamedBarriers>;
+        constexpr int NumAccumulatorMtx = IsComplex ? 2 : 1;
+
+        UnderlyingStreamKScheduler::template fixup_helper<cute::remove_cvref_t<decltype(accumulators)>, BarrierManager>(
+          params_.sk_params_, work_tile_info, accumulators, MaxNumNamedBarriers, BarrierIdx, NumAccumulatorMtx);
+        return acc_pipe_consumer_state;
+      }
+    }
+  }
+
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
+    typename UnderlyingScheduler::WorkTileInfo tmp{
+      work_tile_info.M_idx,
+      work_tile_info.N_idx,
+      work_tile_info.L_idx,
+      work_tile_info.is_valid()
+    };
+    return sm100_scheduler_.work_tile_to_cluster_coord_mnkl(tmp);
+  }
+
+private:
+  CUTLASS_HOST_DEVICE
+  WorkTileInfo invalid_work_tile() const {
+    // Mark the work tile as invalid based on its having a 0 K tiles to comptue.
+    // Set the M, N, and L indices to be outside of the range of valid tiles for the problem.
+    return {
+      static_cast<int32_t>(params_.sm100_params_.problem_tiles_m_) * params_.sm100_params_.divmod_cluster_shape_m_.divisor,
+      static_cast<int32_t>(params_.sm100_params_.problem_tiles_n_) * params_.sm100_params_.divmod_cluster_shape_n_.divisor,
+      0, // K_idx
+      static_cast<int32_t>(params_.sm100_params_.problem_tiles_l_),
+      0  // k_tile_count
+    };
+  }
+
+  // Converts the work tile info returned by the SM100 scheduler to a linear index
+  CUTLASS_DEVICE
+  uint64_t
+  to_linear_idx(
+    InternalWorkTileInfo const& work_tile_info,
+    Params const& params) {
+    // The InternalWorkTileInfo returned from CLC query gives all CTAs in a cluster
+    // the tile offset corresponding to the first CTA tile in the cluster tile assigned
+    // to the cluster. Since the SM90 tile scheduler operates at CTA level, we must assign
+    // each CTA its own tile when computing the linear ID to be used by the SM90
+    // stream-K scheduler.
+    auto start_cta_m_preferred_cluster = params.sk_params_.truncate_to_cluster_size_m(work_tile_info.M_idx);
+    auto start_cta_n_preferred_cluster = params.sk_params_.truncate_to_cluster_size_n(work_tile_info.N_idx);
+    uint64_t cluster_idx = gridDim.y * start_cta_m_preferred_cluster + start_cta_n_preferred_cluster;
+    uint64_t sm_count = gridDim.x * gridDim.y;
+    uint64_t wave_idx = work_tile_info.L_idx;
+
+    auto cluster_start_linear_id = sm_count * wave_idx + cluster_idx;
+
+    // Determine the offset of this CTA in the preferred cluster shape.
+    // This calculation aims to accommodate both cases in which this CTA is part of a preferred cluster
+    // and those in which it is part of a fallback cluster.
+    //
+    // The calculation is performed by computing the starting M and N index of the preferred cluster that
+    // this CTA would be in, and then subtracting these from the true CTA M and N indexes.
+    //
+    // In the case where this CTA is part of a preferred cluster, the resulting offsets are equivalent
+    // to those returned by cute::block_id_in_cluster();
+    uint64_t cta_m_in_preferred_cluster = work_tile_info.M_idx - start_cta_m_preferred_cluster;
+    uint64_t cta_n_in_preferred_cluster = work_tile_info.N_idx - start_cta_n_preferred_cluster;
+
+    if (params.sk_params_.raster_order_ == RasterOrder::AlongN) {
+      return cluster_start_linear_id + (params.sk_params_.divmod_cluster_shape_minor_.divisor * cta_n_in_preferred_cluster) + cta_m_in_preferred_cluster;
+    }
+    else {
+      return cluster_start_linear_id + (params.sk_params_.divmod_cluster_shape_minor_.divisor * cta_m_in_preferred_cluster) + cta_n_in_preferred_cluster;
+    }
+  }
+
+  // Converts the work tile info returned by the SM100 scheduler to a stream-K work tile info
+  CUTLASS_DEVICE
+  WorkTileInfo
+  convert_work(InternalWorkTileInfo const& work_tile_info) {
+    if (has_sk_work()) {
+      current_work_linear_idx_ = to_linear_idx(work_tile_info, params_);
+      auto work = UnderlyingStreamKScheduler::get_current_work_for_linear_idx(unit_iter_start_, current_work_linear_idx_, block_id_in_cluster_, params_.sk_params_);
+      if (!work.is_valid()) {
+        return invalid_work_tile();
+      }
+      return work;
+    }
+    else if (is_split_k()) {
+      // Split-K offsets are returned directly by CLC query (rather than being
+      // returned by the SM90 stream-K tile scheduler). CLC query returns
+      // the first CTA tile of work for each CTA in a cluster, but later use of the
+      // split-K work tile for fixup expect a CTA-offset tile. Thus, we need to offset
+      // each CTA's M and N index by the CTA offset in the cluster.
+      int32_t M_idx = work_tile_info.M_idx;
+      int32_t N_idx = work_tile_info.N_idx;
+
+      int L_idx, Split_idx;
+      params_.sk_params_.divmod_splits_(L_idx, Split_idx, work_tile_info.L_idx);
+
+      int additional_k_tiles = 0;
+      int split_start_offset = params_.sk_params_.big_units_;
+
+      if (Split_idx < params_.sk_params_.big_units_) {
+        // Offsets for "big" units. One additional k iteration is performed,
+        // and each split preceding us was a big unit, so we must increase
+        // our split starting offset by our split ID (Split_idx).
+        additional_k_tiles = 1;
+        split_start_offset = Split_idx;
+      }
+
+      // Set up k iteration count and split starting iteration assuming the
+      // iteration space is evenly split.
+      uint32_t k_tiles = params_.sk_params_.divmod_k_tiles_per_sk_unit_.divisor;
+      uint32_t K_idx = Split_idx * k_tiles;
+
+      // Apply any fixup needed to handle residuals
+      K_idx += split_start_offset;
+      k_tiles += additional_k_tiles;
+
+      // K_idx is even for each cta.
+      //
+      // * Example
+      // 53 k_tiles per output tile
+      // 10 k_tiles for normal size split
+      // 11 k_tiles for start three big unit
+      //
+      // split 0 : K_idx = [0,  10], k_tiles = 11 -> K_idx = [0,  11], k_tiles = 12
+      // split 1 : K_idx = [11, 21], k_tiles = 11 -> K_idx = [12, 21], k_tiles = 10
+      // split 2 : K_idx = [22, 32], k_tiles = 11 -> K_idx = [22, 33], k_tiles = 12
+      // split 3 : K_idx = [33, 42], k_tiles = 10 -> K_idx = [34, 42], k_tiles = 9 -> K_idx = [34, 43], k_tiles = 10
+      // split 4 : K_idx = [43, 52], k_tiles = 10 -> K_idx = [44, 52], k_tiles = 9
+      if (params_.sk_params_.ktile_start_alignment_count_ == 2u && K_idx % 2 != 0) {
+        // If current cta K_idx not start from even, give up one k_tile
+        K_idx += 1;
+        k_tiles -= 1;
+      }
+      if (params_.sk_params_.ktile_start_alignment_count_ == 2u &&
+          (K_idx + k_tiles) % 2 != 0 &&
+          (K_idx + k_tiles) < params_.sk_params_.divmod_tiles_per_output_tile_.divisor) {
+        // If next cta K_idx not start from even, acquire one k_tile
+        k_tiles += 1;
+      }
+
+      return {
+        M_idx,
+        N_idx,
+        static_cast<int32_t>(K_idx),
+        static_cast<int32_t>(L_idx),
+        k_tiles,
+        k_tiles  // remaining iterations
+      };
+    }
+    else {
+      // Data-parallel case
+      return {
+        static_cast<int32_t>(work_tile_info.M_idx),
+        static_cast<int32_t>(work_tile_info.N_idx),
+        static_cast<int32_t>(0),                   // K_idx
+        static_cast<int32_t>(work_tile_info.L_idx),
+        static_cast<uint32_t>(params_.sk_params_.divmod_tiles_per_output_tile_.divisor),
+        static_cast<uint32_t>(params_.sk_params_.divmod_tiles_per_output_tile_.divisor)
+      };
+    }
+  }
+
+  // Converts a WorkTileInfo struct to the WorkTileInfo representation
+  // of the underlying SM100 scheduler.
+  CUTLASS_HOST_DEVICE static
+  InternalWorkTileInfo
+  to_underlying_work_tile_info(WorkTileInfo const& work_tile_info) {
+    return {
+      work_tile_info.M_idx,
+      work_tile_info.N_idx,
+      work_tile_info.L_idx,
+      work_tile_info.is_valid()
+    };
+  }
+
+  // Returns whether the current parameters contain only data-parallel tiles
+  CUTLASS_HOST_DEVICE
+  bool
+  is_dp_only() const {
+    return params_.sk_params_.sk_units_ == 0 && params_.sk_params_.divmod_splits_.divisor == 1;
+  }
+
+  // Returns whether the current parameters are for a split-K decomposition
+  CUTLASS_HOST_DEVICE
+  bool
+  is_split_k() const {
+    return params_.sk_params_.divmod_splits_.divisor > 1;
+  }
+
+  // Returns whether the current parameters contain any stream-K work
+  CUTLASS_HOST_DEVICE
+  bool
+  has_sk_work() const {
+    return params_.sk_params_.sk_units_ > 0;
+  }
+
+  // Performs reduction across splits for a given output tile
+  template <
+    class TiledMma,
+    class AccEngine,
+    class AccLayout,
+    class AccumulatorPipeline,
+    class AccumulatorPipelineState,
+    class CopyOpT2R
+  >
+  CUTLASS_DEVICE
+  AccumulatorPipelineState
+  tmem_fixup(
+      TiledMma const& tiled_mma,
+      WorkTileInfo const& work_tile_info,
+      cute::Tensor<AccEngine, AccLayout>& accumulators,
+      AccumulatorPipeline acc_pipeline,
+      AccumulatorPipelineState acc_pipe_consumer_state,
+      CopyOpT2R,
+      uint32_t num_accumulator_mtx = 1,
+      uint32_t idx_accumulator_mtx = 0) const {
+    using namespace cute;
+    static_assert(cute::is_tmem_v<AccEngine>, "Accumulator must be in TMEM");
+
+    using ElementAccumulator = typename AccEngine::element_type;
+
+    constexpr uint32_t ThreadsForFixup = NumThreadsPerWarpGroup;
+    constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+    constexpr uint32_t MaxNumNamedBarriers = 1;
+    constexpr uint32_t BarrierIdx = 0;
+    using BarrierManager = NamedBarrierManager<ThreadsForFixup, Offset, MaxNumNamedBarriers>;
+
+    // When accumulators reside in TMEM, perform TMEM -> RF loads before performing fixup,
+    // and perform RF -> TMEM stores after fixup (when the split must compute the epilogue)
+    auto dummy_gmem_workspace = make_tensor(
+      make_gmem_ptr<ElementAccumulator>(nullptr),
+      make_layout(take<0,2>(TileShape{}), GenRowMajor{})); // (TILE_M,TILE_N)
+
+    auto dummy_gmem_buffer = tiled_mma.get_slice(0).partition_C(dummy_gmem_workspace); // (MMA,MMA_M,MMA_N)
+
+    auto tmem_load = make_tmem_copy(CopyOpT2R{}, accumulators);
+    auto tmem_store = make_tmem_copy(cute::TMEM::tmem_load_to_store(CopyOpT2R{}), accumulators);
+
+    auto thr_tmem_load = tmem_load.get_slice(threadIdx.x % ThreadsForFixup);
+    auto thr_tmem_store = tmem_store.get_slice(threadIdx.x % ThreadsForFixup);
+
+    Tensor tCtAcc = thr_tmem_load.partition_S(accumulators);      // (TMEM_LOAD,TMEM_LOAD_MMA,TMEM_LOAD_M,TMEM_LOAD_N)
+    Tensor tCgAcc = thr_tmem_load.partition_D(dummy_gmem_buffer); // (TMEM_LOAD,TMEM_LOAD_MMA,TMEM_LOAD_M,TMEM_LOAD_N)
+    auto tCrAcc = make_tensor<ElementAccumulator>(shape(tCgAcc)); // (TMEM_LOAD,TMEM_LOAD_MMA,TMEM_LOAD_M,TMEM_LOAD_N)
+
+    acc_pipeline.consumer_wait(acc_pipe_consumer_state);
+
+    // Copy accumulators from tmem to rmem for reduction
+    copy(tmem_load, tCtAcc, tCrAcc);
+
+    bool should_compute_epilogue = compute_epilogue(work_tile_info);
+    if (!should_compute_epilogue && (idx_accumulator_mtx == (num_accumulator_mtx - 1))) {
+      // Splits that do not compute the epilogue must advance the accumulator pipeline
+      cutlass::arch::fence_view_async_tmem_load();
+      acc_pipeline.consumer_release(acc_pipe_consumer_state);
+      ++acc_pipe_consumer_state;
+    }
+
+    // Perform fixup
+    UnderlyingStreamKScheduler::template fixup_helper<decltype(tCrAcc), BarrierManager>(
+      params_.sk_params_, work_tile_info, tCrAcc, MaxNumNamedBarriers, BarrierIdx, num_accumulator_mtx, idx_accumulator_mtx);
+
+    if (should_compute_epilogue) {
+      // Splits that compute the epilogue copy the reduced accumulators back to tmem for
+      // the epilogue to compute on it
+      copy(tmem_store, tCrAcc, tCtAcc);
+    }
+
+    return acc_pipe_consumer_state;
+  }
+
+
+  //
+  // Members
+  //
+
+  UnderlyingScheduler sm100_scheduler_;
+  Params params_;
+  dim3 block_id_in_cluster_;
+  uint64_t current_work_linear_idx_ = 0;
+  uint32_t unit_iter_start_ = 0;
+
+  // This might not be needed
+  bool is_fallback_cluster_ = false;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_array_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_array_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..06fd138d272a5a3e914b359d3adf7bf2b8202d58
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_array_tma_warpspecialized.hpp
@@ -0,0 +1,1319 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelPtrArrayTmaWarpSpecializedBlockScaledSm103>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using LayoutSFA = typename CollectiveMainloop::LayoutSFA;
+  using LayoutSFB = typename CollectiveMainloop::LayoutSFB;
+  using ElementSF = typename CollectiveMainloop::ElementSF;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC; 
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  using TileSchedulerTag = TileSchedulerTag_;
+  using TileScheduler = cute::conditional_t<IsGroupedGemmKernel,
+      typename detail::TileSchedulerSelector<
+        GroupScheduler, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount, ProblemShape>::Scheduler,
+      typename detail::TileSchedulerSelector<
+        TileSchedulerTag_, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler>;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads            = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopABLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopSFLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads       = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps         = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumEpilogueLoadThreads   = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEmptyThreads          = 3 * NumThreadsPerWarp; // 3 warp
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopABLoadThreads + NumMainloopSFLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads + NumEmptyThreads;
+
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopABPipeline = typename CollectiveMainloop::MainloopABPipeline;
+  using MainloopABPipelineState = typename CollectiveMainloop::MainloopABPipelineState;
+
+  using MainloopSFPipeline = typename CollectiveMainloop::MainloopSFPipeline;
+  using MainloopSFPipelineState = typename CollectiveMainloop::MainloopSFPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cute::conditional_t<IsSchedDynamicPersistent,
+    cutlass::PipelineAsync<SchedulerPipelineStageCount>,
+    cutlass::PipelineEmpty>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  static constexpr int EpilogueWarpRegs = 248;
+  static constexpr int NonEpilogueWarpRegs = 128;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(8) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+      alignas(128) MainloopTensorMapStorage mainloop;
+    } tensormaps;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA            = 0,
+    Sched          = 1,
+    MainloopABLoad = 2,
+    MainloopSFLoad = 3,
+    Epilogue       = 4,    // Warps [4-8)
+    EpilogueLoad   = 8,
+    Unused         = 9
+  };
+
+  struct IsParticipant {
+    uint32_t mma          = false;
+    uint32_t sched        = false;
+    uint32_t main_ab_load = false;
+    uint32_t epi_load     = false;
+    uint32_t epilogue     = false;
+    uint32_t main_sf_load = false;
+    uint32_t unused       = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+    ProblemShape problem_shapes = args.problem_shape;
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (IsGroupedGemmKernel && sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    else if (!IsGroupedGemmKernel && sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, problem_shapes.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+      args.hw_info, args.scheduler, scheduler_workspace
+      );
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      scheduler,
+      args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    } else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Mainloop, Epilogue or Scheduler don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    if constexpr (IsDynamicCluster) {
+      static constexpr int MaxClusterSize = 16;
+      implementable &= size(args.hw_info.cluster_shape) <= MaxClusterSize;
+      implementable &= size(args.hw_info.cluster_shape_fallback) <= MaxClusterSize;
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+    }
+
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Dynamic Cluster or Preferred Cluster don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+
+    constexpr bool IsBlockscaled = !cute::is_void_v<ElementSF>;
+    if constexpr (IsBlockscaled) {
+      if constexpr (IsDynamicCluster) {
+        implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                          args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+      }
+      else {
+        // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+        // more than 4 CTAs
+        implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+      }
+    }
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    size_t workspace_size = 0;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Mainloop
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    constexpr uint32_t NumEpilogueSubTiles = 1;
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Mainloop
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape.get_host_problem_shape(0), args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // NOTE: cluster_shape here is the major cluster shape, not fallback one
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(
+        params.scheduler,
+        params.problem_shape.get_host_problem_shape(),
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+    }
+    return grid_shape;
+  }
+
+  static constexpr
+  dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+private:
+
+  static constexpr
+  CUTLASS_DEVICE
+  void set_warpgroup_reg_dealloc() {
+    cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+  }
+
+  static constexpr
+  CUTLASS_DEVICE
+  void set_warpgroup_reg_alloc() {
+    cutlass::arch::warpgroup_reg_alloc<EpilogueWarpRegs>();
+  }
+
+public:
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+
+    using namespace cute;
+    using X = Underscore;
+
+    auto problem_shape = params.problem_shape;
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = (warp_idx >= static_cast<int>(WarpCategory::Epilogue) && warp_idx < static_cast<int>(WarpCategory::EpilogueLoad)) ? WarpCategory::Epilogue : 
+                                                                                                                     WarpCategory(warp_idx);
+    if (warp_idx > static_cast<int>(WarpCategory::EpilogueLoad)) {
+      warp_category = WarpCategory::Unused;
+    }
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = IsSchedDynamicPersistent ? (cta_rank_in_cluster == 0) : true;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopABLoad),                      // main_ab_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopSFLoad),                      // main_sf_load
+      (warp_category == WarpCategory::Unused)                               // empty
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopABPipeline::Params mainloop_ab_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Producer;
+      // Initialize the barrier for TMA load prefetch
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_ab_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_ab_load;
+    mainloop_ab_pipeline_params.transaction_bytes = CollectiveMainloop::ABTmaTransactionBytes;
+    mainloop_ab_pipeline_params.initializing_warp = 0;
+    MainloopABPipeline mainloop_ab_pipeline(shared_storage.pipelines.mainloop.pipeline_ab,
+                                       mainloop_ab_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Mainloop SF load pipeline
+    typename MainloopSFPipeline::Params mainloop_sf_pipeline_params;
+    if (WarpCategory::MainloopSFLoad == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_sf_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_sf_load;
+    mainloop_sf_pipeline_params.transaction_bytes = CollectiveMainloop::SFTransactionBytes;
+    mainloop_sf_pipeline_params.initializing_warp = 0;
+    MainloopSFPipeline mainloop_sf_pipeline(shared_storage.pipelines.mainloop.pipeline_sf,
+                                       mainloop_sf_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopABLoad || warp_category == WarpCategory::MainloopSFLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopABLoadThreads + NumMainloopSFLoadThreads;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = IsSchedDynamicPersistent ? 
+        CLCPipeline::ThreadCategory::ProducerConsumer :
+        CLCPipeline::ThreadCategory::Producer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+
+    clc_pipeline_params.initializing_warp = 1;
+    clc_pipeline_params.producer_arv_count = 1;
+
+    if constexpr (IsSchedDynamicPersistent) {
+      clc_pipeline_params.producer_blockid = 0;
+      clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                  (NumMainloopABLoadThreads + NumMainloopSFLoadThreads + NumEpilogueThreads + NumMMAThreads);
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+      }
+      clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    } 
+    else {
+      clc_pipeline_params.consumer_arv_count = NumMainloopABLoadThreads + NumMainloopSFLoadThreads + NumEpilogueThreads + NumMMAThreads;
+      if (is_epi_load_needed) {
+        clc_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+    }
+    // Now declare the pipeline outside the if constexpr
+    CLCPipeline clc_pipeline = [&]() {
+      if constexpr (IsSchedDynamicPersistent) {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+      }
+      else {
+        return CLCPipeline(shared_storage.pipelines.clc, clc_pipeline_params);
+      }
+    }();
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (WarpCategory::MainloopABLoad == warp_category || WarpCategory::MainloopSFLoad== warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+      }
+      if (WarpCategory::Sched == warp_category) {
+        clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+      }
+      clc_throttle_pipeline_params.producer_arv_count = NumMainloopSFLoadThreads;
+      clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      clc_throttle_pipeline_params.dst_blockid = 0;
+      clc_throttle_pipeline_params.initializing_warp = 3;
+    }
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if constexpr(!IsOverlappingAccum) {
+      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumMMAThreads);
+      }
+    }
+    else {
+      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+      }
+      else if (WarpCategory::MMA == warp_category && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    MainloopABPipelineState mainloop_ab_pipe_consumer_state;
+    MainloopABPipelineState mainloop_ab_pipe_producer_state = cutlass::make_producer_start_state<MainloopABPipeline>();
+
+    MainloopSFPipelineState mainloop_sf_pipe_consumer_state;
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state = cutlass::make_producer_start_state<MainloopSFPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    int32_t sm_id = static_cast<int32_t>(cutlass::arch::SmId());
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_ab_pipeline.init_masks(cluster_shape);
+    mainloop_sf_pipeline.init_masks(cluster_shape);
+    accumulator_pipeline.init_masks(cluster_shape);
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    //
+    // TMEM "Allocation"
+    //
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    TiledMma tiled_mma;
+    ThrMMA cta_mma = tiled_mma.get_slice(cta_coord_v);
+    auto acc_shape = partition_shape_C(tiled_mma, take<0,2>(TileShape{}));
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+
+    pipeline_init_wait(cluster_size);
+
+    if constexpr (IsGroupedGemmKernel) {
+      if (not work_tile_info.is_valid()) {
+        // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+        return;
+      }
+      // In case user wants to engage less SMs than available on device
+      sm_id = blockIdx.x + (blockIdx.y * gridDim.x);
+    }
+
+    auto problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    if (is_participant.main_ab_load) {
+      set_warpgroup_reg_dealloc();
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_ab_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop,
+          shared_storage.tensormaps.mainloop,
+          params.hw_info.sm_count, sm_id);
+      Tensor gA_mkl = get<0>(load_inputs);
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto input_tensormaps = get<rank(load_inputs) - 1>(load_inputs);
+
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize
+      bool did_batch_change = true;
+      bool requires_clc_query = true;
+      // 2cta: 4x4/4x2/2x4 enable the PF
+      bool enable_prefetch = shape<0>(AtomThrShapeMNK{}) == 2 and
+                             (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 4) or 
+                             (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 2) or
+                             (size<0>(cluster_shape) == 2 and size<1>(cluster_shape) == 4);
+
+      do {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_perform_update_ab(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_iter = scheduler.get_k_tile_iterator(work_tile_info, problem_shape_MNKL, CtaShape_MNK{}, shape<3>(gA_mkl));
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_prologue = min(MainloopABPipeline::Stages, k_tile_count);
+        // Problem Shape and therefore strides that we construct are [M,N,K,L], but since here for the TMA loads
+        // we are managing TMA descriptors to change batches, we need to neglect the L mode 
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter, k_tile_prologue, 
+          did_batch_change,
+          enable_prefetch ? k_tile_count : 0
+        );
+        mainloop_ab_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter_next, k_tile_count - k_tile_prologue, 
+          false, /* did_batch_change - prologue loads handle tensormap acquire */
+          enable_prefetch ? k_tile_count - k_tile_prologue : 0
+        );
+        mainloop_ab_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl));
+
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_ab_pipeline, mainloop_ab_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      set_warpgroup_reg_dealloc();
+
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+      else {
+        do {
+          auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++clc_pipe_producer_state;
+          }
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.main_sf_load) {
+      set_warpgroup_reg_dealloc();
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_sf_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop,
+          shared_storage.tensormaps.mainloop,
+          params.hw_info.sm_count, sm_id, work_tile_info.L_idx);
+
+      auto gA_mkl = collective_mainloop.get_mkl_shape_tensor(problem_shape_MNKL);
+      auto input_tensormaps = get<rank(load_inputs) - 1>(load_inputs);
+
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+
+      bool requires_clc_query = true;
+      // 2cta: 4x4/4x2/2x4 enable the PF
+      bool enable_prefetch = shape<0>(AtomThrShapeMNK{}) == 2 and
+                              (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 4) or 
+                              (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 2) or
+                              (size<0>(cluster_shape) == 2 and size<1>(cluster_shape) == 4);
+      do {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl)); // Usually just returns work_tile_info.L_idx;
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+        if (did_batch_change) {
+          collective_mainloop.tensormaps_perform_update_sf(
+            shared_storage.tensormaps.mainloop,
+            params.mainloop,
+            input_tensormaps,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_prologue = min(MainloopSFPipeline::Stages/2, k_tile_count);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl)); // maybe we could use ceil_div(gSFA_mkl, 2);
+        auto cta_coord_mnk = append<4>(make_coord(get<0>(cta_coord_mnkl), get<1>(cta_coord_mnkl), get<2>(cta_coord_mnkl)), Int<0>{});
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load_sf(
+          params.mainloop,
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter, k_tile_prologue, 
+          did_batch_change,
+          enable_prefetch ? k_tile_count : 0
+        );
+        mainloop_sf_pipe_producer_state = mainloop_producer_state_next;
+
+        if (do_load_order_arrive) {
+          load_order_barrier.arrive();
+          do_load_order_arrive = false;
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load_sf(
+          params.mainloop,
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnk,
+          k_tile_iter_next, k_tile_count - k_tile_prologue, 
+          false, /* did_batch_change - prologue loads handle tensormap acquire */
+          enable_prefetch ? k_tile_count - k_tile_prologue : 0
+        );
+        mainloop_sf_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != idx2crd(work_tile_info.L_idx, shape<4>(gA_mkl));
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_sf_pipeline, mainloop_sf_pipe_producer_state);
+
+    }
+
+    else if (is_participant.mma) {
+      set_warpgroup_reg_dealloc();
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+      int tmem_non_accumulator_base =  tmem_base_ptr + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+      auto mma_inputs = collective_mainloop.mma_init(params.mainloop,
+                                                     shared_storage.tensors.mainloop,
+                                                     tmem_non_accumulator_base /*Start SF TMEM allocation after the accumulator*/);
+
+      do {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        if constexpr (!IsOverlappingAccum) {
+          if (is_mma_leader_cta) {
+            accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+          }
+        }
+        int stage_idx = (IsOverlappingAccum) ? (accumulator_pipe_producer_state.phase() ^ 1) : (accumulator_pipe_producer_state.index());
+        Tensor accumulator = accumulators(_,_,_, stage_idx);
+
+        if (is_mma_leader_cta) {
+          auto [mainloop_ab_pipe_consumer_state_next, mainloop_sf_pipe_consumer_state_next] = collective_mainloop.mma(
+            cute::make_tuple(mainloop_ab_pipeline, mainloop_sf_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_ab_pipe_consumer_state, mainloop_sf_pipe_consumer_state, accumulator_pipe_producer_state),
+            accumulator,
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+            );
+
+          mainloop_ab_pipe_consumer_state = mainloop_ab_pipe_consumer_state_next;
+          mainloop_sf_pipe_consumer_state = mainloop_sf_pipe_consumer_state_next;
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+
+
+        ++accumulator_pipe_producer_state;
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+
+    else if (is_participant.epi_load) {
+      set_warpgroup_reg_dealloc();
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_load_tensormap = get<0>(collective_epilogue.load_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = true;
+
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_load_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            cute::make_tuple(epi_load_tensormap, did_batch_change),
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      set_warpgroup_reg_alloc();
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+
+      auto warp_idx_in_epi = canonical_warp_idx_sync() - static_cast<int>(WarpCategory::Epilogue);
+      bool do_tail_store = false;
+      // Fetch a copy of tensormaps for the CTA from Params
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(
+          params.epilogue, shared_storage.tensormaps.epilogue, params.hw_info.sm_count, sm_id));
+      // Initial batch's tensor address update
+      // Even the first tile for a CTA can be from any of the batches.
+      // And during initialization of the first TMA descriptor on host, we don't initialize to the first batch due to that args value being device-only.
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+      do {
+        int32_t curr_batch = work_tile_info.L_idx;
+
+
+        if (did_batch_change && warp_idx_in_epi == 0) {
+          collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+            shared_storage.tensormaps.epilogue,
+            params.epilogue,
+            epi_store_tensormap,
+            problem_shape,
+            curr_batch
+          );
+        }
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Accumulator stage slice after making sure allocation has been performed
+        int acc_stage = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+
+        // Fusions may need problem shape for the current group
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(problem_shape.get_problem_shape(curr_batch), 1);
+        }
+
+        // Epilogue and write to gD
+        //
+        auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          problem_shape_MNKL,
+          CtaShape_MNK{},
+          cta_coord_mnkl,
+          TileShape{},
+          TiledMma{},
+          collective_mainloop.slice_accumulator(accumulators, acc_stage),
+          shared_storage.tensors.epilogue,
+          cute::make_tuple(epi_store_tensormap, did_batch_change)
+        );
+        epi_load_pipe_consumer_state = load_state_next;
+        epi_store_pipe_producer_state = store_state_next;
+        accumulator_pipe_consumer_state = acc_state_next;
+
+        do_tail_store |= TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        // For subsequent tiles, check if batch changes and therefore, we need tensormap updates
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+
+    }
+
+    else {
+      set_warpgroup_reg_dealloc();
+    }
+
+  }
+
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae93b2ffd9c4aca2230e62c69c410972f91a7849
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,1112 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/detail/cluster.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/detail/sm100_tmem_helper.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/tmem_allocator_sm100.hpp"
+#include "cute/atom/mma_atom.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<
+    cutlass::detail::is_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                KernelTmaWarpSpecializedBlockScaledSm103>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using LayoutSFA = typename CollectiveMainloop::LayoutSFA;
+  using LayoutSFB = typename CollectiveMainloop::LayoutSFB;
+  using ElementSF = typename CollectiveMainloop::ElementSF;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 100);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueTile = typename CollectiveEpilogue::EpilogueTile;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static constexpr bool IsNoSmemEpilogue = is_same_v<cutlass::epilogue::Sm100NoSmem, typename CollectiveEpilogue::DispatchPolicy>;
+  static constexpr bool IsComplex = CollectiveEpilogue::NumAccumulatorMtxs == 2;
+
+  // CLC pipeline depth
+  // determines how many waves (stages-1) a warp can race ahead
+  static constexpr uint32_t SchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  static constexpr uint32_t AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount;
+  static constexpr bool IsOverlappingAccum = DispatchPolicy::IsOverlappingAccum;
+
+  // TileID scheduler
+  // Get Blk and Scheduling tile shapes
+  using AtomThrShapeMNK = typename CollectiveMainloop::AtomThrShapeMNK;
+  using CtaShape_MNK = typename CollectiveMainloop::CtaShape_MNK;
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, CtaShape_MNK, ClusterShape, SchedulerPipelineStageCount>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads          = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMMAThreads            = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopABLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumMainloopSFLoadThreads = NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEpilogueThreads       = CollectiveEpilogue::ThreadCount;
+  static constexpr uint32_t NumEpilogueWarps         = NumEpilogueThreads / NumThreadsPerWarp;
+  static constexpr uint32_t NumEpilogueLoadThreads   = IsNoSmemEpilogue ? 0 : NumThreadsPerWarp; // 1 warp
+  static constexpr uint32_t NumEmptyThreads          = IsNoSmemEpilogue ? 0 : 3 * NumThreadsPerWarp; // 3 warp
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumSchedThreads +
+                                                 NumMainloopABLoadThreads + NumMainloopSFLoadThreads + NumMMAThreads +
+                                                 NumEpilogueLoadThreads + NumEpilogueThreads + NumEmptyThreads;
+
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = 1;
+  static constexpr uint32_t CLCResponseSize = sizeof(typename TileScheduler::CLCResponse);
+
+  // Pipeline and pipeline state types
+  using MainloopABPipeline = typename CollectiveMainloop::MainloopABPipeline;
+  using MainloopABPipelineState = typename CollectiveMainloop::MainloopABPipelineState;
+
+  using MainloopSFPipeline = typename CollectiveMainloop::MainloopSFPipeline;
+  using MainloopSFPipelineState = typename CollectiveMainloop::MainloopSFPipelineState;
+
+  using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+  using EpiLoadPipelineState = typename CollectiveEpilogue::LoadPipelineState;
+
+  using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+  using EpiStorePipelineState = typename CollectiveEpilogue::StorePipelineState;
+
+  using LoadOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using AccumulatorPipeline = cutlass::PipelineUmmaAsync<AccumulatorPipelineStageCount, AtomThrShapeMNK>;
+  using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState;
+
+  using CLCPipeline = cutlass::PipelineCLCFetchAsync<SchedulerPipelineStageCount, ClusterShape>;
+  using CLCPipelineState = typename CLCPipeline::PipelineState;
+
+  using CLCThrottlePipeline = cutlass::PipelineAsync<SchedulerPipelineStageCount>;
+  using CLCThrottlePipelineState = typename CLCThrottlePipeline::PipelineState;
+
+  using TmemAllocator = cute::conditional_t<cute::size(cute::shape<0>(typename TiledMma::ThrLayoutVMNK{})) == 1,
+      cute::TMEM::Allocator1Sm, cute::TMEM::Allocator2Sm>;
+
+  static constexpr int EpilogueWarpRegs = 248;
+  static constexpr int NonEpilogueWarpRegs = 128;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Barriers should be allocated in lower 8KB of SMEM for SM100
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using LoadOrderBarrierStorage = typename LoadOrderBarrier::SharedStorage;
+      using CLCPipelineStorage = typename CLCPipeline::SharedStorage;
+      using AccumulatorPipelineStorage = typename AccumulatorPipeline::SharedStorage;
+      using CLCThrottlePipelineStorage = typename CLCThrottlePipeline::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) LoadOrderBarrierStorage load_order;
+      alignas(16) CLCPipelineStorage clc;
+      alignas(16) AccumulatorPipelineStorage accumulator;
+      alignas(16) CLCThrottlePipelineStorage clc_throttle;
+      alignas(8) arch::ClusterBarrier tmem_dealloc;
+    } pipelines;
+
+    alignas(16) typename TileScheduler::CLCResponse clc_response[SchedulerPipelineStageCount];
+    uint32_t tmem_base_ptr;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm100_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Host facing host arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel device entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    TileSchedulerParams scheduler{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  enum class WarpCategory : int32_t {
+    MMA            = 0,
+    Sched          = 1,
+    MainloopABLoad = 2,
+    MainloopSFLoad = 3,
+    Epilogue       = 4,    // Warps [4-8)
+    EpilogueLoad   = 8,
+    Unused         = 9
+  };
+
+  struct IsParticipant {
+    uint32_t mma          = false;
+    uint32_t sched        = false;
+    uint32_t main_ab_load = false;
+    uint32_t epi_load     = false;
+    uint32_t epilogue     = false;
+    uint32_t main_sf_load = false;
+    uint32_t unused       = false;
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    constexpr int NumEpilogueSubTiles = 1;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count != 0) {
+      CUTLASS_TRACE_HOST("  WARNING: SM100 tile scheduler does not allow for user specified SM counts.\n"
+          "  To restrict a kernel's resource usage, consider using CUDA driver APIs instead (green contexts).");
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    // Epilogue
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    // Tile scheduler
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace, args.hw_info),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, AtomThrShapeMNK{}, ClusterShape{},
+        args.hw_info, args.scheduler, scheduler_workspace
+      )
+      ,args.hw_info
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    if constexpr (IsDynamicCluster) {
+      implementable &= cutlass::detail::preferred_cluster_can_implement<AtomThrShapeMNK>(args.hw_info.cluster_shape, args.hw_info.cluster_shape_fallback);
+      // Special cluster shape check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+      // more than 4 CTAs
+      implementable &= (args.hw_info.cluster_shape.x <= 4 && args.hw_info.cluster_shape.y <= 4 &&
+                        args.hw_info.cluster_shape_fallback.x <= 4 && args.hw_info.cluster_shape_fallback.y <= 4);
+    }
+    else {
+      // Special cluster check for scale factor multicasts. Due to limited size of scale factors, we can't multicast among
+      // more than 4 CTAs
+      implementable &= ((size<0>(ClusterShape{}) <= 4) && (size<1>(ClusterShape{}) <= 4));
+    }
+    
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr int NumEpilogueSubTiles = 1;
+
+    // Epilogue
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    // Tile scheduler
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr int NumEpilogueSubTiles = 1;
+    
+    // Epilogue
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Tile scheduler
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumFixupBarriers, NumEpilogueSubTiles, CollectiveEpilogue::NumAccumulatorMtxs);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, params.hw_info.cluster_shape);
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_grid_shape(
+        params.scheduler,
+        problem_shape_MNKL,
+        TileShape{},
+        AtomThrShapeMNK{},
+        cluster_shape,
+        params.hw_info);
+  }
+
+  static constexpr
+  dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+private:
+
+  static constexpr
+  CUTLASS_DEVICE
+  void set_warpgroup_reg_dealloc() {
+    if constexpr (not IsNoSmemEpilogue) {
+      cutlass::arch::warpgroup_reg_dealloc<NonEpilogueWarpRegs>();
+    }
+  }
+
+  static constexpr
+  CUTLASS_DEVICE
+  void set_warpgroup_reg_alloc() {
+    if constexpr (not IsNoSmemEpilogue) {
+      cutlass::arch::warpgroup_reg_alloc<EpilogueWarpRegs>();
+    }
+  }
+
+public:
+
+  CUTLASS_DEVICE
+  void
+  operator() (Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Account for more than one epilogue warp
+    int warp_idx = canonical_warp_idx_sync();
+    WarpCategory warp_category = (warp_idx >= static_cast<int>(WarpCategory::Epilogue) && warp_idx < static_cast<int>(WarpCategory::EpilogueLoad)) ? WarpCategory::Epilogue : 
+                                                                                                                     WarpCategory(warp_idx);
+    if (warp_idx > static_cast<int>(WarpCategory::EpilogueLoad)) {
+      warp_category = WarpCategory::Unused;
+    }
+
+    uint32_t lane_predicate = cute::elect_one_sync();
+    auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, cute::cluster_shape());
+    int cluster_size = size(cluster_shape);
+    uint32_t cta_rank_in_cluster = cute::block_rank_in_cluster();
+    bool is_first_cta_in_cluster = cta_rank_in_cluster == 0;
+    int cta_coord_v = cta_rank_in_cluster % size<0>(typename TiledMma::AtomThrID{});
+    bool is_mma_leader_cta = cta_coord_v == 0;
+    constexpr bool has_mma_peer_cta = size(AtomThrShapeMNK{}) == 2;
+    [[maybe_unused]] uint32_t mma_peer_cta_rank = has_mma_peer_cta ? cta_rank_in_cluster ^ 1 : cta_rank_in_cluster;
+
+    // // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_category == WarpCategory::Sched) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+    if ((warp_category == WarpCategory::EpilogueLoad) && lane_predicate) {
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop(params.mainloop);
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Do we load source tensor C or other aux inputs
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    IsParticipant is_participant = {
+      (warp_category == WarpCategory::MMA),                                 // mma
+      (warp_category == WarpCategory::Sched) && is_first_cta_in_cluster,    // sched
+      (warp_category == WarpCategory::MainloopABLoad),                      // main_ab_load
+      (warp_category == WarpCategory::EpilogueLoad) && is_epi_load_needed,  // epi_load
+      (warp_category == WarpCategory::Epilogue),                            // epilogue
+      (warp_category == WarpCategory::MainloopSFLoad),                      // main_sf_load
+      (warp_category == WarpCategory::Unused)                               // empty
+    };
+
+    // Mainloop Load pipeline
+    typename MainloopABPipeline::Params mainloop_ab_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Producer;
+      // Initialize the barrier for TMA load prefetch
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_ab_pipeline_params.role = MainloopABPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_ab_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_ab_load;
+    mainloop_ab_pipeline_params.transaction_bytes = CollectiveMainloop::ABTmaTransactionBytes;
+    mainloop_ab_pipeline_params.initializing_warp = 0;
+    MainloopABPipeline mainloop_ab_pipeline(shared_storage.pipelines.mainloop.pipeline_ab,
+                                       mainloop_ab_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Mainloop SF load pipeline
+    typename MainloopSFPipeline::Params mainloop_sf_pipeline_params;
+    if (WarpCategory::MainloopSFLoad == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::MMA == warp_category) {
+      mainloop_sf_pipeline_params.role = MainloopSFPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_sf_pipeline_params.is_leader = lane_predicate && is_mma_leader_cta && is_participant.main_sf_load;
+    mainloop_sf_pipeline_params.transaction_bytes = CollectiveMainloop::SFTransactionBytes;
+    mainloop_sf_pipeline_params.initializing_warp = 0;
+    MainloopSFPipeline mainloop_sf_pipeline(shared_storage.pipelines.mainloop.pipeline_sf,
+                                       mainloop_sf_pipeline_params,
+                                       cluster_shape,
+                                       cute::true_type{},   // Perform barrier init
+                                       cute::false_type{}); // Delay mask calculation
+
+    // Epilogue Load pipeline
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (WarpCategory::EpilogueLoad == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cta_rank_in_cluster;
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumEpilogueThreads;
+    epi_load_pipeline_params.transaction_bytes = CollectiveEpilogue::TmaTransactionBytes;
+    epi_load_pipeline_params.initializing_warp = 4;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Load order barrier
+    typename LoadOrderBarrier::Params load_order_barrier_params;
+    load_order_barrier_params.group_id = (warp_category == WarpCategory::MainloopABLoad || warp_category == WarpCategory::MainloopSFLoad) ? 0 : 1;
+    load_order_barrier_params.group_size = NumMainloopABLoadThreads + NumMainloopSFLoadThreads;
+    load_order_barrier_params.initializing_warp = 5;
+    LoadOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, load_order_barrier_params);
+
+    // CLC pipeline
+    typename CLCPipeline::Params clc_pipeline_params;
+    if (WarpCategory::Sched == warp_category) {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::ProducerConsumer;
+    }
+    else {
+      clc_pipeline_params.role = CLCPipeline::ThreadCategory::Consumer;
+    }
+    clc_pipeline_params.producer_blockid = 0;
+    clc_pipeline_params.producer_arv_count = 1;
+    clc_pipeline_params.consumer_arv_count = NumSchedThreads + cluster_size *
+                                                 (NumMainloopABLoadThreads + NumMainloopSFLoadThreads + NumEpilogueThreads + NumMMAThreads);
+    if (is_epi_load_needed) {
+      clc_pipeline_params.consumer_arv_count += cluster_size * NumEpilogueLoadThreads;
+    }
+    clc_pipeline_params.transaction_bytes = CLCResponseSize;
+    clc_pipeline_params.initializing_warp = 1;
+    CLCPipeline clc_pipeline(shared_storage.pipelines.clc, clc_pipeline_params, cluster_shape);
+
+    // Mainloop-Epilogue pipeline
+    typename AccumulatorPipeline::Params accumulator_pipeline_params;
+    if (WarpCategory::MMA == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Epilogue == warp_category) {
+      accumulator_pipeline_params.role = AccumulatorPipeline::ThreadCategory::Consumer;
+    }
+    // Only one producer thread arrives on this barrier.
+    accumulator_pipeline_params.producer_arv_count = 1;
+    accumulator_pipeline_params.consumer_arv_count = size(AtomThrShapeMNK{}) * NumEpilogueThreads;
+    accumulator_pipeline_params.initializing_warp = 2;
+    AccumulatorPipeline accumulator_pipeline(shared_storage.pipelines.accumulator,
+                                             accumulator_pipeline_params,
+                                             cluster_shape,
+                                             cute::true_type{},   // Perform barrier init
+                                             cute::false_type{}); // Delay mask calculation
+
+    // CLC throttle pipeline
+    typename CLCThrottlePipeline::Params clc_throttle_pipeline_params;
+    if (WarpCategory::MainloopABLoad == warp_category || WarpCategory::MainloopSFLoad== warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Producer;
+    }
+    if (WarpCategory::Sched == warp_category) {
+      clc_throttle_pipeline_params.role = CLCThrottlePipeline::ThreadCategory::Consumer;
+    }
+
+    clc_throttle_pipeline_params.producer_arv_count = NumMainloopSFLoadThreads;
+    clc_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+    clc_throttle_pipeline_params.dst_blockid = 0;
+    clc_throttle_pipeline_params.initializing_warp = 3;
+    CLCThrottlePipeline clc_throttle_pipeline(shared_storage.pipelines.clc_throttle, clc_throttle_pipeline_params);
+    CLCThrottlePipelineState clc_pipe_throttle_consumer_state;
+    CLCThrottlePipelineState clc_pipe_throttle_producer_state = cutlass::make_producer_start_state<CLCThrottlePipeline>();
+
+    // Tmem allocator
+    TmemAllocator tmem_allocator{};
+
+    // Sync allocation status between MMA and epilogue warps within CTA
+    arch::NamedBarrier tmem_allocation_result_barrier(NumMMAThreads + NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::TmemAllocBarrier);
+    // Sync deallocation status between MMA warps of peer CTAs
+    arch::ClusterBarrier& tmem_deallocation_result_barrier = shared_storage.pipelines.tmem_dealloc;
+    [[maybe_unused]] uint32_t dealloc_barrier_phase = 0;
+    if constexpr(!IsOverlappingAccum) {
+      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumMMAThreads);
+      }
+    }
+    else {
+      if (WarpCategory::MMA == warp_category && has_mma_peer_cta && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumEpilogueThreads*2);
+      }
+      else if (WarpCategory::MMA == warp_category && lane_predicate) {
+        tmem_deallocation_result_barrier.init(NumEpilogueThreads);
+      }
+    }
+
+    // We need this to guarantee that the Pipeline init is visible
+    // To all producers and consumer threadblocks in the cluster
+    pipeline_init_arrive_relaxed(cluster_size);
+
+    MainloopABPipelineState mainloop_ab_pipe_consumer_state;
+    MainloopABPipelineState mainloop_ab_pipe_producer_state = cutlass::make_producer_start_state<MainloopABPipeline>();
+
+    MainloopSFPipelineState mainloop_sf_pipe_consumer_state;
+    MainloopSFPipelineState mainloop_sf_pipe_producer_state = cutlass::make_producer_start_state<MainloopSFPipeline>();
+
+    EpiLoadPipelineState epi_load_pipe_consumer_state;
+    EpiLoadPipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+
+    // epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    EpiStorePipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    CLCPipelineState clc_pipe_consumer_state;
+    CLCPipelineState clc_pipe_producer_state = cutlass::make_producer_start_state<CLCPipeline>();
+
+    AccumulatorPipelineState accumulator_pipe_consumer_state;
+    AccumulatorPipelineState accumulator_pipe_producer_state = cutlass::make_producer_start_state<AccumulatorPipeline>();
+
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+
+    // Calculate mask after cluster barrier arrival
+    mainloop_ab_pipeline.init_masks(cluster_shape);
+    mainloop_sf_pipeline.init_masks(cluster_shape);
+    accumulator_pipeline.init_masks(cluster_shape);
+    // TileID scheduler
+    TileScheduler scheduler(&shared_storage.clc_response[0], params.scheduler, block_id_in_cluster);
+    typename TileScheduler::WorkTileInfo work_tile_info = scheduler.initial_work_tile_info(cluster_shape);
+    auto cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+    //
+    // TMEM "Allocation"
+    //
+    // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N,ACC_PIPE) where ACC_PIPE=2 so we can double buffer our accumulators for mainloop and epilogue.
+    TiledMma tiled_mma;
+    ThrMMA cta_mma = tiled_mma.get_slice(cta_coord_v);
+    auto acc_shape = partition_shape_C(tiled_mma, take<0,2>(TileShape{}));
+    Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
+        tiled_mma, acc_shape, EpilogueTile{});
+
+#if 1
+    pipeline_init_wait(cluster_size);
+
+    if (is_participant.main_ab_load) {
+      set_warpgroup_reg_dealloc();
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_ab_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+      Tensor gA_mkl = get<0>(load_inputs);
+      bool requires_clc_query = true;
+      // 2cta: 4x4/4x2/2x4 enable the PF
+      bool enable_prefetch = shape<0>(AtomThrShapeMNK{}) == 2 and
+                             (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 4) or 
+                             (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 2) or
+                             (size<0>(cluster_shape) == 2 and size<1>(cluster_shape) == 4);
+
+      do {
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_prologue = min(MainloopABPipeline::Stages, k_tile_count);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_prologue, 
+          enable_prefetch ? k_tile_count : 0
+        );
+        mainloop_ab_pipe_producer_state = mainloop_producer_state_next;
+
+        if constexpr (not IsNoSmemEpilogue) {
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load_ab(
+          params.mainloop,
+          mainloop_ab_pipeline,
+          mainloop_ab_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter_next, k_tile_count - k_tile_prologue, 
+          enable_prefetch ? k_tile_count - k_tile_prologue : 0
+        );
+        mainloop_ab_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_ab_pipeline, mainloop_ab_pipe_producer_state);
+
+    }
+
+    else if (is_participant.sched) {
+      set_warpgroup_reg_dealloc();
+
+      if constexpr (IsSchedDynamicPersistent) {
+        // Whether a new CLC query must be performed.
+        // See comment below where this variable is updated for a description of
+        // why this variable is needed.
+        bool requires_clc_query = true;
+
+        cutlass::arch::wait_on_dependent_grids();
+
+        do {
+          if (requires_clc_query) {
+            // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+            clc_throttle_pipeline.consumer_wait(clc_pipe_throttle_consumer_state);
+            clc_throttle_pipeline.consumer_release(clc_pipe_throttle_consumer_state);
+            ++clc_pipe_throttle_consumer_state;
+
+            // Query next clcID and update producer state
+            clc_pipe_producer_state = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state);
+          }
+
+          // Fetch next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+            work_tile_info,
+            clc_pipeline,
+            clc_pipe_consumer_state
+          );
+
+          // Only perform a new CLC query if we consumed a new CLC query result in
+          // `fetch_next_work`. An example of a case in which CLC `fetch_next_work` does
+          // not consume a new CLC query response is when processing stream-K units.
+          // The current stream-K scheduler uses single WorkTileInfo to track multiple
+          // (potentially-partial) tiles to be computed via stream-K. In this case,
+          // `fetch_next_work` simply performs in-place updates on the existing WorkTileInfo,
+          // rather than consuming a CLC query response.
+          requires_clc_query = increment_pipe;
+          if (increment_pipe) {
+            ++clc_pipe_consumer_state;
+          }
+
+          work_tile_info = next_work_tile_info;
+        } while (work_tile_info.is_valid());
+        clc_pipeline.producer_tail(clc_pipe_producer_state);
+      }
+    }
+
+
+    else if (is_participant.main_sf_load) {
+      set_warpgroup_reg_dealloc();
+      bool do_load_order_arrive = is_epi_load_needed;
+      auto load_inputs = collective_mainloop.load_sf_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+
+      auto tmp = collective_mainloop.load_ab_init(
+          problem_shape_MNKL, params.mainloop, shared_storage.tensors.mainloop);
+      Tensor gA_mkl = get<0>(tmp); // just to get k_tile_count or maybe we could use ceil_div(shape<3>(gSFA_mkl), 2);
+      bool requires_clc_query = true;
+      // 2cta: 4x4/4x2/2x4 enable the PF
+      bool enable_prefetch = shape<0>(AtomThrShapeMNK{}) == 2 and
+                              (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 4) or 
+                              (size<0>(cluster_shape) == 4 and size<1>(cluster_shape) == 2) or
+                              (size<0>(cluster_shape) == 2 and size<1>(cluster_shape) == 4);
+      do {
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+        auto k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_prologue = min(MainloopSFPipeline::Stages/2, k_tile_count);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl)); // maybe we could use ceil_div(gSFA_mkl, 2);
+
+        if constexpr (IsSchedDynamicPersistent) {
+          if (is_first_cta_in_cluster && requires_clc_query) {
+            clc_throttle_pipeline.producer_acquire(clc_pipe_throttle_producer_state);
+            clc_throttle_pipeline.producer_commit(clc_pipe_throttle_producer_state);
+            ++clc_pipe_throttle_producer_state;
+          }
+        }
+
+        // Start mainloop prologue loads, arrive on the epilogue residual load barrier, resume mainloop loads
+        auto [mainloop_producer_state_next, k_tile_iter_next] = collective_mainloop.load_sf(
+          params.mainloop,
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter, k_tile_prologue, 
+          enable_prefetch ? k_tile_count : 0
+        );
+        mainloop_sf_pipe_producer_state = mainloop_producer_state_next;
+
+        if constexpr (not IsNoSmemEpilogue) {
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+        }
+
+        auto [mainloop_producer_state_next_, unused_] = collective_mainloop.load_sf(
+          params.mainloop,
+          mainloop_sf_pipeline,
+          mainloop_sf_pipe_producer_state,
+          load_inputs,
+          cta_coord_mnkl,
+          k_tile_iter_next, k_tile_count - k_tile_prologue, 
+          enable_prefetch ? k_tile_count - k_tile_prologue :0
+        );
+        mainloop_sf_pipe_producer_state = mainloop_producer_state_next_;
+
+        // Sync warp to prevent non-participating threads entering next wave early
+        __syncwarp();
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+        requires_clc_query = increment_pipe;
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+      } while (work_tile_info.is_valid());
+      collective_mainloop.load_tail(mainloop_sf_pipeline, mainloop_sf_pipe_producer_state);
+
+    }
+
+
+    else if (is_participant.mma) {
+      set_warpgroup_reg_dealloc();
+      // Tmem allocation sequence
+      tmem_allocator.allocate(TmemAllocator::Sm100TmemCapacityColumns, &shared_storage.tmem_base_ptr);
+      __syncwarp();
+      tmem_allocation_result_barrier.arrive();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+      int tmem_non_accumulator_base =  tmem_base_ptr + cutlass::detail::find_tmem_tensor_col_offset(accumulators);
+      auto mma_inputs = collective_mainloop.mma_init(params.mainloop,
+                                                     shared_storage.tensors.mainloop,
+                                                     tmem_non_accumulator_base /*Start SF TMEM allocation after the accumulator*/);
+
+      do {
+        auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, CtaShape_MNK{});
+
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        // Wait for tmem accumulator buffer to become empty with a flipped phase
+        if constexpr (!IsOverlappingAccum) {
+          if (is_mma_leader_cta) {
+            accumulator_pipeline.producer_acquire(accumulator_pipe_producer_state);
+          }
+        }
+        int stage_idx = (IsOverlappingAccum) ? (accumulator_pipe_producer_state.phase() ^ 1) : (accumulator_pipe_producer_state.index());
+        Tensor accumulator = accumulators(_,_,_, stage_idx);
+
+        if (is_mma_leader_cta) {
+          auto [mainloop_ab_pipe_consumer_state_next, mainloop_sf_pipe_consumer_state_next] = collective_mainloop.mma(
+            cute::make_tuple(mainloop_ab_pipeline, mainloop_sf_pipeline, accumulator_pipeline),
+            cute::make_tuple(mainloop_ab_pipe_consumer_state, mainloop_sf_pipe_consumer_state, accumulator_pipe_producer_state),
+            accumulator,
+            mma_inputs,
+            cta_coord_mnkl,
+            k_tile_count
+            );
+
+            mainloop_ab_pipe_consumer_state = mainloop_ab_pipe_consumer_state_next;
+            mainloop_sf_pipe_consumer_state = mainloop_sf_pipe_consumer_state_next;
+          accumulator_pipeline.producer_commit(accumulator_pipe_producer_state);
+        }
+        ++accumulator_pipe_producer_state;
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Release the right to allocate before deallocations so that the next CTA can rasterize
+      tmem_allocator.release_allocation_lock();
+
+      if constexpr (!IsOverlappingAccum) {
+        // Leader MMA waits for leader + peer epilogues to release accumulator stage
+        if (is_mma_leader_cta) {
+          accumulator_pipeline.producer_tail(accumulator_pipe_producer_state);
+        }
+        // Signal to peer MMA that entire tmem allocation can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          // Leader does wait + arrive, follower does arrive + wait
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, not is_mma_leader_cta);
+          tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank, is_mma_leader_cta);
+        }
+      }
+      else {
+        tmem_deallocation_result_barrier.wait(dealloc_barrier_phase);
+      }
+
+      // Free entire tmem allocation
+      tmem_allocator.free(tmem_base_ptr, TmemAllocator::Sm100TmemCapacityColumns);
+    }
+    else if (not IsNoSmemEpilogue and is_participant.epi_load) {
+      set_warpgroup_reg_dealloc();
+      // Ensure that the prefetched kernel does not touch
+      // unflushed global memory prior to this instruction
+      cutlass::arch::wait_on_dependent_grids();
+
+      bool do_load_order_wait = true;
+      bool do_tail_load = false;
+      int current_wave = 0;
+      do {
+        bool compute_epilogue = TileScheduler::compute_epilogue(work_tile_info, params.scheduler);
+
+        // Get current work tile and fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+        work_tile_info = next_work_tile_info;
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        if (compute_epilogue) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          bool reverse_epi_n = IsOverlappingAccum && (current_wave % 2 == 0);
+          epi_load_pipe_producer_state = collective_epilogue.load<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            shared_storage.tensors.epilogue,
+            reverse_epi_n
+          );
+
+          do_tail_load = true;
+        }
+        current_wave++;
+
+        // Calculate the cta coordinates of the next work tile
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+      } while (work_tile_info.is_valid());
+
+      // Only perform a tail load if one of the work units processed performed
+      // an epilogue load. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_load) {
+        collective_epilogue.load_tail(
+          epi_load_pipeline, epi_load_pipe_producer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state);
+      }
+    }
+
+    else if (is_participant.epilogue) {
+      set_warpgroup_reg_alloc();
+      // Wait for tmem allocate here
+      tmem_allocation_result_barrier.arrive_and_wait();
+      uint32_t tmem_base_ptr = shared_storage.tmem_base_ptr;
+      accumulators.data() = tmem_base_ptr;
+
+
+      bool do_tail_store = false;
+      do {
+        // Fetch next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+          work_tile_info,
+          clc_pipeline,
+          clc_pipe_consumer_state
+        );
+
+        if (increment_pipe) {
+          ++clc_pipe_consumer_state;
+        }
+
+        int stage_idx = [&] () {
+          if constexpr (IsOverlappingAccum) {
+            return accumulator_pipe_consumer_state.phase();
+          }
+          else {
+            return accumulator_pipe_consumer_state.index();
+          }
+        }();
+
+        // Accumulator
+        Tensor accumulator = accumulators(_,_,_,stage_idx); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N)
+
+        accumulator_pipe_consumer_state = scheduler.template fixup<IsComplex>(
+          TiledMma{},
+          work_tile_info,
+          accumulator,
+          accumulator_pipeline,
+          accumulator_pipe_consumer_state,
+          typename CollectiveEpilogue::CopyOpT2R{}
+        );
+
+        //
+        // Epilogue and write to gD
+        //
+        if (scheduler.compute_epilogue(work_tile_info)) {
+          auto [load_state_next, store_state_next, acc_state_next] = collective_epilogue.template store<IsOverlappingAccum>(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            accumulator_pipeline,
+            accumulator_pipe_consumer_state,
+            problem_shape_MNKL,
+            CtaShape_MNK{},
+            cta_coord_mnkl,
+            TileShape{},
+            TiledMma{},
+            accumulator,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = load_state_next;
+          epi_store_pipe_producer_state = store_state_next;
+          accumulator_pipe_consumer_state = acc_state_next;
+          do_tail_store = true;
+        }
+
+        work_tile_info = next_work_tile_info;
+        cta_coord_mnkl = scheduler.work_tile_to_cta_coord(work_tile_info);
+
+      } while (work_tile_info.is_valid());
+
+      if constexpr (IsOverlappingAccum) {
+        // Signal to peer MMA that Full TMEM alloc can be deallocated
+        if constexpr (has_mma_peer_cta) {
+          tmem_deallocation_result_barrier.arrive(mma_peer_cta_rank);
+        }
+        tmem_deallocation_result_barrier.arrive();
+      }
+
+      // Only perform a tail store if one of the work units processed performed
+      // an epilogue. An example of a case in which a tail load should not be
+      // performed is in split-K if a cluster is only assigned non-final splits (for which
+      // the cluster does not compute the epilogue).
+      if (do_tail_store) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline, epi_load_pipe_consumer_state,
+          epi_store_pipeline, epi_store_pipe_producer_state,
+          CtaShape_MNK{});
+      }
+    }
+
+    else {
+      set_warpgroup_reg_dealloc();
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f074309bb8023014d52c6b3f691450a161367dd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp
@@ -0,0 +1,904 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<
+    cutlass::detail::is_asymmetric_dma_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelTmaWarpSpecializedCooperativeSparseSm120> ||
+    cutlass::detail::is_asymmetric_dma_kernel_tag_of_v<typename CollectiveMainloop_::DispatchPolicy::Schedule,
+                                        KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+
+  using TileSchedulerTag = TileSchedulerTag_;
+  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                          TileSchedulerTag, ArchTag, TileShape, ClusterShape
+                          ,TileSchedulerPipelineStageCount
+                          >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  // Asymmetric buffering
+  // Tensor A/B could have different buffering, with number of KBLOCK, aka TILEK,
+  //    and STAGEs. It let AsymmetricKRatio, equals KBLOCK_A / KBLOCK_B, to control
+  //    the balance of A/B loading, make sure A/B's pipeline keep same cadence
+  //    when produce / consume data.
+  // Currently, AsymmetricKRatio = {1, 2} is the only support.
+  static constexpr bool isAsymmetric = DispatchPolicy::Schedule::isAsymmetric;
+  static constexpr uint32_t AsymmetricKRatio = isAsymmetric ? 2 : 1;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp * 2;  // 2 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = NumMmaWarpGroups;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorageMK = typename CollectiveMainloop::PipelineStorageMK;
+      using MainloopPipelineStorageNK = typename CollectiveMainloop::PipelineStorageNK;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorageMK mainloop_mk;
+      alignas(16) MainloopPipelineStorageNK mainloop_nk;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    alignas(16) TileSchedulerStorage scheduler;
+
+      struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static_assert(SharedStorageSize <= cutlass::arch::sm120_smem_capacity_bytes, "SMEM usage exceeded capacity.");
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      );
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      LoadMK = 0,
+      Warp1  = 1,
+      LoadNK = 2,
+      LoadMN = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int mma_thread_idx = thread_idx % NumMMAThreads;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) {
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + (NumMainloopLoadThreads + NumMMAThreads);
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      }
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      scheduler_throttle_pipeline_params.initializing_warp = 1;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               (producer_warp_role == ProducerWarpRole::LoadMK ||
+                producer_warp_role == ProducerWarpRole::LoadNK)) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params, ClusterShape{});
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
+    // Mainloop Load pipeline
+    using MainloopPipelineMK = typename CollectiveMainloop::MainloopPipelineMK;
+    using MainloopPipelineNK = typename CollectiveMainloop::MainloopPipelineNK;
+    typename MainloopPipelineMK::Params mainloop_pipeline_params_mk;
+    typename MainloopPipelineNK::Params mainloop_pipeline_params_nk;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadMK) {
+      mainloop_pipeline_params_mk.role = MainloopPipelineMK::ThreadCategory::Producer;
+      mainloop_pipeline_params_mk.is_leader = cute::elect_one_sync();
+      mainloop_pipeline_params_mk.transaction_bytes = params.mainloop.tma_transaction_bytes_mk;
+    }
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadNK) {
+      mainloop_pipeline_params_nk.role = MainloopPipelineNK::ThreadCategory::Producer;
+      mainloop_pipeline_params_nk.is_leader = cute::elect_one_sync();
+      mainloop_pipeline_params_nk.transaction_bytes = params.mainloop.tma_transaction_bytes_nk;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params_mk.role = MainloopPipelineMK::ThreadCategory::Consumer;
+      mainloop_pipeline_params_nk.role = MainloopPipelineNK::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params_mk.num_consumers = NumMMAThreads;
+    mainloop_pipeline_params_nk.num_consumers = NumMMAThreads;
+
+    MainloopPipelineMK mainloop_pipeline_mk(shared_storage.pipelines.mainloop_mk, mainloop_pipeline_params_mk, ClusterShape{});
+    MainloopPipelineNK mainloop_pipeline_nk(shared_storage.pipelines.mainloop_nk, mainloop_pipeline_params_nk, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::LoadMN) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    // 2 warps (LoadMK / LoadNK) are ordered before 1 warp (LoadMN) and will signal arrival.
+    params_load_order_barrier.group_id = (
+        producer_warp_role == ProducerWarpRole::LoadMK ||
+        producer_warp_role == ProducerWarpRole::LoadNK) ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp * 2;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineStateMK mainloop_pipe_consumer_state_mk;
+    typename CollectiveMainloop::PipelineStateNK mainloop_pipe_consumer_state_nk;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    typename CollectiveMainloop::PipelineStateMK mainloop_pipe_producer_state_mk = cutlass::make_producer_start_state<MainloopPipelineMK>();
+    typename CollectiveMainloop::PipelineStateNK mainloop_pipe_producer_state_nk = cutlass::make_producer_start_state<MainloopPipelineNK>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
+    // Declare work_tile_info, then define it in each of warps that use it.
+    typename TileScheduler::WorkTileInfo work_tile_info;
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        if constexpr (IsSchedDynamicPersistent) {
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          cutlass::arch::wait_on_dependent_grids();
+
+          while (work_tile_info.is_valid()) {
+            if (requires_clc_query) {
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next clcID and update producer state
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+              work_tile_info,
+              scheduler_pipeline,
+              scheduler_pipe_consumer_state
+            );
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+            work_tile_info = next_work_tile_info;
+          }
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        }
+      } // Scheduler Producer Warp End
+      else
+      // Producer Warp to LoadMK
+      if (producer_warp_role == ProducerWarpRole::LoadMK) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load_MK(
+            params.mainloop,
+            mainloop_pipeline_mk,
+            mainloop_pipe_producer_state_mk,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state_mk.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                                );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline_mk, mainloop_pipe_producer_state_mk);
+
+      } // Producer Warp LoadMK End
+
+      // LoadNK Producer Warp
+      if (producer_warp_role == ProducerWarpRole::LoadNK) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape) * AsymmetricKRatio;
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info) * AsymmetricKRatio;
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load_NK(
+            params.mainloop,
+            mainloop_pipeline_nk,
+            mainloop_pipe_producer_state_nk,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state_nk.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                                );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline_nk, mainloop_pipe_producer_state_nk);
+
+      } // Producer Warp LoadNK End
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::LoadMN &&
+               is_epi_load_needed) {
+        work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                                ,scheduler_pipeline
+                                                                                ,scheduler_pipe_consumer_state
+                                                                               );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) {
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Producer Warp LoadMN End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          collective_mainloop.mma(
+            mainloop_pipeline_mk,
+            mainloop_pipe_consumer_state_mk,
+            mainloop_pipeline_nk,
+            mainloop_pipe_consumer_state_nk,
+            accumulators,
+            k_tile_iter,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop,
+            blk_coord,
+            problem_shape_MNKL
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline_mk,
+            mainloop_pipe_consumer_state_mk,
+            mainloop_pipeline_nk,
+            mainloop_pipe_consumer_state_nk,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state_mk.advance(work_k_tile_count);
+          mainloop_pipe_consumer_state_nk.advance(work_k_tile_count * AsymmetricKRatio);
+        }
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info
+                                                                              ,scheduler_pipeline
+                                                                              ,scheduler_pipe_consumer_state
+                                                                              );
+        work_tile_info = next_work_tile_info;
+        if constexpr (IsSchedDynamicPersistent) {
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+          }
+        }
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..18c79608ad6ba821f84ebc3ef717eddad7fd682c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -0,0 +1,270 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(ProblemShape{}) == 3 or rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape,
+    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  static constexpr bool IsGdcEnabled = false;
+
+  static constexpr bool is_valid_tile_scheduler =
+  cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>;
+static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializing the tile scheduler.");
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(cute::size(TiledMma{}));
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, args.hw_info.sm_count};
+    auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{});
+
+    return {
+      args.mode,
+      args.problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool mode_implementable = args.mode == GemmUniversalMode::kGemm or
+          (args.mode == GemmUniversalMode::kBatched && rank(ProblemShape{}) == 4);
+    return mode_implementable && TileScheduler::can_implement(args.scheduler);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    return workspace_size;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    cutlass::Status status = Status::kSuccess;
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    int batch_count = 1;
+    if constexpr (cute::rank(ProblemShape{}) == 4) {
+      batch_count = cute::size<3>(params.problem_shape);
+    }
+
+    return dim3(
+      cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))),
+      cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))),
+      batch_count
+    );
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShape>::value);
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    int thread_idx = int(threadIdx.x);
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get batch slice
+    Tensor mA_mk = mA_mkl(_,_,l_coord);                                                                        // (m,k)
+    Tensor mB_nk = mB_nkl(_,_,l_coord);                                                                        // (n,k)
+
+    // Slice to get the tiles this thread block is responsible for
+    Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{});           // (BLK_M,BLK_K,k)
+    Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{});           // (BLK_N,BLK_K,k)
+
+    // Compute tile residues for predication
+    auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl);                             // M - BLK_M * m_coord
+    auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl);                             // N - BLK_N * n_coord
+    auto k_residue   = K - size<1>(gA) * size<2>(gA);                                        // K - BLK_K * k_coord_max
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
+    clear(accumulators);
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    int  k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accumulators,
+      gA,
+      gB,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      blk_coord_mnkl,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm_array.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c0ef53a7e86239cb10bb9c57dde255199fe8e3fc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
@@ -0,0 +1,279 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayMultistage, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA   = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using InternalStrideB   = typename CollectiveMainloop::InternalStrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape,
+    cute::Shape<cute::Int<1>, cute::Int<1>, cute::Int<1>>>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  static constexpr bool IsGdcEnabled = false;
+
+  static constexpr bool is_valid_tile_scheduler =
+  cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>;
+static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializing the tile scheduler.");
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC  = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD  = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  // MSVC requires the cast to fix a warning-as-error.
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(cute::size(TiledMma{}));
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    typename ProblemShape::UnderlyingProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    typename ProblemShape::UnderlyingProblemShape problem_shape = args.problem_shape.get_host_problem_shape();
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, args.hw_info.sm_count};
+    auto problem_shape_MNKL = append<4>(args.problem_shape, Int<1>{});
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+
+    bool implementable = (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    typename ProblemShape::UnderlyingProblemShape problem_shape = args.problem_shape.get_host_problem_shape();
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    return workspace_size;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr, 
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    cutlass::Status status = Status::kSuccess;
+
+    return status;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    int batch_count = cute::size<3>(params.problem_shape);
+    return dim3(
+      cute::size(cute::ceil_div(cute::shape<0>(params.problem_shape), cute::shape<0>(TileShape{}))),
+      cute::size(cute::ceil_div(cute::shape<1>(params.problem_shape), cute::shape<1>(TileShape{}))),
+      batch_count
+    );
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+    // Preconditions
+    CUTE_STATIC_ASSERT(is_static<TileShape>::value);
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    int thread_idx = int(threadIdx.x);
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A[l_coord]), make_shape(M,K,1), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B[l_coord]), make_shape(N,K,1), params.mainloop.dB); //(n,k,l)
+
+    // Get batch slice
+    Tensor mA_mk = mA_mkl(_,_,0);                                                                        // (m,k)
+    Tensor mB_nk = mB_nkl(_,_,0);                                                                        // (n,k)
+
+    // Slice to get the tiles this thread block is responsible for
+    Tensor gA = local_tile(mA_mk, blk_shape, take<0,3>(blk_coord_mnkl), Step<_1, X,_1>{});           // (BLK_M,BLK_K,k)
+    Tensor gB = local_tile(mB_nk, blk_shape, take<0,3>(blk_coord_mnkl), Step< X,_1,_1>{});           // (BLK_N,BLK_K,k)
+
+    // Compute tile residues for predication
+    auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord_mnkl);                             // M - BLK_M * m_coord
+    auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord_mnkl);                             // N - BLK_N * n_coord
+    auto k_residue   = K - size<1>(gA) * size<2>(gA);                                        // K - BLK_K * k_coord_max
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape)); // (MMA,MMA_M,MMA_N)
+    clear(accumulators);
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    int  k_tile_count = size<2>(gA);
+
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      accumulators,
+      gA,
+      gB,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      blk_coord_mnkl,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec5cd4d0584a73825f5cb7dd909a7774463e1a2d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,1039 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+  // Get the type of the scheduler response.
+  template<typename TileScheduler, typename = void>
+  struct TileSchedulerResponseGetter {
+    using Type = typename TileScheduler::CLCResponse;
+  };
+
+  template<typename TileScheduler>
+  struct TileSchedulerResponseGetter<TileScheduler, void_t<typename TileScheduler::SchedulerResponse>> {
+    using Type = typename TileScheduler::SchedulerResponse;
+  };
+
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  static_assert(
+    cute::is_void_v<TileScheduler_>
+    or (
+      IsGroupedGemmKernel
+      and cute::is_any_of_v<TileScheduler_, GroupScheduler>
+    ),
+    "Ptr-Array Cooperative and Grouped Gemm Cooperative kernel only supports the default scheduler.");
+
+  using SchedulerTag = cute::conditional_t<
+    cute::is_void_v<TileScheduler_>,
+    cute::conditional_t<
+      IsGroupedGemmKernel,
+      GroupScheduler,     // Special grouped gemm scheduler
+      void                // Default scheduler for non-grouped kernels
+    >,
+    TileScheduler_
+  >;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    SchedulerTag,
+    ArchTag,
+    TileShape,
+    ClusterShape,
+    8, // SchedulerPipelineStageCount -- Grouped GEMM scheduler will benefit from a larger number of stages.
+    cute::conditional_t<cute::is_same_v<SchedulerTag, void>, void, ProblemShape> // Use void for default scheduler.
+  >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerResponse = typename TileSchedulerResponseGetter<TileScheduler>::Type;
+
+  static constexpr auto TileSchedulerStages = 8;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaThreads = size(TiledMma{});
+  static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMmaThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = detail::HasAuxiliaryLoad_v<typename CollectiveMainloop::DispatchPolicy>;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using TileSchedulerPipelineStorage = typename TileScheduler::PipelineStorage;
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) TileSchedulerPipelineStorage scheduler;
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    alignas(16) TileSchedulerResponse scheduler_response[TileSchedulerStages];
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    }
+    else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#  if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || defined(__CUDA_ARCH_FEAT_SM121_ALL) ||\
+      CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))
+#    define ENABLE_SM90_KERNEL_LEVEL 1
+#  endif
+
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+    static_assert(NumMmaWarpGroups == 2, "Cooperative kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      MainloopAux = 1,
+      Epilogue = 2,
+      Scheduler = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    auto scheduler = [&] () {
+      // Group scheduler requires a different constructor that takes a response ptr
+      if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+        return TileScheduler{params.scheduler, shared_storage.scheduler_response};
+      }
+      else {
+        return TileScheduler{params.scheduler};
+      }
+    } ();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // TileScheduler pipeline
+    using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+    typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params;
+    if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+      if (warp_group_role == WarpGroupRole::Producer
+        && producer_warp_role == ProducerWarpRole::Scheduler) {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer;
+      }
+      else {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      tile_scheduler_pipeline_params.consumer_arv_count = NumMmaThreads
+                                                        + NumThreadsPerWarp * (
+                                                          1                                                           // Main DMA warp
+                                                          + (collective_epilogue.is_producer_load_needed() ? 1 : 0)   // Epilog DMA warp
+                                                          + (IsMainloopAuxiliaryLoadNeeded ? 1 : 0)                   // Aux DMA warp
+                                                        );
+      tile_scheduler_pipeline_params.producer_arv_count = 1;
+    }
+    TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params);
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer
+      && (producer_warp_role == ProducerWarpRole::Mainloop
+       || producer_warp_role == ProducerWarpRole::MainloopAux)) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumMmaThreads;
+    mainloop_pipeline_params.num_producers = NumProducerThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = size(TiledMma{});
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename TileSchedulerPipeline::PipelineState tile_scheduler_pipe_consumer_state;
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState tile_scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      if (producer_warp_role == ProducerWarpRole::Scheduler) {
+        // GroupScheduler requires a producer warp to iterate over the group infos and push
+        // the work tile infos to the downstream pipelines.
+        if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+          do {
+            auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_producer_state;
+            }
+          } while (work_tile_info.is_valid());
+          tile_scheduler_pipeline.producer_tail(tile_scheduler_pipe_producer_state);
+        }
+      }
+      // Mainloop Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        collective_mainloop.tensormaps_perform_update(
+          shared_storage.tensormaps.mainloop,
+          params.mainloop,
+          input_tensormaps,
+          problem_shape_MNKL,
+          curr_batch
+        );
+        // Ensure warp is converged before issuing tensormap fence release
+        __syncwarp();
+        // Entire warp must do this (i.e. it's aligned)
+        collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        do {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+                work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Pipeline state is only advanced if there are K tiles to compute
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+          int32_t const mock_l_coord = 0;
+
+          bool did_batch_change = true;
+          do {
+            if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+              auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+              work_tile_info = next_work_tile_info;
+              if (increment_pipe) {
+                ++tile_scheduler_pipe_consumer_state;
+              }
+              continue;
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+            // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+            auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+            auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+            auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+            if (did_batch_change) {
+              load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            }
+
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, work_k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+
+            // Update starting pipeline state for the next tile
+            mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+            did_batch_change = next_batch != curr_batch;
+            if (work_tile_info.is_valid() && did_batch_change) {
+              curr_batch = next_batch;
+              if constexpr (IsGroupedGemmKernel) {
+                problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+              }
+            }
+          } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+        } // End of auxiliary load needed check
+      } // Mainloop Auxiliary Load Producer Warp End
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_load_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          0
+        );
+
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+
+        load_order_barrier.wait();
+
+        do {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (warp_idx_in_warp_group == 0) {
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_store_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          consumer_warp_group_idx
+        );
+
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                    epi_store_tensormap,
+                                                                    consumer_warp_group_idx);
+      }
+
+      do {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+        work_tile_info = next_work_tile_info;
+        if (increment_pipe) {
+          ++tile_scheduler_pipe_consumer_state;
+        }
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+      } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+
+      // Cooperative only needs TMA to complete at the very end of the kernel
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd7ff603b8f17347767ee746d9cd29bd5ed81bf2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,1110 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/group_array_problem_shape.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+  // Get the type of the scheduler response.
+  template<typename TileScheduler, typename = void>
+  struct TileSchedulerResponseGetter {
+    using Type = typename TileScheduler::CLCResponse;
+  };
+
+  template<typename TileScheduler>
+  struct TileSchedulerResponseGetter<TileScheduler, void_t<typename TileScheduler::SchedulerResponse>> {
+    using Type = typename TileScheduler::SchedulerResponse;
+  };
+
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(rank(typename ProblemShape::UnderlyingProblemShape{}) == 3 or rank(typename ProblemShape::UnderlyingProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  static_assert(cute::is_base_of_v<KernelPtrArrayTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>);
+
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using InternalStrideA = typename CollectiveMainloop::InternalStrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using InternalStrideB = typename CollectiveMainloop::InternalStrideB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using Schedule = typename DispatchPolicy::Schedule;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using InternalStrideC = typename CollectiveEpilogue::InternalStrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using InternalStrideD = typename CollectiveEpilogue::InternalStrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  static constexpr bool IsGroupedGemmKernel = !cute::is_same_v<InternalStrideA, StrideA>;
+  static constexpr uint32_t MinTensorMapWorkspaceAlignment = 64;
+
+  static_assert(
+    cute::is_void_v<TileScheduler_>
+    or (
+      IsGroupedGemmKernel
+      and cute::is_any_of_v<TileScheduler_, GroupScheduler>
+    ),
+    "Ptr-Array Pingpong and Grouped Gemm Pingpong kernel only supports the default scheduler.");
+
+  using SchedulerTag = cute::conditional_t<
+    cute::is_void_v<TileScheduler_>,
+    cute::conditional_t<
+      IsGroupedGemmKernel,
+      GroupScheduler,     // Special grouped gemm scheduler
+      void                // Default scheduler for non-grouped kernels
+    >,
+    TileScheduler_
+  >;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    SchedulerTag,
+    ArchTag,
+    TileShape,
+    ClusterShape,
+    8, // SchedulerPipelineStageCount -- Grouped GEMM scheduler will benefit from a larger number of stages.
+    cute::conditional_t<cute::is_same_v<SchedulerTag, void>, void, ProblemShape> // Use void for default scheduler.
+  >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerResponse = typename TileSchedulerResponseGetter<TileScheduler>::Type;
+
+  static constexpr auto TileSchedulerStages = 8;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumMmaWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = detail::HasAuxiliaryLoad_v<typename CollectiveMainloop::DispatchPolicy>;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement = 40;
+  static constexpr uint32_t MmaRegisterRequirement = 232;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage = cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using TileSchedulerPipelineStorage = typename TileScheduler::PipelineStorage;
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) TileSchedulerPipelineStorage scheduler;
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+
+    alignas(16) TileSchedulerResponse scheduler_response[TileSchedulerStages];
+
+    struct TensorMapStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorMapStorage = typename CollectiveMainloop::TensorMapStorage;
+      using EpilogueTensorMapStorage = typename CollectiveEpilogue::TensorMapStorage;
+
+      alignas(128) MainloopTensorMapStorage mainloop;
+      alignas(128) EpilogueTensorMapStorage epilogue;
+    } tensormaps;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    ProblemShape problem_shapes = args.problem_shape;
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(problem_shapes, args.epilogue, sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* mainloop_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveMainloop::get_workspace_size(problem_shapes, args.mainloop, sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler;
+    if constexpr (IsGroupedGemmKernel) {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+    else {
+      scheduler = TileScheduler::to_underlying_arguments(
+      problem_shapes.get_host_problem_shape(), TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles);
+    }
+
+    return {
+      args.mode,
+      problem_shapes,
+      CollectiveMainloop::to_underlying_arguments(problem_shapes, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(problem_shapes, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    if constexpr (IsGroupedGemmKernel) {
+      // Group GEMM currently only supports rank-3 problem shapes
+      implementable &= (args.mode == GemmUniversalMode::kGrouped && rank(typename ProblemShape::UnderlyingProblemShape{}) == 3);
+    }
+    else {
+      implementable &= (args.mode == GemmUniversalMode::kArray && rank(typename ProblemShape::UnderlyingProblemShape{}) == 4);
+    }
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements for Ptr Array Gemm or Grouped Gemm.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    workspace_size += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, sm_count);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size, MinTensorMapWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = CollectiveMainloop::initialize_workspace(args.problem_shape, args.mainloop, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveMainloop::get_workspace_size(args.problem_shape, args.mainloop, args.hw_info.sm_count);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<typename ProblemShape::UnderlyingProblemShape, ElementAccumulator>(
+      args.scheduler, typename ProblemShape::UnderlyingProblemShape{}, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset, MinTensorMapWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    dim3 grid_shape;
+    if constexpr (IsGroupedGemmKernel) {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    else {
+      grid_shape = TileScheduler::get_grid_shape(params.scheduler, params.problem_shape.get_host_problem_shape(), TileShape{}, ClusterShape{}, params.hw_info, args);
+    }
+    return grid_shape;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#  if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || defined(__CUDA_ARCH_FEAT_SM121_ALL) ||\
+      CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))
+#    define ENABLE_SM90_KERNEL_LEVEL 1
+#  endif
+
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(size(TiledMma{}) == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
+    static_assert(NumMmaWarpGroups == 2, "Pingpong kernels currently only support NumMmaWarpGroups == 2");
+
+    if constexpr (cutlass::epilogue::collective::detail::sm90_is_ptr_array_tma_dispatch_policy_v<typename CollectiveEpilogue::DispatchPolicy>) {
+      static_assert(NumMmaWarpGroups == CollectiveEpilogue::NumEpilogueWarpGroups,
+                    "Tiled MmA does not match expected warp groups performing the epilogue");
+    }
+
+    static_assert(cute::rank(InternalStrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(InternalStrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      MainloopAux = 1,
+      Epilogue = 2,
+      Scheduler = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    auto scheduler = [&] () {
+      // Group scheduler requires a different constructor that takes a response ptr
+      if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+        return TileScheduler{params.scheduler, shared_storage.scheduler_response};
+      }
+      else {
+        return TileScheduler{params.scheduler};
+      }
+    } ();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    auto warp_group_idx = canonical_warp_group_idx();
+    auto warp_group_role = WarpGroupRole(warp_group_idx);
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Note: Tma Descriptor Prefetch (from either const or param) is not applicable here
+
+    // TileScheduler pipeline
+    using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+    typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params;
+    if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+      if (warp_group_role == WarpGroupRole::Producer
+        && producer_warp_role == ProducerWarpRole::Scheduler) {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer;
+      }
+      else {
+        tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      tile_scheduler_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup * NumMmaWarpGroups                   // 1 MATH WG
+                                                        + NumThreadsPerWarp * (
+                                                          1                                                           // Main DMA warp
+                                                          + (collective_epilogue.is_producer_load_needed() ? 1 : 0)   // Epilog DMA warp
+                                                          + (IsMainloopAuxiliaryLoadNeeded ? 1 : 0)                   // Aux DMA warp
+                                                        );
+      tile_scheduler_pipeline_params.producer_arv_count = 1;
+    }
+    TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params);
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer
+      && (producer_warp_role == ProducerWarpRole::Mainloop
+       || producer_warp_role == ProducerWarpRole::MainloopAux)) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.num_producers = NumProducerThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_idx - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename TileSchedulerPipeline::PipelineState tile_scheduler_pipe_consumer_state;
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState tile_scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    const auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    const auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    const auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    if (not work_tile_info.is_valid()) {
+      // When problem shapes are only on device, the grid launched may be larger than the total number of blocks across groups
+      return;
+    }
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+
+    // Consumer1 is not on the critical path at prologue.
+    if (warp_group_role == WarpGroupRole::Consumer1) [[unlikely]] {
+      // Advance 2nd Math WG to the next work tile for the startup
+      const auto k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+      auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+      work_tile_info = next_work_tile_info;
+      if (!work_tile_info.is_valid()) {
+        return;
+      }
+
+      if (increment_pipe) {
+        ++tile_scheduler_pipe_consumer_state;
+      }
+
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+
+      problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+    }
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      if (producer_warp_role == ProducerWarpRole::Scheduler) {
+        // GroupScheduler requires a producer warp to iterate over the group infos and push
+        // the work tile infos to the downstream pipelines.
+        if constexpr (cute::is_same_v<SchedulerTag, GroupScheduler>) {
+          do {
+            auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_producer_state;
+            }
+          } while (work_tile_info.is_valid());
+          tile_scheduler_pipeline.producer_tail(tile_scheduler_pipe_producer_state);
+        }
+      }
+      // Mainloop Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+        int32_t const mock_l_coord = 0;
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        // Fetch a copy of tensormaps for the CTA
+        auto input_tensormaps = collective_mainloop.tensormaps_init(params.mainloop, shared_storage.tensormaps.mainloop, sm_count, sm_idx);
+
+        // Update tensormap for the initial batch for the CTA
+        collective_mainloop.tensormaps_perform_update(
+          shared_storage.tensormaps.mainloop,
+          params.mainloop,
+          input_tensormaps,
+          problem_shape_MNKL,
+          curr_batch
+        );
+        // Ensure warp is converged before issuing tensormap fence release
+        __syncwarp();
+        // Entire warp must do this (i.e. it's aligned)
+        collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+
+        bool do_load_order_arrive = true;
+        bool did_batch_change = true;
+        do {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+                work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (did_batch_change) {
+            load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            collective_mainloop.tensormaps_fence_acquire(input_tensormaps);
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            input_tensormaps,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Pipeline state is only advanced if there are K tiles to compute
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
+          auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+          did_batch_change = next_batch != curr_batch;
+          if (work_tile_info.is_valid() && did_batch_change) {
+            curr_batch = next_batch;
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+            }
+            collective_mainloop.tensormaps_perform_update(
+              shared_storage.tensormaps.mainloop,
+              params.mainloop,
+              input_tensormaps,
+              problem_shape_MNKL,
+              curr_batch
+            );
+            // Ensure warp is converged before issuing tensor replace
+            __syncwarp();
+            // Entire warp must do this (i.e. it's aligned)
+            collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps);
+          }
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      } // Mainloop Producer Warp End
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          int32_t curr_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx;
+          int32_t const mock_l_coord = 0;
+
+          bool did_batch_change = true;
+          do {
+            if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+              auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+              work_tile_info = next_work_tile_info;
+              if (increment_pipe) {
+                ++tile_scheduler_pipe_consumer_state;
+              }
+              continue;
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, mock_l_coord);
+
+            // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+            auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+            auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+            auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+            if (did_batch_change) {
+              load_inputs = collective_mainloop.tensors_perform_update(load_inputs, params.mainloop, problem_shape_MNKL, curr_batch);
+            }
+
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, work_k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+
+            // Update starting pipeline state for the next tile
+            mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++tile_scheduler_pipe_consumer_state;
+            }
+            auto next_batch = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl)); // Usually just returns work_tile_info.L_idx
+            did_batch_change = next_batch != curr_batch;
+            if (work_tile_info.is_valid() && did_batch_change) {
+              curr_batch = next_batch;
+              if constexpr (IsGroupedGemmKernel) {
+                problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1);
+              }
+            }
+          } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+        } // End of auxiliary load needed check
+      } // Mainloop Auxiliary Load Producer Warp End
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+        int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+        int32_t const sm_count = params.hw_info.sm_count;
+
+        auto epi_load_tensormap = get<0>(collective_epilogue.load_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx));
+
+        bool did_batch_change = true;
+        constexpr bool IsEpiLoad = true;
+
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_load_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          0
+        );
+
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+
+        load_order_barrier.wait();
+
+        do {
+          int32_t curr_batch = work_tile_info.L_idx;
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            if (did_batch_change) {
+              collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_load_tensormap);
+            }
+
+            epi_load_pipe_producer_state = collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              epi_load_tensormap,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
+          did_batch_change = curr_batch != work_tile_info.L_idx;
+
+          if (work_tile_info.is_valid() && did_batch_change) {
+            if constexpr (IsGroupedGemmKernel) {
+              problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+            }
+
+            // tensormap update
+            {
+              collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+                shared_storage.tensormaps.epilogue,
+                params.epilogue,
+                epi_load_tensormap,
+                problem_shape_MNKL,
+                work_tile_info.L_idx,
+                0
+              );
+
+              // Converge before issuing tensormap fence release since fence is aligned
+              __syncwarp();
+              collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
+            }
+          }
+
+        } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      // Index of warp group within consumer warp groups
+      int consumer_warp_group_idx = warp_group_role == WarpGroupRole::Consumer0 ? 0 : 1;
+
+      int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x);
+      int32_t const sm_count = params.hw_info.sm_count;
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      // Get a copy of tensormaps
+      auto epi_store_tensormap = get<0>(collective_epilogue.store_init(params.epilogue, shared_storage.tensormaps.epilogue, sm_count, sm_idx, consumer_warp_group_idx));
+
+      bool did_batch_change = true;
+      constexpr bool IsEpiLoad = false;
+
+      if (warp_idx_in_warp_group == 0) {
+        collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+          shared_storage.tensormaps.epilogue,
+          params.epilogue,
+          epi_store_tensormap,
+          problem_shape_MNKL,
+          work_tile_info.L_idx,
+          consumer_warp_group_idx
+        );
+
+        // Converge before issuing tensormap fence release since fence is aligned
+        __syncwarp();
+        collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                    epi_store_tensormap,
+                                                                    consumer_warp_group_idx);
+      }
+
+      do {
+        if constexpr (IsGroupedGemmKernel) {
+          problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+        }
+
+        int32_t curr_batch = work_tile_info.L_idx;
+
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+
+          math_wg_order_barrier.wait();
+
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          math_wg_order_barrier.arrive();
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+           math_wg_order_barrier.wait();
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (did_batch_change) {
+          collective_epilogue.template tensormaps_fence_acquire<IsEpiLoad>(epi_store_tensormap);
+        }
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            epi_store_tensormap,
+            work_tile_info.reduction_subtile_idx()
+          );
+
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+        work_tile_info = next_work_tile_info;
+        if (increment_pipe) {
+          ++tile_scheduler_pipe_consumer_state;
+        }
+
+        // Skip a tile for pingpong
+        if (work_tile_info.is_valid()) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+          // Go to next tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state);
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++tile_scheduler_pipe_consumer_state;
+          }
+        }
+
+        did_batch_change = curr_batch != work_tile_info.L_idx;
+        if (work_tile_info.is_valid() && did_batch_change) {
+          if constexpr (IsGroupedGemmKernel) {
+            problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(work_tile_info.L_idx), 1);
+          }
+          if (warp_idx_in_warp_group == 0) {
+            collective_epilogue.template tensormaps_perform_update<IsEpiLoad>(
+              shared_storage.tensormaps.epilogue,
+              params.epilogue,
+              epi_store_tensormap,
+              problem_shape_MNKL,
+              work_tile_info.L_idx,
+              consumer_warp_group_idx
+            );
+
+            // Converge before issuing tensormap fence release since fence is aligned
+            __syncwarp();
+            collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue,
+                                                                       epi_store_tensormap,
+                                                                       consumer_warp_group_idx);
+          }
+        }
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+      } while (work_tile_info.is_valid()); // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2292d7e4a2d0f0355e62fb338023beaba370d0cf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -0,0 +1,306 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/trace.h"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTma, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+  static_assert(cute::is_same_v<ElementAccumulator, typename CollectiveEpilogue::ElementAccumulator>,
+    "Mainloop and epilogue do not agree on accumulator value type.");
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  static constexpr int SharedStorageSize = static_cast<int>(cute::max(
+      sizeof(typename CollectiveMainloop::SharedStorage),
+      sizeof(typename CollectiveEpilogue::SharedStorage)));
+
+  static constexpr uint32_t MaxThreadsPerBlock = CollectiveMainloop::ThreadCount;
+
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    int thread_idx = int(threadIdx.x);
+    int warp_idx   = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+    }
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = params.mainloop.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
+    Tensor mB_nkl = params.mainloop.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    auto blk_coord = make_coord(_,_,_);                                                   // (m,n,k) -- defer the slice
+
+    // Make tiled views
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, blk_coord, Step<_1, X,_1>{});                  // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, blk_coord, Step< X,_1,_1>{});                  // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto output_tile_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Allocate the tiled_mma and the accumulators for the (M,N) blk_shape
+    TiledMma tiled_mma;
+    Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                   // (MMA,MMA_M,MMA_N)
+
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+
+    // Perform the collective scoped MMA
+    CollectiveMainloop collective_mma;
+    collective_mma(
+      gA, params.mainloop.tma_load_a,
+      gB, params.mainloop.tma_load_b,
+      accumulators,
+      k_tile_iter, k_tile_count,
+      thread_idx,
+      block_rank_in_cluster,
+      smem_buf,
+      params.mainloop
+    );
+
+    constexpr int BLK_M_RANK = cute::rank<0>(blk_shape);
+    auto m_max_coord = unwrap(cute::transform(make_seq<BLK_M_RANK>{}, [&](auto i) {
+        return  get<i>(M) - get<0,i>(blk_shape) * get<i>(m_coord);
+      }));
+
+    constexpr int BLK_N_RANK = cute::rank<1>(blk_shape);
+    auto n_max_coord = unwrap(cute::transform(make_seq<BLK_N_RANK>{}, [&](auto i) {
+        return  get<i>(N) - get<1,i>(blk_shape) * get<i>(n_coord);
+      }));
+    auto residue_mnk = make_tuple(m_max_coord, n_max_coord, Int<0>{});
+
+    // Epilogue and write to gD
+    CollectiveEpilogue epilogue{params.epilogue};
+    epilogue(
+      problem_shape_MNKL,
+      blk_shape,
+      output_tile_coord,
+      accumulators,
+      tiled_mma,
+      residue_mnk,
+      thread_idx,
+      smem_buf
+    );
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b558005f315e4b1a8143b67096931ed16ad490c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -0,0 +1,522 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cutlass/conv/detail.hpp"
+
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+#include "cutlass/arch/grid_dependency_control.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<cutlass::gemm::KernelTmaWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>
+>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+
+  // Handles the static_assert placed inside the operator()
+  // This is also used to decide whether the load_init inside collective mainloop returns rank 4 tensors or rank 5 tensors
+  static constexpr bool IsConvProblemShape = not (cute::is_tuple_v<ProblemShape>|| IsCutlass3ArrayKernel<ProblemShape>::value);
+  static_assert( IsConvProblemShape || (cute::rank(ProblemShape{}) == 3 || cute::rank(ProblemShape{}) == 4), "ProblemShape{} should be <M,N,K> or <M,N,K,L> for Gemm");
+
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "TMA warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileSchedulerTag, ArchTag, TileShape, ClusterShape>::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    // Mainloop and epilogue don't use smem concurrently since kernel is non-persistent, so we can use a union
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 1;
+  static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMma{})) + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    cutlass::gemm::GemmUniversalMode mode{}; //maintained here for backward compatibility
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+
+    // Default constructor
+    Arguments() = default;
+
+    // Constructor with specified mode 
+    // It is used for Gemm
+    Arguments(
+        cutlass::gemm::GemmUniversalMode mode_,
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(mode_)
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+    // Constructor with default value for 'mode'
+    // This allows us to set GemmUniversal mode as kGemm for Conv right away
+    // while keeping the testbeds unchanged
+    Arguments(
+        ProblemShape problem_shape_,
+        MainloopArguments mainloop_,
+        EpilogueArguments epilogue_,
+        KernelHardwareInfo hw_info_ = KernelHardwareInfo(),
+        TileSchedulerArguments scheduler_ = TileSchedulerArguments())
+    : mode(cutlass::gemm::GemmUniversalMode::kGemm) // Default mode
+      , problem_shape(problem_shape_)
+      , mainloop(mainloop_)
+      , epilogue(epilogue_)
+      , hw_info(hw_info_)
+      , scheduler(scheduler_) {}
+
+  };
+
+  // Kernel entry point API
+  struct Params {
+    using ProblemShapeMNKL = decltype(cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(ProblemShape{}, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{}));
+    ProblemShapeMNKL problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+
+    (void) workspace;
+    auto problem_shape_mnkl = cutlass::conv::detail::get_problem_shape_MNKL_helper<CollectiveMainloop>(args.problem_shape, cute::conditional_t<IsConvProblemShape, cute::true_type, cute::false_type>{});
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    auto swapped_problem_shape = problem_shape_mnkl;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(swapped_problem_shape) = get<1>(problem_shape_mnkl);
+      get<1>(swapped_problem_shape) = get<0>(problem_shape_mnkl);
+    }
+    return {
+      swapped_problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(transformed_problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    auto transformed_problem_shape = cutlass::conv::detail::get_transformed_problem_shape_MNKL(args.problem_shape);
+
+    if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+        return implementable;
+    }
+
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(transformed_problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = ClusterShape{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#  if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || defined(__CUDA_ARCH_FEAT_SM121_ALL) ||\
+      CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))
+#    define ENABLE_SM90_KERNEL_LEVEL 1
+#  endif
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+    enum class ProducerWarpRole {
+      MainloopEpilogue = 0,
+      Warp1 = 1,
+      Warp2 = 2,
+      Warp3 = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+  
+    // Preconditions only valid for Gemm
+    static_assert(IsConvProblemShape || cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(IsConvProblemShape || cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    // Using constexpr if (C++17 and later)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, cute::Int<1>{});
+    
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. 
+    // Expects a tuple of tensors for conv where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+    
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    // handles the difference between the rank of Tensor returned by load_input in case they do not have a batch mode
+    auto l_coord = [&] (auto const& gB_nkl_) {
+      // gB_nkl needs to be passed into the lambda because C++17
+      // does not permit lambda capture of structured bindings.
+      if constexpr (not IsConvProblemShape) {
+        // This needs to be inside an `if constexpr`,
+        // because shape<4>(gB_nkl) is not well-formed otherwise.
+        return idx2crd(int(blockIdx.z), shape<4>(gB_nkl_));
+      }
+      else {
+        return Int<0>{};
+      }
+    } (gB_nkl);
+
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+    auto k_tile_count = size<3>(gA_mkl);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      if (producer_warp_role == ProducerWarpRole::MainloopEpilogue) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        collective_mainloop.load(
+          params.mainloop,
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          load_inputs,
+          blk_coord,
+          k_tile_iter, k_tile_count,
+          lane_idx,
+          block_rank_in_cluster,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting mainloop pipeline state for the pipeline drain
+        mainloop_pipe_producer_state.advance(k_tile_count);
+        // Make sure mainloop consumer has been waited upon before issuing epilogue load
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          // Ensure warp is converged before issuing epilogue loads
+          __syncwarp();
+          epi_load_pipe_producer_state = collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+          collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+        }
+      } 
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Hint on an early release of global memory resources.
+      // The timing of calling this function only influences performance,
+      // not functional correctness.
+      cutlass::arch::launch_dependent_grids();
+
+      // Epilogue and write to gD
+      auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+
+      collective_epilogue.store_tail(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state_next,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state_next
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d398d1f2906c473453f774e528adde246e953620
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -0,0 +1,861 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileSchedulerTag_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileSchedulerTag_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  using TileSchedulerTag = TileSchedulerTag_;
+
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                                          TileSchedulerTag, 
+                                          ArchTag, 
+                                          TileShape,
+                                          ClusterShape
+                                          ,TileSchedulerPipelineStageCount
+                                          >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp       
+  static constexpr uint32_t NumMMAThreads          = size(TiledMma{});       // 8 warps
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = NumMMAThreads / NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr uint32_t NumFixupBarriers = NumMmaWarpGroups;
+  static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = detail::HasAuxiliaryLoad_v<typename CollectiveMainloop::DispatchPolicy>;
+
+  /// Register requirement for Load and Math WGs
+  static constexpr int RegsPerThread =
+    size<0>(TileShape{}) * size<1>(TileShape{}) / NumMMAThreads *
+    sizeof(ElementAccumulator) / sizeof(uint32_t);
+  static constexpr bool HeavyRegisterPressure = RegsPerThread >= 208;
+  static constexpr uint32_t LoadRegisterRequirement = !HeavyRegisterPressure ? 40 : 24;
+  static constexpr uint32_t MmaRegisterRequirement = !HeavyRegisterPressure ? 232 : 240;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+  
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+
+    alignas(16) TileSchedulerStorage scheduler;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+    void* workspace{nullptr};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+    // Precompute the sub tiles numbers in epilogue, pass into tile scheduler.  Therefore it will be used
+    // in separate reduction scheme for streamk case, NumEpilogueSubTiles default value is 1, which means
+    // subtile will not be used, therefore separate reduction will not be enabled.
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      );
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      scheduler,
+      workspace
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#  if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || defined(__CUDA_ARCH_FEAT_SM121_ALL) ||\
+      CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))
+#    define ENABLE_SM90_KERNEL_LEVEL 1
+#  endif
+
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(NumMMAThreads == 256, "Cooperative kernel must have TiledMMA operating using 256 threads.");
+    static_assert(size<0>(TileShape{}) >= 128,
+        "Cooperative kernel requires Tile Size to be greater than or equal to 128 along the M-dimension.");
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, Consumer0 and Consumer1 collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      MainloopAux = 3
+    };
+
+
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int mma_thread_idx = thread_idx % NumMMAThreads;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+    bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) { 
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + NumMainloopLoadThreads + NumMMAThreads;
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      } 
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+      
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      scheduler_throttle_pipeline_params.initializing_warp = 3;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               producer_warp_role == ProducerWarpRole::Mainloop) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params);
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && (producer_warp_role == ProducerWarpRole::Mainloop || 
+        producer_warp_role == ProducerWarpRole::MainloopAux)) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumMMAThreads;
+    mainloop_pipeline_params.num_producers = NumProducerThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    } 
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumEpilogueLoadThreads;
+    epi_load_pipeline_params.consumer_arv_count = NumMMAThreads;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+
+    auto cluster_wait_fn = [] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
+    // Declare work_tile_info, then define it in each of warps that use it.
+    typename TileScheduler::WorkTileInfo work_tile_info;
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        if constexpr (IsSchedDynamicPersistent) { 
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          cutlass::arch::wait_on_dependent_grids();
+          while (work_tile_info.is_valid()) {
+
+            if (requires_clc_query) {
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next work tile
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+              work_tile_info,
+              scheduler_pipeline,
+              scheduler_pipe_consumer_state
+            );
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+
+            work_tile_info = next_work_tile_info;
+          }
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        } 
+      } // Scheduler Producer Warp End  
+      else
+
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+            work_tile_info = next_work_tile_info;   
+            continue;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+          auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+          auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+          auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, work_k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                            scheduler_pipeline,             
+                                                                            scheduler_pipe_consumer_state
+                                                                           );
+
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) { 
+            requires_clc_query = increment_pipe; 
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      }
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          while (work_tile_info.is_valid()) {
+            if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+              auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+              work_tile_info = next_work_tile_info;
+              continue;
+            }
+
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+            auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+            auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+            auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<3>(gA_mkl)), shape<3>(gA_mkl));
+
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, work_k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+            // Update starting pipeline state for the next tile
+            mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(
+              work_tile_info,
+              scheduler_pipeline,
+              scheduler_pipe_consumer_state
+            );
+
+            work_tile_info = next_work_tile_info;
+          } // Scheduler work fetch loop
+
+        }
+      }
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && is_epi_load_needed) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        if (!TileScheduler::requires_separate_reduction(params.scheduler) && work_tile_info.is_valid()) {
+          load_order_barrier.wait();
+        }
+
+        CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+        while (work_tile_info.is_valid()) {
+          if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+            
+            epi_load_pipe_producer_state =
+            collective_epilogue.load(
+              epi_load_pipeline,
+              epi_load_pipe_producer_state,
+              problem_shape_MNKL,
+              blk_shape,
+              blk_coord,
+              tiled_mma,
+              lane_idx,
+              shared_storage.tensors.epilogue,
+              work_tile_info.reduction_subtile_idx()
+            );
+          }
+
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                            scheduler_pipeline,     
+                                                                            scheduler_pipe_consumer_state
+                                                                           );
+          work_tile_info = next_work_tile_info;
+          if constexpr (IsSchedDynamicPersistent) { 
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+      // Do we potentially issue tail arrives for TMA stores, if epilogue load is waiting for it
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        // Allocate the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+        if (TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) {
+          collective_mainloop.mma(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            accumulators,
+            work_k_tile_count,
+            mma_thread_idx,
+            shared_storage.tensors.mainloop,
+            params.mainloop
+          );
+
+          // Make sure the math instructions are done and free buffers before entering the epilogue
+          collective_mainloop.mma_tail(
+            mainloop_pipeline,
+            mainloop_pipe_consumer_state,
+            work_k_tile_count
+          );
+
+          // Update starting mainloop pipeline state for the next tile
+          mainloop_pipe_consumer_state.advance(work_k_tile_count);
+        }
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue,
+            work_tile_info.reduction_subtile_idx()
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info,
+                                                                          scheduler_pipeline,
+                                                                          scheduler_pipe_consumer_state
+                                                                          );
+        work_tile_info = next_work_tile_info;
+        if constexpr (IsSchedDynamicPersistent) { 
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+          }
+        }
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1326f390fdcd536cec9f74bd8c311342ef2d53de
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -0,0 +1,946 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/workspace.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/grid_dependency_control.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelTmaWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = cutlass::arch::IsGdcGloballyEnabled;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  static constexpr uint32_t TileSchedulerPipelineStageCount = DispatchPolicy::Schedule::SchedulerPipelineStageCount;
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+                                          TileSchedulerTag, 
+                                          ArchTag, 
+                                          TileShape,
+                                          ClusterShape,
+                                          TileSchedulerPipelineStageCount
+                                          >::Scheduler;
+
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+  using TileSchedulerPipeline = typename TileScheduler::Pipeline;
+  using TileSchedulerPipelineState = typename TileSchedulerPipeline::PipelineState;
+  using TileSchedulerStorage = typename TileScheduler::SharedStorage;
+
+  using TileSchedulerThrottlePipeline = typename TileScheduler::ThrottlePipeline;
+  using TileSchedulerThrottlePipelineState = typename TileSchedulerThrottlePipeline::PipelineState;
+
+  static constexpr bool IsSchedDynamicPersistent = TileScheduler::IsDynamicPersistent;
+
+  // Warp specialization thread count per threadblock
+  static constexpr uint32_t NumSchedThreads        = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumMainloopLoadThreads = NumThreadsPerWarp;      // 1 warp
+  static constexpr uint32_t NumEpilogueLoadThreads = NumThreadsPerWarp;      // 1 warp for C
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups = 2;
+  static constexpr uint32_t NumProducerThreads = CollectiveMainloop::NumProducerThreadEvents;
+  static constexpr uint32_t NumMMAThreads = size(TiledMma{});                 // 4 warp 
+  static constexpr uint32_t MaxThreadsPerBlock = NumMMAThreads * NumMmaWarpGroups + (NumLoadWarpGroups * NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static constexpr bool     IsMainloopAuxiliaryLoadNeeded = detail::HasAuxiliaryLoad_v<typename CollectiveMainloop::DispatchPolicy>;
+  
+  static_assert(NumMMAThreads == 128, "Pingpong kernel must have TiledMMA operating using 128 threads.");
+  static_assert(MaxThreadsPerBlock == 384, "Pingpong kernel must have 384 threads in total.");
+
+  /// Register requirement for Load and Math WGs
+  static constexpr int RegsPerThread =
+    (size<0>(TileShape{}) * size<1>(TileShape{}) * sizeof(ElementAccumulator))
+    / (NumMMAThreads * sizeof(uint32_t));
+  static constexpr bool HeavyRegisterPressure = RegsPerThread >= 208;
+  static constexpr uint32_t LoadRegisterRequirement = !HeavyRegisterPressure ? 40 : 24;
+  static constexpr uint32_t MmaRegisterRequirement = !HeavyRegisterPressure ? 232 : 240;
+
+  // 1 stage ordered sequence between mainloop and epilogue producer load threads
+  using LoadWarpOrderBarrier = cutlass::OrderedSequenceBarrier<1,2>;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+  using MathWarpGroupOrderBarrierSharedStorage =
+    cutlass::PipelineDetail::OrderedSequenceBarrierSharedStorage<
+      MathWarpGroupOrderBarrier::SequenceDepth,
+      MathWarpGroupOrderBarrier::SequenceLength>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = MathWarpGroupOrderBarrierSharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+      alignas(16) typename LoadWarpOrderBarrier::SharedStorage load_order;
+    } pipelines;
+    
+    alignas(16) TileSchedulerStorage scheduler;
+
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      EpilogueTensorStorage epilogue;
+      MainloopTensorStorage mainloop;
+    } tensors;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    // Calculate workspace pointers
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+
+    void* epilogue_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* scheduler_workspace = workspace_ptr + workspace_offset;
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+
+    void* mainloop_workspace = nullptr;
+    constexpr uint32_t NumEpilogueSubTiles = CollectiveEpilogue::get_store_pipe_increment(TileShape{});
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, mainloop_workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, epilogue_workspace),
+      hw_info,
+      TileScheduler::to_underlying_arguments(
+        problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, scheduler_workspace, NumEpilogueSubTiles
+      )
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_size = 0;
+
+    workspace_size += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    workspace_size += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_size = round_nearest(workspace_size,  MinWorkspaceAlignment);
+
+    return workspace_size;
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    Status status = Status::kSuccess;
+    uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
+    size_t workspace_offset = 0;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+
+    status = CollectiveEpilogue::initialize_workspace(args.problem_shape, args.epilogue, workspace_ptr + workspace_offset, stream, cuda_adapter);
+    workspace_offset += CollectiveEpilogue::get_workspace_size(args.problem_shape, args.epilogue);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = TileScheduler::template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace_ptr + workspace_offset, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+    workspace_offset += TileScheduler::template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+    workspace_offset = round_nearest(workspace_offset,  MinWorkspaceAlignment);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return status;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    args.raster_order = params.scheduler.raster_order_ == TileScheduler::RasterOrder::AlongN ? TileScheduler::RasterOrderOptions::AlongN : TileScheduler::RasterOrderOptions::AlongM;
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+#  if (defined(__CUDA_ARCH_FEAT_SM90_ALL) || defined(__CUDA_ARCH_FEAT_SM120_ALL) || defined(__CUDA_ARCH_FEAT_SM121_ALL) ||\
+      CUDA_ARCH_CONDITIONAL_OR_FAMILY(1200) || CUDA_ARCH_CONDITIONAL_OR_FAMILY(1210))
+#    define ENABLE_SM90_KERNEL_LEVEL 1
+#  endif
+
+// Any Tensor Op MMA Atom in the ISA is arch conditional.
+#if ! defined(ENABLE_SM90_KERNEL_LEVEL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting appropriate compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer0 = 1,
+      Consumer1 = 2
+    };
+    enum class ProducerWarpRole {
+      Mainloop = 0,
+      Warp1 = 1,
+      Epilogue = 2,
+      MainloopAux = 3
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int lane_idx = canonical_lane_idx();
+    int warp_idx = canonical_warp_idx_sync();
+    int warp_idx_in_warp_group = warp_idx % NumWarpsPerWarpGroup;
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    auto warp_group_role = WarpGroupRole(canonical_warp_group_idx());
+    auto producer_warp_role = ProducerWarpRole(warp_idx_in_warp_group);
+    int lane_predicate = cute::elect_one_sync();
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if ((warp_idx == 0) && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+
+    // TileScheduler pipeline
+    typename TileSchedulerPipeline::Params scheduler_pipeline_params;
+    typename TileSchedulerThrottlePipeline::Params scheduler_throttle_pipeline_params;
+    if constexpr (IsSchedDynamicPersistent) { 
+      if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::ProducerConsumer;
+      }
+      else {
+        scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Consumer;
+      }
+      scheduler_pipeline_params.producer_blockid = 0;
+      scheduler_pipeline_params.producer_arv_count = 1;
+      scheduler_pipeline_params.consumer_arv_count = NumSchedThreads + NumMainloopLoadThreads + NumMMAThreads;
+
+      CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+      bool is_epi_load_needed = collective_epilogue.is_producer_load_needed();
+
+      if (is_epi_load_needed) {
+        scheduler_pipeline_params.consumer_arv_count += NumEpilogueLoadThreads;
+      } 
+      scheduler_pipeline_params.transaction_bytes = sizeof(typename TileScheduler::CLCResponse);
+
+      scheduler_throttle_pipeline_params.producer_arv_count = NumMainloopLoadThreads;
+      scheduler_throttle_pipeline_params.consumer_arv_count = NumSchedThreads;
+      scheduler_throttle_pipeline_params.dst_blockid = 0;
+      if (warp_group_role == WarpGroupRole::Producer &&
+          producer_warp_role == ProducerWarpRole::Warp1) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Consumer;
+      }
+      // set role when it is for DMA warp in Mainloop
+      else if (warp_group_role == WarpGroupRole::Producer &&
+               producer_warp_role == ProducerWarpRole::Mainloop) {
+        scheduler_throttle_pipeline_params.role =
+            TileSchedulerThrottlePipeline::ThreadCategory::Producer;
+      }
+    }
+    TileSchedulerPipeline scheduler_pipeline(shared_storage.scheduler.pipeline(), scheduler_pipeline_params);
+    TileSchedulerPipelineState scheduler_pipe_consumer_state;
+
+    TileSchedulerThrottlePipeline scheduler_throttle_pipeline(shared_storage.scheduler.throttle_pipeline(), scheduler_throttle_pipeline_params);
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_consumer_state;
+    TileSchedulerThrottlePipelineState scheduler_pipe_throttle_producer_state = cutlass::make_producer_start_state<TileSchedulerThrottlePipeline>();
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && (producer_warp_role == ProducerWarpRole::Mainloop 
+        || producer_warp_role == ProducerWarpRole::MainloopAux)) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.is_leader = warp_group_thread_idx == 0;
+    mainloop_pipeline_params.num_consumers = NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.num_producers = NumProducerThreads;
+    mainloop_pipeline_params.transaction_bytes = params.mainloop.tma_transaction_bytes;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params, ClusterShape{});
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer && producer_warp_role == ProducerWarpRole::Epilogue) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.dst_blockid = cute::block_rank_in_cluster();
+    epi_load_pipeline_params.producer_arv_count = NumThreadsPerWarp;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup;
+    if constexpr (CollectiveEpilogue::RequiresTransactionBytes) {
+      epi_load_pipeline_params.transaction_bytes = params.epilogue.tma_transaction_bytes;
+    }
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename LoadWarpOrderBarrier::Params params_load_order_barrier;
+    params_load_order_barrier.group_id = producer_warp_role == ProducerWarpRole::Mainloop ? 0 : 1;
+    params_load_order_barrier.group_size = NumThreadsPerWarp;
+    LoadWarpOrderBarrier load_order_barrier(shared_storage.pipelines.load_order, params_load_order_barrier);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = canonical_warp_group_idx() - static_cast<int>(WarpGroupRole::Consumer0);
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    auto cluster_wait_fn = [&] () {
+      // We need this to guarantee that the Pipeline init is visible
+      // To all producers and consumer thread blocks in the Cluster
+      if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        return [] () { cute::cluster_wait(); };
+      }
+      else {
+        __syncthreads();
+        return [] () {}; // do nothing
+      }
+    } ();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue);
+
+    // Prepare and partition the input tensors. Expects a tuple of tensors where:
+    // get<0>(load_inputs) is the tma tensor A after local tiling so that it has shape (BLK_M,BLK_K,m,k,l)
+    // get<1>(load_inputs) is the tma tensor B after local tiling so that it has shape (BLK_N,BLK_K,n,k,l)
+    auto load_inputs = collective_mainloop.load_init(problem_shape_MNKL, params.mainloop);
+    static_assert(cute::tuple_size_v<decltype(load_inputs)> >= 2, "Output of load_init must have at least two elements (A, B)");
+
+    // Extract out partitioned A and B.
+    Tensor gA_mkl = get<0>(load_inputs);
+    Tensor gB_nkl = get<1>(load_inputs);
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+    if constexpr (IsSchedDynamicPersistent) {
+      scheduler.set_data_ptr(shared_storage.scheduler.data());
+    }
+
+    if (warp_group_role == WarpGroupRole::Consumer1) {
+
+      if constexpr (not IsSchedDynamicPersistent) {
+        // Advance 2nd Math WG to the next work tile for the startup
+        scheduler.advance_to_next_work();
+      }
+
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // Wait for all thread blocks in the Cluster
+    cluster_wait_fn();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+    
+      // Scheduler Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Warp1) {
+        if constexpr (IsSchedDynamicPersistent) { 
+          bool requires_clc_query = true;
+          TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state<TileSchedulerPipeline>();
+
+          while (work_tile_info.is_valid()) {
+            
+            if (requires_clc_query) {
+
+              // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
+              scheduler_throttle_pipeline.consumer_wait(scheduler_pipe_throttle_consumer_state);
+              scheduler_throttle_pipeline.consumer_release(scheduler_pipe_throttle_consumer_state);
+              ++scheduler_pipe_throttle_consumer_state;
+
+              // Query next work tile
+              scheduler_pipe_producer_state = scheduler.advance_to_next_work(scheduler_pipeline, scheduler_pipe_producer_state);
+            }
+
+            // Fetch next work tile
+            auto [next_work_tile_info, increment_pipe] = 
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+            
+            work_tile_info = next_work_tile_info;
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+
+          // Terminal condition - if work_tile_info is end-of-grid, produce an extra invalid tile
+          scheduler_pipeline.producer_acquire(scheduler_pipe_producer_state);
+          scheduler.store_invalid_response(scheduler_pipe_producer_state); // Push invalid tile to smem
+          scheduler_pipeline.producer_commit(scheduler_pipe_producer_state); // Manual completion of transaction
+          ++scheduler_pipe_producer_state;
+
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          scheduler_pipeline.producer_tail(scheduler_pipe_producer_state);
+        } 
+      } // Scheduler Producer Warp End  
+      else
+      
+      // Mainloop Producer Warp
+      if (producer_warp_role == ProducerWarpRole::Mainloop) {
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+        bool do_load_order_arrive = true;
+        bool requires_clc_query = true;
+        while (work_tile_info.is_valid()) {
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          auto k_tile_iter  = cute::make_coord_iterator(shape<3>(gA_mkl));
+
+          if (requires_clc_query) {
+            scheduler_throttle_pipeline.producer_acquire(scheduler_pipe_throttle_producer_state);
+            scheduler_throttle_pipeline.producer_commit(scheduler_pipe_throttle_producer_state);
+            ++scheduler_pipe_throttle_producer_state;
+          }
+
+          collective_mainloop.load(
+            params.mainloop,
+            mainloop_pipeline,
+            mainloop_pipe_producer_state,
+            load_inputs,
+            blk_coord,
+            k_tile_iter, k_tile_count,
+            lane_idx,
+            block_rank_in_cluster,
+            shared_storage.tensors.mainloop
+          );
+          // Update starting pipeline state for the next tile
+          mainloop_pipe_producer_state.advance(k_tile_count);
+
+          // Signal for the epilogue load warp to begin
+          if (do_load_order_arrive) {
+            load_order_barrier.arrive();
+            do_load_order_arrive = false;
+          }
+
+          if constexpr (IsSchedDynamicPersistent) {  
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] =
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+            work_tile_info = next_work_tile_info;
+            requires_clc_query = increment_pipe;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+          else {
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+        if constexpr (IsSchedDynamicPersistent) {  
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+        }
+        
+      } // Mainloop Producer Warp End
+
+      else if (producer_warp_role == ProducerWarpRole::MainloopAux) {
+        if constexpr (IsMainloopAuxiliaryLoadNeeded) {
+          // Ensure that the prefetched kernel does not touch
+          // unflushed global memory prior to this instruction
+          cutlass::arch::wait_on_dependent_grids();
+          while (work_tile_info.is_valid()) {
+            // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+            auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+            auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+            auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+            auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+            auto k_tile_iter = cute::make_coord_iterator(shape<3>(gA_mkl));
+            collective_mainloop.load_auxiliary(
+              params.mainloop,
+              mainloop_pipeline,
+              mainloop_pipe_producer_state,
+              load_inputs,
+              blk_coord,
+              k_tile_iter, k_tile_count,
+              lane_idx,
+              block_rank_in_cluster,
+              shared_storage.tensors.mainloop
+            );
+            // Update starting pipeline state for the next tile
+            mainloop_pipe_producer_state.advance(k_tile_count);
+
+            scheduler.advance_to_next_work();
+            work_tile_info = scheduler.get_current_work();
+          } // Scheduler work fetch loop
+
+          // Make sure all Consumer Warp Groups have been waited upon
+          collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+          if constexpr (IsSchedDynamicPersistent) {  
+            auto [next_work_tile_info, increment_pipe] = 
+              scheduler.fetch_next_work(
+                work_tile_info,
+                scheduler_pipeline,
+                scheduler_pipe_consumer_state
+              );
+          }
+          
+        }
+      }
+
+      // Epilogue Producer Warp
+      else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) {
+
+        // Ensure that the prefetched kernel does not touch
+        // unflushed global memory prior to this instruction
+        cutlass::arch::wait_on_dependent_grids();
+
+        bool do_load_order_wait = true;
+        while (work_tile_info.is_valid()) {
+          if (do_load_order_wait) {
+            load_order_barrier.wait();
+            do_load_order_wait = false;
+          }
+
+          // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+          auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+          auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+          auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+          auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            lane_idx,
+            shared_storage.tensors.epilogue
+          );
+
+          if constexpr (IsSchedDynamicPersistent) {  
+            // Get next work tile
+            auto [next_work_tile_info, increment_pipe] = 
+              scheduler.fetch_next_work(
+                  work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+            work_tile_info = next_work_tile_info;
+            if (increment_pipe) {
+              ++scheduler_pipe_consumer_state;
+            }
+          }
+          else {
+          // Get next work tile
+          scheduler.advance_to_next_work();
+          work_tile_info = scheduler.get_current_work();
+          }
+        } // Scheduler work fetch loop
+
+        // Make sure all Consumer Warp Groups have been waited upon
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+
+        if constexpr (IsSchedDynamicPersistent) {  
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+        }
+      } // Epilogue Producer Warp End
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer0 || warp_group_role == WarpGroupRole::Consumer1) {
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+      // It is possible to have work tiles start off invalid,
+      // so we have to check that first.
+      if (not work_tile_info.is_valid()) {
+        // Hint on an early release of global memory resources.
+        // The timing of calling this function only influences performance,
+        // not functional correctness.
+        cutlass::arch::launch_dependent_grids();
+
+        return;
+      }
+      #endif
+      
+      if constexpr (IsSchedDynamicPersistent) {
+        // Consumer0's initial tile is static. It starts consuming the 2nd tile.
+        if (warp_group_role == WarpGroupRole::Consumer0) {
+            ++scheduler_pipe_consumer_state;
+        } 
+
+        if (warp_group_role == WarpGroupRole::Consumer1) {
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+            ++scheduler_pipe_consumer_state;
+          }
+        } 
+      }
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          warp_group_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        #ifdef CUTLASS_ENABLE_GDC_FOR_SM90
+        if (scheduler.is_last_tile(work_tile_info, NumMmaWarpGroups)) {
+          // Hint on an early release of global memory resources.
+          // The timing of calling this function only influences performance,
+          // not functional correctness.
+          cutlass::arch::launch_dependent_grids();
+
+        }
+        #endif
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+
+        // TMA store pipeline wait is only visible to TMA-issuing warp, so for multiple-consumer kernels
+        // we need to wait for all TMA stores to complete before issuing consumer order barrier arrives
+        // to ensure next math consumer doesn't overwrite smem of in-flight TMA stores of current consumer.
+        auto [epi_load_pipe_consumer_state_next_, epi_store_pipe_producer_state_next_] =
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state_next,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state_next
+        );
+
+        // Update starting load/store pipeline states for the next tile
+        // state has already been incremented by 1 tile in collective calls, advance once again for ping pong
+        epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next_;
+        epi_store_pipe_producer_state = epi_store_pipe_producer_state_next_;
+        epi_load_pipe_consumer_state.advance(c_tile_count);
+        epi_store_pipe_producer_state.advance(d_tile_count);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        if constexpr (IsSchedDynamicPersistent) {  
+          // Get next work tile
+          auto [next_work_tile_info, increment_pipe] = 
+            scheduler.fetch_next_work(
+                work_tile_info, scheduler_pipeline, scheduler_pipe_consumer_state);
+
+          work_tile_info = next_work_tile_info;
+          if (increment_pipe) {
+            ++scheduler_pipe_consumer_state;
+            ++scheduler_pipe_consumer_state;
+          }
+        }
+        else {
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+        }
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7cafde5338941287ae2628cdc7bcb36b9644c31
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecialized, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(cute::is_void_v<TileScheduler_> or cute::is_same_v<TileScheduler_, PersistentScheduler>,
+    "Non-persistent warp-specialized kernel does not support specializing the tile scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    union TensorStorage {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same.");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace)
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto cluster_shape = Shape<_1,_1,_1>{};
+    auto tile_shape = TileShape{};
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    return TileScheduler::get_tiled_cta_shape_mnl(
+        problem_shape_MNKL, tile_shape, cluster_shape);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+    TiledMma tiled_mma;
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Compute m_coord, n_coord, and l_coord with their post-tiled shapes
+    auto m_coord = idx2crd(int(blockIdx.x), shape<2>(gA_mkl));
+    auto n_coord = idx2crd(int(blockIdx.y), shape<2>(gB_nkl));
+    auto l_coord = idx2crd(int(blockIdx.z), shape<4>(gB_nkl));
+    auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+    // Slice with m_coord and n_coord
+    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                       // (BLK_M,BLK_K,k)
+    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                       // (BLK_N,BLK_K,k)
+
+    // Get pipeline iterators and increments from tensor shapes
+    auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+    auto k_tile_count = size<2>(gA);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+      // Compute tile residues for predication
+      auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+      auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+      auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+      auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+      collective_mainloop.load(
+        mainloop_pipeline,
+        mainloop_pipe_producer_state,
+        gA,
+        gB,
+        k_tile_iter, k_tile_count,
+        residue_mnk,
+        thread_idx,
+        shared_storage.tensors.mainloop
+      );
+      // Update starting mainloop pipeline state for the pipeline drain
+      mainloop_pipe_producer_state.advance(k_tile_count);
+      // Make sure mainloop consumer has been waited upon before issuing epilogue load
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+
+      if (collective_epilogue.is_producer_load_needed()) {
+        epi_load_pipe_producer_state =
+        collective_epilogue.load(
+          epi_load_pipeline,
+          epi_load_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          tiled_mma,
+          thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    }
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+      Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));                 // (MMA,MMA_M,MMA_N)
+
+      collective_mainloop.mma(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        accumulators,
+        k_tile_count,
+        warp_group_thread_idx,
+        shared_storage.tensors.mainloop,
+        params.mainloop
+      );
+
+      // Make sure the math instructions are done and free buffers before entering the epilogue
+      collective_mainloop.mma_tail(
+        mainloop_pipeline,
+        mainloop_pipe_consumer_state,
+        k_tile_count
+      );
+
+      // Epilogue and write to gD
+      collective_epilogue.store(
+        epi_load_pipeline,
+        epi_load_pipe_consumer_state,
+        epi_store_pipeline,
+        epi_store_pipe_producer_state,
+        problem_shape_MNKL,
+        blk_shape,
+        blk_coord,
+        accumulators,
+        tiled_mma,
+        warp_group_thread_idx,
+        shared_storage.tensors.epilogue
+      );
+    }
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d35ff2dc8c3992e7942a0be5da929febd771cae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
@@ -0,0 +1,515 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cute/tensor.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedCooperative, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    TileScheduler t;
+    return t.template get_workspace_size<ProblemShape, ElementAccumulator>(
+      args.scheduler, args.problem_shape, args.hw_info, NumMmaWarpGroups);
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    TileScheduler t;
+    static constexpr uint32_t NumEpilogueSubTiles = 1;
+    static constexpr uint32_t NumAccumulatorMtxs = 1;
+    return t.template initialize_workspace<ProblemShape, ElementAccumulator>(
+      args.scheduler, workspace, stream, args.problem_shape, args.hw_info, NumMmaWarpGroups, NumEpilogueSubTiles, NumAccumulatorMtxs, cuda_adapter);
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    /* In the Cooperative kernel, one or multiple Consumers collaborate on the same tile */
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int mma_thread_idx = thread_idx % size(TiledMma{});
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumMmaWarpGroups * NumThreadsPerWarpGroup;
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    TileScheduler scheduler{params.scheduler};
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        // Get the number of K tiles to compute for this work as well as the starting K tile offset of the work.
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+        auto work_k_tile_start = TileScheduler::get_work_k_tile_start(work_tile_info);
+        auto k_tile_iter = cute::make_coord_iterator(idx2crd(work_k_tile_start, shape<2>(gA)), shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, work_k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(work_k_tile_count);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler) &&
+           collective_epilogue.is_producer_load_needed()) {
+          epi_load_pipe_producer_state =
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+      }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      bool do_store_tail = false;
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+        auto work_k_tile_count = TileScheduler::get_work_k_tile_count(work_tile_info, problem_shape_MNKL, blk_shape);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        //
+        // MSVC CTAD breaks if we say "Tensor" here, so we use "auto" instead.
+        auto accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          work_k_tile_count,
+          mma_thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          work_k_tile_count
+        );
+
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(work_k_tile_count);
+
+        // Index of warp group within consumer warp groups
+        int consumer_warp_group_idx = canonical_warp_group_idx() - NumLoadWarpGroups;
+
+        // Perform reduction across splits, if needed
+        TileScheduler::fixup(
+          params.scheduler, work_tile_info, accumulators, NumMmaWarpGroups, consumer_warp_group_idx);
+
+        if (TileScheduler::compute_epilogue(work_tile_info, params.scheduler)) {
+          // Epilogue and write to gD
+          auto [epi_load_pipe_consumer_state_next, epi_store_pipe_producer_state_next] =
+          collective_epilogue.store(
+            epi_load_pipeline,
+            epi_load_pipe_consumer_state,
+            epi_store_pipeline,
+            epi_store_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            accumulators,
+            tiled_mma,
+            mma_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          epi_load_pipe_consumer_state = epi_load_pipe_consumer_state_next;
+          epi_store_pipe_producer_state = epi_store_pipe_producer_state_next;
+          do_store_tail = true;
+        }
+
+        // Get next work tile
+        auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info);
+        work_tile_info = next_work_tile_info;
+      } // Scheduler work fetch loop
+
+      if (do_store_tail) {
+        collective_epilogue.store_tail(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state
+        );
+      }
+    } // Consumer Warp Groups End
+#endif
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be086f0c9c5dcd21d68dadc0d67ac1c3844373f8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
@@ -0,0 +1,527 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/fast_math.h"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/reg_reconfig.h"
+#include "cutlass/arch/mma_sm90.h"
+#include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/gemm_universal_decl.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/tensor.hpp"
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel {
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape_,
+  class CollectiveMainloop_,
+  class CollectiveEpilogue_,
+  class TileScheduler_
+>
+class GemmUniversal<
+  ProblemShape_,
+  CollectiveMainloop_,
+  CollectiveEpilogue_,
+  TileScheduler_,
+  cute::enable_if_t<cute::is_base_of_v<KernelCpAsyncWarpSpecializedPingpong, typename CollectiveMainloop_::DispatchPolicy::Schedule>>>
+{
+public:
+  //
+  // Type Aliases
+  //
+  using ProblemShape = ProblemShape_;
+  static_assert(cute::rank(ProblemShape{}) == 3 or cute::rank(ProblemShape{}) == 4,
+    "ProblemShape{} should be <M,N,K> or <M,N,K,L>");
+  static constexpr bool IsGdcEnabled = false;
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape = typename CollectiveMainloop::TileShape;
+  using TiledMma  = typename CollectiveMainloop::TiledMma;
+  using ArchTag   = typename CollectiveMainloop::ArchTag;
+  using ElementA  = typename CollectiveMainloop::ElementA;
+  using StrideA   = typename CollectiveMainloop::StrideA;
+  using ElementB  = typename CollectiveMainloop::ElementB;
+  using StrideB   = typename CollectiveMainloop::StrideB;
+  using DispatchPolicy = typename CollectiveMainloop::DispatchPolicy;
+  using ElementAccumulator = typename CollectiveMainloop::ElementAccumulator;
+  using ClusterShape = typename DispatchPolicy::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using ElementC = typename CollectiveEpilogue::ElementC;
+  using StrideC  = typename CollectiveEpilogue::StrideC;
+  using ElementD = typename CollectiveEpilogue::ElementD;
+  using StrideD  = typename CollectiveEpilogue::StrideD;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(!cute::is_same_v<TileScheduler_, StreamKScheduler>, "Ping-pong kernel does not currently support stream-K scheduler.");
+  using TileSchedulerTag = TileScheduler_;
+  using TileScheduler = typename detail::TileSchedulerSelector<
+    TileScheduler_, ArchTag, TileShape, ClusterShape>::Scheduler;
+  using TileSchedulerArguments = typename TileScheduler::Arguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  using GmemTiledCopyA = typename CollectiveMainloop::GmemTiledCopyA;
+  using GmemTiledCopyB = typename CollectiveMainloop::GmemTiledCopyB;
+  static_assert(cute::size(GmemTiledCopyA{}) == cute::size(GmemTiledCopyB{}), "Number of threads in A/B tiled copies must be the same");
+
+  static constexpr uint32_t NumLoadWarpGroups = cute::size(GmemTiledCopyA{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumMmaWarpGroups = 2 * cute::size(TiledMma{}) / NumThreadsPerWarpGroup;
+  static constexpr uint32_t NumWarpGroups = NumLoadWarpGroups + NumMmaWarpGroups;
+  static_assert(NumWarpGroups == 2 || NumWarpGroups == 3, "Number of warp groups must be 2 or 3 for good performance.");
+  static_assert(NumMmaWarpGroups == 2, "Pingpong kernel requires 2 MMA warp groups.");
+
+  static constexpr uint32_t MaxThreadsPerBlock = NumWarpGroups * NumThreadsPerWarpGroup;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  // Order Sequence barrier with two stages: one for Mainloop and one for Epilogue
+  static constexpr uint32_t StagesPerMathWarpGroup = 2;
+  using MathWarpGroupOrderBarrier = cutlass::OrderedSequenceBarrier<
+    StagesPerMathWarpGroup, NumMmaWarpGroups>;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128, _1> {
+      using MainloopTensorStorage = typename CollectiveMainloop::TensorStorage;
+      using EpilogueTensorStorage = typename CollectiveEpilogue::TensorStorage;
+
+      MainloopTensorStorage mainloop;
+      EpilogueTensorStorage epilogue;
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16, _1> {
+      using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage;
+      using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage;
+      using MathWarpGroupOrderBarrierStorage = typename MathWarpGroupOrderBarrier::SharedStorage;
+
+      alignas(16) MainloopPipelineStorage mainloop;
+      alignas(16) EpiLoadPipelineStorage epi_load;
+      alignas(16) MathWarpGroupOrderBarrierStorage math_wg_order;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    GemmUniversalMode mode{};
+    ProblemShape problem_shape{};
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    (void) workspace;
+    auto problem_shape = args.problem_shape;
+    if constexpr (detail::Has_SwapAB_v<CollectiveMainloop>) {
+      // swap M/N
+      get<0>(problem_shape) = get<1>(args.problem_shape);
+      get<1>(problem_shape) = get<0>(args.problem_shape);
+    }
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+    }
+    CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
+    TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
+      problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
+
+    return {
+      args.mode,
+      problem_shape,
+      CollectiveMainloop::to_underlying_arguments(args.problem_shape, args.mainloop, workspace),
+      CollectiveEpilogue::to_underlying_arguments(args.problem_shape, args.epilogue, workspace),
+      hw_info,
+      scheduler
+    };
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    bool implementable = (args.mode == GemmUniversalMode::kGemm) or
+        (args.mode == GemmUniversalMode::kBatched && cute::rank(ProblemShape{}) == 4);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Shape don't meet the requirements.\n");
+      return implementable;
+    }
+    implementable &= CollectiveMainloop::can_implement(args.problem_shape, args.mainloop);
+    implementable &= CollectiveEpilogue::can_implement(args.problem_shape, args.epilogue);
+    implementable &= TileScheduler::can_implement(args.scheduler);
+
+    return implementable;
+  }
+
+  static
+  size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static
+  cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3
+  get_grid_shape(Params const& params) {
+    // Given device SM count, set grid size s.t. we do not launch more thread blocks than we can run concurrently
+    TileSchedulerArguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.scheduler.log_swizzle_size_;
+    }
+    return TileScheduler::get_grid_shape(params.scheduler, params.problem_shape, TileShape{}, ClusterShape{}, params.hw_info, args);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(Params const& params, char* smem_buf) {
+    using namespace cute;
+    using X = Underscore;
+
+// Any Tensor Op MMA Atom in the WGMMA ISA is arch conditional to sm90a.
+#if ! defined(__CUDA_ARCH_FEAT_SM90_ALL)
+    printf("ERROR : Arch conditional MMA instruction used without targeting sm90a compute capability. Aborting.\n");
+#else
+
+    // Preconditions
+    static_assert(cute::rank(StrideA{}) == 3, "StrideA must be rank-3: [M, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideB{}) == 3, "StrideB must be rank-3: [N, K, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideC{}) == 3, "StrideC must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+    static_assert(cute::rank(StrideD{}) == 3, "StrideD must be rank-3: [M, N, L]. If batch mode is not needed, set L stride to Int<0>.");
+
+    enum class WarpGroupRole {
+      Producer = 0,
+      Consumer = 1,
+    };
+
+    // Kernel level shared memory storage
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int thread_idx = int(threadIdx.x);
+    int warp_group_thread_idx = thread_idx % NumThreadsPerWarpGroup;
+    int warp_group_idx = canonical_warp_group_idx();
+    CUTLASS_ASSERT(warp_group_idx < NumWarpGroups);
+    WarpGroupRole warp_group_role = warp_group_idx < NumLoadWarpGroups ? WarpGroupRole::Producer : WarpGroupRole::Consumer;
+    int warp_group_consumer_idx = warp_group_idx - NumLoadWarpGroups;
+
+    // Mainloop Load pipeline
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    typename MainloopPipeline::Params mainloop_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      mainloop_pipeline_params.role = MainloopPipeline::ThreadCategory::Consumer;
+    }
+    mainloop_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    mainloop_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    MainloopPipeline mainloop_pipeline(shared_storage.pipelines.mainloop, mainloop_pipeline_params);
+
+    // Epilogue Load pipeline
+    using EpiLoadPipeline = typename CollectiveEpilogue::LoadPipeline;
+    typename EpiLoadPipeline::Params epi_load_pipeline_params;
+    if (warp_group_role == WarpGroupRole::Producer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Producer;
+    }
+    if (warp_group_role == WarpGroupRole::Consumer) {
+      epi_load_pipeline_params.role = EpiLoadPipeline::ThreadCategory::Consumer;
+    }
+    epi_load_pipeline_params.producer_arv_count = NumLoadWarpGroups * NumThreadsPerWarpGroup;
+    epi_load_pipeline_params.consumer_arv_count = NumThreadsPerWarpGroup; // only 1 WG consumes at a time
+    EpiLoadPipeline epi_load_pipeline(shared_storage.pipelines.epi_load, epi_load_pipeline_params);
+
+    // Epilogue Store pipeline
+    using EpiStorePipeline = typename CollectiveEpilogue::StorePipeline;
+    typename EpiStorePipeline::Params epi_store_pipeline_params;
+    epi_store_pipeline_params.always_wait = true;
+    EpiStorePipeline epi_store_pipeline(epi_store_pipeline_params);
+
+    typename MathWarpGroupOrderBarrier::Params params_math_wg_order_barrier;
+    // DMA Load WG will not participate in these Ordered Barrier syncs
+    params_math_wg_order_barrier.group_id = warp_group_consumer_idx;
+    params_math_wg_order_barrier.group_size = NumThreadsPerWarpGroup; // Number of threads / participants in a group
+    MathWarpGroupOrderBarrier math_wg_order_barrier(shared_storage.pipelines.math_wg_order, params_math_wg_order_barrier);
+
+    // Initialize starting pipeline states for the collectives
+    // Epilogue store pipe is producer-only (consumer is TMA unit, waits via scoreboarding)
+    typename CollectiveMainloop::PipelineState mainloop_pipe_consumer_state;
+    typename CollectiveEpilogue::LoadPipelineState epi_load_pipe_consumer_state;
+
+    // For the DMA Load (producer) we start with an opposite phase
+    // i.e., we skip all waits since we know that the buffer is indeed empty
+    PipelineState mainloop_pipe_producer_state = cutlass::make_producer_start_state<MainloopPipeline>();
+    PipelineState epi_load_pipe_producer_state = cutlass::make_producer_start_state<EpiLoadPipeline>();
+    PipelineState epi_store_pipe_producer_state = cutlass::make_producer_start_state<EpiStorePipeline>();
+
+    // Separate out problem shape for convenience
+    // Optionally append 1s until problem shape is rank-4 in case its is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(params.problem_shape, Int<1>{});
+    auto M = get<0>(problem_shape_MNKL);
+    auto N = get<1>(problem_shape_MNKL);
+    auto K = get<2>(problem_shape_MNKL);
+    auto L = get<3>(problem_shape_MNKL);
+
+    // Represent the full tensors
+    Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
+    Tensor mB_nkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_B), make_shape(N,K,L), params.mainloop.dB); //(n,k,l)
+
+    // Get the appropriate blocks for this thread block -- potential for thread block locality
+    TiledMma tiled_mma;
+    auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, blk_shape, make_coord(_,_,_), Step<_1, X,_1>{});          // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, blk_shape, make_coord(_,_,_), Step< X,_1,_1>{});          // (BLK_N,BLK_K,n,k,l)
+
+    // Get pipeline stage increments from tensor shapes
+    auto k_tile_count = size<3>(gA_mkl);
+    auto c_tile_count = CollectiveEpilogue::get_load_pipe_increment(blk_shape);
+    auto d_tile_count = CollectiveEpilogue::get_store_pipe_increment(blk_shape);
+
+    TileScheduler scheduler{params.scheduler};
+
+    if (warp_group_consumer_idx == 1) {
+      // Advance 2nd Math WG to the next work tile for the startup
+      scheduler.advance_to_next_work();
+      // Advance 2nd Math WG pipeline states to the end of 1st Math WG
+      mainloop_pipe_consumer_state.advance(k_tile_count);
+      epi_load_pipe_consumer_state.advance(c_tile_count);
+      epi_store_pipe_producer_state.advance(d_tile_count);
+    }
+    auto work_tile_info = scheduler.initial_work_tile_info(ClusterShape{});
+
+    // In a warp specialized kernel, collectives expose data movement and compute operations separately
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue{params.epilogue, shared_storage.tensors.epilogue};
+
+    // Wait for all threads in the thread block
+    __syncthreads();
+
+    if (warp_group_role == WarpGroupRole::Producer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Slice with our work tile coordinates to construct mainloop tensor views
+        Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                   // (BLK_M,BLK_K,k)
+        Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                   // (BLK_N,BLK_K,k)
+
+        auto k_tile_iter  = cute::make_coord_iterator(shape<2>(gA));
+
+        // Compute tile residues for predication
+        auto m_max_coord = M - size<0>(gA) * get<0>(blk_coord);                             // M - BLK_M * m_coord
+        auto n_max_coord = N - size<0>(gB) * get<1>(blk_coord);                             // N - BLK_N * n_coord
+        auto k_residue   = K - size<1>(gA) * size<2>(gA);                                   // K - BLK_K * k_coord_max
+        auto residue_mnk = make_tuple(m_max_coord, n_max_coord, k_residue);
+
+        collective_mainloop.load(
+          mainloop_pipeline,
+          mainloop_pipe_producer_state,
+          gA,
+          gB,
+          k_tile_iter, k_tile_count,
+          residue_mnk,
+          thread_idx,
+          shared_storage.tensors.mainloop
+        );
+        // Update starting pipeline state for the next tile
+        mainloop_pipe_producer_state.advance(k_tile_count);
+
+        if (collective_epilogue.is_producer_load_needed()) {
+          collective_epilogue.load(
+            epi_load_pipeline,
+            epi_load_pipe_producer_state,
+            problem_shape_MNKL,
+            blk_shape,
+            blk_coord,
+            tiled_mma,
+            warp_group_thread_idx,
+            shared_storage.tensors.epilogue
+          );
+          // Update starting pipeline state for the next tile
+          epi_load_pipe_producer_state.advance(c_tile_count);
+        }
+
+        // Get next work tile
+        scheduler.advance_to_next_work();
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+
+      // Make sure all Consumer Warp Groups have been waited upon
+      collective_mainloop.load_tail(mainloop_pipeline, mainloop_pipe_producer_state);
+      if (collective_epilogue.is_producer_load_needed()) {
+        collective_epilogue.load_tail(epi_load_pipeline, epi_load_pipe_producer_state);
+      }
+    } // Producer Warp Group End
+
+    else if (warp_group_role == WarpGroupRole::Consumer) {
+
+      while (work_tile_info.is_valid()) {
+        // Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
+        auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
+        auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
+        auto l_coord = idx2crd(work_tile_info.L_idx, shape<4>(gB_nkl));
+        auto blk_coord = make_coord(m_coord, n_coord, _, l_coord);
+
+        // Allocate the the accumulators for the (M,N) blk_shape
+        Tensor accumulators = partition_fragment_C(tiled_mma, take<0,2>(blk_shape));               // (MMA,MMA_M,MMA_N)
+
+        // Order two Math WG's MMA one after the other, helps hide Epilogue
+        math_wg_order_barrier.wait();
+
+        collective_mainloop.mma(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          accumulators,
+          k_tile_count,
+          thread_idx,
+          shared_storage.tensors.mainloop,
+          params.mainloop
+        );
+
+        // Cue for next Math WG's MMA to start
+        math_wg_order_barrier.arrive();
+
+        // Make sure the math instructions are done and free buffers before entering the epilogue
+        collective_mainloop.mma_tail(
+          mainloop_pipeline,
+          mainloop_pipe_consumer_state,
+          k_tile_count
+        );
+        // Update starting mainloop pipeline state for the next tile
+        mainloop_pipe_consumer_state.advance(k_tile_count * NumMmaWarpGroups);
+
+        // Order two Math WG's Epilogue one after the other
+        math_wg_order_barrier.wait();
+
+        // Epilogue and write to gD
+        collective_epilogue.store(
+          epi_load_pipeline,
+          epi_load_pipe_consumer_state,
+          epi_store_pipeline,
+          epi_store_pipe_producer_state,
+          problem_shape_MNKL,
+          blk_shape,
+          blk_coord,
+          accumulators,
+          tiled_mma,
+          warp_group_thread_idx,
+          shared_storage.tensors.epilogue
+        );
+        // Update starting load/store pipeline states for the next tile
+        epi_load_pipe_consumer_state.advance(c_tile_count * NumMmaWarpGroups);
+        epi_store_pipe_producer_state.advance(d_tile_count * NumMmaWarpGroups);
+
+        // Wait for all TMA stores to complete
+        epi_store_pipeline.producer_tail(epi_store_pipe_producer_state);
+
+        // Cue for next Math WG's Epilogue to start
+        math_wg_order_barrier.arrive();
+
+        // Get next work tile
+        scheduler.advance_to_next_work(NumMmaWarpGroups);
+        work_tile_info = scheduler.get_current_work();
+      } // Scheduler work fetch loop
+    } // Consumer Warp Groups End
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd90d48f1bd82e9d14cdc41dd93f402d8bd20363
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -0,0 +1,153 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/gemm/kernel/static_tile_scheduler.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+class PersistentTileSchedulerSm90:
+public StaticPersistentTileScheduler<PersistentTileSchedulerSm90> {
+
+  using BaseScheduler = StaticPersistentTileScheduler<PersistentTileSchedulerSm90>;
+public:
+  using StaticPersistentTileScheduler::StaticPersistentTileScheduler;
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  using Arguments = BaseScheduler::Arguments;
+
+  static constexpr bool IsDynamicPersistent = false;
+
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
+  // get work_idx_m, work_idx_n from blk_per_grid_dim while applying swizzle
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order) {
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
+    return get_work_idx_m_and_n(
+      blk_per_grid_dim,
+      divmod_cluster_shape_major,
+      divmod_cluster_shape_minor,
+      divmod_cluster_blk_major,
+      log_swizzle_size,
+      raster_order,
+      cta_m_in_cluster,
+      cta_n_in_cluster
+    );
+  }
+
+  static CUTLASS_DEVICE
+  cute::tuple<int32_t, int32_t>
+  get_work_idx_m_and_n(
+      uint64_t blk_per_grid_dim,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cluster_blk_major,
+      int32_t log_swizzle_size,
+      RasterOrder raster_order,
+      uint64_t cta_m_in_cluster,
+      uint64_t cta_n_in_cluster) {
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = cta_m_in_cluster;
+    }
+    else {
+      cluster_minor_offset = cta_n_in_cluster;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    divmod_cluster_blk_major(cluster_idx_minor_div_swizzle, cluster_idx_major, extra);
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor +
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor +
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx};
+    }
+    else {
+      return {major_work_idx, minor_work_idx};
+    }
+
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+};
+
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..92749b196640e5682a0aa09e5c9c4d8c8c08f2f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -0,0 +1,586 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Persistent Thread Block (TB) scheduler
+template <class GroupProblemShape, int SchedulerPipelineStageCount>
+class PersistentTileSchedulerSm90Group {
+  //
+  // Data members
+  //
+
+private:
+  uint64_t current_work_linear_idx_ = 0;
+  uint64_t total_grid_size_ = 0;
+
+  // Tracking current group, its starting linear idx and total tiles
+  struct GroupInfo {
+    int group_idx = 0;
+    uint64_t start_linear_idx = 0;
+    uint64_t total_tiles = 0;
+    uint64_t problem_blocks_along_raster_order = 0;
+  } current_group_info_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    int32_t is_valid_tile = 0;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile != 0;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, 0};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using ProblemShape = typename GroupProblemShape::UnderlyingProblemShape;
+  using Params = PersistentTileSchedulerSm90GroupParams<GroupProblemShape>;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  // We need to hard code the number of stages here since the scheduling is static
+  // and it can benefit from a larger number of stages without worrying about imbalances.
+
+  using Pipeline = PipelineAsync<SchedulerPipelineStageCount>;
+
+  // Call out the types here to work around a bug in MSVC.
+
+  // using PipelineStorage = typename Pipeline::SharedStorage;
+  // using PipelineState = typename Pipeline::PipelineState;
+  using PipelineStorage = cutlass::PipelineDetail::PipelineAsyncSharedStorage<SchedulerPipelineStageCount>;
+  using PipelineState = cutlass::PipelineDetail::PipelineAsyncPipelineState<SchedulerPipelineStageCount>;
+
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename PipelineEmpty::SharedStorage;
+  using SchedulerResponse = WorkTileInfo;
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return pipeline_; }
+    // Pipeline throttle is not needed here as the scheduling is not dynamic.
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE SchedulerResponse* data() { return data_; }
+
+  private: 
+    alignas(16) PipelineStorage pipeline_;
+    alignas(16) SchedulerResponse data_[SchedulerPipelineStageCount];
+  };
+
+  struct Arguments {
+    int max_swizzle_size = 1;
+    // Not applying Heuristics for Grouped problems, since largest dimension can change per group
+    RasterOrderOptions raster_order = RasterOrderOptions::AlongM;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+  SchedulerResponse *response_ptr_ = nullptr;
+  ProblemShape cached_problem_shapes_[2];
+
+  //
+  // Methods
+  //
+
+  template <class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+    GroupProblemShape problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    Arguments const& arguments,
+    [[maybe_unused]] void* workspace=nullptr,
+    [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u
+    ) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      problem_shapes,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size, 
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class TileShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    GroupProblemShape const& problem_shapes,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments,
+    bool truncate_by_problem_size=true) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(
+      problem_shapes,
+      hw_info,
+      tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(GroupProblemShape const& problem_shapes, KernelHardwareInfo hw_info, BlockShape cta_shape, ClusterShape cluster_shape) {
+    int groups = problem_shapes.groups();
+    uint32_t total_ctas = 0;
+    uint32_t cta_in_N_dim = 1; // We linearize the blocks across all the problems here
+
+    // If host problem shapes are not provided.
+    if (!problem_shapes.is_host_problem_shape_available()) {
+      total_ctas = hw_info.sm_count;
+    }
+    // If host problem shapes are provided, make a better decision about possibility to launch smaller grid.
+    else {
+      for (int group = 0; group < groups; group++) {
+        auto ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shapes.get_host_problem_shape(group)), cute::shape<0>(cta_shape)));
+        auto ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shapes.get_host_problem_shape(group)), cute::shape<1>(cta_shape)));
+        auto problem_blocks_m = round_up(ctas_along_m, cute::get<0>(cluster_shape));
+        auto problem_blocks_n = round_up(ctas_along_n, cute::get<1>(cluster_shape));
+        total_ctas += problem_blocks_m * problem_blocks_n;
+      }
+    }
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(cluster_shape),
+      total_ctas, cta_in_N_dim
+    );
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    return true;
+  }
+
+  PersistentTileSchedulerSm90Group() = default;
+
+  CUTLASS_DEVICE explicit PersistentTileSchedulerSm90Group(Params const& params_, SchedulerResponse* response_ptr) : scheduler_params(params_), response_ptr_(response_ptr) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (scheduler_params.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    int lane_idx = canonical_lane_idx();
+    if (lane_idx < params_.problem_shapes_.groups()) {
+      cached_problem_shapes_[1] = params_.problem_shapes_.get_problem_shape(lane_idx);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+    uint64_t ctas_along_m, ctas_along_n;
+    ProblemShape problem_shape = params_.problem_shapes_.get_problem_shape(0);
+    if (is_tuple<decltype(cute::shape<0>(problem_shape))>::value ||
+        is_tuple<decltype(cute::shape<1>(problem_shape))>::value) {
+      ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape), scheduler_params.cta_shape_.m()));
+      ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape), scheduler_params.cta_shape_.n()));
+    }
+    else {
+      ctas_along_m = scheduler_params.divmod_cta_shape_m_.divide(cute::shape<0>(problem_shape) +  scheduler_params.divmod_cta_shape_m_.divisor - 1);
+      ctas_along_n = scheduler_params.divmod_cta_shape_n_.divide(cute::shape<1>(problem_shape) +  scheduler_params.divmod_cta_shape_n_.divisor - 1);
+    }
+    auto problem_blocks_m = round_up(ctas_along_m, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.m());
+    auto problem_blocks_n = round_up(ctas_along_n, (1 << params_.log_swizzle_size_) * params_.cluster_shape_.n());
+    current_group_info_.total_tiles = problem_blocks_m * problem_blocks_n;
+    current_group_info_.problem_blocks_along_raster_order = params_.raster_order_ == RasterOrder::AlongN ? problem_blocks_n : problem_blocks_m;
+
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  // get work_idx_m, work_idx_n from linear_idx while applying swizzle
+  template<class WorkTileInfo, class GroupInfo, class ProblemShape, class RasterOrder>
+  static
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_work_idx_m_and_n(
+      uint64_t linear_idx,
+      GroupInfo& group_info,
+      GroupProblemShape &problem_shapes,
+      ProblemShape (&cached_problem_shapes)[2],
+      GemmCoord cta_shape,
+      GemmCoord cluster_shape,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+      FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+      FastDivmodU64 const& divmod_cta_shape_m,
+      FastDivmodU64 const& divmod_cta_shape_n,
+      int32_t log_swizzle_size, 
+      RasterOrder raster_order) {
+
+    int32_t valid_tile = 1;
+
+    // Use a warp to "speculatively" check if the work tile maps to the next 32 groups
+    int lane_idx = canonical_lane_idx();
+    int total_problem_groups = problem_shapes.groups();
+
+    if (linear_idx >= group_info.total_tiles + group_info.start_linear_idx) {
+      group_info.group_idx += lane_idx;
+      for ( ; ; group_info.group_idx += NumThreadsPerWarp) {
+        cached_problem_shapes[0] = cached_problem_shapes[1];
+        if (group_info.group_idx + NumThreadsPerWarp < total_problem_groups) {
+          cached_problem_shapes[1] = problem_shapes.get_problem_shape(group_info.group_idx + NumThreadsPerWarp);
+        }
+        if (group_info.group_idx < total_problem_groups) {
+          uint64_t ctas_along_m, ctas_along_n;
+          if (is_tuple<decltype(cute::shape<0>(cached_problem_shapes[0]))>::value ||
+              is_tuple<decltype(cute::shape<1>(cached_problem_shapes[0]))>::value) {
+            ctas_along_m = cute::size(cute::ceil_div(cute::shape<0>(cached_problem_shapes[0]), cta_shape.m()));
+            ctas_along_n = cute::size(cute::ceil_div(cute::shape<1>(cached_problem_shapes[0]), cta_shape.n()));
+          }
+          else {
+            ctas_along_m = divmod_cta_shape_m.divide(cute::shape<0>(cached_problem_shapes[0]) +  divmod_cta_shape_m.divisor - 1);
+            ctas_along_n = divmod_cta_shape_n.divide(cute::shape<1>(cached_problem_shapes[0]) +  divmod_cta_shape_n.divisor - 1);
+          }
+          auto problem_blocks_m = round_up(ctas_along_m, (1 << log_swizzle_size) * cluster_shape.m());
+          auto problem_blocks_n = round_up(ctas_along_n, (1 << log_swizzle_size) * cluster_shape.n());
+          group_info.problem_blocks_along_raster_order = raster_order == RasterOrder::AlongN ? problem_blocks_n : problem_blocks_m;
+          group_info.total_tiles = problem_blocks_m * problem_blocks_n;
+        } else {
+          group_info.total_tiles = INT_MAX;
+        }
+
+        auto curr_total_tiles = group_info.total_tiles;
+
+        // Calculate prefix sum for start_linear_idx.
+        #pragma unroll
+        for (int i = 1; i < NumThreadsPerWarp; i *= 2) {
+          auto n = __shfl_up_sync(0xffffffff, curr_total_tiles, i);
+          curr_total_tiles = lane_idx >= i ? curr_total_tiles + n : curr_total_tiles;
+        }
+        group_info.start_linear_idx += curr_total_tiles - group_info.total_tiles;
+
+        uint32_t thread_succeed = __ballot_sync(0xffffffff, linear_idx < group_info.start_linear_idx + group_info.total_tiles);
+        if (thread_succeed) {
+          // Use the first succeeding thread.
+          int first_succeeding_thread = __ffs(thread_succeed) - 1;
+          group_info.group_idx = __shfl_sync(0xffffffff, group_info.group_idx, first_succeeding_thread);
+          group_info.start_linear_idx = __shfl_sync(0xffffffff, group_info.start_linear_idx, first_succeeding_thread);
+          group_info.total_tiles = __shfl_sync(0xffffffff, group_info.total_tiles, first_succeeding_thread);
+          group_info.problem_blocks_along_raster_order = __shfl_sync(0xffffffff, group_info.problem_blocks_along_raster_order, first_succeeding_thread);
+          if (group_info.group_idx + lane_idx < total_problem_groups) {
+            cached_problem_shapes[1] = problem_shapes.get_problem_shape(group_info.group_idx + lane_idx);
+          }
+          break;
+        }
+        // Update the start_linear_idx for all threads so that they're ready for the next iteration.
+        group_info.start_linear_idx = __shfl_sync(0xffffffff, group_info.start_linear_idx + group_info.total_tiles, NumThreadsPerWarp - 1);
+      }
+    }
+
+    if (group_info.group_idx >= total_problem_groups) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
+    uint64_t blk_per_grid_dim = divmod_cluster_shape_minor.divide(linear_idx - group_info.start_linear_idx);
+    divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
+
+    // With static schedulers, we launch grid such that all cluster are linear (1-D) order, i.e., 
+    // there can only be one cluster in the minor dimension. get_grid_shape() in scheduler params
+    // put cluster_shape.m/n() as the minor dimension based on raster order AlongN/M resp.
+    // Therefore, the offset of a CTA (inside a cluster) in the minor dimension can be directly be 
+    // inferred by the blockIdx along the minor dimension.
+    if (raster_order == RasterOrder::AlongN) {
+      cluster_minor_offset = blockIdx.x;
+    }
+    else {
+      cluster_minor_offset = blockIdx.y;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major;
+    
+    uint64_t cluster_idx_minor_div_swizzle, extra, offset;
+
+    offset = cluster_id & ((1 << log_swizzle_size) - 1);
+    extra = cluster_id >> log_swizzle_size;
+
+    uint64_t curr_group_cluster_blk_major = divmod_cluster_shape_major.divide(group_info.problem_blocks_along_raster_order);
+
+    cluster_idx_minor_div_swizzle = extra / curr_group_cluster_blk_major;
+    cluster_idx_major = extra % curr_group_cluster_blk_major;
+
+    cluster_idx_minor = cluster_idx_minor_div_swizzle * (1 << log_swizzle_size) + offset;
+
+    auto minor_work_idx = static_cast<int32_t>(cluster_idx_minor * divmod_cluster_shape_minor.divisor + 
+                                               cluster_minor_offset);
+    auto major_work_idx = static_cast<int32_t>(cluster_idx_major * divmod_cluster_shape_major.divisor + 
+                                               cluster_major_offset);
+
+    if (raster_order == RasterOrder::AlongN) {
+      return {minor_work_idx, major_work_idx, group_info.group_idx, valid_tile};
+    }
+    else {
+      return {major_work_idx, minor_work_idx, group_info.group_idx, valid_tile}; 
+    }
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) {
+    if (scheduler_params.pre_processed_problem_shapes && linear_idx >= scheduler_params.blocks_across_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+    return get_work_idx_m_and_n<WorkTileInfo>(
+              linear_idx,
+              current_group_info_,
+              scheduler_params.problem_shapes_,
+              cached_problem_shapes_,
+              scheduler_params.cta_shape_,
+              scheduler_params.cluster_shape_,
+              scheduler_params.divmod_cluster_shape_major_,
+              scheduler_params.divmod_cluster_shape_minor_,
+              scheduler_params.divmod_cta_shape_m_,
+              scheduler_params.divmod_cta_shape_n_,
+              scheduler_params.log_swizzle_size_, 
+              scheduler_params.raster_order_);
+  }
+  template <typename TileSchedulerPipeline, typename TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  advance_to_next_work(
+    TileSchedulerPipeline& scheduler_pipeline,
+    TileSchedulerPipelineState scheduler_pipe_producer_state,
+    uint32_t advance_count = 1) {
+
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+    auto work_tile = get_current_work_for_linear_idx(current_work_linear_idx_);
+    scheduler_pipeline.producer_acquire(scheduler_pipe_producer_state);
+    if (cute::elect_one_sync()) {
+      response_ptr_[scheduler_pipe_producer_state.index()] = work_tile;
+      cutlass::arch::fence_view_async_shared();
+      scheduler_pipeline.producer_commit(scheduler_pipe_producer_state);
+    }
+    return cute::make_tuple(work_tile, true);
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  // The basic tile scheduler does not require any additional workspace
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(Arguments const&, ProblemShape, KernelHardwareInfo const&, uint32_t, const uint32_t = 1, uint32_t = 1) {
+    return 0;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
+    uint32_t, const uint32_t = 1, uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  template <class ProblemShape_MNKL, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape_MNKL problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t
+  epilgoue_subtile_idx(WorkTileInfo const& work_tile_info, Params const& params) const {
+    return 0;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  // Kernel helper function to get next work tile
+  template <typename TileSchedulerPipeline, typename TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+    WorkTileInfo work_tile_info,
+    TileSchedulerPipeline& scheduler_pipeline,
+    TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+    scheduler_pipeline.consumer_wait(scheduler_pipe_consumer_state);
+    auto work_tile = response_ptr_[scheduler_pipe_consumer_state.index()];
+    cutlass::arch::fence_view_async_shared();
+    scheduler_pipeline.consumer_release(scheduler_pipe_consumer_state);
+
+    return cute::make_tuple(work_tile, true);
+  }
+  
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  auto
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a298e06bf4e65b068d1cb1935d9325551b428c68
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -0,0 +1,1113 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/barrier.h"
+#include "cutlass/block_striped.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+
+namespace cutlass::gemm::kernel::detail {
+
+// Persistent Thread Block (TB) scheduler leveraging stream-K decomposition
+template <
+  class TileShape,
+  class ClusterShape
+>
+class PersistentTileSchedulerSm90StreamK {
+  //
+  // Data members
+  //
+
+private:
+  using UnderlyingScheduler = PersistentTileSchedulerSm90;
+
+private:
+  using UnderlyingArguments = typename UnderlyingScheduler::Arguments;
+  using UnderlyingParams = typename UnderlyingScheduler::Params;
+
+  dim3 block_id_in_cluster_;
+  uint64_t current_work_linear_idx_ = 0;
+  uint32_t unit_iter_start_ = 0;
+
+public:
+
+  using RasterOrder = UnderlyingScheduler::RasterOrder;
+  using RasterOrderOptions = UnderlyingScheduler::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+  using Pipeline = PipelineEmpty;
+  using PipelineStorage = typename Pipeline::SharedStorage;
+  using ThrottlePipeline = PipelineEmpty;
+  using ThrottlePipelineStorage = typename ThrottlePipeline::SharedStorage;
+  struct CLCResponse {};
+
+  class SharedStorage {
+  public:
+    CUTLASS_DEVICE PipelineStorage pipeline() { return PipelineStorage{}; }
+    CUTLASS_DEVICE ThrottlePipelineStorage throttle_pipeline() { return ThrottlePipelineStorage{}; }
+    CUTLASS_DEVICE CLCResponse* data() { return nullptr; }
+  };
+
+  // Use a dummy barrier manager to simply get the type used to store the barrier
+  using BarrierType = typename NamedBarrierManager<1>::T;
+
+  using Params = PersistentTileSchedulerSm90StreamKParams;
+  using ReductionMode = Params::ReductionMode;
+  using DecompositionMode = Params::DecompositionMode;
+
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t K_idx = 0;
+    int32_t L_idx = 0;
+
+    // Number of k tiles to compute for this unit of work. For stream-K, this
+    // can indicate the number of K tiles across multiple output tiles.
+    uint32_t k_tile_count = 0;
+
+    // Number of k tiles remaining for the work unit as a whole
+    uint32_t k_tile_remaining = 0;
+
+    // Whether this unit of work is the final split for the given tile
+    bool is_separate_reduction = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      // A work tile that computes no K tiles is invalid unless it is a separate-reduction work tile
+      // (which only performs reduction and epilogue)
+      return k_tile_count > 0 || is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_reduction_unit() const {
+      return is_separate_reduction;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      // For separate reduction units, the K_idx of the work tile is unused.
+      // Therefore, we override it to contain the subtile of that the reduction
+      // unit operates on.
+      return is_reduction_unit() ? K_idx : -1;
+    }
+
+    CUTLASS_HOST_DEVICE
+    void
+    setup_separate_reduction(int32_t epilogue_subtile_idx) {
+      // Set the epilogue subtile in the K_idx, since this is otherwise unused
+      // by separate reduction units.
+      K_idx = epilogue_subtile_idx;
+
+      is_separate_reduction = true;
+      k_tile_count = 0;
+      // Clean up remaining k tiles
+      k_tile_remaining = 0;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, -1, 0};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return (K_idx + k_tile_count) == k_tiles_per_output_tile;
+    }
+  };
+
+  struct Arguments {
+
+    Arguments() = default;
+    Arguments(Arguments const&) = default;
+    Arguments(Arguments&&) = default;
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments const& args) {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments&
+    operator=(Arguments&& args) noexcept {
+      splits = args.splits;
+      max_swizzle_size = args.max_swizzle_size;
+      raster_order = args.raster_order;
+      reduction_mode = args.reduction_mode;
+      decomposition_mode = args.decomposition_mode;
+      return *this;
+    }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_) : splits(splits_) {}
+
+    CUTLASS_HOST_DEVICE
+    Arguments(int splits_, int max_swizzle_size_, RasterOrderOptions raster_order_, DecompositionMode decomposition_mode_) :
+      splits(splits_),
+      max_swizzle_size(max_swizzle_size_),
+      raster_order(raster_order_),
+      decomposition_mode(decomposition_mode_) {}
+
+    // The splitting factor to be used in a split-K decomposition of the problem.
+    // If this is set to a value greater than 1, stream-K decomposition logic
+    // is bypassed in favor of a split-K decomposition.
+    int splits = 1;
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+    ReductionMode reduction_mode = ReductionMode::Deterministic;
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic;
+  };
+
+  // Sink scheduler params as a member
+  Params scheduler_params;
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShape problem_shape,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      Arguments const& args,
+      void* workspace,
+      const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.reduction_mode,
+      args.decomposition_mode,
+      workspace,
+      epilogue_subtile
+    );
+    return params;
+  }
+
+  static bool
+  can_implement(Arguments const& args) {
+    // Split count > 1 is only valid for heuristic and split-K decomposition modes
+    return (args.splits == 1 ||
+            args.decomposition_mode == DecompositionMode::Heuristic ||
+            args.decomposition_mode == DecompositionMode::SplitK);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PersistentTileSchedulerSm90StreamK() { };
+
+  CUTLASS_DEVICE
+  PersistentTileSchedulerSm90StreamK(Params const& params_) : scheduler_params(params_), block_id_in_cluster_(cute::block_id_in_cluster()) {
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() {
+    return get_current_work_for_linear_idx(unit_iter_start_, current_work_linear_idx_, block_id_in_cluster_, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static WorkTileInfo
+  get_current_work_for_linear_idx(uint32_t &unit_iter_start, uint64_t linear_idx, dim3 block_id_in_cluster, Params const& params) {
+    // The maximum number of work units is units_per_problem_ * splits_.
+    // The multiplication by splits_ is used for handling split-K, in which
+    // units_per_problem_ is equal to the total number of output tiles. To account
+    // for the fact that we have splits_ peers per output tile, we multiply this
+    // value by splits_. For stream-K, this multiplication ends up being a no-op
+    // because splits_ is set to 1 for stream-K.
+    if(linear_idx >= (params.units_per_problem_ * params.divmod_splits_.divisor + params.separate_reduction_units_)) {
+      // Invalid work. Return an empty result.
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    WorkTileInfo work_tile_info;
+    assign_work(params, linear_idx, block_id_in_cluster, work_tile_info, unit_iter_start);
+    return work_tile_info;
+  }
+
+  // Returns whether the current work_tile_info passed in should continue to be used. This
+  // occurs only in the stream-K decomposition with stream-K work units, which encompass
+  // work over multiple output tiles. If the current work_tile_info should continue to be
+  // used, it is updated to advance to the next output tile it should cover.
+  CUTLASS_DEVICE
+  bool
+  continue_current_work(WorkTileInfo& work_tile_info) const {
+    return continue_current_work_for_linear_idx(
+      current_work_linear_idx_, unit_iter_start_, block_id_in_cluster_, work_tile_info, scheduler_params);
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work_for_linear_idx(
+    uint64_t linear_idx,
+    uint32_t unit_iter_start,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info,
+    Params const& params) {
+
+    work_tile_info.k_tile_remaining -= work_tile_info.k_tile_count;
+
+    if (work_tile_info.k_tile_remaining == 0) {
+      return false;
+    }
+    fast_assign_work(unit_iter_start, params, linear_idx, block_id_in_cluster, work_tile_info);
+    return work_tile_info.is_valid();
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo work_tile_info, uint32_t advance_count = 1) const {
+     // Never pass this by reference; it needs a copy,
+    // because continue_current_work will modify it.
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        unit_iter_start_,
+        current_work_linear_idx_ + (
+          uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z) * uint64_t(advance_count)
+          ),
+        block_id_in_cluster_,
+        scheduler_params
+    ).is_valid();
+  }
+
+  // Given the inputs, computes the total number of output blocks this problem will compute over
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShape problem_shape_mnkl, TileShape cta_shape, ClusterShape cluster_shape) {
+    return UnderlyingScheduler::get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+  }
+
+  // Given the cluster shape, computes the physical grid we should launch.
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    [[maybe_unused]] Params const& params,
+    ProblemShape problem_shape,
+    TileShape tile_shape,
+    ClusterShape cluster_shape,
+    KernelHardwareInfo hw_info,
+    Arguments arguments) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+  }
+
+  // Returns whether fixup is needed for `work_tile_info`.
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_fixup(Params const& params, WorkTileInfo const& work_tile_info) {
+    // Fixup is not needed for invalid or data-parallel tiles
+    return work_tile_info.is_valid() && work_tile_info.k_tile_count != params.divmod_tiles_per_output_tile_.divisor;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return params.requires_separate_reduction();
+  }
+
+  // When the work tile is not special for reduction, it's valid. Otherwise need to skip
+  // global loading that producer warpgroup do, also math computation that consumer warpgroup do.
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return !work_tile_info.is_reduction_unit();
+  }
+
+  // Performs the reduction across splits for a given output tile.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+    static constexpr uint32_t Offset = static_cast<int>(cutlass::arch::ReservedNamedBarriers::StreamkBarrier0);
+    static constexpr uint32_t MaxNumNamedBarriers = 2;
+    using BarrierManager = NamedBarrierManager<NumThreadsPerWarpGroup, Offset, MaxNumNamedBarriers>;
+    return fixup_helper<FrgTensorC, BarrierManager>(
+      params, work_tile_info, accumulators, num_barriers, barrier_idx);
+  }
+
+  // Helper for performing the reduction across splits for a given output tile.
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  fixup_helper(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t idx_accumulator_mtxs = 0) {
+
+    using ElementAccumulator = typename FrgTensorC::value_type;
+
+    if (!requires_fixup(params, work_tile_info)) {
+      return;
+    }
+    uint64_t tile_idx = output_tile_index(params, work_tile_info);
+
+    // Index of the lock on which to wait
+    uint64_t lock_idx = (tile_idx * num_barriers) + barrier_idx;
+
+    uint64_t reduction_tile_idx = tile_idx;
+    uint64_t num_peers = 0;
+    uint64_t reduction_peer_offset = 0;
+    if (
+      params.requires_separate_reduction()
+      ) {
+      // If separate reduction is to be performed, each stream-K unit writes its partials
+      // to a separate portion of the workspace. There are as many of these portions as there
+      // are peers for a given output tile, so we multiply the tile index by the maximum peer count.
+      auto [first_peer_id, my_peer_id, last_peer_id] = tile_peer_range(params, tile_idx, work_tile_info);
+      auto peer_id_in_output_tile = my_peer_id - first_peer_id;
+      num_peers = last_peer_id - first_peer_id + 1;
+      reduction_tile_idx = tile_idx * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+      reduction_peer_offset = peer_id_in_output_tile * cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * num_accumulator_mtxs;
+    }
+
+    // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
+    // Thus, the start of the reduction space is the same across all threads in a warp group.
+    uint64_t reduction_offset_base = (static_cast<uint64_t>(cute::size<0>(TileShape{})) * static_cast<uint64_t>(cute::size<1>(TileShape{})) * reduction_tile_idx * num_accumulator_mtxs) +
+      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount * num_accumulator_mtxs)
+      + static_cast<uint64_t>(size(accumulators)) * BarrierManager::ThreadCount * idx_accumulator_mtxs;
+    uint64_t reduction_offset = reduction_offset_base + reduction_peer_offset;
+
+    ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
+
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* reduction_workspace_array = reinterpret_cast<AccumulatorArrayT*>(group_reduction_workspace);
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    uint32_t barrier_group_thread_idx = threadIdx.x % BarrierManager::ThreadCount;
+
+    // The number of tiles for which reduction is required is either:
+    //   (a) the total number of output tiles (in the case of split-K)
+    //   (b) the number of stream-K tiles (potentially multiplied by peer count if using separate reduction)
+    // To calculate the total number of output tiles in the split-K case, we
+    // note that, in the split-K case, the units_per_problem_ member of Params will be
+    // the total number of output tiles.
+    uint32_t reduction_tiles = 0;
+    if (params.divmod_splits_.divisor > 1) {
+      reduction_tiles = params.units_per_problem_;
+    }
+    else if (
+      params.requires_separate_reduction()
+      ) {
+      reduction_tiles = params.sk_tiles_ * Params::max_peers_per_tile(params.sk_units_, params.sk_tiles_);
+    }
+    else {
+      reduction_tiles = params.sk_tiles_;
+    }
+
+    uint64_t reduction_workspace_size = Params::get_reduction_workspace_size(
+      reduction_tiles, to_gemm_coord(TileShape{}), sizeof_bits<ElementAccumulator>::value, num_accumulator_mtxs);
+    BarrierType* lock_workspace = reinterpret_cast<BarrierType*>(
+      reinterpret_cast<uint8_t*>(params.reduction_workspace_) + reduction_workspace_size);
+
+    if (work_tile_info.is_reduction_unit()) {
+      // Wait until the peers collaborating on this output tile have all written
+      // their accumulators to workspace.
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, num_peers);
+
+      separate_reduction<FrgTensorC, BarrierManager>(accumulators, num_barriers, group_reduction_workspace, barrier_group_thread_idx, num_peers, num_accumulator_mtxs);
+    }
+    else if (!compute_epilogue(work_tile_info, params)) {
+      if (
+        params.requires_separate_reduction()
+        || work_tile_info.K_idx == 0
+        ) {
+        // The first peer initializes the workspace partials in the non-separate-reduction case,
+        // and all peers write to their own location in workspace when using separate reduction
+        BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+      else {
+        if (params.reduction_mode_ == ReductionMode::Deterministic) {
+          // Wait until the preceding split added its accumulators
+          BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        }
+        else {
+          // Wait until the first split has stored its accumulators. Note that the first split will have
+          // accumulated a value into the lock potentially greater than one (since the locked value is
+          // incremented by work_tile_info.k_tile_count below for both the deterministic and non-deterministic)
+          // cases. For non-deterministic reductions, all that non-first or last splits care about is whether
+          // the first split has been written, so we only wait while the locked value is less than 1.
+          BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+        }
+
+        // Perform reduction in workspace
+        BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
+      }
+
+      // If separate reduction is being performed, each participating stream-K unit increments the barrier
+      // by only 1. Otherwise, increment by the K tile count that this unit has processed.
+      uint32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
+
+      // Signal our arrival
+      if (idx_accumulator_mtxs == (num_accumulator_mtxs - 1)) {
+        BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+      }
+    }
+    else {
+      // Wait until the preceding split added its accumulators
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+
+      // The block computing the final split for the tile adds previously-reduced partials
+      // to its accumulators and computes the epilogue.
+      BlockStripedReduceT::load_add(*accumulator_array, reduction_workspace_array, barrier_group_thread_idx);
+    }
+  }
+
+  template <class FrgTensorC, class BarrierManager>
+  CUTLASS_DEVICE
+  static void
+  separate_reduction(
+      FrgTensorC& accumulators,
+      uint32_t num_barriers,
+      typename FrgTensorC::value_type* reduction_workspace,
+      uint32_t thread_idx,
+      uint64_t num_peers,
+      uint32_t num_accumulator_mtxs) {
+    using AccumulatorArrayT = Array<typename FrgTensorC::value_type, size(FrgTensorC{})>;
+    using BlockStripedReduceT = BlockStripedReduce<BarrierManager::ThreadCount, AccumulatorArrayT>;
+
+    AccumulatorArrayT* accumulator_array = reinterpret_cast<AccumulatorArrayT*>(accumulators.data());
+
+    plus<AccumulatorArrayT> add_fragments;
+    uint64_t peer_offset = cute::size<0>(TileShape{}) * cute::size<1>(TileShape{}) * num_accumulator_mtxs;
+
+    for (uint64_t i = 0; i < num_peers; ++i) {
+      // Load peer fragment
+      AccumulatorArrayT addend_fragment;
+      auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(reduction_workspace + (i * peer_offset));
+
+      BlockStripedReduceT::load_add(*accumulator_array, peer_reduction_workspace, thread_idx);
+    }
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the case of stream-K, this should only occur if the work is marked as the final split.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const& work_tile_info, Params const& params) {
+    // `is_final_split` will be set to `true` for the following scenarios, all of which must compute the epilogue:
+    //  1. The tile is computed in data-parallel mode
+    //  2. The tile is computed in split-/stream-K mode and this work unit represents the final split of the tile
+    //  3. The tile is computed in split-/stream-K mode and separate reduction is used, and this is a separate reduction unit
+    return work_tile_info.is_valid() &&
+            (work_tile_info.is_final_split(params.divmod_tiles_per_output_tile_.divisor) &&
+             !params.requires_separate_reduction()) || work_tile_info.is_separate_reduction;
+  }
+
+  // Returns the linearized index of the output tile corresponding to the tile with offset [L, M, K]
+  CUTLASS_DEVICE
+  static uint64_t
+  output_tile_index(Params const& params, WorkTileInfo const& work_tile_info) {
+    uint64_t linear_idx_in_batch = UnderlyingScheduler::get_linear_idx_from_m_and_n(
+      work_tile_info.M_idx, work_tile_info.N_idx,
+      params.divmod_cluster_shape_major_,
+      params.divmod_cluster_shape_minor_,
+      params.divmod_cluster_blk_major_,
+      params.log_swizzle_size_,
+      params.raster_order_
+    );
+
+    uint64_t tiles_mn = params.divmod_batch_.divisor;
+    return tiles_mn * work_tile_info.L_idx + linear_idx_in_batch;
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static size_t
+  get_workspace_size(
+    Arguments const& args,
+    ProblemShape problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::get_workspace_size(
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      args.reduction_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile
+    );
+  }
+
+  template <class ProblemShape, class ElementAccumulator>
+  static cutlass::Status
+  initialize_workspace(
+    Arguments const& args,
+    void* workspace,
+    cudaStream_t stream,
+    ProblemShape const& problem_shape,
+    KernelHardwareInfo const& hw_info,
+    uint32_t mma_warp_groups,
+    const uint32_t epilogue_subtile = 1,
+    [[maybe_unused]] uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape, 1);
+
+    ClusterShape cluster_shape;
+    TileShape tile_shape;
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+    uint32_t k_tile_per_output_tile = cute::size(cute::ceil_div(cute::shape<2>(problem_shape_mnkl), cute::shape<2>(TileShape{})));
+
+    return Params::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tile_per_output_tile,
+      to_gemm_coord(tile_shape),
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      args.splits,
+      args.max_swizzle_size,
+      args.raster_order,
+      args.decomposition_mode,
+      args.reduction_mode,
+      mma_warp_groups,
+      sizeof_bits<BarrierType>::value,
+      sizeof_bits<ElementAccumulator>::value,
+      epilogue_subtile,
+      1,
+      cuda_adapter
+    );
+  }
+
+  template <class ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape, TileShape) {
+    return work_tile_info.k_tile_count;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const& work_tile_info) {
+    return work_tile_info.K_idx;
+  }
+
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+      WorkTileInfo work_tile_info,
+      TileSchedulerPipeline& scheduler_pipeline,
+      TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+    return fetch_next_work(work_tile_info);
+  }
+
+  // Returns the initial work tile info that will be computed over
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape) {
+    return get_current_work();
+  }
+
+  // Given raster order and current work tile linear index, reset cta m and n index in the cluster.
+  CUTLASS_DEVICE
+  static dim3
+  get_current_work_cta_m_n_in_cluster(
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster) {
+    auto [cta_m_in_cluster_, cta_n_in_cluster_, _] = block_id_in_cluster;
+    uint64_t cta_m_in_cluster = static_cast<uint64_t>(cta_m_in_cluster_);
+    uint64_t cta_n_in_cluster = static_cast<uint64_t>(cta_n_in_cluster_);
+    
+    // Determine the CTA's M and N offsets within the preferred cluster
+    // This simply finds the linear offset of the CTA within the cluster, and takes a divmod
+    // on it depending on the rasterization order used by the scheduler.
+    uint64_t cluster_linear_work_idx_tmp = params.div_cluster_size(linear_idx) * params.get_cluster_size();
+
+    if (params.raster_order_ == RasterOrder::AlongN) {
+      params.divmod_cluster_shape_minor_(cta_n_in_cluster, cta_m_in_cluster, linear_idx - cluster_linear_work_idx_tmp);
+    }
+    else {
+      params.divmod_cluster_shape_minor_(cta_m_in_cluster, cta_n_in_cluster, linear_idx - cluster_linear_work_idx_tmp);
+    }
+    
+    return {static_cast<uint32_t>(cta_m_in_cluster), static_cast<uint32_t>(cta_n_in_cluster), _};
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  static uint32_t
+  get_current_work_iter_start_possible_update_work_tile_k_remaining(
+    Params const& params,
+    uint64_t linear_idx,
+    WorkTileInfo& work_tile_info) {
+    // In the CUTLASS 2.x implementation of stream K, stream-K work is assigned to each stream-K
+    // threadblock individually. For the most part, the set of K iterations corresponding to stream-K
+    // work was divided amongst stream-K threadblocks, and a threadblock determined which tile
+    // it would compute a (potentially-partial) output tile for based on the space of k iterations
+    // assigned to it. This often results in stream-K threadblocks processing tiles with different
+    // offsets in the K dimension from one another. This can reduce locality, but is lmitied to the
+    // (generally few) waves of threadblocks assigned to compute stream-K work.
+    //
+    // With the introduction of threadblock clusters, there is additional benefit to maintaining
+    // locality in the K dimension: shared portions of operands can be multicasted to threadblocks
+    // within a cluster. Thus, we would like to ensure that the assignment of stream-K work to
+    // threadblocks respects the ability to perform multicasting.
+    //
+    // To do so, we divide up the linearized stream-K units into clusters and share the same K
+    // offsets for work within clusters.
+    uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+    uint64_t group_idx;
+    params.divmod_sk_groups_(cluster_linear_work_idx, group_idx, cluster_linear_work_idx);
+
+    // Determine whether we are in a "big group" that will process an additional
+    // stream-K cluster tile.
+    uint64_t sk_cluster_tiles = params.div_cluster_size(params.sk_tiles_);
+    uint64_t sk_cluster_tiles_in_group = params.divmod_sk_groups_.divide(sk_cluster_tiles);
+    if (group_idx < params.big_groups_) {
+      ++sk_cluster_tiles_in_group;
+    }
+
+    // Determine whether we are in a "big unit" within the group, that will process
+    // an additional K chunk in the group.
+    uint64_t sk_tiles_in_group = sk_cluster_tiles_in_group * params.get_cluster_size();
+    uint64_t k_tiles_in_group = sk_tiles_in_group * params.divmod_tiles_per_output_tile_.divisor;
+    uint64_t k_tiles_per_unit_in_group = params.divmod_sk_units_per_group_.divide(k_tiles_in_group);
+    uint64_t big_units_in_group = params.div_cluster_size(
+      k_tiles_in_group - (k_tiles_per_unit_in_group * params.divmod_sk_units_per_group_.divisor));
+
+    uint64_t split;
+    params.divmod_clusters_mnl_(split, cluster_linear_work_idx, cluster_linear_work_idx);
+
+    bool is_split_k = params.divmod_splits_.divisor > 1;
+    uint64_t big_unit_cmp_lhs = is_split_k ? split : cluster_linear_work_idx;
+    uint64_t big_unit_cmp_rhs = is_split_k ? params.big_units_ : big_units_in_group;
+    uint64_t linear_idx_mult = is_split_k ? params.divmod_tiles_per_output_tile_.divisor : k_tiles_per_unit_in_group;
+    uint64_t k_tiles_per_split = is_split_k ? params.divmod_k_tiles_per_sk_unit_.divisor : k_tiles_per_unit_in_group;
+
+    // Determine the starting k iteration computed by this stream-K work unit
+    uint32_t unit_iter_start = (linear_idx_mult * cluster_linear_work_idx) +
+                               (k_tiles_per_split * split);
+
+    // Adjust the starting position and number of k iterations for "big units," which
+    // compute one extra iteration. If there are any big units, they will be the first
+    // in the linearized ID space.
+    auto k_tiles_in_my_split = k_tiles_per_split;
+    if (big_unit_cmp_lhs < big_unit_cmp_rhs) {
+      // Since the "big units" are the first units in the linearized ID space, each
+      // of the units preceding this big unit computed one extra iteration. Thus,
+      // we must offset our start iteration by the number of units that precede
+      // the current unit in the linearized ID space.
+      unit_iter_start += big_unit_cmp_lhs;
+      ++k_tiles_in_my_split;
+    }
+    else {
+      // Increment by one for each of the big clusters (since all big units precede this unit)
+      unit_iter_start += big_unit_cmp_rhs;
+    }
+    if (!is_split_k) {
+      // Adjust the unit starting position and number of tiles to avoid
+      // computing splits of size less than min_iters_per_sk_unit_
+      int unused, start_tile_k_tile;
+      params.divmod_tiles_per_output_tile_(unused, start_tile_k_tile, unit_iter_start);
+      if (start_tile_k_tile < Params::min_iters_per_sk_unit_) {
+        // Starting K tile is in range [0, Params::min_iters_per_sk_unit_), which means that another
+        // stream-K unit will be computing a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+        // Adjust our work to take over these K tiles.
+        unit_iter_start -= start_tile_k_tile;
+        k_tiles_in_my_split += start_tile_k_tile;
+      }
+      else if (start_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+        // Starting K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+        // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+        // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+        auto adjustment_tiles = (params.divmod_tiles_per_output_tile_.divisor - start_tile_k_tile);
+        unit_iter_start += adjustment_tiles;
+        k_tiles_in_my_split -= adjustment_tiles;
+      }
+      else if (params.ktile_start_alignment_count_ == 2 && start_tile_k_tile % 2 != 0) {
+        // ktile for each SM start from even number
+        // If start from odd number ktile within the output tile
+        //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+        // if end on odd number ktile within the output tile
+        //    now end at ktile that one before my ktile end (give one ktile to next sm)
+        unit_iter_start -= 1;
+        k_tiles_in_my_split += 1;
+      }
+    }
+    if (work_tile_info.k_tile_count == 0) {
+      // This is a new unit
+
+      if (!is_split_k) {
+        //
+        // Adjust the unit ending position and number of tiles to avoid
+        // computing splits of size less than min_iters_per_sk_unit_
+        //
+
+        // Begin by assuming that no adjustment is needed
+        auto initial_unit_iter_end = unit_iter_start + k_tiles_in_my_split;
+
+        int unused, end_tile_k_tile;
+        params.divmod_tiles_per_output_tile_(unused, end_tile_k_tile, initial_unit_iter_end);
+
+        if (end_tile_k_tile < Params::min_iters_per_sk_unit_) {
+          // Ending K tile is within the first Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that this unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to shed these K tiles to a neighboring stream-K unit that will compute more consecutive K tiles.
+          k_tiles_in_my_split -= end_tile_k_tile;
+        }
+        else if (end_tile_k_tile > (params.divmod_tiles_per_output_tile_.divisor - Params::min_iters_per_sk_unit_)) {
+          // Ending K tile is within the final Params::min_iters_per_sk_unit_ K tiles of some output tile,
+          // which means that some other unit will compute a split with fewer than Params::min_iters_per_sk_unit_ K tiles.
+          // Adjust our work to take on these K tiles.
+          k_tiles_in_my_split += (params.divmod_tiles_per_output_tile_.divisor - end_tile_k_tile);
+        }
+        else if (params.ktile_start_alignment_count_ == 2 && end_tile_k_tile % 2 != 0) {
+          // ktile for each SM start from even number
+          // If start from odd number ktile within the output tile
+          //    now start at the ktile one before my initial ktile start (take one ktile from prev sm)
+          // If end on odd number ktile within the output tile,
+          //    now end at ktile that one before my ktile end (give one ktile to next sm)
+          k_tiles_in_my_split -= 1;
+        }
+      }
+
+      work_tile_info.k_tile_remaining = k_tiles_in_my_split;
+    }
+    return unit_iter_start;
+  }
+
+  // Update output tile index given existing remaining k tiles of current work tile.
+  CUTLASS_DEVICE
+  static uint64_t update_output_tile_id_and_work_tile_k(
+    Params const& params,
+    WorkTileInfo& work_tile_info,
+    uint64_t linear_idx,
+    uint32_t unit_iter_start,
+    uint64_t cta_m_in_cluster,
+    uint64_t cta_n_in_cluster) {
+    // we divide up the linearized stream-K units into clusters and share the same K
+    // offsets for work within clusters.
+    uint64_t cluster_linear_work_idx = params.div_cluster_size(linear_idx);
+
+    uint64_t unused, group_idx;
+    params.divmod_sk_groups_(unused, group_idx, cluster_linear_work_idx);
+
+    uint32_t unit_iter_end = unit_iter_start + work_tile_info.k_tile_remaining - 1;
+
+    // Find the output tile corresponding to the final k tile covered by this
+    // work unit. Stream-K work units will work backwards in terms of the tiles they
+    // are responsible computing. This is beneficial because the final (partial)
+    // tile computed by a stream-K block is typically the beginning of the output
+    // tile, while the beginning (partial) tile is typically the ending of another
+    // output tile. Since ending portions of an output tile must reduce across
+    // other work units computing portions of that output tile, it is preferable
+    // for them to be computed later, so as to reduce the likelihood of blocking
+    // on other work.
+
+    auto output_tile_id_in_group = params.divmod_tiles_per_output_tile_.divide(unit_iter_end);
+    uint32_t output_tile_iter_start = output_tile_id_in_group * params.divmod_tiles_per_output_tile_.divisor;
+    uint32_t output_tile_iter_end = output_tile_iter_start + params.divmod_tiles_per_output_tile_.divisor;
+
+    // Convert the output tile from the linearized space within each group to the
+    // overall linearized space.
+    uint64_t output_tile_id = (output_tile_id_in_group * params.divmod_sk_groups_.divisor) + group_idx;
+
+    // Bring the linearized tile ID back into the space of tiles, rather than clusters
+    output_tile_id *= params.get_cluster_size();
+
+    // The final linearized tile ID is in units of the cluster dimension over which we rasterize.
+    if (params.raster_order_ == RasterOrder::AlongN) {
+      output_tile_id += cta_n_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+    }
+    else {
+      output_tile_id += cta_m_in_cluster * params.divmod_cluster_shape_minor_.divisor;
+    }
+    // The unit's starting k iteration in the current tile is either the starting
+    // iteration for the tile as a whole, or the starting k iteration for the unit
+    // as a whole (if the latter is greater than the former).
+    uint32_t tile_iter_start = max(output_tile_iter_start, unit_iter_start);
+
+    // Similarly, the unit's ending k iteration (exclusive) is either the end of
+    // the current tile it is assigned, or the ending iteration of the unit as a whole
+    // (if the latter is less than the former).
+    uint32_t tile_iter_end = min(output_tile_iter_end, unit_iter_end + 1);
+
+    // Set the k offset to be the starting k tile for this output tile
+    work_tile_info.K_idx = static_cast<int32_t>(tile_iter_start - output_tile_iter_start);
+    work_tile_info.k_tile_count = tile_iter_end - tile_iter_start;
+
+    return output_tile_id;
+  }
+  // Given output tile index, update M, N, L index of current work tile info.
+  CUTLASS_DEVICE
+  static void
+  update_work_tile_m_n_l(
+    Params const& params,
+    uint32_t output_tile_id,
+    WorkTileInfo& work_tile_info,
+    uint64_t cta_m_in_cluster,
+    uint64_t cta_n_in_cluster) {
+
+    uint64_t work_idx_l, remainder;
+    params.divmod_batch_(work_idx_l, remainder, output_tile_id);
+
+    uint64_t cta_per_grid_dim = params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = UnderlyingScheduler::get_work_idx_m_and_n(
+                                          cta_per_grid_dim,
+                                          params.divmod_cluster_shape_major_,
+                                          params.divmod_cluster_shape_minor_,
+                                          params.divmod_cluster_blk_major_,
+                                          params.log_swizzle_size_,
+                                          params.raster_order_
+                                          , cta_m_in_cluster  
+                                          , cta_n_in_cluster  
+                                        );
+
+    // Set the M, N, and L block offsets
+    work_tile_info.M_idx = work_idx_m;
+    work_tile_info.N_idx = work_idx_n;
+    work_tile_info.L_idx = static_cast<int32_t>(work_idx_l);
+  }
+
+  // Sets the current stream-K work to compute within work_tile_info. If new_unit is true, work_tile_info
+  // is populated as a new unit of work. Otherwise, state existing in work_tile_info (e.g., remaining
+  // iterations) is used to find the next tile in the current work unit.
+  CUTLASS_DEVICE
+  static void
+  assign_work(
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info,
+    uint32_t &unit_iter_start) {
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] =
+      get_current_work_cta_m_n_in_cluster(params, linear_idx, block_id_in_cluster);
+
+    uint64_t output_tile_id = linear_idx;
+    if (linear_idx >= params.units_per_problem_ * params.divmod_splits_.divisor) {
+      // Separate-reduction work
+      auto cluster_size = params.get_cluster_size();
+      // Divide up the linearized separate reduction units into clusters
+      uint64_t cluster_linear_reduction_unit_idx = params.div_cluster_size((linear_idx - params.units_per_problem_));
+      uint64_t cluster_tile_idx, epi_subtile_idx;
+      params.divmod_epilogue_subtile_(cluster_tile_idx, epi_subtile_idx, cluster_linear_reduction_unit_idx);
+      // Bring the linearized tile ID back into the space of tiles, rather than clusters
+      output_tile_id = cluster_tile_idx * cluster_size;
+
+      work_tile_info.setup_separate_reduction(epi_subtile_idx);
+    }
+    else if (linear_idx >= params.sk_units_ && params.divmod_splits_.divisor == 1) {
+      // Data-parallel work
+      output_tile_id = linear_idx - params.sk_units_ + params.sk_tiles_;
+      work_tile_info.K_idx = 0;
+      work_tile_info.k_tile_count = params.divmod_tiles_per_output_tile_.divisor;
+      work_tile_info.k_tile_remaining = params.divmod_tiles_per_output_tile_.divisor;
+    }
+    else {
+      unit_iter_start = get_current_work_iter_start_possible_update_work_tile_k_remaining(params, linear_idx, work_tile_info);
+      output_tile_id = update_output_tile_id_and_work_tile_k(params, work_tile_info,
+        linear_idx, unit_iter_start, cta_m_in_cluster, cta_n_in_cluster);
+    }
+    update_work_tile_m_n_l(params, output_tile_id, work_tile_info, cta_m_in_cluster, cta_n_in_cluster);
+  }
+
+  // The fast path to get current output tile index then update fields of work tile info
+  // when continuing current work tile is needed, since k tile starting index has precomputed
+  // in the first time fetching current work tile.
+  CUTLASS_DEVICE
+  static void
+  fast_assign_work(
+    uint32_t unit_iter_start,
+    Params const& params,
+    uint64_t linear_idx,
+    dim3 block_id_in_cluster,
+    WorkTileInfo& work_tile_info) {
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] =
+      get_current_work_cta_m_n_in_cluster(params, linear_idx, block_id_in_cluster);
+
+    uint64_t output_tile_id = update_output_tile_id_and_work_tile_k(params, work_tile_info,
+      linear_idx, unit_iter_start, cta_m_in_cluster, cta_n_in_cluster);
+
+    update_work_tile_m_n_l(params, output_tile_id, work_tile_info, cta_m_in_cluster, cta_n_in_cluster);
+  }
+
+  // Returns the starting and ending peer ID of this tile
+  CUTLASS_HOST_DEVICE
+  static auto
+  tile_peer_range(Params const& params, uint32_t tile_idx, WorkTileInfo const& work_tile_info) {
+    uint32_t cur_k_tile = static_cast<uint32_t>(work_tile_info.K_idx);
+    uint32_t tile_idx_in_cluster_path = params.div_cluster_size(tile_idx);
+    uint32_t start_k_tile = params.divmod_tiles_per_output_tile_.divisor * tile_idx_in_cluster_path;
+    uint32_t end_k_tile = start_k_tile + params.divmod_tiles_per_output_tile_.divisor - 1;
+    uint32_t big_unit_k_tiles = params.big_units_ * (params.divmod_k_tiles_per_sk_unit_.divisor + 1);
+
+    auto adjust_unit = [&](uint32_t k_tile, uint32_t unit_idx, uint32_t unit_k_start, uint32_t unit_k_end) {
+      if (k_tile - start_k_tile < Params::min_iters_per_sk_unit_ &&
+          unit_k_end - start_k_tile < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the first min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the next stream-K unit.
+        ++unit_idx;
+      }
+
+      if (end_k_tile + 1 - k_tile < Params::min_iters_per_sk_unit_ &&
+          end_k_tile + 1 - unit_k_start < Params::min_iters_per_sk_unit_) {
+        // k_tile is within the last min_iters_per_sk_unit_ K tiles of this output tile,
+        // and the stream-K unit computes fewer than min_iters_per_sk_unit_ K tiles for this
+        // output tile. This work will thus be subsumed by the previous stream-K unit.
+        --unit_idx;
+      }
+      return unit_idx;
+    };
+
+    // Lambda to find the ID of the stream-K unit that computes this K tile
+    auto find_unit = [&](uint32_t k_tile) {
+      if (k_tile < big_unit_k_tiles) {
+        // The tile is within the "big unit range"
+        uint32_t unit_idx = params.divmod_k_tiles_per_sk_big_unit_.divide(k_tile);
+        uint32_t unit_k_start = unit_idx * params.divmod_k_tiles_per_sk_big_unit_.divisor;
+        uint32_t unit_k_end = unit_k_start + params.divmod_k_tiles_per_sk_big_unit_.divisor;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, unit_k_start, unit_k_end));
+      }
+      else {
+        // The tile is after the "big unit range." Account for this by finding the "normal unit"
+        // that it belongs to, and then offsetting by the number of big units
+        uint32_t unit_idx_after_big_units = params.divmod_k_tiles_per_sk_unit_.divide(k_tile - big_unit_k_tiles);
+        uint32_t unit_k_start = unit_idx_after_big_units * params.divmod_k_tiles_per_sk_unit_.divisor + (params.big_units_ * params.divmod_k_tiles_per_sk_big_unit_.divisor);
+        uint32_t unit_k_end = unit_k_start + params.divmod_k_tiles_per_sk_unit_.divisor;
+        uint32_t unit_idx = unit_idx_after_big_units + params.big_units_;
+        return static_cast<uint64_t>(adjust_unit(k_tile, unit_idx, unit_k_start, unit_k_end));
+      }
+    };
+
+    return cute::make_tuple(find_unit(start_k_tile), find_unit(start_k_tile + cur_k_tile), find_unit(end_k_tile));
+  }
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..84102a6c933fcc6e80604ccd232db8ca033c0d56
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct SparseGemm {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_C;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C;
+    typename Epilogue::OutputTileIterator::Params params_D;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D;
+    typename OutputOp::Params output_op;
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      TensorRefE ref_E,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op) {
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemm() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename Mma::IteratorE::TensorRef ref_E) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
+      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op, iterator_D, accumulators, iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..0574c21823be1b492abb5dc1766ee87a4f12d8bd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM kernel with an epilogue that computes the absolute maximum value of the output
+    and a pre-activation-function auxiliary output. The auxiliary output is also (optionally)
+    stored to global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  bool SplitKSerial               ///! If true, code supporting split-K via serial reduction is enabled.
+>
+struct SparseGemmWithAbsmax {
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using OutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  static bool const kSplitKSerial = SplitKSerial;
+
+  static int const kSparse = Mma::kSparse;
+  static int const kMetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const kMaxID2 = Mma::kMaxID2;
+  static int const kElementsPerElementE = Mma::kElementsPerElementE;
+
+  using ElementE = typename Mma::ElementE;
+  using LayoutE = typename Mma::LayoutE;
+
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  using ParamsC = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefC = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsD = typename Epilogue::OutputTileIterator::Params;
+  using TensorRefD = typename Epilogue::OutputTileIterator::TensorRef;
+  using ParamsAux = typename Epilogue::AuxOutputTileIterator::Params;
+  using TensorRefAux = typename Epilogue::AuxOutputTileIterator::TensorRef;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size;
+    TensorRefA ref_A;
+    TensorRefB ref_B;
+    TensorRefC ref_C;
+    TensorRefD ref_D;
+    TensorRefE ref_E;
+    TensorRefAux ref_Aux;
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename Epilogue::OutputOp::Params epilogue;
+    int split_k_slices;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(): problem_size(0, 0, 0), split_k_slices(1) {
+
+    }
+
+    /// Constructs an Arguments structure 
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_,
+      TensorRefA ref_A_,
+      TensorRefB ref_B_,
+      TensorRefC ref_C_,
+      TensorRefD ref_D_,
+      TensorRefE ref_E_,
+      TensorRefAux ref_Aux_,
+      void* ptr_Vector_,
+      typename LayoutC::Stride::Index ldr_,
+      typename OutputOp::Params epilogue_ = 
+        typename OutputOp::Params(),
+      int split_k_slices = 1
+    ):
+      problem_size(problem_size_),
+      ref_A(ref_A_),
+      ref_B(ref_B_),
+      ref_C(ref_C_),
+      ref_D(ref_D_),
+      ref_E(ref_E_),
+      ref_Aux(ref_Aux_),
+      ptr_Vector(ptr_Vector_),
+      ldr(ldr_),
+      epilogue(epilogue_),
+      split_k_slices(split_k_slices) {
+
+    }
+  };
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    ParamsC params_C;
+    TensorRefC ref_C;
+    ParamsD params_D;
+    TensorRefD ref_D;
+    ParamsAux params_Aux;
+    TensorRefAux ref_Aux;
+
+    void* ptr_Vector;
+    typename LayoutC::Stride::Index ldr;
+
+    typename OutputOp::Params output_op;
+    int *semaphore;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      TensorRefA ref_A,
+      TensorRefB ref_B,
+      TensorRefC ref_C,
+      TensorRefD ref_D,
+      TensorRefE ref_E,
+      TensorRefAux ref_Aux,
+      void* ptr_Vector,
+      typename LayoutC::Stride::Index ldr,
+      typename OutputOp::Params output_op = typename OutputOp::Params(),
+      int *workspace = nullptr
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      params_C(ref_C.layout()),
+      ref_C(ref_C),
+      params_D(ref_D.layout()),
+      ref_D(ref_D),
+      output_op(output_op),
+      ref_Aux(ref_Aux),
+      params_Aux(ref_Aux.layout()),
+      ptr_Vector(ptr_Vector),
+      ldr(ldr) {
+    semaphore = workspace;
+    }
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithAbsmax() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D,
+      typename Mma::IteratorE::TensorRef ref_E) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+    static int const kAlignmentE = Mma::IteratorE::AccessType::kElements;
+
+    if (!TensorRef_aligned(ref_A, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_E, kAlignmentE)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size.m() % kAlignmentA) || ((problem_size.k() / kSparse) % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC) ||
+      (problem_size.m() % kAlignmentE) || ((problem_size.k() / kSparse) % kAlignmentE)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // The k dimension has to be the multiple of the Threadblock k because out
+    // of bound meta data would be initialized to 0 by acync.zfill but 0 is not
+    // a valid meta data.
+    if (problem_size.k() % Mma::Shape::kK) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int) 
+    // because of the row reordering of operand E
+    static int const kAlignmentM = (sizeof(ElementE) == 2) ? 32 : 16;
+
+    if (problem_size.m() % kAlignmentM) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (!kSplitKSerial || gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Epilogue
+    //
+
+    OutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    // If performing a reduction via split-K, fetch the initial synchronization
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      // Fetch the synchronization lock initially but do not block.
+      semaphore.fetch();
+
+      // Indicate which position in a serial reduction the output operator is currently updating
+      output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+    }
+
+    typename Epilogue::ElementVector *ptr_Vector = static_cast<typename Epilogue::ElementVector *>(params.ptr_Vector);
+    // Move to appropriate location for this output tile
+    if (ptr_Vector) {
+      ptr_Vector += threadblock_offset.column() + threadblock_tile_offset.m() * params.ldr;
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      params.ref_C.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      params.ref_D.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to auxiliary destination tensor.
+    typename Epilogue::AuxOutputTileIterator iterator_Aux(
+      params.params_Aux,
+      // Only the final block writes the auxiliary tensor
+      ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+          (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+          ? nullptr
+          : params.ref_Aux.data(),
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(output_op,
+             // Only the final block uses Vector
+             ((kSplitKSerial && params.grid_tiled_shape.k() > 1) &&
+              (params.grid_tiled_shape.k() != threadblock_tile_offset.k() + 1))
+                 ? nullptr
+                 : ptr_Vector,
+             iterator_D,
+             accumulators,
+             iterator_C,
+             iterator_Aux,
+             params.problem_size.mn(),
+             threadblock_offset);
+    
+    //
+    // Release the semaphore
+    //
+
+    if (kSplitKSerial && params.grid_tiled_shape.k() > 1) {
+      
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+
+      __threadfence();
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8ec1c3dc091dd5d14a2b1c1d71897b7af272546
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
@@ -0,0 +1,238 @@
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Sparse GEMM with visitor.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/kernel/sparse_gemm.h"
+#include "cutlass/gemm/kernel/params_sparse_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Sparse Gemm that compute the epilogue visitor functor
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_    ///! Threadblock swizzling function
+>
+struct SparseGemmWithEpilogueVisitor : public SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>  {
+
+  using Base = SparseGemm<Mma_, Epilogue_, ThreadblockSwizzle_, false>;
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using FusionCallbacks = typename Epilogue::FusionCallbacks;
+
+  using ParamsA = typename Mma::IteratorA::Params;
+  using TensorRefA = typename Mma::IteratorA::TensorRef;
+  using ParamsB = typename Mma::IteratorB::Params;
+  using TensorRefB = typename Mma::IteratorB::TensorRef;
+  using ParamsE = typename Mma::IteratorE::Params;
+  using TensorRefE = typename Mma::IteratorE::TensorRef;
+
+  static int const kSparse = Base::kSparse;
+  static int const kElementsPerElementE = Base::kElementsPerElementE;
+  using SharedStorage = typename Base::SharedStorage;
+
+  /// Parameters structure
+  struct Params : public SparseParamsBase<
+      ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+      ParamsE, TensorRefE> {
+
+    using Base = SparseParamsBase<
+        ThreadblockSwizzle, ParamsA, TensorRefA, ParamsB, TensorRefB,
+        ParamsE, TensorRefE>;
+
+    //
+    // Data members
+    //
+
+    typename FusionCallbacks::Params output_op;
+    cute::Shape<int32_t,int32_t,int32_t> problem_shape;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename Mma::IteratorA::TensorRef ref_A,
+      typename Mma::IteratorB::TensorRef ref_B,
+      typename Mma::IteratorE::TensorRef ref_E,
+      typename FusionCallbacks::Arguments output_op = typename FusionCallbacks::Arguments()
+    ):
+      Base(problem_size, grid_tiled_shape, ref_A, ref_B, ref_E, Mma::Shape::kK),
+      output_op(FusionCallbacks::to_underlying_arguments(problem_size, output_op, nullptr /*workspace*/)),
+      problem_shape(problem_size.m(), problem_size.n(), 1) {
+    }
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  SparseGemmWithEpilogueVisitor() { }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      threadblock_tile_offset.k() * params.gemm_k_size,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_E{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.k() * params.gemm_k_size / kSparse,
+    };
+
+    // Problem size is a function of threadblock index in the K dimension
+    int problem_size_k = min(
+      params.problem_size.k(), 
+      (threadblock_tile_offset.k() + 1) * params.gemm_k_size);
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - tb_offset_B.row() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Construct iterators to A, B, and E operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      params.ref_A.data(),
+      {params.problem_size.m(), problem_size_k / kSparse},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      params.ref_B.data(),
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    typename Mma::IteratorE iterator_E(
+        params.params_E, params.ref_E.data(),
+        {params.problem_size.m(),
+         problem_size_k / kSparse / kElementsPerElementE},
+        thread_idx, tb_offset_E);
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    if (gemm_k_iterations > 0) {
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, iterator_E, accumulators);
+    }
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    //
+    // Epilogue
+    //
+
+    Epilogue epilogue(
+      params.output_op,
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(accumulators, threadblock_tile_offset, params.problem_shape, thread_idx);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f8319b1157b1e6c9df5be1b444e9d3813a1a2bae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
@@ -0,0 +1,513 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/gemm/kernel/tile_scheduler_params.h"
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/pipeline/pipeline.hpp"
+namespace cutlass::gemm::kernel::detail {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Users are not supposed to use this class directly.
+// This is a CRTP base class for the actual tile schedulers.
+template<class Subclass>
+class StaticPersistentTileScheduler {
+
+private:
+  uint64_t current_work_linear_idx_;
+  uint64_t total_grid_size_;
+
+public:
+  struct WorkTileInfo {
+    int32_t M_idx = 0;
+    int32_t N_idx = 0;
+    int32_t L_idx = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_valid() const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_HOST_DEVICE
+    static WorkTileInfo
+    invalid_work_tile() {
+      return {-1, -1, -1, false};
+    }
+
+    CUTLASS_HOST_DEVICE
+    bool
+    is_final_split(uint32_t k_tiles_per_output_tile) const {
+      return true;
+    }
+
+    CUTLASS_HOST_DEVICE
+    int32_t
+    reduction_subtile_idx() const {
+      return -1;
+    }
+  };
+
+  using Params = PersistentTileSchedulerSm90Params;
+  using RasterOrder = typename Params::RasterOrder;
+  using RasterOrderOptions = typename Params::RasterOrderOptions;
+  static constexpr bool IsDynamicPersistent = false;
+
+public:
+  struct Arguments {
+    int max_swizzle_size = 1;
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic;
+  };
+
+  template <class ProblemShapeMNKL, class TileShape, class ClusterShape>
+  static Params
+  to_underlying_arguments(
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape,
+      ClusterShape cluster_shape,
+      [[maybe_unused]] KernelHardwareInfo const& hw_info,
+      Arguments const& arguments,
+      [[maybe_unused]] void* workspace=nullptr,
+      [[maybe_unused]] const uint32_t epilogue_subtile = 1,
+      [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) {
+
+    // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic
+    static_assert(cute::is_static<TileShape>::value);
+    static_assert(cute::is_static<ClusterShape>::value);
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape);
+
+    Params params;
+    params.initialize(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order
+    );
+
+    return params;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  can_implement(Arguments const& args) {
+    return args.max_swizzle_size >= 0;
+  }
+
+  CUTLASS_HOST_DEVICE
+  StaticPersistentTileScheduler() { }
+
+  CUTLASS_DEVICE explicit StaticPersistentTileScheduler(Params const& params_) : scheduler_params(params_) {
+    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
+    // like blockIdx and gridDim, with __CUDA_ARCH__.
+#if defined(__CUDA_ARCH__)
+    if (params_.raster_order_ == RasterOrder::AlongN) {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x);
+    }
+    else {
+      current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y);
+    }
+
+    total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z);
+#else
+    CUTLASS_ASSERT(false && "This line should never be reached");
+#endif
+  }
+
+  // Returns the initial work tile info that will be computed over
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  WorkTileInfo
+  initial_work_tile_info(ClusterShape cluster_shape) {
+    return get_current_work();
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work() const {
+    return get_current_work_for_linear_idx(current_work_linear_idx_);
+  }
+
+  CUTLASS_DEVICE
+  WorkTileInfo
+  get_current_work_for_linear_idx(uint64_t linear_idx) const {
+    if (linear_idx >= scheduler_params.blocks_per_problem_) {
+      return WorkTileInfo::invalid_work_tile();
+    }
+
+    // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices
+    uint64_t work_idx_l, remainder;
+    scheduler_params.divmod_batch_(work_idx_l, remainder, linear_idx);
+
+    uint64_t blk_per_grid_dim = scheduler_params.divmod_cluster_shape_minor_.divide(remainder);
+
+    auto [work_idx_m, work_idx_n] = Subclass::get_work_idx_m_and_n(blk_per_grid_dim,
+                                                         scheduler_params.divmod_cluster_shape_major_,
+                                                         scheduler_params.divmod_cluster_shape_minor_,
+                                                         scheduler_params.divmod_cluster_blk_major_,
+                                                         scheduler_params.log_swizzle_size_,
+                                                         scheduler_params.raster_order_);
+
+    return {work_idx_m, work_idx_n, static_cast<int32_t>(work_idx_l), true};
+  }
+
+  CUTLASS_DEVICE
+  void
+  advance_to_next_work(uint32_t advance_count = 1) {
+    current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count);
+  }
+
+  CUTLASS_DEVICE
+  bool is_last_tile(WorkTileInfo& work_tile_info, uint32_t advance_count = 1) const {
+    if (continue_current_work(work_tile_info)) {
+      return false;
+    }
+    return not get_current_work_for_linear_idx(
+        current_work_linear_idx_ + (total_grid_size_ * uint64_t(advance_count))
+    ).is_valid();
+  }
+
+  // Computes the linear index within a batch given M and N tile offsets within the batch.
+  // This essentially inverts the mapping performed in get_work_idx_m_and_n
+  static CUTLASS_DEVICE
+  uint64_t
+  get_linear_idx_from_m_and_n(
+    int32_t tile_m,
+    int32_t tile_n,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_major,
+    FastDivmodU64Pow2 const& divmod_cluster_shape_minor,
+    FastDivmodU64 const& divmod_cluster_blk_major,
+    int32_t log_swizzle_size,
+    RasterOrder raster_order) {
+
+    uint64_t minor_work_idx, major_work_idx, cluster_minor_offset;
+    if (raster_order == RasterOrder::AlongN) {
+      minor_work_idx = static_cast<uint64_t>(tile_m);
+      major_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_m = divmod_cluster_shape_minor.divide(tile_m) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_m - cluster_m;
+    }
+    else {
+      major_work_idx = static_cast<uint64_t>(tile_m);
+      minor_work_idx = static_cast<uint64_t>(tile_n);
+      uint64_t cluster_n = divmod_cluster_shape_minor.divide(tile_n) * divmod_cluster_shape_minor.divisor;
+      cluster_minor_offset = tile_n - cluster_n;
+    }
+
+    uint64_t cluster_idx_minor, cluster_idx_major, cluster_major_offset;
+    cluster_idx_minor = divmod_cluster_shape_minor.divide(minor_work_idx - cluster_minor_offset);
+    divmod_cluster_shape_major(cluster_idx_major, cluster_major_offset, major_work_idx);
+
+    uint64_t cluster_idx_minor_div_swizzle = cluster_idx_minor >> log_swizzle_size;
+    uint64_t offset = cluster_idx_minor & ((1 << log_swizzle_size) - 1);
+
+    uint64_t extra = cluster_idx_minor_div_swizzle * divmod_cluster_blk_major.divisor + cluster_idx_major;
+
+    uint64_t cluster_id = (extra << log_swizzle_size) | offset;
+    return (cluster_id * divmod_cluster_shape_major.divisor + cluster_major_offset) * divmod_cluster_shape_minor.divisor + cluster_minor_offset;
+  }
+
+  // Given the inputs, computes the total number of output blocks over which this problem will compute. 
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) {
+    auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape)));
+    auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape)));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape),
+      cta_m, cta_n
+    );
+  }
+
+  // Reloaded interface that receives WorkTileInfo to deduce next work.
+  // Kernel helper function to get next work tile
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return cute::make_tuple(work_tile_info, true);
+    }
+
+    advance_to_next_work();
+    return cute::make_tuple(get_current_work(), true);
+  }
+  
+  // Given the inputs, computes the total number of output blocks over which this problem will compute.
+  // Note that this is only the logical size of our grid, not the physical grid we will actually launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl,
+                          TileShape tile_shape_mnk,
+                          AtomThrShape atom_thr_shape_mnk,
+                          ClusterShape cluster_shape_mnk) {
+    auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk)));
+    auto cta_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk));
+    auto cta_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk));
+
+    return Params::get_tiled_cta_shape_mnl(
+      to_gemm_coord(problem_shape_mnkl),
+      to_gemm_coord(cluster_shape_mnk),
+      cta_m, cta_n
+    );
+  }
+
+  // Kernel helper function to get next work tile
+  template <class TileSchedulerPipeline, class TileSchedulerPipelineState>
+  CUTLASS_DEVICE
+  auto
+  fetch_next_work(
+      WorkTileInfo work_tile_info,
+      TileSchedulerPipeline& scheduler_pipeline,
+      TileSchedulerPipelineState scheduler_pipe_consumer_state) {
+    return fetch_next_work(work_tile_info);
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info, dim3 block_id_in_cluster) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster;
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+      [[maybe_unused]] Params const& params,
+      ProblemShapeMNKL problem_shape_mnk,
+      BlockShape cta_shape,
+      ClusterShape cluster_shape,
+      KernelHardwareInfo hw_info,
+      Arguments arguments = Arguments{},
+      bool truncate_by_problem_size=true) {
+
+    auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{});
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape);
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape),
+      hw_info,
+      arguments.max_swizzle_size,
+      arguments.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  template<class ProblemShapeMNKL, class TileShape, class AtomThrShape, class ClusterShape>
+  static dim3
+  get_grid_shape(
+      Params const& params,
+      ProblemShapeMNKL problem_shape_mnkl,
+      TileShape tile_shape_mnk,
+      AtomThrShape atom_thr_shape_mnk,
+      ClusterShape cluster_shape_mnk,
+      KernelHardwareInfo hw_info) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk);
+    Arguments args{};
+    if constexpr (!std::is_const_v<decltype(args.max_swizzle_size)>) {
+      args.max_swizzle_size = 1 << params.log_swizzle_size_;
+    }
+    args.raster_order = params.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM;
+
+    return Params::get_grid_shape(
+      problem_blocks,
+      to_gemm_coord(cluster_shape_mnk),
+      hw_info,
+      args.max_swizzle_size,
+      args.raster_order,
+      /* truncate_by_problem_size = */true
+    );
+  }
+
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  auto
+  work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const {
+    // TileScheduler works at CTA-level, kernel works at cluster-level
+    int m_coord = idx2crd(work_tile_info.M_idx / scheduler_params.cluster_shape_m_,
+                          scheduler_params.problem_tiles_m_);
+    int n_coord = idx2crd(work_tile_info.N_idx / scheduler_params.cluster_shape_n_,
+                          scheduler_params.problem_tiles_n_);
+    int l_coord = idx2crd(work_tile_info.L_idx,
+                          scheduler_params.problem_tiles_l_);
+    return make_coord(m_coord, n_coord, _, l_coord);
+  }
+
+  // Returns whether the block assigned this work should compute the epilogue for the corresponding
+  // output tile. For the basic tile scheduler, this is always true.
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&, Params const&) {
+    return true;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static bool
+  compute_epilogue(WorkTileInfo const&) {
+    return true;
+  }
+
+  // Performs the reduction across splits for a given output tile. Since this scheduler does
+  // not split output tiles, no reduction is needed.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {}
+
+  // Performs the reduction across splits for a given output tile. No fixup is required for
+  // work units returned by this scheduler.
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) const { }
+
+  // Returns whether the current WorkTileInfo passed in should continue to be used. Since
+  // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo
+  // passed in should not be used after having been processed.
+  CUTLASS_DEVICE
+  static bool
+  continue_current_work(WorkTileInfo&) {
+    return false;
+  }
+
+  template <class ProblemShapeMNKL, class TileShape, class Shape>
+  CUTLASS_DEVICE
+  auto
+  get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) {
+    auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape));
+    return cute::make_coord_iterator(k_tiles);
+  }
+
+  template <class ProblemShape, class TileShape>
+  CUTLASS_HOST_DEVICE
+  static int
+  get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) {
+    // All work units returned by this scheduler cover the entire K iteration
+    // space of the output tile assigned to the work unit.
+    return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape)));
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  get_work_k_tile_start(WorkTileInfo const&) {
+    // All work units returned by this scheduler start from K tile 0
+    return 0u;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  need_separate_reduction(Params const& params) {
+    return false;
+  }
+
+  CUTLASS_DEVICE
+  bool
+  is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) {
+    return false;
+  }
+
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  void
+  separate_reduction(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  // Shares the accumulator set with peers in the global workspace
+  template <class FrgTensorC>
+  CUTLASS_DEVICE
+  static void
+  share(
+    Params const& params,
+    WorkTileInfo const& work_tile_info,
+    FrgTensorC& accumulators,
+    uint32_t num_barriers,
+    uint32_t barrier_idx) {
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) {
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  static bool
+  requires_separate_reduction(Params const& params) {
+    return false;
+  }
+
+public:
+  // Sink scheduler params as a member
+  Params scheduler_params;
+};
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/symm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/symm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..29cf977c66a46569849e53a48b9cce4a772b96d3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/symm_universal.h
@@ -0,0 +1,675 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma1_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (A*B or B*A)
+  typename Mma2_,                 ///! Threadblock-scoped triangular matrix multiply-accumulate (AT*B or B*AT)
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_              ///! Fill Mode for triangular matrix (kLower or kUpper)
+>
+struct SymmUniversal {
+public:
+
+  using Mma1 = Mma1_;
+  using Mma2 = Mma2_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma1::IteratorA::Element;
+  using ElementB = typename Mma1::IteratorB::Element;
+
+  // Mma1 (TRMM - with diagonal: C_tmp = alpha * A * B)
+  using LayoutA = typename Mma1::IteratorA::Layout;
+  using LayoutBT = typename Mma1::IteratorB::Layout;
+  static ComplexTransform const kMma1TransformA = Mma1::kTransformA;
+  static ComplexTransform const kMma1TransformB = Mma1::kTransformB;
+
+  // Mma2 (TRMM - withOUT diagonal: alpha * AT * B)
+  using LayoutB = typename Mma2::IteratorA::Layout;
+  using LayoutAT = typename Mma2::IteratorB::Layout;
+  static ComplexTransform const kMma2TransformA = Mma2::kTransformA;
+  static ComplexTransform const kMma2TransformB = Mma2::kTransformB;
+
+  // Common type definitions for Mma1 and Mma2
+  using Operator = typename Mma1::Operator;
+  using OperatorClass = typename Mma1::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma1::Shape;
+  using WarpShape = typename Mma1::Operator::Shape;
+  using InstructionShape = typename Mma1::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma1::ArchTag;
+
+  static int const kStages = Mma1::kStages;
+  static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+
+  // Output related typedefinitions
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideModeA = SideMode_;
+  static FillMode const kFillModeA = FillMode_;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma1::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode = GemmUniversalMode::kGemm;
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void const * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_C{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldc{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    //
+    // Methods
+    //
+    
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void const * ptr_C,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_C,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldc,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size), 
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_C(ptr_C), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(0),
+      batch_stride_C(batch_stride_C), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldc(ldc), ldd(ldd) {
+
+      }
+
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+    
+    // Mma1 Iterator A and B params
+    typename Mma1::IteratorA::Params params_A_mma1{};
+    typename Mma1::IteratorB::Params params_B_mma1{};
+
+    // Mma2 Iterator A and B params 
+    typename Mma2::IteratorA::Params params_A_mma2{};
+    typename Mma2::IteratorB::Params params_B_mma2{};
+
+    typename Epilogue::OutputTileIterator::Params params_C{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_C{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_C {0};
+    int64_t batch_stride_D {0};
+
+    int *semaphore{nullptr};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A_mma1(args.lda),
+      params_B_mma1(args.ldb),
+      params_A_mma2(args.lda),
+      params_B_mma2(args.ldb),
+      params_C(args.ldc),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_C(const_cast<void *>(args.ptr_C)),
+      ptr_D(const_cast<void *>(args.ptr_D)),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_C(args.batch_stride_C),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_C = const_cast<void *>(args.ptr_C);
+      ptr_D = args.ptr_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma1::SharedStorage mma1_main_loop;
+    typename Mma2::SharedStorage mma2_main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  SymmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma1::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma1::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes two GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+      return;
+    }
+   
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_MxK_mma1{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma1{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    cutlass::MatrixCoord tb_offset_MxK_mma2{
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_KxN_mma2{
+      offset_k,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply for Mma1
+    Mma1 mma1(shared_storage.mma1_main_loop, thread_idx, warp_idx, lane_idx);
+
+    // Construct thread-scoped matrix multiply for Mma2
+    Mma2 mma2(shared_storage.mma2_main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma1::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+    int gemm_k_iterations_mma1 = gemm_k_iterations;
+    int gemm_k_iterations_mma2 = gemm_k_iterations;
+
+
+    /******************************************************************************************************
+     * SYMM (Side Mode, Fill Mode) is made of two TRMMs:
+      First TRMM (Mma1: Side Mode, Fill Mode, Non-Unit Diag): (A * B) or (B * A)
+      Second TRMM (Mma2: Side Mode, Inverted Fill Mode, Unit Diag): (AT * B) or (B * AT)
+
+     * For the first TRMM (Mma1) of SYMM, the following method is used to calculate the k-iterations:
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+
+      * For the second TRMM (Mma2) of SYMM, the k-iterations and threadblock offsets are calculated 
+        the same way as the first TRMM (Mma1) of same side mode but with inverted fill mode. 
+        For example, if the first TRMM is left sided with lower fill, the second TRMM would be 
+        left sided with upper fill.
+    ********************************************************************************************************/
+
+    if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+      
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 < gemm_k_iterations) {
+        gemm_k_iterations_mma1  = k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 != 0) {
+        tb_offset_MxK_mma2 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma2 * Mma1::Shape::kK});
+        tb_offset_KxN_mma2 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma2 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma2 -= k_iterations_till_diagonal_mma2;
+      }
+
+    } else if (kSideModeA == SideMode::kLeft && kFillModeA == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.m()) * Mma1::Shape::kM) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1  -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.m() + 1) * Mma1::Shape::kM + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }      
+
+    } else if (kSideModeA == SideMode::kRight && kFillModeA == FillMode::kLower) {
+
+      int k_iterations_till_diagonal_mma1 = ((threadblock_tile_offset.n()) * Mma1::Shape::kN) / Mma1::Shape::kK;
+
+      if (k_iterations_till_diagonal_mma1 != 0) {
+        tb_offset_MxK_mma1 += cutlass::MatrixCoord({0, k_iterations_till_diagonal_mma1 * Mma1::Shape::kK});
+        tb_offset_KxN_mma1 += cutlass::MatrixCoord({k_iterations_till_diagonal_mma1 * Mma1::Shape::kK, 0});
+        gemm_k_iterations_mma1 -= k_iterations_till_diagonal_mma1;
+      }
+
+      int k_iterations_till_diagonal_mma2 = ((threadblock_tile_offset.n() + 1) * Mma1::Shape::kN + Mma1::Shape::kK - 1) / Mma1::Shape::kK;
+      if (k_iterations_till_diagonal_mma2 < gemm_k_iterations) {
+        gemm_k_iterations_mma2  = k_iterations_till_diagonal_mma2;
+      }
+
+    }
+
+    // Construct iterators to A and B operands for Mma1
+    typename Mma1::IteratorA iterator_A_mma1(
+      params.params_A_mma1,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma1);
+
+    typename Mma1::IteratorB iterator_B_mma1(
+      params.params_B_mma1,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma1);
+
+    // Construct iterators to A and B operands for Mma2
+    typename Mma2::IteratorA iterator_A_mma2(
+      params.params_A_mma2,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_MxK_mma2);
+
+    typename Mma2::IteratorB iterator_B_mma2(
+      params.params_B_mma2,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_KxN_mma2);
+
+    // Compute threadblock-scoped matrix multiply-add (A x B) or (B x A)
+    mma1(
+      gemm_k_iterations_mma1, 
+      accumulators, 
+      iterator_A_mma1, 
+      iterator_B_mma1, 
+      accumulators);
+
+    // Compute threadblock-scoped matrix multiply-add (AT x B) or (B x AT)
+    mma2(
+      gemm_k_iterations_mma2, 
+      accumulators, 
+      iterator_A_mma2, 
+      iterator_B_mma2, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma1::Shape::kM,
+      threadblock_tile_offset.n() * Mma1::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_C = static_cast<ElementC *>(params.ptr_C); 
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_C += threadblock_tile_offset.k() * params.batch_stride_C;
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_C = static_cast<ElementC * const *>(params.ptr_C)[threadblock_tile_offset.k()];
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    // Tile iterator loading from source tensor.
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_C,
+      ptr_C,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d78bc4b056c61e9cc27f6e17a578d631b62aeb4e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -0,0 +1,423 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Utilities for selecting default tile schedulers
+*/
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm {
+
+//
+// Tags for specifying tile schedulers
+//
+
+struct PersistentScheduler { };
+
+struct StreamKScheduler { };
+
+struct GroupScheduler { }; // Only used for Grouped GEMMs
+
+struct DynamicPersistentScheduler { };
+
+struct StaticPersistentScheduler { };
+
+} // namespace cutlass::gemm
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/kernel/sm90_tile_scheduler.hpp"
+#include "cutlass/gemm/kernel/sm100_static_tile_scheduler.hpp" 
+
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp"
+#include "cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp"
+#include "cutlass/gemm/kernel/sm100_tile_scheduler.hpp"            
+#include "cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp"   
+#include "cutlass/gemm/kernel/sm100_tile_scheduler_group.hpp"      
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::kernel::detail {
+
+//
+// Selectors mapping tile scheduler tag and arch tag to a tile scheduler class
+//
+
+template <
+  class TileSchedulerTag,
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+  , uint32_t SchedulerPipelineStageCount = 2 
+  , class ProblemShapeType = void
+>
+struct TileSchedulerSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a tile scheduler for given parameters.");
+};
+
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+  , uint32_t SchedulerPipelineStageCount     
+>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    ArchTag,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+  > {
+  using Scheduler = PersistentTileSchedulerSm90;
+};
+
+// Default (void) for Sm90 maps to PersistentTileSchedulerSm90
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape
+  , uint32_t SchedulerPipelineStageCount     
+>
+struct TileSchedulerSelector<
+    void,
+    ArchTag,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+  > {
+  using Scheduler = typename TileSchedulerSelector<
+      PersistentScheduler,
+      ArchTag,
+      TileShape,
+      ClusterShape
+      , SchedulerPipelineStageCount            
+  >::Scheduler;
+};
+
+template <
+  class TileShape,
+  class ClusterShape
+  , uint32_t SchedulerPipelineStageCount     
+>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+  > {
+  using Scheduler = PersistentTileSchedulerSm90StreamK<TileShape, ClusterShape>;
+};
+
+template <
+  class ArchTag,
+  class TileShape,
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount     
+>
+struct TileSchedulerSelector<
+    StaticPersistentScheduler,
+    ArchTag,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+  > {
+  using Scheduler = PersistentTileSchedulerSm90;
+};
+
+template <
+  class TileShape,
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount, 
+  class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm90,
+    TileShape,
+    ClusterShape
+    , SchedulerPipelineStageCount              
+    , GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
+};
+
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// Ptr-Array kernel may provide a specialized ArrayProblemShape type
+template <class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class ProblemShape>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    ProblemShape> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// Default (void) for Sm100 maps to PersistentTileSchedulerSm100
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    void,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+    using Scheduler = PersistentTileSchedulerSm100<
+                ClusterShape,
+                SchedulerPipelineStageCount
+                >;
+};
+
+// Default (void) for Sm100 maps to PersistentTileSchedulerSm100
+// Ptr-Array kernel may provide a specialized ArrayProblemShape type
+template <class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class ProblemShape>
+struct TileSchedulerSelector<
+    void,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    ProblemShape> {
+  using Scheduler = typename TileSchedulerSelector<
+      PersistentScheduler,
+      arch::Sm100,
+      TileShape,
+      ClusterShape,
+      SchedulerPipelineStageCount>::Scheduler;
+};
+
+// SM100 Group tile scheduler
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm100Group<GroupProblemShape, SchedulerPipelineStageCount>;
+};
+
+// SM100 stream-K scheduler
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100StreamK<
+                        TileShape,
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// SM100 dynamic tile scheduler
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    DynamicPersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount
+>
+struct TileSchedulerSelector<
+    StaticPersistentScheduler,
+    arch::Sm100,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = StaticPersistentTileScheduler100;
+};
+
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm103,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// Ptr-Array kernel may provide a specialized ArrayProblemShape type
+template <class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class ProblemShape>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm103,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    ProblemShape> {
+  using Scheduler = PersistentTileSchedulerSm100<
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// SM103 Group tile scheduler
+template <
+  class TileShape,
+  class ClusterShape,
+  uint32_t SchedulerPipelineStageCount,
+  class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm103,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm100Group<GroupProblemShape, SchedulerPipelineStageCount>;
+};
+
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm103,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100StreamK<
+                        TileShape,
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// Default (void) for Sm120 maps to PersistentTileSchedulerSm100
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    void,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+    using Scheduler = PersistentTileSchedulerSm100<
+                ClusterShape,
+                SchedulerPipelineStageCount
+                >;
+};
+
+// PersistentScheduler for Sm120 maps to PersistentTileSchedulerSm100
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    PersistentScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100<ClusterShape, SchedulerPipelineStageCount>;
+};
+
+
+// StreamKScheduler for Sm120 maps to PersistentTileSchedulerSm100StreamK
+template <class TileShape, class ClusterShape, uint32_t SchedulerPipelineStageCount>
+struct TileSchedulerSelector<
+    StreamKScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount> {
+  using Scheduler = PersistentTileSchedulerSm100StreamK<
+                        TileShape,
+                        ClusterShape,
+                        SchedulerPipelineStageCount>;
+};
+
+// SM120 Group tile scheduler
+template <
+  class TileShape,
+  class ClusterShape, 
+  uint32_t SchedulerPipelineStageCount, 
+  class GroupProblemShape
+>
+struct TileSchedulerSelector<
+    GroupScheduler,
+    arch::Sm120,
+    TileShape,
+    ClusterShape,
+    SchedulerPipelineStageCount,
+    GroupProblemShape
+  > {
+  using Scheduler = PersistentTileSchedulerSm90Group<GroupProblemShape, SchedulerPipelineStageCount>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel::detail
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1d192c13a45dff4c0082ab8610e6b94dca13996
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_detail.hpp
@@ -0,0 +1,88 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+namespace cutlass::gemm::kernel::detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+enum class RasterOrder {
+  AlongM,
+  AlongN
+};
+
+enum class RasterOrderOptions {
+  Heuristic,
+  AlongM,
+  AlongN
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Strategies for computing reductions between CTAs computing portions of a given output tile
+enum class ReductionMode {
+  // Participating CTAs perform reduction in a turnstile fashion in order of the K extent
+  // covered by each CTA. This requires a lock to be held exclusively by the CTA that is
+  // currently accumulating.
+  //
+  // Turnstile accumulation ensures deterministic numeric behavior when using this mode.
+  Deterministic,
+
+  // Participating CTAs perform reduction atomically to the same workspace (mostly) without locking.
+  // Locks are used only to wait for the first CTA to write its partial values (to initialize the
+  // workspace), and for all but the final CTA to have accumulated (so that the final CTA can load
+  // the accumulated value and accumulate it into registers on top of which the epilogue will
+  // be performed).
+  //
+  // Due to the nondeterminsitic ordering of accumulation, deterministic numeric behavior cannot
+  // be guaranteed with this mode (e.g., floating-point rounding error will depend on the order
+  // of accumulation)
+  Nondeterministic
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Strategies for decomposing the problem
+enum class DecompositionMode {
+  // Use a heuristic to determine whether data-parallel, split-K, or stream-K decomposition should be performed
+  Heuristic,
+  // Force a data-parallel decomposition
+  DataParallel,
+  // Force a split-K decomposition. This should be paired with setting the `splits` parameter
+  SplitK,
+  // Force a stream-K decomposition
+  StreamK
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::kernel::detail
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..96037b121470b8d0c841dd876f1c4802ba1afd52
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -0,0 +1,2609 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/*! \file
+    \brief Parameters structures for persistent tile schedulers
+*/
+
+#include "cutlass/coord.h"
+#include "cutlass/kernel_hardware_info.h"
+#include "cutlass/workspace.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm_coord.h"
+#include "cutlass/gemm/kernel/tile_scheduler_detail.hpp"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+static uint32_t
+get_max_cta_occupancy(int max_sm_per_gpc, GemmCoord cluster_shape, int sm_count) {
+  // Provided SM count could possibly be less than the assumed maximum SMs per GPC
+  auto cluster_size = cluster_shape.m() * cluster_shape.n();
+  int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
+  int const max_cta_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % cluster_size);
+  int cta_per_device = min_num_gpc * max_cta_occupancy_per_gpc;
+  // Suppose max_sm_per_gpc = 20, cluster_size = 8, sm_count = 148
+  // min_num_gpc = 148 / 20 = 7
+  // max_cta_occupancy_per_gpc = 20 - (20 % 8) = 16
+  // cta_per_device = 7 * 16 = 112
+  // num_gpc_residual = 148 % 20 = 8
+  // max_cta_occupancy_per_residual_gpc = 8 - (8 % 8) = 8
+  // cta_per_device += 8 = 120
+  // cta_per_device = 120 < 148 ? 148 : 120 = 148
+
+  // The calculation below allows for larger grid size launch for different GPUs.
+  int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+  int const max_cta_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % cluster_size);
+  cta_per_device += max_cta_occupancy_per_residual_gpc;
+
+  cta_per_device = sm_count < cta_per_device ? sm_count : cta_per_device;
+  return cta_per_device;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters for SM90 tile schedulers
+//
+
+// Parameters for SM90 persistent tile scheduler
+struct PersistentTileSchedulerSm90Params {
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  uint64_t blocks_per_problem_ = 0;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  uint32_t problem_tiles_m_ = 0;
+  uint32_t problem_tiles_n_ = 0;
+  uint32_t problem_tiles_l_ = 0;
+  uint32_t cluster_shape_m_ = 0;
+  uint32_t cluster_shape_n_ = 0;
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    problem_tiles_m_ = problem_blocks_m / cluster_shape.m();
+    problem_tiles_n_ = problem_blocks_n / cluster_shape.n();
+    problem_tiles_l_ = problem_blocks.z;
+    cluster_shape_m_ = cluster_shape.m();
+    cluster_shape_n_ = cluster_shape.n();
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+
+    blocks_per_problem_ = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+    divmod_batch_ = FastDivmodU64(problem_blocks_m * problem_blocks_n);
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_n / cluster_shape.n());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_blk_major_ = FastDivmodU64(problem_blocks_m / cluster_shape.m());
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true,
+    bool bypass_sm90_occupancy_calculation=false 
+    ) {
+
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      truncate_by_problem_size,
+      bypass_sm90_occupancy_calculation 
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true,
+    bool bypass_sm90_occupancy_calculation=false 
+    ) {
+
+    int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    // In case the maximum number of clusters that could co-exist on the target device is
+    // already calculated using cudaOccupancyMaxActiveClusters
+    else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            max_active_clusters * cluster_shape.n(),
+            problem_blocks_total / cluster_shape.m());
+
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            max_active_clusters * cluster_shape.m(),
+            problem_blocks_total / cluster_shape.n());
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
+    else {
+      int cta_per_device = sm_count;
+      if (!bypass_sm90_occupancy_calculation) { 
+        /*
+        * Optimal grid size calculation is based on
+        * GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+        * Hence, maximum SMs per GPC = 18
+        */
+        constexpr int max_sm_per_gpc = 18;
+        cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
+      } 
+
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using heuristics = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) {
+    auto cta_m = (problem_shape.m() + cta_shape.m() - 1) / cta_shape.m();
+    auto cta_n = (problem_shape.n() + cta_shape.n() - 1) / cta_shape.n();
+
+    return get_tiled_cta_shape_mnl(problem_shape, cluster_shape, cta_m, cta_n);
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(BatchedGemmCoord problem_shape, GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(problem_blocks_m),
+      static_cast<uint32_t>(problem_blocks_n),
+      static_cast<uint32_t>(problem_shape.batch())
+    };
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent stream-K scheduler
+struct PersistentTileSchedulerSm90StreamKParams {
+  using ReductionMode = cutlass::gemm::kernel::detail::ReductionMode;
+  using DecompositionMode = cutlass::gemm::kernel::detail::DecompositionMode;
+
+
+  using UnderlyingParams = PersistentTileSchedulerSm90Params;
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+
+  // Cluster dimensions are typically always a power of 2, so use
+  // the power-of-two variants of FastDivmod for these.
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+
+  FastDivmodU64 divmod_batch_{};
+  FastDivmodU64 divmod_cluster_blk_major_{};
+
+  // Total number of cluster-sized output tiles (i.e., not including any
+  // splitting factors). This is primarily used for split-K decompositions,
+  // and may be overridden in other decompositions.
+  FastDivmodU64 divmod_clusters_mnl_{};
+
+  // We divide up the number of stream-K tiles amongst G groups of stream-K units.
+  // The stream-K units within a group collaborate to compute over the `sk_tiles / G`
+  // tiles assigned to that group. Non-unit group sizes can help to preserve L2 locality of
+  // partial chunks computed by stream-K units -- units 0 in each group will compute identical K extents
+  // of tiles that would be assigned in the same wave according to the rasterization order of the
+  // data-parallel formulation of the problem.
+  FastDivmodU64 divmod_sk_groups_{};
+
+  // Number of stream-K units in each group
+  FastDivmodU64 divmod_sk_units_per_group_{};
+
+  uint64_t units_per_problem_ = 0;
+  FastDivmod divmod_tiles_per_output_tile_{};
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  // The splitting factor to be used in a split-K decomposition of the problem.
+  // If this is set to a value greater than 1, stream-K decomposition logic
+  // is bypassed in favor of a split-K decomposition.
+  FastDivmod divmod_splits_{};
+
+  // Number of stream-K or split-K work units that compute an extra k iteration.
+  // This is done to handle residuals in dividing up the k iteration space.
+  // For stream-K, since the actual assignment of work to stream-K units will be done
+  // at the granularity of a cluster, we store only the number of big clusters.
+  uint32_t big_units_ = 0;
+
+  // The number of groups of stream-K units that will process an extra stream-K tile cluster.
+  uint32_t big_groups_ = 0;
+
+  // Workspace for holding partial accumulators to be reduced across stream-K/split-K units
+  void* reduction_workspace_ = nullptr;
+
+  // Number of tiles covered by stream-K work units
+  uint32_t sk_tiles_ = 0;
+
+  // Number of work units computing stream-K tiles
+  uint32_t sk_units_ = 0;
+
+  // Number of tiled k iterations computed by each stream-K work unit. This
+  // can potentially cover more than one output tile.
+  FastDivmod divmod_k_tiles_per_sk_unit_{};
+  // Number of tiled k iterations computed by each "big" stream-K units, which
+  // processes one more K chunk than a "normal" stream-K unit.
+  FastDivmod divmod_k_tiles_per_sk_big_unit_{};
+
+  // Strategy to use when reducing between collaborating CTAs
+  ReductionMode reduction_mode_ = ReductionMode::Deterministic;
+
+  // The number of sub blocks in the kernel epilogue
+  FastDivmodU64 divmod_epilogue_subtile_{};
+
+  // The number of blocks that launched for doing separate reduction
+  uint32_t separate_reduction_units_ = 0;
+
+  // Minimum number of k tiles that can be assigned to a stream-K unit
+  static constexpr uint32_t min_iters_per_sk_unit_ = 8u;
+
+  // Maximum number of groups of stream-K units
+  static constexpr uint32_t max_sk_groups_ = 8u;
+
+  // ktile start from even for each cta
+  uint32_t ktile_start_alignment_count_ { 1u };
+
+  // Divides dividend by the cluster size
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  div_cluster_size(uint64_t dividend) const {
+    // Use each underlying fast divmod rather than performing integer division
+    // by the multiplication of major.divisor * minor.divisor
+    return divmod_cluster_shape_minor_.divide(
+      divmod_cluster_shape_major_.divide(dividend)
+    );
+  }
+
+  
+  // Divides dividend by the cluster size in the M dimension
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  truncate_to_cluster_size_m(uint64_t dividend) const {
+    if (raster_order_ == RasterOrder::AlongN) {
+      return divmod_cluster_shape_minor_.divide(dividend) * divmod_cluster_shape_minor_.divisor;
+    }
+    else {
+      return divmod_cluster_shape_major_.divide(dividend) * divmod_cluster_shape_major_.divisor;
+    }
+  }
+
+  // Divides dividend by the cluster size in the N dimension
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  truncate_to_cluster_size_n(uint64_t dividend) const {
+    if (raster_order_ == RasterOrder::AlongM) {
+      return divmod_cluster_shape_minor_.divide(dividend) * divmod_cluster_shape_minor_.divisor;
+    }
+    else {
+      return divmod_cluster_shape_major_.divide(dividend) * divmod_cluster_shape_major_.divisor;
+    }
+  }
+  
+
+  CUTLASS_HOST_DEVICE
+  uint64_t
+  get_cluster_size() const {
+    return divmod_cluster_shape_minor_.divisor * divmod_cluster_shape_major_.divisor;
+  }
+
+  // Returns whether the kernel uses separate reduction
+  CUTLASS_HOST_DEVICE
+  bool
+  requires_separate_reduction() const {
+    return separate_reduction_units_ > 0;
+  }
+
+  // Returns the maximum number of peers that can collaborate on a given output tile
+  CUTLASS_HOST_DEVICE
+  static uint32_t
+  max_peers_per_tile(uint64_t sk_units, uint64_t sk_tiles) {
+    // When we can divide up our SK units to SK tiles evenly, the number of peers
+    // per SK tile is exactly (sk_units_ / sk_tiles_). In cases where this division
+    // is not exact, some tiles will need to be covered by additional SK units. Because
+    // the extra work can occur at both the beginning and the end of the SK tile, at
+    // most 2 extra peers will be needed.
+    return static_cast<uint32_t>(sk_units / sk_tiles + 2);
+  }
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1u,
+    uint32_t ktile_start_alignment_count = 1u,
+    bool bypass_sm90_occupancy_calculation=false
+  ) {
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
+      problem_shape, tile_shape, cluster_shape);
+
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      reduction_mode,
+      decomposition_mode,
+      workspace,
+      epilogue_subtile,
+      ktile_start_alignment_count,
+      bypass_sm90_occupancy_calculation
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    const uint32_t epilogue_subtile = 1,
+    uint32_t ktile_start_alignment_count = 1u,
+    bool bypass_sm90_occupancy_calculation=false
+  ) {
+
+    #if !defined(__CUDACC_RTC__)
+    if (hw_info.sm_count <= 0) {
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+    #endif // !defined(__CUDACC_RTC__) 
+
+    ktile_start_alignment_count_ = ktile_start_alignment_count; 
+    UnderlyingParams underlying_params;
+    underlying_params.initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+
+    // Set basic parameters that not affected by any heuristics in advance.
+    set_params_base(underlying_params, workspace);
+
+    // Call for internal streamk heuristic to setup streamk related params
+    stream_k_heuristic(
+      underlying_params,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      epilogue_subtile,
+      ktile_start_alignment_count,
+      bypass_sm90_occupancy_calculation
+    ); 
+  }
+  
+  // max_sk_groups_ unless this extends beyond the extent of the dimension over
+  // which the problem is rasterized. For example, if the tiled problem shape
+  // (in CTA_M x CTA_N representation) when using 1x1 clusters is 4x16,
+  // and we rasterize along the M dimension, we choose 4 groups, rather than 8.
+  // If the cluster shape is 2x1, we choose 2 groups (CTA_M / CLUSTER_M).
+  uint32_t calculate_groups(
+    UnderlyingParams underlying_params,
+    ReductionMode reduction_mode,
+    uint32_t problem_blocks_m,
+    uint32_t problem_blocks_n,
+    GemmCoord cluster_shape,
+    uint64_t cluster_size,
+    uint32_t sk_tiles,
+    uint64_t sk_cluster_tiles,
+    uint64_t sk_units,
+    uint32_t k_tiles_per_output_tile,
+    bool do_separate_reduction) {
+
+    uint32_t max_groups_problem;
+    if (underlying_params.raster_order_ == RasterOrder::AlongM) {
+      max_groups_problem = problem_blocks_m / cluster_shape.m();
+    }
+    else {
+      max_groups_problem = problem_blocks_n / cluster_shape.n();
+    }
+
+    // Select the number of groups that will be use. We start with the maximum
+    // number of potential groups, and iterate down looking for a group size that
+    // evenly divides the stream-K units and tiles, and for which the resulting
+    // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
+
+    uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
+    // Grouping is disabled when separate reduction is used because grouping is primarily an attempt
+    // to improve L2 locality, and L2-locality optimizations are unnecessary when the the kernel
+    // is a single wave (which is the case for separate reduction).
+    if (
+      do_separate_reduction
+      ) {
+      groups = 1;
+    }
+
+    uint32_t fallback_groups = 0;
+    auto sk_cluster_units = sk_units / cluster_size;
+
+    auto sk_splits_too_small = [&](uint32_t g) {
+      // Check whether the number of K tiles computed per stream-K unit is less
+      // than min_iters_per_sk_unit_
+      auto total_sk_cluster_tiles = (sk_cluster_tiles / g) * cluster_size;
+      auto total_sk_k_tiles = total_sk_cluster_tiles * k_tiles_per_output_tile;
+      auto k_tiles_per_sk_unit = total_sk_k_tiles / (sk_units / g);
+      return k_tiles_per_sk_unit < min_iters_per_sk_unit_;
+    };
+
+    auto is_ideal_grouping = [&](uint32_t g) {
+      // An ideal grouping will evenly divide stream-K clusters, evenly divide
+      // stream-K tiles, and not result in stream-K splits that are too small.
+      return (sk_cluster_units % g == 0) && (sk_cluster_tiles % g == 0) && !sk_splits_too_small(g);
+    };
+
+    auto is_valid_grouping = [&](uint32_t g) {
+      // A grouping is valid, but not ideal, if it evenly divides the
+      // stream-K clusters and does not result in stream-K splits that are
+      // too small. Such a setting can be used as a fallback option in the
+      // case that an ideal grouping is not achievable
+      return sk_cluster_units % g == 0 && !sk_splits_too_small(g);
+    };
+
+    while (groups > 1 && !is_ideal_grouping(groups)) {
+      if (fallback_groups == 0 && is_valid_grouping(groups)) {
+        // Set fallback groups once in preference for a larger number of groups.
+        fallback_groups = groups;
+      }
+      --groups;
+    }
+
+    // If groups == 1, we did not find a group count that satisfies all criteria. If we have
+    // found a fallback group count, use this instead.
+    if (groups == 1 && fallback_groups > 0) {
+      groups = fallback_groups;
+    }
+    return groups;
+  }
+
+  // Stream-K kernel use below function to set stream-K feature related parameters to choose
+  // optimal/customized decomposition mode.
+  void stream_k_heuristic(
+      UnderlyingParams underlying_params,
+      dim3 problem_blocks,
+      uint32_t k_tiles_per_output_tile,
+      GemmCoord cluster_shape,
+      KernelHardwareInfo hw_info,
+      int splits,
+      int max_swizzle,
+      RasterOrderOptions raster_order_option,
+      DecompositionMode decomposition_mode,
+      ReductionMode reduction_mode,
+      const uint32_t epilogue_subtile = 1,
+      uint32_t ktile_start_alignment_count = 1u,
+      bool bypass_sm90_occupancy_calculation=false) {
+    uint32_t groups = 0;
+    uint32_t sk_tiles = 0;
+    uint64_t sk_units = 0;
+    uint64_t cluster_size = 0;
+    uint64_t dp_units = 0;
+    uint64_t k_tiles_per_group = 0;
+    uint64_t k_tiles_per_sk_unit = 0;
+    uint64_t sk_big_groups = 0;
+    uint32_t sk_splits = 1;
+    // Self calculated optimal heuristic mode
+    DecompositionMode heuristic_mode =
+      select_decomposition_mode(
+        groups,
+        sk_tiles,
+        sk_units,
+        cluster_size,
+        dp_units,
+        k_tiles_per_group,
+        k_tiles_per_sk_unit,
+        sk_big_groups,
+        sk_splits,
+        underlying_params,
+        problem_blocks,
+        k_tiles_per_output_tile,
+        cluster_shape,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        epilogue_subtile,
+        ktile_start_alignment_count,
+        bypass_sm90_occupancy_calculation
+      );
+
+    // Given heuristic_mode returned from the heuristic() method, set params fields.
+    // Here, we decouple the params that have no relation with
+    // decomposition mode from the params that are decided within heuristic().
+    set_params(
+      heuristic_mode,
+      groups,
+      sk_tiles,
+      sk_units,
+      cluster_size,
+      dp_units,
+      k_tiles_per_group,
+      k_tiles_per_sk_unit,
+      sk_big_groups,
+      sk_splits,
+      underlying_params,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      splits,
+      epilogue_subtile,
+      reduction_mode,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Return the optimal decomposition result by heuristic.
+  DecompositionMode select_decomposition_mode(
+    uint32_t &groups,
+    uint32_t &sk_tiles,
+    uint64_t &sk_units,
+    uint64_t &cluster_size,
+    uint64_t &dp_units,
+    uint64_t &k_tiles_per_group,
+    uint64_t &k_tiles_per_sk_unit,
+    uint64_t &sk_big_groups,
+    uint32_t &sk_splits,
+    UnderlyingParams underlying_params,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t epilogue_subtile,
+    uint32_t ktile_start_alignment_count,
+    bool bypass_sm90_occupancy_calculation=false
+  ) {
+
+    // Get block numbers in m, n and l dimensions
+    if (decomposition_mode == DecompositionMode::SplitK ||
+        (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+      // Short circuit to basic split-K decomposition
+      uint32_t adapted_splits = adjust_split_count(
+        splits, hw_info.sm_count, k_tiles_per_output_tile
+        , ktile_start_alignment_count 
+      );
+      sk_splits = adapted_splits;
+      return DecompositionMode::SplitK;
+    }
+    else {
+      // Calculate the maximum number of blocks from clusters of shape cluster_shape that we
+      // can fit within sm_count SMs.
+      // Get block numbers in m, n and l dimensions
+      auto problem_blocks_l = problem_blocks.z;
+      auto problem_blocks_m = round_up(problem_blocks.x, (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
+      auto problem_blocks_n = round_up(problem_blocks.y, (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
+      uint64_t output_tiles = problem_blocks_m * problem_blocks_n * problem_blocks_l;
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        hw_info,
+        max_swizzle,
+        raster_order_option,
+        bypass_sm90_occupancy_calculation
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      cluster_size = cluster_shape.m() * cluster_shape.n();
+      uint64_t ctas_per_wave_in_full_clusters = (ctas_per_wave / cluster_size) * cluster_size; 
+
+      // The number of output tiles to be computed in stream-K and data-parallel fashion, respectively.
+      sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        cluster_size,
+        k_tiles_per_output_tile,
+        decomposition_mode,
+        ctas_per_wave_in_full_clusters 
+      );
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+      // Calculate the number of work units covering the data-parallel and stream-K tiles.
+      // A "work unit" is a single index in the linearized ID space used by the scheduler.
+      // We distinguish it from a "block," which is typically tied to a hardware unit
+      // (e.g., the callers into this scheduler will be persistent thread blocks).
+      // A work unit can encompass multiple output tiles worth of work (as will be the
+      // case for stream-K blocks).
+      // Since splitting is not required for data-parallel tiles, only one data-parallel unit
+      // is needed per data-parallel tile.
+      dp_units = dp_tiles;
+
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      ctas_per_sk_wave = ctas_per_wave_in_full_clusters; 
+      sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+
+      if (decomposition_mode == DecompositionMode::DataParallel ||
+          (decomposition_mode == DecompositionMode::Heuristic && sk_tiles == 0) ||
+          sk_units == 0) {
+        // Short circuit to basic data-parallel decomposition
+        return DecompositionMode::DataParallel;
+      }
+      else {
+        bool do_separate_reduction = should_perform_separate_reduction(
+          epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave);
+        
+        uint64_t sk_cluster_tiles = sk_tiles / cluster_size;
+
+        groups = calculate_groups(underlying_params, reduction_mode, problem_blocks_m, problem_blocks_n, cluster_shape,
+          cluster_size, sk_tiles, sk_cluster_tiles, sk_units, k_tiles_per_output_tile, do_separate_reduction);
+
+        auto sk_units_per_group = sk_units / groups;
+
+        // sk_tiles is guaranteed to be divisible by cluster_size because it is calculated as:
+        //    sk_tiles = (waves <= 2) ? total_tiles : (sm_count + (total_tiles % sm_count))
+        // Both total_tiles and sm_count are multiples of cluster size due to padding added
+        // prior to kernel launch.
+        uint64_t sk_cluster_tiles_per_group = sk_cluster_tiles / groups;
+        uint64_t sk_tiles_per_group = sk_cluster_tiles_per_group * cluster_size;
+
+        // Groups that will process an extra stream-K tile cluster. These differ from "big_units," which
+        // are stream-K units within a group that process an extra K chunk.
+        sk_big_groups = sk_cluster_tiles % groups;
+
+        k_tiles_per_group = k_tiles_per_output_tile * sk_tiles_per_group;
+
+        // Number of k tiles computed per stream-K unit
+        k_tiles_per_sk_unit = k_tiles_per_group / sk_units_per_group;
+
+        DecompositionMode heuristic_mode;
+        if (decomposition_mode == DecompositionMode::Heuristic && sk_tiles < sk_units && sk_units % sk_tiles == 0) {
+          // If the number of stream-K units is a multiple of the number of stream-K tiles, then
+          // the problem can leverage a basic split-K decomposition for the stream-K tiles.
+          // This case happens when separate reduction is disable.
+          sk_splits = static_cast<uint32_t>(sk_units / sk_tiles);
+          heuristic_mode = DecompositionMode::SplitK;
+        }
+        else {
+          // Rest scenario is streamk
+          heuristic_mode = DecompositionMode::StreamK;
+        }
+        // Refresh heuristic_mode using analytical model before choosing streamk/separate_reduction decomposition,
+        // ideally it's to get the final decomposition more accuracy. Comment it as it is place holder at this moment.
+        #if 0
+        uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+        analytical_model(heuristic_mode, k_tiles_per_output_tile, k_tiles_per_sk_unit,
+          sk_splits, epilogue_subtile, total_waves);
+        #endif
+        return heuristic_mode;
+      }
+    }
+  }
+
+  // Given decomposition mode output from heuristic, set all fields of params.
+  void set_params(
+    DecompositionMode heuristic_mode,
+    uint32_t groups,
+    uint32_t sk_tiles,
+    uint64_t sk_units,
+    uint64_t cluster_size,
+    uint64_t dp_units,
+    uint64_t k_tiles_per_group,
+    uint64_t k_tiles_per_sk_unit,
+    uint64_t sk_big_groups,
+    uint32_t sk_splits,
+    UnderlyingParams underlying_params,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord cluster_shape,
+    uint32_t splits,
+    uint32_t epilogue_subtile,
+    ReductionMode reduction_mode
+    , uint32_t ktile_start_alignment_count 
+    ) {
+    // The highest priority when customers set as splitk mode, may set
+    // with a adapted splits value rather than the original splits
+    // even it does not make sense
+    if (splits > 1 && heuristic_mode == DecompositionMode::SplitK) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        sk_splits, // split-k set by customers
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else if (heuristic_mode == DecompositionMode::DataParallel) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        1, // fast path to fall back to the mode without any split scheme
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else if (heuristic_mode == DecompositionMode::SplitK) {
+      set_params_basic(
+        underlying_params,
+        problem_blocks,
+        cluster_shape,
+        sk_splits, // splits calculated by heuristic
+        k_tiles_per_output_tile,
+        reduction_mode
+      );
+    }
+    else {
+      // streamk
+      set_params_stream_k(
+        underlying_params,
+        k_tiles_per_output_tile,
+        groups,
+        sk_tiles,
+        sk_units,
+        cluster_size,
+        dp_units,
+        k_tiles_per_group,
+        k_tiles_per_sk_unit,
+        sk_big_groups,
+        reduction_mode,
+        1, /*epilogue_subtile*/
+        0  /*reduction_units*/
+      );
+    }
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool bypass_sm90_occupancy_calculation=false
+  ) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+
+    return get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      bypass_sm90_occupancy_calculation
+    );
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool bypass_sm90_occupancy_calculation=false
+  ) {
+
+    // Call into the underlying get_grid_shape method, but do not allow the grid shape returned
+    // to be truncated based on the number of output tiles in the problem.
+    return UnderlyingParams::get_grid_shape(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option,
+      /* truncate_by_problem_size = */false,
+      bypass_sm90_occupancy_calculation 
+    );
+  }
+
+  // Returns the number of stream-K tiles that will be computed amongst `output_tiles` total
+  // output tiles on a device with `ctas_per_wave` CTAs in each wave.
+  static uint32_t
+  get_num_sk_tiles(
+    uint64_t output_tiles,
+    uint64_t ctas_per_wave,
+    uint64_t cluster_size,
+    uint32_t k_tiles_per_output_tile,
+    DecompositionMode decomposition_mode
+    , uint64_t ctas_per_wave_in_full_clusters 
+  ) {
+    uint32_t full_waves = static_cast<uint32_t>(output_tiles / ctas_per_wave);
+    uint32_t total_waves = static_cast<uint32_t>((output_tiles + ctas_per_wave - 1) / ctas_per_wave);
+
+    if (decomposition_mode == DecompositionMode::DataParallel ||
+        decomposition_mode == DecompositionMode::SplitK) {
+      return 0;
+    }
+
+    // If there is wave quantization, assign the first two waves worth of tiles to be
+    // covered by stream-K work and the remainder to be data-parallel. Since we know
+    // that full_waves == total_waves - 1 in this case, the number of data-parallel
+    // waves is simply full_waves-1 (unless full_waves == 0).
+    uint32_t dp_waves = full_waves > 1 ? full_waves - 1 : 0;
+    uint64_t dp_tiles = dp_waves * ctas_per_wave;
+    uint64_t sk_tiles = output_tiles - dp_tiles;
+
+    if (full_waves == total_waves || k_tiles_per_output_tile <= min_iters_per_sk_unit_) {
+      // All tiles will be data-parallel tiles if there is either no quantization
+      // or if there is no work to be split.
+      return 0;
+    }
+
+    //
+    // The final wave is not full. Perform some stream-K work.
+    //
+    if (decomposition_mode == DecompositionMode::Heuristic) {
+      // Rudimentary heuristic: prefer data-parallel decomposition if we have more than
+      // one wave and the tail wave is more than half full. This is subject to change.
+      uint64_t tail_tiles = output_tiles - (full_waves * ctas_per_wave);
+      if (2 * tail_tiles >= ctas_per_wave) {
+        return 0;
+      }
+    }
+    // Ensure that the number of SK tiles is divisible by cluster size so that it can be evenly
+    // divided among SK clusters.
+    sk_tiles = (sk_tiles / cluster_size) * cluster_size;
+
+    return static_cast<uint32_t>(sk_tiles);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static uint64_t
+  get_num_sk_units(GemmCoord cluster_shape, uint64_t ctas_per_sk_wave, uint32_t sk_tiles, uint32_t k_tiles_per_output_tile) {
+    // If there are stream-K tiles to compute and a sufficiently large number of k iterations
+    // across them, they will be covered by a single wave of persistent threadblocks. Thus, there
+    // will be as many work units as there are threadblocks in a single wave.
+    //
+    // When the total k iterations across stream-K tiles is too small to justify distributing
+    // across an entire wave of blocks, we instead distribute the iterations over a smaller
+    // set of blocks.
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+
+    // Number of k iterations computed by the stream-K units as a whole
+    uint64_t k_tiles_sk_total = k_tiles_per_output_tile * sk_tiles;
+
+    // Calculate the number of stream-K units that would be needed if each stream-K unit
+    // computed the minimum allowable k iterations. Truncate this to be in units of clusters.
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    uint64_t min_sized_sk_units = (k_tiles_sk_total / min_iters_per_sk_unit_);
+    min_sized_sk_units = (min_sized_sk_units / cluster_size) * cluster_size;
+
+    uint64_t sk_units = platform::min(ctas_per_sk_wave, min_sized_sk_units);
+    return sk_units;
+  }
+
+  // Calculates the size of the workspace needed for holding reduction barriers
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_barrier_workspace_size(uint64_t num_tiles, uint32_t mma_warp_groups, uint32_t barrier_bits) {
+    size_t workspace_bits = num_tiles * static_cast<size_t>(mma_warp_groups) * static_cast<size_t>(barrier_bits);
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  // Calculates the size of the workspace needed for holding partial outputs from splits
+  CUTLASS_HOST_DEVICE
+  static size_t
+  get_reduction_workspace_size(uint64_t num_tiles, GemmCoord tile_shape, uint32_t accumulator_bits, uint32_t num_accumulator_mtxs = 1) {
+    size_t output_tile_size = tile_shape.m() * tile_shape.n();
+    size_t workspace_bits = accumulator_bits * output_tile_size * num_tiles * num_accumulator_mtxs;
+    return round_up_to_l2_alignment(bits_to_bytes<size_t>(workspace_bits));
+  }
+
+  #if !defined(__CUDACC_RTC__)
+  static void
+  get_workspace_component_sizes(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    size_t& barrier_workspace_size,
+    size_t& reduction_workspace_size,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1,
+    bool bypass_sm90_occupancy_calculation=false) {
+
+    auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
+    problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    problem_blocks.y = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    // Workspace is needed only for output tiles that will be split. Thus, we first determine the number
+    // of output tiles that will be split, and then calculate the workspace needed to cover these.
+    uint64_t output_tiles = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+
+    if (decomposition_mode == DecompositionMode::DataParallel) {
+      barrier_workspace_size = 0;
+      reduction_workspace_size = 0;
+    }
+    else {
+      KernelHardwareInfo new_hw_info;
+      new_hw_info.device_id = hw_info.device_id;
+      new_hw_info.sm_count = hw_info.sm_count;
+      new_hw_info.max_active_clusters = hw_info.max_active_clusters;
+      if (new_hw_info.sm_count <= 0) {
+        CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+            "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+        new_hw_info.sm_count = KernelHardwareInfo::query_device_multiprocessor_count(new_hw_info.device_id);
+      }
+
+      dim3 grid = get_grid_shape(
+        problem_blocks,
+        cluster_shape,
+        new_hw_info,
+        max_swizzle,
+        raster_order_option,
+        bypass_sm90_occupancy_calculation
+      );
+      uint64_t ctas_per_wave = grid.x * grid.y;
+      uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
+      uint64_t ctas_per_wave_in_full_clusters = (ctas_per_wave / cluster_size) * cluster_size; 
+      uint32_t sk_tiles = get_num_sk_tiles(
+        output_tiles,
+        ctas_per_wave,
+        cluster_size,
+        static_cast<uint32_t>(k_tiles_per_output_tile),
+        decomposition_mode
+        , ctas_per_wave_in_full_clusters 
+      );
+      uint64_t ctas_per_sk_wave = ctas_per_wave;
+      ctas_per_sk_wave = ctas_per_wave_in_full_clusters; 
+      uint64_t sk_units = get_num_sk_units(cluster_shape, ctas_per_sk_wave, sk_tiles, k_tiles_per_output_tile);
+      uint64_t dp_tiles = output_tiles - sk_tiles;
+
+      if (decomposition_mode == DecompositionMode::SplitK ||
+         (decomposition_mode == DecompositionMode::Heuristic && splits > 1)) {
+        splits = adjust_split_count(
+          splits, new_hw_info.sm_count, k_tiles_per_output_tile
+          , ktile_start_alignment_count 
+        );
+      }
+
+      bool split_k_required = splits > 1 && (decomposition_mode == DecompositionMode::SplitK || decomposition_mode == DecompositionMode::Heuristic);
+      bool split_k_selected = !split_k_required &&
+                              decomposition_mode == DecompositionMode::Heuristic &&
+                              sk_units > sk_tiles &&
+                              sk_tiles != 0 &&
+                              sk_units % sk_tiles == 0;
+
+      if (split_k_required || split_k_selected) {
+        // Basic split-K variant requires workspace for all output tiles
+        barrier_workspace_size = get_barrier_workspace_size(output_tiles, mma_warp_groups, barrier_bits);
+        reduction_workspace_size = get_reduction_workspace_size(output_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+      }
+      else {
+        uint64_t reduction_tiles = sk_tiles;
+        if (
+          should_perform_separate_reduction(epilogue_subtile, sk_units, sk_tiles, dp_tiles, ctas_per_wave)
+          ) {
+          // In separate reduction, each peer writes to its own location in scratch space.
+          // Thus, for separate reduction, we need as many reduction tiles per output tile
+          // as there are the maximum number of peers that can collaborate on an output tile.
+          reduction_tiles *= max_peers_per_tile(sk_units, sk_tiles);
+        }
+
+        // Though separate reduction requires a larger reduction workspace, only one barrier
+        // is needed per output tile. Each peer will increment the barrier by one once the peer has
+        // written its accumulator to scratch space. The separate reduction unit will only begin
+        // performing the reduction when the barrier has reached the number of peers for the output tile.
+        barrier_workspace_size = get_barrier_workspace_size(sk_tiles, mma_warp_groups, barrier_bits);
+        reduction_workspace_size = get_reduction_workspace_size(reduction_tiles, tile_shape, accumulator_bits, num_accumulator_mtxs);
+      }
+    }
+  }
+  #endif // !defined(__CUDACC_RTC__)
+
+  // Returns whether the kernel is configured in a manner for which separate reduction should be used
+  CUTLASS_HOST_DEVICE
+  static bool
+  should_perform_separate_reduction(uint32_t, uint64_t, uint64_t, uint64_t, uint64_t) {
+    // Separate reduction is temporarily disabled, pending fixes
+    return false;
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    uint32_t num_accumulator_mtxs,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1,
+    bool bypass_sm90_occupancy_calculation=false) {
+
+    size_t barrier_workspace_size = 0;
+    size_t reduction_workspace_size = 0;
+
+    #if !defined(__CUDACC_RTC__)
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs,
+        ktile_start_alignment_count,
+        bypass_sm90_occupancy_calculation
+      );
+    #endif
+
+    return barrier_workspace_size + reduction_workspace_size;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile,
+    CudaHostAdapter* cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1) {
+
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      mma_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      1,
+      cuda_adapter,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t mma_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter* cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1,
+    bool bypass_sm90_occupancy_calculation=false) {
+
+    #if !defined(__CUDACC_RTC__)
+      uint64_t barrier_workspace_size = 0;
+      uint64_t reduction_workspace_size = 0;
+
+      get_workspace_component_sizes(
+        problem_blocks,
+        k_tiles_per_output_tile,
+        tile_shape,
+        cluster_shape,
+        barrier_workspace_size,
+        reduction_workspace_size,
+        hw_info,
+        splits,
+        max_swizzle,
+        raster_order_option,
+        decomposition_mode,
+        reduction_mode,
+        mma_warp_groups,
+        barrier_bits,
+        element_accumulator_bits,
+        epilogue_subtile,
+        num_accumulator_mtxs,
+        ktile_start_alignment_count,
+        bypass_sm90_occupancy_calculation
+      );
+
+      if (barrier_workspace_size > 0) {
+        if (workspace == nullptr) {
+          return Status::kErrorWorkspaceNull;
+        }
+
+        // Only the barrier workspace needs to be cleared for stream-K.
+        // Barrier workspace follows reduction workspace.
+        uint8_t* barrier_workspace = reinterpret_cast<uint8_t*>(workspace) + reduction_workspace_size;
+        return zero_workspace(static_cast<void*>(barrier_workspace), barrier_workspace_size, stream, cuda_adapter);
+      }
+    #endif // !defined(__CUDACC_RTC__)
+
+    return Status::kSuccess;
+  }
+
+  // Set params for basic parameters, which will not affected by different decompositions.
+  void
+  set_params_base(UnderlyingParams const& underlying_params, void* reduction_workspace) {
+    divmod_cluster_shape_major_ = underlying_params.divmod_cluster_shape_major_;
+    divmod_cluster_shape_minor_ = underlying_params.divmod_cluster_shape_minor_;
+    divmod_cluster_blk_major_ = underlying_params.divmod_cluster_blk_major_;
+    log_swizzle_size_ = underlying_params.log_swizzle_size_;
+    raster_order_ = underlying_params.raster_order_;
+    reduction_workspace_ = reduction_workspace;
+  }
+
+  void
+  set_params_basic(
+    UnderlyingParams const& underlying_params,
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    uint32_t splits,
+    uint32_t k_tiles_per_output_tile,
+    ReductionMode reduction_mode) {
+
+    auto blocks_l = problem_blocks.z;
+    auto blocks_m = round_up(problem_blocks.x,
+                             (1 << underlying_params.log_swizzle_size_) * cluster_shape.m());
+    auto blocks_n = round_up(problem_blocks.y,
+                             (1 << underlying_params.log_swizzle_size_) * cluster_shape.n());
+
+    divmod_batch_ = FastDivmodU64(blocks_m * blocks_n);
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(1u);
+    auto cluster_size = underlying_params.divmod_cluster_shape_major_.divisor *
+                        underlying_params.divmod_cluster_shape_minor_.divisor;
+    divmod_clusters_mnl_ = FastDivmodU64((blocks_m * blocks_n * blocks_l) / cluster_size);
+    divmod_splits_ = FastDivmod(splits);
+    units_per_problem_ = blocks_m * blocks_n * blocks_l;
+    big_units_ = k_tiles_per_output_tile % splits;
+    reduction_mode_ = reduction_mode;
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(k_tiles_per_output_tile / splits);
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(k_tiles_per_output_tile / splits + 1);
+
+    // No stream-K work is performed for "basic" data-parallel and split-K decompositions
+    sk_tiles_ = 0;
+    sk_units_ = 0;
+    divmod_sk_units_per_group_ = FastDivmodU64(1u);
+    separate_reduction_units_ = 0;
+  }
+
+  // Set params for streamk(streamk, separate-reduction included) decomposition.
+  void
+  set_params_stream_k(
+    UnderlyingParams const& underlying_params,
+    uint32_t k_tiles_per_output_tile,
+    uint32_t groups,
+    uint32_t sk_tiles,
+    uint64_t sk_units,
+    uint64_t cluster_size,
+    uint64_t dp_units,
+    uint64_t k_tiles_per_group,
+    uint64_t k_tiles_per_sk_unit,
+    uint64_t sk_big_groups,
+    ReductionMode reduction_mode,
+    uint32_t epilogue_subtile,
+    uint32_t reduction_units) {
+    // stream-k and separate-reduction decompostions
+    divmod_batch_ = underlying_params.divmod_batch_;
+    divmod_tiles_per_output_tile_ = FastDivmod(k_tiles_per_output_tile);
+    divmod_sk_groups_ = FastDivmodU64(static_cast<uint64_t>(groups));
+    divmod_sk_units_per_group_ = FastDivmodU64(static_cast<uint64_t>(sk_units / groups));
+
+    // Override divmod_clusters_mnl_ to be the number of cluster-sized stream-K units.
+    // This setting ensures that the use of this divmod for stream-K decompositions
+    // is essentially a no-op.
+    divmod_clusters_mnl_ = FastDivmodU64(sk_units / cluster_size);
+    divmod_splits_ = FastDivmod(1);
+    units_per_problem_ = static_cast<uint32_t>(dp_units + sk_units);
+
+    // Assign big_units_ assuming that group count == 1. This is unused by stream-K
+    // when group count > 1.
+    auto big_units_in_ctas = k_tiles_per_group % sk_units;
+
+    // Store big_units in terms of clusters. big_units_in_ctas is guaranteed to be divisible
+    // by cluster_size because both k_tiles_per_group and k_tiles_per_sk_unit must be a multiple
+    // of cluster_size.
+    auto big_units_in_clusters = big_units_in_ctas / cluster_size;
+    big_units_ = static_cast<uint32_t>(big_units_in_clusters);
+
+    big_groups_ = static_cast<uint32_t>(sk_big_groups);
+    sk_tiles_ = sk_tiles;
+    sk_units_ = static_cast<uint32_t>(sk_units);
+    divmod_k_tiles_per_sk_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit));
+    divmod_k_tiles_per_sk_big_unit_ = FastDivmod(static_cast<uint32_t>(k_tiles_per_sk_unit + 1));
+    reduction_mode_ = reduction_mode;
+    divmod_epilogue_subtile_ = FastDivmodU64(epilogue_subtile);
+    separate_reduction_units_ = reduction_units;
+  }
+
+  private:
+  // Round up number of bytes to the nearest multiple of L2 cache line alignment
+  CUTLASS_HOST_DEVICE
+  static size_t
+  round_up_to_l2_alignment(size_t bytes) {
+    constexpr size_t L2CacheLineSizeBytes = 128u;
+    return (bytes + L2CacheLineSizeBytes - 1) / L2CacheLineSizeBytes * L2CacheLineSizeBytes;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int adjust_split_count(
+      int splits,
+      int sm_count,
+      uint32_t k_tiles_per_output_tile
+      , uint32_t ktile_start_alignment_count 
+      ) {
+    // Don't split by more than the available number of SMs
+    if (splits > sm_count) {
+      splits = sm_count;
+    }
+
+    // Don't split by more than the K tile iterations
+    if (static_cast<uint32_t>(splits) > k_tiles_per_output_tile) {
+      splits = k_tiles_per_output_tile;
+    }
+
+    // If k_tiles_per_output_tiles / splits == 1, there will be one k_tile per cta
+    //   and this violate k_tile start from even requirements. Thus we need to
+    //   reduce the number of splits.
+    if (ktile_start_alignment_count > 1u && 
+          splits > 1 &&
+          k_tiles_per_output_tile / static_cast<uint32_t>(splits) == 1) {
+      splits = k_tiles_per_output_tile / ktile_start_alignment_count;
+    } 
+    return splits;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM90 persistent group scheduler (only used for Grouped Gemms)
+template<class GroupProblemShape>
+struct PersistentTileSchedulerSm90GroupParams {
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+
+  FastDivmodU64Pow2 divmod_cluster_shape_major_{};
+  FastDivmodU64Pow2 divmod_cluster_shape_minor_{};
+  FastDivmodU64 divmod_cta_shape_m_{};
+  FastDivmodU64 divmod_cta_shape_n_{};
+
+  uint64_t blocks_across_problem_ = 0;
+  bool pre_processed_problem_shapes = true;
+  int32_t log_swizzle_size_ = 0;
+  RasterOrder raster_order_ = RasterOrder::AlongN;
+
+  GroupProblemShape problem_shapes_;
+  GemmCoord cta_shape_;
+  GemmCoord cluster_shape_;
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GroupProblemShape problem_shapes,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(hw_info);
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    //
+    // Set members
+    //
+    problem_shapes_ = problem_shapes;
+    cta_shape_ = cta_shape;
+    cluster_shape_ = cluster_shape;
+
+    blocks_across_problem_ = problem_blocks.x * problem_blocks.y * problem_blocks.z;
+    pre_processed_problem_shapes = problem_shapes.is_host_problem_shape_available();
+    log_swizzle_size_ = log_swizzle_size;
+    raster_order_ = raster_order;
+
+    if (raster_order == RasterOrder::AlongN) {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.n());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.m());
+    }
+    else {
+      divmod_cluster_shape_major_ = FastDivmodU64Pow2(cluster_shape.m());
+      divmod_cluster_shape_minor_ = FastDivmodU64Pow2(cluster_shape.n());
+    }
+
+    divmod_cta_shape_m_ = FastDivmodU64(cta_shape_.m());
+    divmod_cta_shape_n_ = FastDivmodU64(cta_shape_.n());
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+    // Round up to nearest multiple of cluster dim along each mode
+    auto problem_blocks_m = ((cta_m + cluster_shape.m() - 1) / cluster_shape.m()) * cluster_shape.m();
+    auto problem_blocks_n = ((cta_n + cluster_shape.n() - 1) / cluster_shape.n()) * cluster_shape.n();
+
+    return {
+      static_cast<uint32_t>(cta_m),
+      static_cast<uint32_t>(cta_n),
+      static_cast<uint32_t>(1) // Only a single batch per group is currently supported
+    };
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size=true) {
+
+    int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+
+    // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+    auto cluster_size = cluster_shape.m() * cluster_shape.n();
+    if (cluster_size == 1) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+      }
+      else {
+        launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+      }
+    }
+    // In case the maximum number of clusters that could co-exist on the target device is
+    // already calculated using cudaOccupancyMaxActiveClusters
+    else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = max_active_clusters * cluster_shape.n();
+      }
+      else {
+        launch_grid.x = max_active_clusters * cluster_shape.m();
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
+    else {
+      // Optimal grid size calculation is based on
+      // GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
+      // Hence, maximum SMs per GPC = 18
+      constexpr int max_sm_per_gpc = 18;
+      int cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
+
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = possibly_truncate(
+            cta_per_device       / cluster_shape.m(),
+            problem_blocks_total / cluster_shape.m());
+      }
+      else {
+        launch_grid.x = possibly_truncate(
+            cta_per_device       / cluster_shape.n(),
+            problem_blocks_total / cluster_shape.n());
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using heuristics = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    int min_cta_dim = platform::min(problem_ctas_m, problem_ctas_n);
+    if (max_swizzle_size >= 8 && min_cta_dim >= 6) {
+      return 3;
+    }
+    else if (max_swizzle_size >= 4 && min_cta_dim >= 3) {
+      return 2;
+    }
+    else if (max_swizzle_size >= 2 && min_cta_dim >= 2) {
+      return 1;
+    }
+    else {
+      return 0;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    if (raster_order_option == RasterOrderOptions::Heuristic) {
+      if (tiles_n > tiles_m) {
+        return RasterOrder::AlongM;
+      }
+      else {
+        return RasterOrder::AlongN;
+      }
+    }
+    else {
+      switch (raster_order_option) {
+        case RasterOrderOptions::AlongN:
+          return RasterOrder::AlongN;
+          break;
+        default:
+          return RasterOrder::AlongM;
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+//
+// Parameters for SM100 tile schedulers
+//
+
+// Parameters for SM100 persistent tile scheduler
+struct PersistentTileSchedulerSm100Params {
+
+  using UnderlyingParams = PersistentTileSchedulerSm90Params;
+
+  using RasterOrder = UnderlyingParams::RasterOrder;
+  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+
+  uint32_t problem_tiles_m_ = 0;
+  uint32_t problem_tiles_n_ = 0;
+  uint32_t problem_tiles_l_ = 0;
+  FastDivmod divmod_cluster_shape_m_{};
+  FastDivmod divmod_cluster_shape_n_{};
+  FastDivmod divmod_swizzle_size_{};
+  RasterOrder raster_order_ = RasterOrder::AlongM;
+  int32_t log_swizzle_size_ = 0;
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  void initialize_swizzle(
+      dim3 problem_blocks,
+      GemmCoord cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      int max_swizzle_size,
+      RasterOrderOptions raster_order_option) {
+
+    raster_order_ = UnderlyingParams::get_rasterization_order(problem_tiles_m_, problem_tiles_n_, raster_order_option);
+    if (raster_order_option == RasterOrderOptions::Heuristic && raster_order_ == RasterOrder::AlongN) {
+      // The current implementation of AlongN rasterization for B100 requires swapping the number of clusters along the
+      // X and Y dimensions of the grid. However, since the grid Y dimension has a smaller range of allowed values
+      // than the grid X dimension, we must check whether the swapped grid would exceed the grid Y limit. If the
+      // swapped grid would exceed this limit, simply rever to AlongM mode.
+      //
+      // Overflow in the swapped X dimension is not possible. At worst, there will be ((1 << 16) - 1) clusters
+      // along the original Y dimension of the grid. Even if the cluster M mode is 16, the new grid X value
+      // will be at most ((1 << 16) - 1) * 16, which is less than the grid X limit of ((1 << 31) - 1).
+      uint32_t new_grid_y = problem_tiles_m_ * static_cast<uint32_t>(cluster_shape.n());
+
+      if (new_grid_y > (1 << 16) - 1) {
+        raster_order_ = RasterOrder::AlongM;
+      }
+    }
+
+    if (max_swizzle_size <= 1) {
+      // Set divisors directly to be zero to mark as unused
+      divmod_swizzle_size_.divisor = 0;
+    }
+    else {
+      divmod_swizzle_size_ = FastDivmod(max_swizzle_size);
+    }
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+      dim3 problem_blocks,
+      GemmCoord cluster_shape,
+      KernelHardwareInfo const& hw_info,
+      int max_swizzle_size,
+      RasterOrderOptions raster_order_option
+  ) {
+
+    // Cluster counters in m, n and l dimensions of the problem tiles
+    problem_tiles_m_ = problem_blocks.x / cluster_shape.m();
+    problem_tiles_n_ = problem_blocks.y / cluster_shape.n();
+    problem_tiles_l_ = problem_blocks.z;
+    divmod_cluster_shape_m_ = FastDivmod(cluster_shape.m());
+    divmod_cluster_shape_n_ = FastDivmod(cluster_shape.n());
+
+    initialize_swizzle(problem_blocks, cluster_shape, hw_info, max_swizzle_size, raster_order_option);
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(cluster_shape);
+    CUTLASS_UNUSED(hw_info);
+    CUTLASS_UNUSED(max_swizzle_size);
+    CUTLASS_UNUSED(raster_order_option);
+
+    return get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+  }
+
+  // Get the number of CTA tiles in this problem. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape) {
+
+    return UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return get_workspace_size(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    CUTLASS_UNUSED(problem_blocks);
+    CUTLASS_UNUSED(cluster_shape);
+    CUTLASS_UNUSED(hw_info);
+    CUTLASS_UNUSED(max_swizzle);
+    CUTLASS_UNUSED(raster_order_option);
+
+    return 0;
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    CudaHostAdapter *cuda_adapter = nullptr
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      max_swizzle,
+      raster_order_option,
+      cuda_adapter
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    CudaHostAdapter *cuda_adapter = nullptr
+  ) {
+
+    CUTLASS_UNUSED(workspace);
+    CUTLASS_UNUSED(stream);
+    CUTLASS_UNUSED(problem_blocks);
+    CUTLASS_UNUSED(cluster_shape);
+    CUTLASS_UNUSED(hw_info);
+    CUTLASS_UNUSED(max_swizzle);
+    CUTLASS_UNUSED(raster_order_option);
+
+    return cutlass::Status::kSuccess;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Parameters for SM100 persistent stream-K tile scheduler
+struct PersistentTileSchedulerSm100StreamKParams {
+  using UnderlyingParams = PersistentTileSchedulerSm100Params;
+  using UnderlyingStreamKParams = PersistentTileSchedulerSm90StreamKParams;
+  using RasterOrderOptions = UnderlyingParams::RasterOrderOptions;
+  using ReductionMode = UnderlyingStreamKParams::ReductionMode;
+  using DecompositionMode = UnderlyingStreamKParams::DecompositionMode;
+
+  using RasterOrder = UnderlyingParams::RasterOrder;
+  RasterOrder raster_order_ = RasterOrder::AlongM;
+  int32_t log_swizzle_size_ = 0;
+
+  UnderlyingStreamKParams sk_params_{};
+  UnderlyingParams sm100_params_{};
+
+  // Initializes members. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  void
+  initialize(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    uint32_t ktile_start_alignment_count = 1u
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+
+    // Number of k tiles in each output tile
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    initialize(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle_size,
+      raster_order_option,
+      reduction_mode,
+      decomposition_mode,
+      workspace,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    uint32_t k_tile_per_output_tile,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    ReductionMode reduction_mode,
+    DecompositionMode decomposition_mode,
+    void* workspace,
+    uint32_t ktile_start_alignment_count = 1u
+  ) {
+    sk_params_.initialize(
+      problem_blocks,
+      k_tile_per_output_tile,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle_size,
+      raster_order_option,
+      reduction_mode,
+      decomposition_mode,
+      workspace,
+      /*epilogue_subtile=*/1,
+      ktile_start_alignment_count,
+      /*bypass_sm90_occupancy_calculation=*/true
+    );
+
+    log_swizzle_size_ = sk_params_.log_swizzle_size_;
+    raster_order_ = sk_params_.raster_order_;
+
+    sm100_params_.initialize(
+      problem_blocks,
+      cluster_shape,
+      hw_info,
+      0, // Override max_swizzle_size to be 0, since the SM100 stream-K scheduler handles swizzling on its own
+      RasterOrderOptions::AlongM // Override raster_order to be AlongM, since the SM100 stream-K scheduler does not require grid swapping for raster order selection
+    );
+  }
+
+  // Get the number of CTA tiles in this problem.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(
+    BatchedGemmCoord problem_shape,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape) {
+
+    return UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+  }
+
+  // Given the inputs, computes the physical grid we should launch.
+  // This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  CUTLASS_HOST_DEVICE
+  dim3
+  get_grid_shape(BatchedGemmCoord problem_shape, GemmCoord cta_shape, GemmCoord cluster_shape) const {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
+
+    return get_grid_shape(problem_blocks, cluster_shape);
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  dim3
+  get_grid_shape(dim3 problem_blocks, GemmCoord cluster_shape) const {
+    if (sk_params_.sk_units_ > 0) {
+      // For stream-K cases, we would, ideally, launch a linear grid of size `sk_params_.units_per_problem_`.
+      // However doing so raises two potential issues:
+      //   (a) the total number of tiles in the kernel may exceed the amount that can fit in a single
+      //       returned value of a CLC query
+      //   (b) the launched grid would not respect cluster-size divisibility requirements
+      //
+      // To circumvent these issues, we must distribute the `sk_params_.units_per_problem_` units of work
+      // across the X, Y, and Z dimensions of the grid, while ensuring that the X and Y dimensions are
+      // divisible by cluster size (we ignore Z, as all CUTLASS kernels currently use a cluster shape
+      // of 1 in the Z dimension).
+      //
+      // For convenience, we launch this as "waves" of `sk_params_.sk_units_` CTAs, with the wave count being
+      // the Z dimension of the grid, and the `sk_params_.sk_units_` CTAs per wave being distributed across
+      // the X and Y dimensions of the grid in a way that alingns with cluster divisibility requirements.
+      //
+      // Thus, the grid that is launched looks like:
+      //   grid = dim3(sk_units_ / cluster.y, cluster.y, waves)
+      //
+      // We place sk_units_ / cluster.y in the X dimension of the grid because the CLC query feature
+      // allocates more bits for the X index values returned in the query.
+      //
+
+      // For most cases, `sk_params_.sk_units_` will equal the number of available SMs, so this grid will
+      // naturally represent waves in the true hardware sense.
+      //
+      // However, there are some corner cases in which fewer stream-K units are used than the full SM count
+      // (e.g., if using the full SM count would result in stream-K units that are assigned fewer than the
+      // minimum number of K tile iterations). In these cases, `sk_params_.units_per_problem_` may not be
+      // divisible by `sk_params_.sk_units_`, since any data-parallel work performed alongside stream-K
+      // work is always done in terms of waves of CTAs of number equal to the number of available SMs.
+      // Therefore, we take the ceiling of the division when determining wave count, and allow the underlying
+      // stream-K scheduler to determine which indices are in bounds.
+      uint32_t waves = static_cast<uint32_t>(
+        (sk_params_.units_per_problem_ + sk_params_.sk_units_ - 1) / sk_params_.sk_units_);
+
+      return dim3(
+        sk_params_.sk_units_ / cluster_shape.n(),
+        cluster_shape.n(),
+        waves
+      );
+    }
+    else {
+      // Grid launch for data-parallel and basic split-K decomposition. When data-parallel
+      // mode is used, params.sk_params_.splits = 1.
+      return dim3(problem_blocks.x, problem_blocks.y, problem_blocks.z * sk_params_.divmod_splits_.divisor);
+    }
+  }
+
+  // Get the amount of scratch workspace needed for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static size_t
+  get_workspace_size(
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t reduction_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t ktile_start_alignment_count = 1
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      reduction_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of get_workspace_size that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static size_t
+  get_workspace_size(
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t reduction_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t ktile_start_alignment_count = 1
+  ) {
+    return UnderlyingStreamKParams::get_workspace_size(
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      reduction_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs,
+      ktile_start_alignment_count,
+      /*bypass_sm90_occupancy_calculation=*/true
+    );
+  }
+
+  // Initialize the workspace to be used for the kernel. This variant of the method should only be used when
+  // problem_shape and tile_shape contain modes of only rank 1.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    BatchedGemmCoord problem_shape,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t reduction_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1
+  ) {
+    dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, tile_shape, cluster_shape);
+    uint32_t k_tiles_per_output_tile = (problem_shape.k() + tile_shape.k() - 1) / tile_shape.k();
+
+    return initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      reduction_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs,
+      cuda_adapter,
+      ktile_start_alignment_count
+    );
+  }
+
+  // Version of initialize_workspace that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  static cutlass::Status
+  initialize_workspace(
+    void* workspace,
+    cudaStream_t stream,
+    dim3 problem_blocks,
+    uint32_t k_tiles_per_output_tile,
+    GemmCoord tile_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int splits,
+    int max_swizzle,
+    RasterOrderOptions raster_order_option,
+    DecompositionMode decomposition_mode,
+    ReductionMode reduction_mode,
+    uint32_t reduction_warp_groups,
+    uint32_t barrier_bits,
+    uint32_t element_accumulator_bits,
+    uint32_t epilogue_subtile = 1,
+    uint32_t num_accumulator_mtxs = 1,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    uint32_t ktile_start_alignment_count = 1
+  ) {
+    return UnderlyingStreamKParams::initialize_workspace(
+      workspace,
+      stream,
+      problem_blocks,
+      k_tiles_per_output_tile,
+      tile_shape,
+      cluster_shape,
+      hw_info,
+      splits,
+      max_swizzle,
+      raster_order_option,
+      decomposition_mode,
+      reduction_mode,
+      reduction_warp_groups,
+      barrier_bits,
+      element_accumulator_bits,
+      epilogue_subtile,
+      num_accumulator_mtxs,
+      cuda_adapter,
+      ktile_start_alignment_count,
+      /*bypass_sm90_occupancy_calculation=*/true
+    );
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameters for SM100 persistent group scheduler (only used for Grouped Gemms)
+template<class GroupProblemShape>
+struct PersistentTileSchedulerSm100GroupParams {
+
+  using UnderlyingSm90Params = PersistentTileSchedulerSm90GroupParams<GroupProblemShape>;
+  using RasterOrder = cutlass::gemm::kernel::detail::RasterOrder;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+
+  UnderlyingSm90Params params_sm90_{};
+
+  // Version of initialize that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  void
+  initialize(
+    dim3 problem_blocks,
+    GroupProblemShape problem_shapes,
+    GemmCoord cta_shape,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo const& hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option
+  ) {
+
+    params_sm90_.initialize(
+      problem_blocks,
+      problem_shapes,
+      cta_shape,
+      cluster_shape,
+      hw_info,
+      max_swizzle_size,
+      raster_order_option
+    );
+  }
+
+  // Version of get_tiled_cta_shape_mnl that takes in as input the number of CTAs in the M and N dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE
+  static dim3
+  get_tiled_cta_shape_mnl(GemmCoord cluster_shape, uint32_t cta_m, uint32_t cta_n) {
+    return UnderlyingSm90Params::get_tiled_cta_shape_mnl(cluster_shape, cta_m, cta_n);
+  }
+
+  // Version of get_grid_shape that takes in as input the number of CTAs in the M and N and L dimensions.
+  // This is useful for calculating the tiled shape when a mode of problem and/or CTA shape has rank > 1,
+  // for which using CuTe algebra for calculating tile shapes is easiest.
+  CUTLASS_HOST_DEVICE static
+  dim3
+  get_grid_shape(
+    dim3 problem_blocks,
+    GemmCoord cluster_shape,
+    KernelHardwareInfo hw_info,
+    int max_swizzle_size,
+    RasterOrderOptions raster_order_option,
+    bool truncate_by_problem_size = true,
+    bool is_static_cluster_shape = false) {
+
+    int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
+
+    // Round up to nearest multiple of swizzle_size along each mode
+    auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
+    auto problem_blocks_m = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
+    auto problem_blocks_n = round_up(problem_blocks.y, (1 << log_swizzle_size) * cluster_shape.n());
+
+    int problem_blocks_total = problem_blocks_m * problem_blocks_n * problem_blocks.z;
+
+    RasterOrder raster_order = get_rasterization_order(
+      problem_blocks_m,
+      problem_blocks_n,
+      raster_order_option
+    );
+
+    dim3 launch_grid;
+
+    if (raster_order == RasterOrder::AlongN) {
+      launch_grid = dim3(cluster_shape.m(), 1, 1);
+    }
+    else {
+      launch_grid = dim3(1, cluster_shape.n(), 1);
+    }
+
+    auto possibly_truncate = [&](int x, int y) {
+      if (truncate_by_problem_size) {
+        return platform::min(x, y);
+      }
+      else {
+        return x;
+      }
+    };
+    
+    if (is_static_cluster_shape) {
+      // The else path is generic, however, we can avoid some divs if we know cluster size is 1
+      auto cluster_size = cluster_shape.m() * cluster_shape.n();
+      if (cluster_size == 1) {
+        if (raster_order == RasterOrder::AlongN) {
+          launch_grid.y = possibly_truncate(sm_count, problem_blocks_total);
+        }
+        else {
+          launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
+        }
+      }
+      // In case the maximum number of clusters that could co-exist on the target device is
+      // already calculated using cudaOccupancyMaxActiveClusters
+      else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
+        if (raster_order == RasterOrder::AlongN) {
+          launch_grid.y = max_active_clusters * cluster_shape.n();
+        }
+        else {
+          launch_grid.x = max_active_clusters * cluster_shape.m();
+        }
+        CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+            "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+      }
+      else {
+        constexpr int max_sm_per_gpc = 20;
+        int cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
+        if (raster_order == RasterOrder::AlongN) {
+          launch_grid.y = possibly_truncate(
+              cta_per_device       / cluster_shape.m(),
+              problem_blocks_total / cluster_shape.m());
+        }
+        else {
+          launch_grid.x = possibly_truncate(
+              cta_per_device       / cluster_shape.n(),
+              problem_blocks_total / cluster_shape.n());
+        }
+        CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using heuristics = "
+            "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+      }
+    }
+    else {
+      // With preferred clusters, we can launch the largest possible persistent grid (rounded up to cluster dims) 
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = ((possibly_truncate(sm_count, problem_blocks_total) / cluster_shape.m()) / cluster_shape.n()) * cluster_shape.n();
+      }
+      else {
+        launch_grid.x = ((possibly_truncate(sm_count, problem_blocks_total) / cluster_shape.n()) / cluster_shape.m()) * cluster_shape.m();
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using preferred clusters = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
+    return launch_grid;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static int32_t
+  get_log_swizzle_size(int problem_ctas_m, int problem_ctas_n, int max_swizzle_size) {
+    return UnderlyingSm90Params::get_log_swizzle_size(problem_ctas_m, problem_ctas_n, max_swizzle_size);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static RasterOrder
+  get_rasterization_order(
+    uint32_t tiles_m,
+    uint32_t tiles_n,
+    RasterOrderOptions raster_order_option
+  ) {
+    return UnderlyingSm90Params::get_rasterization_order(tiles_m, tiles_n, raster_order_option);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace detail
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/trmm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..992aa484ff8e789b037fced736af1baa8b93502c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/kernel/trmm_universal.h
@@ -0,0 +1,580 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief 
+
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+#include "cutlass/core_io.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Mma_,                  ///! Threadblock-scoped matrix multiply-accumulate 
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  SideMode SideMode_,             ///! Side Mode for the kernel (kLeft or kRight)
+  FillMode FillMode_,             ///! Fill Mode for triangular matrix (kLower or kUpper)
+  DiagType DiagType_              ///! Diag Type for triangular matrix (kNonUnit or kUnit)
+>
+struct TrmmUniversal {
+public:
+
+  using Mma = Mma_;
+  using Epilogue = Epilogue_;
+  using EpilogueOutputOp = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Epilogue::OutputTileIterator::Element;
+  using LayoutC = typename Epilogue::OutputTileIterator::Layout;
+  static SideMode const kSideMode = SideMode_;
+  static FillMode const kFillMode = FillMode_;
+  static DiagType const kDiagType = DiagType_;
+
+  static ComplexTransform const kTransformA = Mma::kTransformA;
+  static ComplexTransform const kTransformB = Mma::kTransformB;
+  using Operator = typename Mma::Operator;
+
+  using OperatorClass = typename Mma::Operator::OperatorClass;
+  using ThreadblockShape = typename Mma::Shape;
+  using WarpShape = typename Mma::Operator::Shape;
+  using InstructionShape = typename Mma::Policy::Operator::InstructionShape;
+  using ArchTag = typename Mma::ArchTag;
+
+  static int const kStages = Mma::kStages;
+  static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+  static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+  static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename Mma::WarpCount;
+  static int const kThreadCount = 32 * WarpCount::kCount;
+
+  /// Split-K preserves splits that are 128b aligned
+  static int const kSplitKAlignment = const_max(128 / sizeof_bits<ElementA>::value, 128 / sizeof_bits<ElementB>::value);
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmUniversalMode mode{GemmUniversalMode::kGemm};
+    GemmCoord problem_size{};
+    int batch_count{1};
+
+    typename EpilogueOutputOp::Params epilogue{};
+
+    void const * ptr_A{nullptr};
+    void const * ptr_B{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A{0};
+    int64_t batch_stride_B{0};
+    int64_t batch_stride_D{0};
+
+    typename LayoutA::Stride::Index lda{0};
+    typename LayoutB::Stride::Index ldb{0};
+    typename LayoutC::Stride::Index ldd{0};
+
+    //
+    // Methods
+    //
+
+    Arguments() = default;
+
+    /// constructs an arguments structure
+    Arguments(
+      GemmUniversalMode mode,
+      GemmCoord problem_size,
+      int batch_count,
+      typename EpilogueOutputOp::Params epilogue,
+      void const * ptr_A,
+      void const * ptr_B,
+      void * ptr_D,
+      int64_t batch_stride_A,
+      int64_t batch_stride_B,
+      int64_t batch_stride_D,
+      typename LayoutA::Stride::Index lda,
+      typename LayoutB::Stride::Index ldb,
+      typename LayoutC::Stride::Index ldd
+    ):
+      mode(mode), 
+      problem_size(problem_size),
+      batch_count(batch_count),
+      epilogue(epilogue), 
+      ptr_A(ptr_A), ptr_B(ptr_B), ptr_D(ptr_D), 
+      batch_stride_A(batch_stride_A), batch_stride_B(batch_stride_B), batch_stride_D(batch_stride_D), 
+      lda(lda), ldb(ldb), ldd(ldd) {
+      }
+    
+    /// Returns arguments for the transposed problem sizes
+    Arguments transposed_problem_size() const {
+      Arguments args(*this);
+
+      std::swap(args.problem_size.m(), args.problem_size.n());
+
+      return args;
+    }
+
+    /// Returns arguments for the transposed matrices
+    Arguments swapped_matrices() const {
+      Arguments args(*this);
+
+      std::swap(args.ptr_A, args.ptr_B);
+      std::swap(args.lda, args.ldb);
+      std::swap(args.batch_stride_A, args.batch_stride_B);
+
+      return args;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    cutlass::gemm::GemmCoord problem_size{};
+    cutlass::gemm::GemmCoord grid_tiled_shape{};
+    int swizzle_log_tile{0};
+   
+    typename Mma::IteratorA::Params params_A{};
+    typename Mma::IteratorB::Params params_B{};
+    typename Epilogue::OutputTileIterator::Params params_D{};
+    
+    typename EpilogueOutputOp::Params output_op{};
+
+    GemmUniversalMode mode = cutlass::gemm::GemmUniversalMode::kGemm;
+    int batch_count {0};
+    int gemm_k_size {0};
+
+    void * ptr_A{nullptr};
+    void * ptr_B{nullptr};
+    void * ptr_D{nullptr};
+
+    int64_t batch_stride_A {0};
+    int64_t batch_stride_B {0};
+    int64_t batch_stride_D {0};
+
+    int *semaphore{nullptr};
+
+    //
+    // Methods
+    //
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      Arguments const &args,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      int gemm_k_size,
+      void *workspace = nullptr
+    ):
+      problem_size(args.problem_size),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A(args.lda),
+      params_B(args.ldb),
+      params_D(args.ldd),
+      output_op(args.epilogue),
+      mode(args.mode),
+      batch_count(args.batch_count),
+      gemm_k_size(gemm_k_size),
+      ptr_A(const_cast<void *>(args.ptr_A)),
+      ptr_B(const_cast<void *>(args.ptr_B)),
+      ptr_D(args.ptr_D),
+      batch_stride_A(args.batch_stride_A),
+      batch_stride_B(args.batch_stride_B),
+      batch_stride_D(args.batch_stride_D),
+      semaphore(static_cast<int *>(workspace)) {
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr) {
+
+      ptr_A = const_cast<void *>(args.ptr_A);
+      ptr_B = const_cast<void *>(args.ptr_B);
+      ptr_D = args.ptr_D;
+
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_D = args.batch_stride_D;
+
+      output_op = args.epilogue;
+
+      semaphore = static_cast<int *>(workspace);
+    }
+
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename Mma::SharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  TrmmUniversal() { } 
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(
+    cutlass::gemm::GemmCoord const & problem_size) {
+
+    static int const kAlignmentA = Mma::IteratorA::AccessType::kElements;
+    static int const kAlignmentB = Mma::IteratorB::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if ((problem_size.m() % kAlignmentA) || (problem_size.k() % kAlignmentA) ||
+      (problem_size.n() % kAlignmentB) || (problem_size.k() % kAlignmentB) ||
+      (problem_size.m() % kAlignmentC) || (problem_size.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    int offset_k = 0;
+    int problem_size_k = params.problem_size.k();
+
+    ElementA *ptr_A = static_cast<ElementA *>(params.ptr_A); 
+    ElementB *ptr_B = static_cast<ElementB *>(params.ptr_B);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    if (params.mode == GemmUniversalMode::kGemm || 
+      params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+
+      if (threadblock_tile_offset.k() + 1 < params.grid_tiled_shape.k()) {
+
+        problem_size_k = (threadblock_tile_offset.k() + 1) * params.gemm_k_size; 
+      }
+
+      offset_k = threadblock_tile_offset.k() * params.gemm_k_size;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_A += threadblock_tile_offset.k() * params.batch_stride_A;
+      ptr_B += threadblock_tile_offset.k() * params.batch_stride_B;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_A = static_cast<ElementA * const *>(params.ptr_A)[threadblock_tile_offset.k()];
+      ptr_B = static_cast<ElementB * const *>(params.ptr_B)[threadblock_tile_offset.k()];
+    }
+
+    __syncthreads();
+
+    // Compute initial location in logical coordinates
+    cutlass::MatrixCoord tb_offset_A{
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      offset_k,
+    };
+
+    cutlass::MatrixCoord tb_offset_B{
+      offset_k,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    };
+
+    // Compute position within threadblock
+    int thread_idx = threadIdx.x;
+
+    // Broadcast the warp_id computed by lane 0 to ensure dependent code
+    // is compiled as warp-uniform.
+    int warp_idx = canonical_warp_idx_sync();
+
+    int lane_idx = threadIdx.x % 32;
+
+    //
+    // Main loop
+    //
+
+    // Construct thread-scoped matrix multiply
+    Mma mma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx);
+
+    typename Mma::FragmentC accumulators;
+
+    accumulators.clear();
+
+    // Compute threadblock-scoped matrix multiply-add
+    int gemm_k_iterations = (problem_size_k - offset_k + Mma::Shape::kK - 1) / Mma::Shape::kK;
+    
+    /******************************************************************************************************
+      First two cases: (Left Side, Lower Fill) and (Right Side, Upper Fill) are transpose of each other
+        - (Left Side, Lower Fill): calculate bottom of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+        - (Right Side, Upper Fill): calculate right end of the CTA tile,  then find the k-iterations 
+                                    needed to process all elements till that coordinate.
+
+      Last two cases: (Left Side, Upper Fill) and (Right Side, Lower Fill) are transpose of each other
+        - (Left Side, Upper Fill): calculate the top of the CTA tile, then find k-iterations 
+                                   that can be skipped for all elements of this tile.
+        - (Right Side, Lower Fill): calculate the left start of the CTA tile, then find k-iterations 
+                                    that can be skipped for all elements of this tile.
+    ********************************************************************************************************/
+ 
+    if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m() + 1) * Mma::Shape::kM + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n() + 1) * Mma::Shape::kN + Mma::Shape::kK - 1) / Mma::Shape::kK;
+      if (k_iterations_till_diagonal < gemm_k_iterations) {
+        gemm_k_iterations = k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kLeft && kFillMode == FillMode::kUpper) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.m()) * Mma::Shape::kM) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    } else if (kSideMode == SideMode::kRight && kFillMode == FillMode::kLower) {
+
+      int k_iterations_till_diagonal = ((threadblock_tile_offset.n()) * Mma::Shape::kN) / Mma::Shape::kK;
+
+      if (k_iterations_till_diagonal != 0) {
+        tb_offset_A += cutlass::MatrixCoord({0, k_iterations_till_diagonal * Mma::Shape::kK});
+        tb_offset_B += cutlass::MatrixCoord({k_iterations_till_diagonal * Mma::Shape::kK, 0});
+        gemm_k_iterations -= k_iterations_till_diagonal;
+      }
+
+    }
+
+    // Construct iterators to A and B operands
+    typename Mma::IteratorA iterator_A(
+      params.params_A,
+      ptr_A,
+      {params.problem_size.m(), problem_size_k},
+      thread_idx,
+      tb_offset_A);
+
+    typename Mma::IteratorB iterator_B(
+      params.params_B,
+      ptr_B,
+      {problem_size_k, params.problem_size.n()},
+      thread_idx,
+      tb_offset_B);
+
+    // Compute threadblock-scoped matrix multiply-add
+    mma(
+      gemm_k_iterations, 
+      accumulators, 
+      iterator_A, 
+      iterator_B, 
+      accumulators);
+
+    //
+    // Epilogue
+    //
+
+    EpilogueOutputOp output_op(params.output_op);
+
+    //
+    // Masked tile iterators constructed from members
+    //
+
+    threadblock_tile_offset = threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    //assume identity swizzle
+    MatrixCoord threadblock_offset(
+      threadblock_tile_offset.m() * Mma::Shape::kM,
+      threadblock_tile_offset.n() * Mma::Shape::kN
+    );
+
+    int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m();
+
+    ElementC *ptr_D = static_cast<ElementC *>(params.ptr_D);
+
+    //
+    // Fetch pointers based on mode.
+    //
+    
+    // Construct the semaphore.
+    Semaphore semaphore(params.semaphore + block_idx, thread_idx);
+
+    if (params.mode == GemmUniversalMode::kGemm) {
+
+      // If performing a reduction via split-K, fetch the initial synchronization
+      if (params.grid_tiled_shape.k() > 1) {
+        
+        // Fetch the synchronization lock initially but do not block.
+        semaphore.fetch();
+
+        // Indicate which position in a serial reduction the output operator is currently updating
+        output_op.set_k_partition(threadblock_tile_offset.k(), params.grid_tiled_shape.k());
+      }
+    }
+    else if (params.mode == GemmUniversalMode::kGemmSplitKParallel) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kBatched) {
+      ptr_D += threadblock_tile_offset.k() * params.batch_stride_D;
+    }
+    else if (params.mode == GemmUniversalMode::kArray) {
+      ptr_D = static_cast<ElementC * const *>(params.ptr_D)[threadblock_tile_offset.k()];
+    }
+
+    
+    // Tile iterator loading from source tensor (although irrelevant to this kernel as beta is zero).
+    typename Epilogue::OutputTileIterator iterator_C(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    // Tile iterator writing to destination tensor.
+    typename Epilogue::OutputTileIterator iterator_D(
+      params.params_D,
+      ptr_D,
+      params.problem_size.mn(),
+      thread_idx,
+      threadblock_offset
+    );
+
+    Epilogue epilogue(
+      shared_storage.epilogue, 
+      thread_idx, 
+      warp_idx, 
+      lane_idx);
+
+    // Wait on the semaphore - this latency may have been covered by iterator construction
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) {
+        
+      // For subsequent threadblocks, the source matrix is held in the 'D' tensor.
+      if (threadblock_tile_offset.k()) {
+        iterator_C = iterator_D;
+      }
+
+      semaphore.wait(threadblock_tile_offset.k());
+
+      __threadfence();
+    }
+
+
+    // Execute the epilogue operator to update the destination tensor.
+    epilogue(
+      output_op, 
+      iterator_D, 
+      accumulators, 
+      iterator_C); 
+    
+    //
+    // Release the semaphore
+    //
+
+    if (params.mode == GemmUniversalMode::kGemm && params.grid_tiled_shape.k() > 1) { 
+
+      int lock = 0;
+      if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) {
+
+        // The final threadblock resets the semaphore for subsequent grids.
+        lock = 0;
+      }
+      else {
+        // Otherwise, the semaphore is incremented
+        lock = threadblock_tile_offset.k() + 1;
+      }
+      
+      semaphore.release(lock);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..018963b260979d771d86070cfc79c989a710a059
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma.h
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp-level multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+  /// Concept: arch::OpMultiplyAdd or arch::Mma<>
+  typename Operator = arch::OpMultiplyAdd,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+struct Mma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Overloads specialized for existing architectures
+//
+
+#include "cutlass/gemm/thread/mma_sm50.h"
+#include "cutlass/gemm/thread/mma_sm60.h"
+#include "cutlass/gemm/thread/mma_sm61.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm50.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm50.h
new file mode 100644
index 0000000000000000000000000000000000000000..e05c56e3081ea2bb9ac72051c1a22f46394ff6ee
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm50.h
@@ -0,0 +1,540 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_,
+  /// Operator used to compute GEMM
+  typename Operator_
+>
+struct MmaGeneric {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = Operator_;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = arch::Mma<
+    gemm::GemmShape<1,1,1>,
+    1,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    Operator>;
+
+  static bool const kMultipleOf2 = ((Shape::kM % 2 == 0) && (Shape::kN % 2 == 0));
+
+  static bool const kAllFp32 = platform::is_same<ElementA, float>::value &&
+      platform::is_same<ElementB, float>::value &&
+      platform::is_same<ElementC, float>::value;
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementA const, LayoutA> a_ref(
+      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
+
+    TensorRef<ElementB const, LayoutB> b_ref(
+      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
+
+    TensorRef<ElementC, LayoutC> d_ref(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
+
+    MmaOp mma_op;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK; ++k) {
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 860)
+      if constexpr (kMultipleOf2 && kAllFp32) {
+        //2x2 zigzag - m and n loops to increment by 2. Inner loop to process 4 multiply-adds in a 2x2 tile.
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; n+=2) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; m+=2) {
+  
+            int m_serpentine = (n % 4) ? (Shape::kM - 2 - m) : m;
+
+            //top-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-left element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //bottom-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine+1, n+1);
+              MatrixCoord mk(m_serpentine+1, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+  
+            //top-right element in 2x2 tile
+            {
+              MatrixCoord mn(m_serpentine, n+1);
+              MatrixCoord mk(m_serpentine, k);
+              MatrixCoord kn(k, n+1);
+              Array<ElementC, 1> d;
+              Array<ElementA, 1> a;
+              Array<ElementB, 1> b;
+              d[0] = d_ref.at(mn);
+              a[0] = a_ref.at(mk);
+              b[0] = b_ref.at(kn);
+              mma_op(d, a, b, d);
+              d_ref.at(mn) = d[0];
+            }
+          }
+        }
+      } else 
+      #endif
+      {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; ++n) {
+  
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; ++m) {
+  
+            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+  
+            MatrixCoord mn(m_serpentine, n);
+            MatrixCoord mk(m_serpentine, k);
+            MatrixCoord kn(k, n);
+  
+            Array<ElementC, 1> d;
+            Array<ElementA, 1> a;
+            Array<ElementB, 1> b;
+  
+            d[0] = d_ref.at(mn);
+            a[0] = a_ref.at(mk);
+            b[0] = b_ref.at(kn);
+  
+            mma_op(d, a, b, d);
+  
+            d_ref.at(mn) = d[0];
+          }
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Matrix multiply-add operation - assumes operand B is not changing
+struct MmaComplexF32_Column {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+/// Matrix multiply-add operation - assumes operand A is not changing
+struct MmaComplexF32_Corner {
+
+  using Shape = gemm::GemmShape<1, 1, 1>;
+  using ElementC = complex<float>;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    Array<complex<float>, 1> &d,
+    Array<complex<float>, 1> const &a,
+    Array<complex<float>, 1> const &b,
+    Array<complex<float>, 1> const &c
+  ) {
+
+    d[0].real() = -a[0].imag() * b[0].imag() + d[0].real();
+    d[0].imag() =  a[0].real() * b[0].imag() + d[0].imag();
+    d[0].real() =  a[0].real() * b[0].real() + c[0].real();
+    d[0].imag() =  a[0].imag() * b[0].real() + c[0].imag();
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles all packed matrix layouts
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_
+>
+struct MmaGeneric<
+  Shape_,
+  complex<float>,
+  LayoutA_,
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  arch::OpMultiplyAdd> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = complex<float>;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = complex<float>;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = complex<float>;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Instruction
+  using MmaOp = arch::Mma<
+    gemm::GemmShape<1,1,1>,
+    1,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    Operator>;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementA const, LayoutA> a_ref(
+      reinterpret_cast<ElementA const *>(&A), LayoutA::packed({Shape::kM, Shape::kK}));
+
+    TensorRef<ElementB const, LayoutB> b_ref(
+      reinterpret_cast<ElementB const *>(&B), LayoutB::packed({Shape::kK, Shape::kN}));
+
+    TensorRef<ElementC, LayoutC> d_ref(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed(make_Coord(Shape::kM, Shape::kN)));
+
+    detail::MmaComplexF32_Column mma_column;
+    detail::MmaComplexF32_Corner mma_corner;
+
+    // Copy accumulators
+    D = C;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK; ++k) {
+
+      {
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < Shape::kN; ++n) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Shape::kM; ++m) {
+
+            int m_serpentine = (n % 2) ? (Shape::kM - 1 - m) : m;
+
+            MatrixCoord mn(m_serpentine, n);
+            MatrixCoord mk(m_serpentine, k);
+            MatrixCoord kn(k, n);
+
+            Array<ElementC, 1> d;
+            Array<ElementA, 1> a;
+            Array<ElementB, 1> b;
+
+            d[0] = d_ref.at(mn);
+            a[0] = a_ref.at(mk);
+            b[0] = b_ref.at(kn);
+
+            if ((m == 0 && n) || m == Shape::kM - 1) {
+              mma_corner(d, a, b, d);
+            }
+            else {
+              mma_column(d, a, b, d);
+            }
+
+            d_ref.at(mn) = d[0];
+          }
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles conventional layouts for FFMA and DFMA GEMM
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: layout::MapFunc)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: layout::MapFunc)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: layout::MapFunc)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  ElementA_,
+  LayoutA_,
+  ElementB_,
+  LayoutB_,
+  ElementC_,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  bool> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = ElementA_;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = LayoutA_;
+
+  /// Data type of operand B
+  using ElementB = ElementB_;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = LayoutB_;
+
+  /// Element type of operand C
+  using ElementC = ElementC_;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename MmaGeneric<
+                                    Shape,
+                                    ElementA,
+                                    LayoutA,
+                                    ElementB,
+                                    LayoutB,
+                                    ElementC,
+                                    LayoutC,
+                                    Operator>::MmaOp;
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    MmaGeneric<
+      Shape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator> mma;
+
+    mma(D, A, B, C);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm60.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm60.h
new file mode 100644
index 0000000000000000000000000000000000000000..64c8e033af3f60d3c85f642ade9ad2b43797146c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm60.h
@@ -0,0 +1,1161 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/functional.h"
+#include "cutlass/reduction/thread/reduce.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Structure to compute the matrix product for HFMA
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC,
+
+  /// Type of GEMM inner vs outer product
+  bool
+>
+struct Mma_HFMA2;
+
+
+/////////////////////////////
+// Specialization for NNN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM/2 + m],
+                ptr_B[n*Shape::kK + k],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for NNT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            Array<half_t, 2> tmp_B;
+            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
+            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM + m],
+                tmp_B,
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for NTN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the GEMM M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / Mma::Shape::kK; ++k) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM / Mma::Shape::kM; ++m) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Shape::kN / Mma::Shape::kN; ++n) {
+
+          Array<half_t, 2> tmp { ptr_D[m + n * Shape::kM/2] };
+
+          mma(
+            tmp,
+            ptr_A[m + k * Shape::kM/2],
+            ptr_B[k * Shape::kN + n],
+            tmp);
+
+          ptr_D[m + n * Shape::kM/2] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for NTT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            mma(
+                tmp,
+                ptr_A[k*Shape::kM + m],
+                ptr_B[k*Shape::kN/2 + n],
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for TNN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            Array<half_t, 2> tmp_A;
+            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
+            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                tmp_A,
+                ptr_B[n*Shape::kK + k],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for TNT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            Array<half_t, 2> tmp_B;
+            tmp_B[0] = ptr_B->at(2*n*Shape::kK + k);
+            tmp_B[1] = ptr_B->at((2*n+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                ptr_A[m*Shape::kK + k],
+                tmp_B,
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////
+// Specialization for TTN  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2 <
+  Shape_,
+  layout::RowMajor,
+  layout::RowMajor,
+  layout::ColumnMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+   /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kM % 2),
+    "Mma_HFMA2 requires the M dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<2,1,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::ColumnMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 1> const *ptr_B = reinterpret_cast<Array<half_t, 1> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+            Array<half_t, 2> tmp { ptr_D[n*Shape::kM/2 + m] };
+
+            Array<half_t, 2> tmp_A;
+            tmp_A[0] = ptr_A->at(2*m*Shape::kK + k);
+            tmp_A[1] = ptr_A->at((2*m+1)*Shape::kK + k);
+
+            mma(
+                tmp,
+                tmp_A,
+                ptr_B[k*Shape::kN + n],
+                tmp);
+
+            ptr_D[n*Shape::kM/2 + m] = tmp;
+        }
+      }
+    }
+  }
+};
+
+
+/////////////////////////////
+// Specialization for TTT  //
+/////////////////////////////
+
+template <typename Shape_>
+struct Mma_HFMA2<
+  Shape_,
+  layout::RowMajor,
+  layout::RowMajor,
+  layout::RowMajor,
+  true
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kN % 2),
+    "Mma_HFMA2 requires the N dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x2x1 HFMA2 sequence for bulk of computation
+    using Mma = arch::Mma<
+      gemm::GemmShape<1,2,1>,
+      1,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      half_t,
+      layout::RowMajor,
+      arch::OpMultiplyAdd>;
+
+    Array<half_t, 2> *ptr_D = reinterpret_cast<Array<half_t, 2> *>(&D);
+    Array<half_t, 1> const *ptr_A = reinterpret_cast<Array<half_t, 1> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    Mma mma;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto k=0; k <  Shape::kK / Mma::Shape::kK; k++){
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto n=0; n < Shape::kN / Mma::Shape::kN; n++){
+
+          CUTLASS_PRAGMA_UNROLL
+          for(auto m=0; m < Shape::kM / Mma::Shape::kM; m++){
+
+            Array<half_t, 2> tmp { ptr_D[m*Shape::kN/2 + n] };
+
+            mma(
+                tmp,
+                ptr_A[m*Shape::kK + k],
+                ptr_B[k*Shape::kN/2 + n],
+                tmp);
+
+            ptr_D[m*Shape::kN/2 + n] = tmp;
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////
+// Specialization for TNT + Inner Product  or 1x1x2K + LayoutC = T //
+/////////////////////////////////////////////////////////////////////
+
+template <typename Shape_, typename LayoutA, typename LayoutB>
+struct Mma_HFMA2<
+  Shape_,
+  LayoutA,
+  LayoutB,
+  layout::RowMajor,
+  false
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kK % 2),
+    "Mma_HFMA2 requires the K dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x2 HFMA2 sequence for bulk of computation
+    using GemmShape = gemm::GemmShape<1,1,2>;
+
+    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    // Inner product is calculated using MACs, followed by final reduction
+    multiply_add<Array<half_t, 2>> mac;
+    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
+
+        Array<half_t, 2> tmp_C;
+        tmp_C.clear();
+        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
+        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
+          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
+        }
+
+        Array<half_t, 1> res;
+        Array<half_t, 1> *ptr_res = &res;
+        res = reduce(tmp_C);
+
+        ptr_D[m*Shape::kN + n] = ptr_res[0];
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////
+// Specialization for TNN + Inner Product  or 1x1x2K + LayoutC = N //
+/////////////////////////////////////////////////////////////////////
+
+template <typename Shape_, typename LayoutA, typename LayoutB>
+struct Mma_HFMA2<
+  Shape_,
+  LayoutA,
+  LayoutB,
+  layout::ColumnMajor,
+  false
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// A operand storage
+  using FragmentA = Array<half_t, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<half_t, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<half_t, Shape::kMN>;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  static_assert(
+    !(Shape::kK % 2),
+    "Mma_HFMA2 requires the K dimension to be divisible by 2."
+  );
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    /// Initialize output with input
+    D = C;
+
+    /// Use 1x1x2 HFMA2 sequence for bulk of computation
+    using GemmShape= gemm::GemmShape<1,1,2>;
+
+    Array<half_t, 1> *ptr_D = reinterpret_cast<Array<half_t, 1> *>(&D);
+    Array<half_t, 2> const *ptr_A = reinterpret_cast<Array<half_t, 2> const *>(&A);
+    Array<half_t, 2> const *ptr_B = reinterpret_cast<Array<half_t, 2> const *>(&B);
+
+    // Inner product is calculated using MACs, followed by final reduction
+    multiply_add<Array<half_t, 2>> mac;
+    cutlass::reduction::thread::Reduce< plus<half_t>, Array<half_t, 2> > reduce;
+
+    CUTLASS_PRAGMA_UNROLL
+    for(auto n=0; n < Shape::kN / GemmShape::kN; n++){ 
+
+      CUTLASS_PRAGMA_UNROLL
+      for(auto m=0; m < Shape::kM / GemmShape::kM; m++){
+
+        Array<half_t, 2> tmp_C;
+        tmp_C.clear();
+        Array<half_t, 1> *ptr_tmp_C = reinterpret_cast<Array<half_t, 1> *>(&tmp_C);
+        ptr_tmp_C[0] = ptr_D[n*Shape::kM + m];
+
+        CUTLASS_PRAGMA_UNROLL
+        for(auto k=0; k <  Shape::kK / GemmShape::kK; k++){ 
+
+          tmp_C = mac(ptr_A[m*Shape::kK/2 + k], ptr_B[n*Shape::kK/2 + k], tmp_C);
+
+        }
+
+        Array<half_t, 1> res;
+        Array<half_t, 1> *ptr_res = &res;
+        res = reduce(tmp_C);
+
+        ptr_D[n*Shape::kM + m] = ptr_res[0];
+      }
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_, typename LayoutA, typename LayoutB, typename LayoutC
+>
+struct Mma<
+  Shape_,
+  half_t,
+  LayoutA,
+  half_t,
+  LayoutB,
+  half_t,
+  LayoutC,
+  arch::OpMultiplyAdd
+  > {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = half_t;
+
+  /// Data type of operand B
+  using ElementB = half_t;
+
+  /// Element type of operand C
+  using ElementC = half_t;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  static bool const a_row_major = platform::is_same< LayoutA, layout::RowMajor>::value;
+  static bool const b_column_major = platform::is_same< LayoutB, layout::ColumnMajor>::value;
+  static bool const c_row_major = platform::is_same< LayoutC, layout::RowMajor>::value;
+  static bool const c_column_major = platform::is_same< LayoutC, layout::ColumnMajor>::value;
+
+  static bool const m_mod2 = !(Shape::kM % 2);
+  static bool const n_mod2 = !(Shape::kN % 2);
+  static bool const k_mod2 = !(Shape::kK % 2);
+
+  // HFMA based MMA optimizations are of 2 types :
+  // 1. Inner product 
+  // 2. Outer product
+  // It is chosen based on LayoutC (for outer product gemm) or
+  // Using LayoutA and LayoutB or shape=1x1x2K (for inner product gemms)
+  // If all fails, we choose the generic MMA
+  static bool const use_outer_prod = (c_column_major && m_mod2) || (c_row_major && n_mod2);
+  static bool const use_inner_prod = (a_row_major && b_column_major && k_mod2) || (Shape::kM==1 && Shape::kN==1 && k_mod2);
+  static bool const use_optimized =  (use_outer_prod || use_inner_prod);
+
+  using ArchMmaOperator = typename platform::conditional< use_optimized, 
+    detail::Mma_HFMA2<Shape, LayoutA, LayoutB, LayoutC, use_outer_prod>, 
+    MmaGeneric <Shape, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC, Operator> 
+  >::type;
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    ArchMmaOperator mma;
+
+    mma(D, A, B, C);
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+  /// Determines whether to enable thread::Gemm<> specializations compatible with SM50
+  template <
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB>
+  struct EnableMma_Crow_SM60 {
+
+    static bool const kIsConventionalLayout =
+      (platform::is_same<LayoutA, layout::RowMajor>::value ||
+        platform::is_same<LayoutA, layout::ColumnMajor>::value) &&
+      (platform::is_same<LayoutB, layout::RowMajor>::value ||
+        platform::is_same<LayoutB, layout::ColumnMajor>::value);
+
+    static bool const value = kIsConventionalLayout;
+  };
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Computes matrix product when C is row-major
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  typename LayoutA_,
+  typename LayoutB_
+>
+struct Mma<
+  Shape_,
+  half_t,
+  LayoutA_,
+  half_t,
+  LayoutB_,
+  half_t,
+  layout::RowMajor,
+  arch::OpMultiplyAdd,
+  typename platform::enable_if<detail::EnableMma_Crow_SM60<
+    LayoutA_,
+    LayoutB_
+    >::value>::type>{
+
+  using Shape = Shape_;
+  using ElementA = half_t;
+  using LayoutA = LayoutA_;
+  using ElementB = half_t;
+  using LayoutB = LayoutB_;
+  using ElementC = half_t;
+  using LayoutC = layout::RowMajor;
+  using Operator = arch::OpMultiplyAdd;
+
+  using TransposeMma = Mma<
+    GemmShapeTranspose<Shape>,
+    half_t,
+    typename layout::LayoutTranspose<LayoutB>::type,
+    half_t,
+    typename layout::LayoutTranspose<LayoutA>::type,
+    half_t,
+    layout::ColumnMajor,
+    arch::OpMultiplyAdd,
+    bool>;
+
+  using FragmentA = Array<ElementA, Shape::kMK>;
+  using FragmentB = Array<ElementB, Shape::kKN>;
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  using ArchMmaOperator = typename TransposeMma::ArchMmaOperator;
+
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TransposeMma mma;
+
+    mma(D, B, A, C);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm61.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm61.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7127ed842133a147db2f1cdeaa700ce3d69dc90
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/thread/mma_sm61.h
@@ -0,0 +1,284 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gemplate that handles conventional layouts for IDP4A
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  int8_t,
+  layout::RowMajor,
+  int8_t,
+  layout::ColumnMajor,
+  int32_t,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  bool> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = int8_t;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = layout::RowMajor;
+
+  /// Data type of operand B
+  using ElementB = int8_t;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = layout::ColumnMajor;
+
+  /// Element type of operand C
+  using ElementC = int32_t;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  //  Use 1x1x4 IDP4A sequence for bulk of computation
+  using ArchMmaOperator = arch::Mma<
+      gemm::GemmShape<1,1,4>,
+      1,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      arch::OpMultiplyAdd>; 
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementC, LayoutC> d(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
+    
+    // Copy accumulators
+    D = C;
+
+    /// Use 1x1x4 IDP4A sequence for bulk of computation
+    ArchMmaOperator mma;
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+          MatrixCoord mn(m, n);
+
+          Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
+          Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
+
+          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
+
+          mma(
+            tmp,
+            ptr_A[m * Shape::kK / ArchMmaOperator::Shape::kK + k],
+            ptr_B[n * Shape::kK / ArchMmaOperator::Shape::kK + k],
+            tmp);
+
+          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Gemplate that handles conventional layouts for IDP4A
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_
+>
+struct Mma<
+  Shape_,
+  int8_t,
+  layout::ColumnMajor,
+  int8_t,
+  layout::RowMajor,
+  int32_t,
+  LayoutC_,
+  arch::OpMultiplyAdd,
+  int8_t> {
+
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  /// Data type of operand A
+  using ElementA = int8_t;
+
+  /// Layout of A matrix (concept: layout::MapFunc)
+  using LayoutA = layout::ColumnMajor;
+
+  /// Data type of operand B
+  using ElementB = int8_t;
+
+  /// Layout of B matrix (concept: layout::MapFunc)
+  using LayoutB = layout::RowMajor;
+
+  /// Element type of operand C
+  using ElementC = int32_t;
+
+  /// Layout of C matrix (concept: layout::MapFunc)
+  using LayoutC = LayoutC_;
+
+  /// Underlying mathematical operator
+  using Operator = arch::OpMultiplyAdd;
+
+  /// A operand storage
+  using FragmentA = Array<ElementA, Shape::kMK>;
+
+  /// B operand storage
+  using FragmentB = Array<ElementB, Shape::kKN>;
+
+  /// C operand storage
+  using FragmentC = Array<ElementC, Shape::kMN>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  /// Use 1x1x4 IDP4A sequence for bulk of computation
+  using ArchMmaOperator = arch::Mma<
+      gemm::GemmShape<1,1,4>,
+      1,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      arch::OpMultiplyAdd>; 
+
+  //
+  // Methods
+  //
+
+  /// Computes a matrix product D = A * B + C
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentC & D,
+    FragmentA const & A,
+    FragmentB const & B,
+    FragmentC const & C) {
+
+    TensorRef<ElementC, LayoutC> d(
+      reinterpret_cast<ElementC *>(&D), LayoutC::packed({ Shape::kM, Shape::kN }));
+    
+    // Copy accumulators
+    D = C;
+
+    /// Underlying matrix multiply operator
+    ArchMmaOperator mma;
+    
+    Array<int8_t, 4> const *ptr_A = reinterpret_cast<Array<int8_t, 4> const *>(&A);
+    Array<int8_t, 4> const *ptr_B = reinterpret_cast<Array<int8_t, 4> const *>(&B);
+
+    // Compute matrix product
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Shape::kK / ArchMmaOperator::Shape::kK; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Shape::kN; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < Shape::kM; ++m) {
+          MatrixCoord mn(m, n);
+
+          Array<int32_t, 1> tmp = reinterpret_cast<Array<int32_t, 1> &>(d.at(mn));
+
+          mma(
+            tmp,
+            ptr_A[m + k * Shape::kM],
+            ptr_B[n + k * Shape::kN],
+            tmp);
+
+          d.at(mn) = reinterpret_cast<int32_t &>(tmp);
+        }
+      }
+    }
+  }
+};
+
+} // namespace thread
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae82f32a857315466af13ce485313d6bc67efe0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_ell_mma.h
@@ -0,0 +1,734 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default template for a Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+#include "cutlass/gemm/threadblock/ell_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/ell_mma_multistage.h"
+#include "cutlass/transform/threadblock/ell_predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultEllMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass Simt)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassSimt, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::EllPredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>,
+      typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for SIMT IDP4A Kernels
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape>
+struct DefaultEllMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
+                  Operator, false> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using OperatorClass =  arch::OpClassSimt;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, layout::RowMajor >::value;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// Specialization for Wmma TensorOp operator with 2 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::EllMmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for Wmma TensorOp operator with 1 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultEllMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 1, Operator, false> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 1, Operator>; 
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::EllPredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped singlestage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..214f451c152451d0b78f70bc191cc5ead625286a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -0,0 +1,151 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level batched GEMV assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting SIMT instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/thread/mma.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/gemm/threadblock/gemv.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Template defininng default vector-matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout.
+template <
+  typename Shape_,            /// Shape of the threadblock vector-matrix multiply operator
+  typename ThreadShape_,      /// Shape of per-thread vector-matrix multiply operator
+  typename ElementA_,         /// Element data type of A operand
+  typename LayoutA_,          /// Layout of operand A
+  typename ElementB_,         /// Element data type of B operand
+  typename LayoutB_,          /// Layout of operand B
+  typename ElementC_,         /// Data type of accumulator
+  typename LayoutC_           /// Layout of accumulator
+>
+struct DefaultGemvCore {
+
+  using Shape = Shape_;
+  using ThreadShape = ThreadShape_;
+
+  using LayoutA = LayoutA_;
+  using LayoutB = LayoutB_;
+  using LayoutC = LayoutC_;
+  
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  static int const kThreadsPerN = Shape::kN / ThreadShape::kN;
+
+  using IteratorPolicyA = typename platform::conditional<
+                            platform::is_same<LayoutA, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kK, Shape::kM>, 1, ThreadShape::kK>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kM, Shape::kK>, 1, ThreadShape::kM>>::type;
+
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+                          cutlass::MatrixShape<Shape::kM, Shape::kK>, ElementA, LayoutA, 1, IteratorPolicyA>;
+
+  using IteratorPolicyB = typename platform::conditional<
+                            platform::is_same<LayoutB, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreadsPerN, ThreadShape::kN>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreadsPerN, ThreadShape::kK>>::type;
+
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+                            cutlass::MatrixShape<Shape::kK, Shape::kN>, ElementB, LayoutB, 0, IteratorPolicyB>;
+
+  using IteratorPolicyC = typename platform::conditional<
+                            platform::is_same<LayoutC, layout::RowMajor>::value,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadContiguous<
+                              layout::PitchLinearShape<Shape::kN, Shape::kM>, kThreadsPerN, ThreadShape::kN>,
+                            cutlass::transform::PitchLinearTilePolicyStripminedThreadStrided<
+                              layout::PitchLinearShape<Shape::kM, Shape::kN>, kThreadsPerN, ThreadShape::kM>>::type;
+
+  using IteratorC = cutlass::transform::threadblock::PredicatedTileIterator<
+                             cutlass::MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC, 0, IteratorPolicyC>;
+
+  using MmaSimtOp = typename cutlass::gemm::thread::Mma<
+    cutlass::gemm::GemmShape<ThreadShape::kM, ThreadShape::kN, Shape::kK>,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC>;
+
+  using Operator = MmaSimtOp;
+
+  // Assertions for correctness
+  static_assert((Shape::kM == 1), "M=1 is required for GEMV");
+  
+  static_assert((ThreadShape::kM == 1), "M=1 is required for GEMV");
+
+  static_assert(Shape::kK % ThreadShape::kK == 0, "Shape::K must be a multiple of ThreadShape::K");
+
+  static_assert(((ThreadShape::kK == 1) ||
+                (ThreadShape::kK == 2) || 
+                (ThreadShape::kK == 4) ||
+                (ThreadShape::kK == 8) ||
+                (ThreadShape::kK == 16) ||
+                (ThreadShape::kK == 32)
+               ),
+              "ThreadShape::K must be a 1, 2, 4, 8, 16 or 32");
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee573dbe8dac3576b8647a8302d7a5fb7b677edb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma.h
@@ -0,0 +1,823 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Gather operand A by using an index array
+    bool GatherA = false,
+    /// Gather operand B by using an index array
+    bool GatherB = false,
+    /// Permute operand A
+    typename PermuteALayout = layout::NoPermute,
+    /// Permute operand B
+    typename PermuteBLayout = layout::NoPermute
+    >
+struct DefaultMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass Simt)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassSimt, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<float, LayoutA, kAlignmentA, float, LayoutB,
+                  kAlignmentB, float, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float,
+      LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2,
+      arch::OpMultiplyAddFastF16>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA,
+          GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB,
+          GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, float,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape, 2,
+                  Operator, true, SharedMemoryClearOption::kNone, false, false,
+                  layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, 2, Operator,
+      true>;
+
+  static_assert(kAlignmentA == 128 / sizeof_bits<ElementA>::value, 
+    "Alignment must match thread data map's vector length");
+
+  static_assert(kAlignmentB ==128 / sizeof_bits<ElementB>::value,
+    "Alignment must match thread data map's vector length");
+
+  // Define iterators over tiles from the A operand
+  using IteratorA = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>, ElementA,
+      LayoutA, 1, typename MmaCore::IteratorThreadMapA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = cutlass::transform::threadblock::PredicatedTileIterator<
+      cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>, ElementB,
+      LayoutB, 0, typename MmaCore::IteratorThreadMapB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>,
+      typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassSimt, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false, SharedMemoryClearOption::kNone,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassSimt,
+      Stages, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    /// Gather operand A by using an index array
+    bool GatherA,
+    /// Gather operand B by using an index array
+    bool GatherB,
+    /// Permute operand A
+    typename PermuteALayout,
+    /// Permute operand B
+    typename PermuteBLayout
+    >
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false, SharedMemoryClear,
+                  GatherA, GatherB, PermuteALayout, PermuteBLayout> {
+
+  static_assert(platform::is_same<LayoutC, layout::RowMajor>::value
+             || platform::is_same<LayoutC, layout::AffineRankN<2>>::value,
+             "simt epilogue must be row major");
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA, GatherA, PermuteALayout>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB, GatherB, PermuteBLayout>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, LayoutC,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for column-major-interleaved output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Number of Interleaved K
+    int InterleavedK>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator,
+                  layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass,
+                  ArchTag, ThreadblockShape, WarpShape, InstructionShape,
+                  Stages, Operator, true, SharedMemoryClearOption::kNone, 
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator,
+      layout::ColumnMajorInterleaved<InterleavedK>, OperatorClass, Stages,
+      Operator, true>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for SIMT IDP4A Kernels
+template <
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape>
+struct DefaultMma<int8_t, LayoutA, kAlignmentA, int8_t, LayoutB, kAlignmentB,
+                  ElementAccumulator, layout::RowMajor, arch::OpClassSimt,
+                  ArchTag, ThreadblockShape, WarpShape, GemmShape<1, 1, 4>, 2,
+                  Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using ElementB = int8_t;
+  using OperatorClass =  arch::OpClassSimt;
+
+  static const bool transposeA = platform::is_same< LayoutA, layout::ColumnMajor >::value;
+  static const bool transposeB = platform::is_same< LayoutB, layout::RowMajor >::value;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      OperatorClass, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      layout::RowMajor, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+/// Specialization for Wmma TensorOp operator with 2 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 2, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for Wmma TensorOp operator with 1 staged pipeline
+template <
+    ///< Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator>
+struct DefaultMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, LayoutC,
+                  arch::OpClassWmmaTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, 1, Operator, false, SharedMemoryClearOption::kNone,
+                  false, false, layout::NoPermute, layout::NoPermute> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, LayoutC,
+      arch::OpClassWmmaTensorOp, 1, Operator>; 
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kM, MmaCore::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore::Shape::kK, MmaCore::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped singlestage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementAccumulator,
+      LayoutC, typename MmaCore::MmaPolicy>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..16860880e8d84b95b6134a149730ed3d6a21c2f5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+#include "cutlass/arch/mma.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaCore;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c9f3e6f142d6c04768c1b10c904c85de6ce7cd0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -0,0 +1,1723 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+namespace detail {
+
+// convert a WarpShape which is the whole tile of elements into warp num threads.
+// The goal is for each thread's tile of elements to be as square as possible
+// for performance (4x4 will be faster than 2x8).
+template<typename WarpShape>
+constexpr int simt_get_warp_threads_m() {
+    return (WarpShape::kM > WarpShape::kN) ? 8 : 4;
+}
+
+/// Computes padding in shared memory to perform efficient transpose without bank conflicts.
+constexpr int simt_transpose_padding(int threads, int crosswise, int size_in_bits) {
+  return (size_in_bits >= 32 ?
+      threads / crosswise / (size_in_bits / 32) :
+      threads / crosswise * (32 / size_in_bits)
+  );
+}
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+  
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA // was IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB // was IteratorThreadMapA
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingM % LaneM) && !(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,      /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,       /// Data type of A elements
+      SmemLayoutA,    /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,       /// Data type of B elements
+      SmemLayoutB,    /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,       /// Element type of C matrix
+      LayoutC,        /// Layout of C matrix (concept: MatrixLayout)
+      Policy          /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, kPaddingN>,    // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+
+  static_assert(!(kPaddingM % LaneM),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,     /// Data type of A elements
+      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,     /// Data type of B elements
+      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,     /// Element type of C matrix
+      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,    // skew for A matrix to avoid SMEM bank conflicts
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA,
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB,
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  static_assert(!(kPaddingN % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+      WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+      ElementA,     /// Data type of A elements
+      SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+      ElementB,     /// Data type of B elements
+      SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+      ElementC,     /// Element type of C matrix
+      LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+      Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, kPaddingN>, // skew for B matrix to avoid SMEM bank conflicts
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 1>, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              2,
+                              Operator>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::ColumnMajor, int8_t, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                    > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Row-major
+///   B: Column-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::RowMajor, int8_t, layout::ColumnMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,
+    MatrixShape<0, kPaddingN>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Row-major
+///   B: Row-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::RowMajor, int8_t, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::RowMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    SmemThreadMapA
+  >;
+  
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<kPaddingM, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization:
+//
+///
+///   A: Column-major
+///   B: Column-major
+///   Operator: simt class, for dp4a
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<1, 1, 4>, int8_t,
+                      layout::ColumnMajor, int8_t, layout::ColumnMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, 2, Operator_
+                      > {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 4>;
+  using ElementA = int8_t;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = int8_t;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorInterleaved<4>;
+  using SmemLayoutB = layout::RowMajorInterleaved<4>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+  
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinear2DThreadTileStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 4>
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap2DThreadTile<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator2dThreadTile<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    SmemThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(4, ThreadTileM);
+  static const int LaneN = cutlass::const_min(4, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      4>;
+
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::ColumnMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    PartitionsK   /// Number of partitions along K dimension
+    >;
+
+  static int const kPaddingM = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementA>::value);
+  static int const kPaddingN = detail::simt_transpose_padding(kWarpSize, Shape::kK, sizeof_bits<ElementB>::value);
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, kPaddingN>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..fafc45c029b0bf7198231f1a7a6e2baddb8c122e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -0,0 +1,682 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = 
+    layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value>;
+
+  // Shared memory layout
+  using SmemLayoutB = 
+    layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+      sizeof_bits<ElementB>::value>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+      sizeof_bits<ElementB>::value>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, GemmShape<8, 8, 4>, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<8, 8, 4>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    layout::PitchLinearShape<4, 8>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Mma<
+      cutlass::gemm::GemmShape<16, 16, 4>,
+      32,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      cutlass::layout::RowMajor,
+      cutlass::arch::OpMultiplyAdd
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaVoltaTensorOp<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
new file mode 100644
index 0000000000000000000000000000000000000000..39422ec8e20838861c19f5510aa60e6414972632
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -0,0 +1,1315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h"
+
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = 
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+    sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                             kWarpThreadArrangementStridedA>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                             kWarpThreadArrangementStridedB>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, 2, Operator_
+                      > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                       MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Below is for arch::OpMultiplyAddFastF16
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB =
+      layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<half_t>::value,
+                                                    int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA =
+      layout::RowMajorTensorOpMultiplicandCrosswise<sizeof_bits<half_t>::value,
+                                                    Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t, 
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    1,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::RowMajor, float, layout::RowMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::RowMajor;
+  using ElementB = float;
+  using LayoutB = layout::RowMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    half_t,
+    SmemLayoutA,
+    0,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    layout::PitchLinearShape<8, 4>,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    half_t, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, float,
+                      layout::ColumnMajor, float, layout::ColumnMajor, float,
+                      LayoutC_, arch::OpClassTensorOp, 2,
+                      arch::OpMultiplyAddFastF16> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = float;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = float;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = float;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 256;
+
+  /// Default Operator
+  using Operator = arch::OpMultiplyAdd; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<half_t>::value, int(128 / sizeof(half_t))>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<half_t>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, half_t, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, half_t, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                              WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major-interleave
+///   B: row-major-interleave
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+///
+/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
+/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
+/// can be reused. The shared store iterator is the same as the crosswise shared
+/// store iterator. So, the only thing we need to do is to swap the coordinates
+/// (contiguous <=> strided) used by the global iterator and the shared store
+/// iterator.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Number of interleaved k
+    int InterleavedK>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
+                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, 2, Operator_,
+                      AccumulatorsInRowMajor> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassTensorOp;
+  static int const kInterleavedK = InterleavedK;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess =
+      kAccessSizeInBits / sizeof_bits<ElementA>::value;
+
+  static int const kWarpThreadArrangementContiguous =
+      kInterleavedK / kElementsPerAccess;
+
+  static int const kWarpThreadArrangementStrided =
+      kWarpSize / kWarpThreadArrangementContiguous;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, kInterleavedK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kInterleavedK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapA,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                       MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5e14c6ad20e063078b838a6ed55bc04fde0d5c4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -0,0 +1,2951 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+
+      SM80 Multi stage kernel expects stage number to be larger or equal to 3
+   to use asynchronous copy.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::ColumnMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::RowMajor, double, layout::RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2ColumnMajor, double, layout::AffineRank2RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2RowMajor, double, layout::AffineRank2ColumnMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Partial specialization for double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, double,
+                      layout::AffineRank2RowMajor, double, layout::AffineRank2RowMajor, double,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = double;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = double;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = double;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassTensorOp,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float-precision
+///
+///   ElementA: complex<float>
+///   ElementB: complex<float>
+///   ElementC: complex<float>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+  complex<float>, LayoutA_, 
+  complex<float>, LayoutB_, 
+  complex<float>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<float>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddComplexFastF32>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for double-precision
+///
+///   ElementA: complex<double>
+///   ElementB: complex<double>
+///   ElementC: complex<double>
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout for A operand
+    typename LayoutA_,
+    /// Layout for B operand
+    typename LayoutB_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA_,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB_
+    >
+struct DefaultMmaCore<
+  Shape_, WarpShape_, InstructionShape_, 
+  complex<double>, LayoutA_, 
+  complex<double>, LayoutB_, 
+  complex<double>, LayoutC_, 
+  arch::OpClassTensorOp, 
+  Stages, 
+  Operator_, 
+  false, 
+  CacheOpA, 
+  CacheOpB,
+  TransformA_, TransformB_, true> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = LayoutA_;
+  using ElementB = complex<double>;
+  using LayoutB = LayoutB_;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+  static const ComplexTransform TransformA = TransformA_;
+  static const ComplexTransform TransformB = TransformB_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 64;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  static_assert(
+    platform::is_same<Operator, arch::OpMultiplyAddComplex>::value ||
+    platform::is_same<Operator, arch::OpMultiplyAddGaussianComplex>::value,
+    "The operator tag must indicate complex multiplication.");
+
+  //
+  // Underlying template
+  //
+
+  using MmaComplexCore = DefaultMultistageMmaComplexCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    arch::OpClassTensorOp,
+    kStages, 
+    TransformA,
+    TransformB,
+    Operator,
+    kCacheOpA,
+    kCacheOpB
+  >;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename MmaComplexCore::SmemLayoutA;
+
+  // Shared memory layout
+  using SmemLayoutB = typename MmaComplexCore::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename MmaComplexCore::SmemIteratorA;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename MmaComplexCore::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename MmaComplexCore::MmaTensorOp;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename MmaComplexCore::MmaPolicy;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, Shape::kK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major-interleaved
+///   B: row-major-interleaved
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+///
+/// Column/RowMajorInterleved<InterleavedK>(m, n) is mapped to Column/RowMajor(m
+/// x InterleavedK, n / InterleavedK) so that Column/RowMajor global iterators
+/// can be reused. The shared store iterator is the same as the crosswise shared
+/// store iterator. So, the only thing we need to do is to swap the coordinates
+/// (contiguous <=> strided) used by the global iterator and the shared store
+/// iterator.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Number of interleaved K
+    int InterleavedK>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajorInterleaved<InterleavedK>, ElementB_,
+                      layout::RowMajorInterleaved<InterleavedK>, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      AccumulatorsInRowMajor, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajorInterleaved<InterleavedK>;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajorInterleaved<InterleavedK>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static int const kInterleavedK = InterleavedK;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>; 
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess =
+      kAccessSizeInBits / sizeof_bits<ElementA>::value;
+
+  static int const kWarpThreadArrangementContiguous =
+      kInterleavedK / kElementsPerAccess;
+
+  static int const kWarpThreadArrangementStrided =
+      kWarpSize / kWarpThreadArrangementContiguous;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, kInterleavedK>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kInterleavedK>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapA,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN * kInterleavedK,
+                               Shape::kK / kInterleavedK>,
+      kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMap<
+      IteratorThreadMapB,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguous,
+                               kWarpThreadArrangementStrided>>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneM) && !((Shape::kK / 32) % LaneN),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+
+  static_assert(!((Shape::kK / 32) % LaneM),
+                "Padding must be divisible by Lane");
+
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,  /// Data type of A elements
+    SmemLayoutA,   /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,  /// Data type of B elements
+    SmemLayoutB,   /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,  /// Element type of C matrix
+    LayoutC,   /// Layout of C matrix (concept: MatrixLayout)
+    Policy     /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    >;         /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2ColumnMajor, ElementB_, layout::AffineRank2ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::ColumnMajor,
+                              ElementB,
+                              layout::ColumnMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+
+};
+
+/// Partial specialization for SIMT GEMMs using multistage pipeline.
+///
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by Simt
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::AffineRank2RowMajor, ElementB_, layout::AffineRank2RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassSimt, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::AffineRank2RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::AffineRank2RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  using Base = DefaultMmaCore<Shape,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              layout::RowMajor,
+                              ElementB,
+                              layout::RowMajor,
+                              ElementC,
+                              LayoutC,
+                              arch::OpClassSimt,
+                              kStages,
+                              Operator,
+                              false,
+                              kCacheOpA,
+                              kCacheOpB>;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = typename Base::SmemIteratorA;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = typename Base::SmemIteratorB;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = typename Base::MmaPolicy;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..4abf72352ba0d37441126be0ce2e0a6f12f0e0d6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -0,0 +1,876 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting sparse
+   TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_sparse_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_sparse_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    /// Cache operation of operand A
+    , cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global
+>
+struct DefaultSparseMmaCore;
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+ 
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  // crosswise cannot be larger than 1024 bit.
+  static int const kCrosswiseB =
+      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
+          ? (1024 / sizeof_bits<ElementB>::value)
+          : Shape::kK;
+
+  static int const kWarpThreadArrangementContiguousB =
+      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kCrosswiseB>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassTensorOp, Stages,
+                      Operator_, false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const Crosswise_A = platform::min(int(128 / sizeof(ElementA)),
+                                               Shape::kM);
+
+  static int const kWarpThreadArrangementContiguousA =
+      platform::min(Shape::kM / (kAccessSizeInBits / sizeof_bits<ElementA>::value), 8);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  // Warp thread arrangement
+  // crosswise cannot be larger than 1024 bit.
+  static int const kCrosswiseB =
+      (Shape::kK > (1024 / sizeof_bits<ElementB>::value))
+          ? (1024 / sizeof_bits<ElementB>::value)
+          : Shape::kK;
+
+  static int const kWarpThreadArrangementContiguousB =
+      kCrosswiseB / (kAccessSizeInBits / sizeof_bits<ElementB>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementA>::value, Crosswise_A>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementB>::value, kCrosswiseB>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK / kSparse>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultSparseMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassTensorOp, Stages, Operator_,
+                      false, CacheOpA, CacheOpB> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  static int const kSparse = 2;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / kSparse / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      platform::min(Shape::kN / (kAccessSizeInBits / sizeof_bits<ElementB>::value), 8);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  static int const Crosswise_B = platform::min(int(128 / sizeof(ElementB)),
+                                               Shape::kN);
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<ElementA>::value, Shape::kK / kSparse>;
+
+  // Shared memory layout
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<ElementB>::value, Crosswise_B>;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK / kSparse, Shape::kM>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousA,
+                               kWarpThreadArrangementStridedA>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK / kSparse>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<kWarpThreadArrangementContiguousB,
+                               kWarpThreadArrangementStridedB>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultSparseMmaTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, WarpCount::kK>::Type;
+
+  /// Cache operation of operand E
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE =
+      cutlass::arch::CacheOperation::Global;
+
+  static int const kInterleavedE = MmaTensorOp::kInterleaved;
+  static int const kMetaSizeInBits = MmaTensorOp::kMetaSizeInBits;
+  static int const kMaxID2 = MmaTensorOp::kMaxID2;
+  static int const kElementsPerElementE = MmaTensorOp::kElementsPerElementE;
+
+  using ElementE = typename MmaTensorOp::ElementE;
+  using GmemLayoutE = cutlass::layout::ColumnMajorInterleaved<kInterleavedE>;
+
+  // Shared memory layout.  Interleaved layout is mapped to PitchLinear layout.
+  using SmemLayoutE = typename MmaTensorOp::LayoutE;
+
+  /// ThreadMap of iterator E
+  static int const kElementsPerAccessE =
+      kAccessSizeInBits / sizeof_bits<ElementE>::value;
+
+  /// E is tiny.  Not all warps are needed.
+  static int const kThreadsE =
+      (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+           (kAccessSizeInBits / sizeof_bits<ElementE>::value) >
+       kThreads)
+          ? kThreads
+          : (Shape::kM * Shape::kK / kSparse / kElementsPerElementE /
+             (kAccessSizeInBits / sizeof_bits<ElementE>::value));
+
+  using IteratorThreadMapE = transform::PitchLinearStripminedThreadMap<
+      layout::PitchLinearShape<Shape::kM * kInterleavedE,
+                               Shape::kK / kSparse / kElementsPerElementE /
+                                   kInterleavedE>,
+      kThreadsE, kElementsPerAccessE>;
+
+  /// Shared memory iterator to E operand
+  using SmemIteratorE = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM * kInterleavedE,
+                  Shape::kK / kSparse / kElementsPerElementE / kInterleavedE>,
+      ElementE, SmemLayoutE, 0, IteratorThreadMapE>;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy =
+      SparseMmaPolicy<MmaTensorOp, MatrixShape<0, 0>, MatrixShape<0, 0>,
+                      MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..b260c91197f1a86c2521778527aa7d13791f7327
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting simt instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/warp/mma.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_singlestage.h"
+#include "cutlass/arch/cache_operation.h" 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Size of a threadblock-scoped access
+    int kAccessSizeInBits = -1, // -1 denoting the default
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false // (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaCoreWithAccessSize;
+
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Operation performed by MMA
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB,
+    bool IsComplex
+>
+struct DefaultMmaCoreWithAccessSize<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, -1, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> : DefaultMmaCore<
+    Shape, WarpShape, InstructionShape,
+    ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+    OperatorClass, Stages, Operator, AccumulatorsInRowMajor,
+    CacheOpA, CacheOpB, TransformA, TransformB, IsComplex
+> {};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: simt class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Size of a threadblock-scoped access (a value of -1 indicates the default)
+    int kAccessSizeInBits_,
+    /// Operation performed by GEMM
+    typename Operator_>
+struct DefaultMmaCoreWithAccessSize<Shape_, WarpShape_, typename platform::enable_if<kAccessSizeInBits_ != -1, GemmShape<1, 1, 1>>::type, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassSimt, kAccessSizeInBits_, 2, Operator_
+                     > {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassSimt;
+  static int const PartitionsK = Shape::kK / WarpShape::kK;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    PartitionsK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassSimt>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  static int const kElementsPerAccessDefault = 1;
+  static_assert(kAccessSizeInBits_ == -1 ||
+          sizeof_bits<ElementA>::value == sizeof_bits<ElementB>::value ||
+          kAccessSizeInBits_ / sizeof_bits<ElementA>::value == kElementsPerAccessDefault,
+          "Non-default value for kAccessSizeInBits_ is only allowed if size(elementA) == sizeof(elementB)");
+  static int const kElementsPerAccess = (kAccessSizeInBits_ != -1) ? kAccessSizeInBits_ / sizeof_bits<ElementA>::value : kElementsPerAccessDefault;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = detail::simt_get_warp_threads_m<WarpShape>();
+  static const int WarpNumThreadsN = kWarpSize / WarpNumThreadsM;
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy        /// Policy describing warp-level MmaSimtOp (concept: MmaSimtOp policy)
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..72015956e905561b5f4be686dbeea2921b7ba3df
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_with_reduction_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from threadblock tile size,
+/// global memory data layout, and target math instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Reduce operand A or B along K dimension
+    bool ReduceKForA_,
+    /// Number of stages
+    int Stages = 2,
+    /// Operation performed by MMA
+    typename Operator = typename platform::conditional<
+        (platform::is_same<OperatorClass,
+                           cutlass::arch::OpClassTensorOp>::value) &&
+            (platform::is_same<ElementA, int8_t>::value ||
+             platform::is_same<ElementA, int4b_t>::value ||
+             platform::is_same<ElementA, uint8_t>::value ||
+             platform::is_same<ElementA, uint4b_t>::value),
+        cutlass::arch::OpMultiplyAddSaturate,
+        cutlass::arch::OpMultiplyAdd>::type,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global,
+    /// per-element transformation for elements of A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// per-element transformation for elements of B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    bool IsComplex = false// (is_complex<ElementA>::value || is_complex<ElementB>::value)
+>
+struct DefaultMmaWithReductionCore {
+  using Base = DefaultMmaCore<Shape_,
+                              WarpShape,
+                              InstructionShape,
+                              ElementA,
+                              LayoutA,
+                              ElementB,
+                              LayoutB,
+                              ElementC,
+                              LayoutC,
+                              OperatorClass,
+                              Stages,
+                              Operator,
+                              AccumulatorsInRowMajor,
+                              CacheOpA,
+                              CacheOpB,
+                              TransformA,
+                              TransformB,
+                              IsComplex>;
+  using Shape = Shape_;
+  using IteratorThreadMapA = typename Base::IteratorThreadMapA;
+  using IteratorThreadMapB = typename Base::IteratorThreadMapB;
+  using SmemIteratorA = typename Base::SmemIteratorA;
+  using SmemIteratorB = typename Base::SmemIteratorB;
+  using SmemLayoutA = typename Base::SmemLayoutA;
+  using SmemLayoutB = typename Base::SmemLayoutB;
+  using WarpCount = typename Base::WarpCount;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+   
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaWithReductionTensorOp<
+      WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB,
+      ElementC, LayoutC, Operator, ReduceKForA_, WarpCount::kK>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b3bbcf71ed389cc7f001bb943ce70c62a83dd5d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -0,0 +1,712 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming expectations about data
+      layout of the global memory fragments, data types, and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: row-major
+///   Operator: wmma tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    ///< Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::RowMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  //
+  // Shared memory layouts
+  //
+  // NOTE: shared memory layout for wmma is same as the operands' layout in the global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Iterators to write to shared memory
+  //
+  
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<kPaddingA, 0>,
+    MatrixShape<0, kPaddingB>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: column-major
+///   Operator: wmma tensorop class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    ///< Shape of threadblock-scoped matrix multiply operator
+    ///< (concept:GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape) [allowed
+    /// wmma instruction shapes, e.g., 16x16x16, 32x8x16, 8x32x16,...]
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by GEMM
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads per threadblock
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+  
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+
+  //
+  // Iterators to write to shared memory 
+  //
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA 
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;  
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB // SmemThreadMapB 
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, kPaddingA>,
+    MatrixShape<kPaddingB, 0>,
+    WarpCount::kK
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: row-major
+///   B: row-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::RowMajor, ElementB_, layout::RowMajor, ElementC_,
+                      LayoutC_, arch::OpClassWmmaTensorOp, Stages, Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::RowMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<
+    Shape::kM / WarpShape::kM,
+    Shape::kN / WarpShape::kN,
+    Shape::kK / WarpShape::kK
+  >;
+
+  // Divisility requirements
+  static_assert(
+    !(Shape::kM % WarpShape::kM) &&
+    !(Shape::kN % WarpShape::kN),
+    "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."
+  );
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_;
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousA =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedA =
+      kWarpSize / kWarpThreadArrangementContiguousA;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+  
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>, 
+    ElementA, 
+    SmemLayoutA,
+    1,
+    IteratorThreadMapA
+  >;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>, 
+    ElementB, 
+    SmemLayoutB,
+    0,
+    IteratorThreadMapB
+  >;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<0, kPaddingA>,
+    MatrixShape<0, kPaddingB>,
+    WarpCount::kK
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization:
+///
+///   A: column-major
+///   B: column-major
+///   Operator: tensor op class
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A operand
+    typename ElementA_,
+    /// Data type of B operand
+    typename ElementB_,
+    /// Data type of accumulator
+    typename ElementC_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Operation performed by MMA
+    typename Operator_,
+    /// Number of stages
+    int Stages>
+struct DefaultMmaCore<Shape_, WarpShape_, InstructionShape_, ElementA_,
+                      layout::ColumnMajor, ElementB_, layout::ColumnMajor,
+                      ElementC_, LayoutC_, arch::OpClassWmmaTensorOp, Stages,
+                      Operator_> {
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = ElementA_;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = ElementB_;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of warps present
+  using WarpCount =
+      GemmShape<Shape::kM / WarpShape::kM, Shape::kN / WarpShape::kN,
+                Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassWmmaTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped access
+  static int const kAccessSizeInBits = 128;
+
+  /// Default Operator
+  using Operator = Operator_; 
+
+  // Warp thread arrangement 
+  static int const kWarpThreadArrangementContiguousB =
+      Shape::kK / (kAccessSizeInBits / sizeof_bits<ElementA>::value);
+
+  static int const kWarpThreadArrangementStridedB =
+      kWarpSize / kWarpThreadArrangementContiguousB;
+
+  //
+  // Shared memory layouts
+  //
+
+  // shared memory layout for wmma is same as the operands' layout in global memory
+  using SmemLayoutA = LayoutA;
+  using SmemLayoutB = LayoutB;
+
+  // Pad shared memory to avoid bank conflicts
+  static int const kPaddingA = 128 / sizeof_bits<ElementA>::value;
+  static int const kPaddingB = 128 / sizeof_bits<ElementB>::value;
+  
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementA>::value
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB =  transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kAccessSizeInBits / sizeof_bits<ElementB>::value
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+    cutlass::arch::Wmma<
+      InstructionShape,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      Operator
+    >,
+    cutlass::MatrixShape<1, 1>
+  >;
+
+  using MmaTensorOp = cutlass::gemm::warp::MmaTensorOpWmma<
+    WarpShape,
+    ElementA,
+    SmemLayoutA,
+    ElementB,
+    SmemLayoutB,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Policy used to define MmaPipelined 
+  using MmaPolicy = MmaPolicy<
+    MmaTensorOp,
+    MatrixShape<kPaddingA, 0>,
+    MatrixShape<kPaddingB, 0>,
+    WarpCount::kK
+  >;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+#endif // defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..bce17dd19fab25040ab4be1c9e31421842637b79
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaLayernormMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorVarMean =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorGammaBeta =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  using SmemIteratorGammaBeta =
+      cutlass::transform::threadblock::RegularScaleBiasVectorAccessIterator<
+          cutlass::MatrixShape<1, ThreadblockShape::kK>, ElementScaleBias,
+          LayoutScaleBias>;
+
+  static int const kThreadCount = 32;
+
+  // Warp-level iterators to load scale and bias vectors
+  using WarpIteratorGammaBeta = cutlass::gemm::warp::ScaleBiasTileIterator<
+      MatrixShape<WarpShape::kM, WarpShape::kK>, ElementScaleBias,
+      LayoutScaleBias, MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+      typename MmaCore::MmaTensorOp::IteratorA::Base::Policy, kThreadCount,
+      MmaCore::WarpCount::kK>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaLayernormMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorVarMean, IteratorGammaBeta, SmemIteratorGammaBeta,
+      CacheOpGammaBeta,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, WarpIteratorGammaBeta, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..cab385aff88f9b4736da33de2819b19a2f9f0f9e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Math operator tag (e.g. arch::OpMultiplyAdd)
+    typename Operator = arch::OpMultiplyAdd
+>
+struct DefaultMmaPlanarComplexMultistage {
+
+    // Construct a planar complex variant from the real-valued variant
+    using RealMmaMultistage = typename DefaultMma<
+        ElementA_,
+        LayoutA_,
+        kAlignmentA,
+        ElementB_,
+        LayoutB_,
+        kAlignmentB,
+        ElementAccumulator_,
+        LayoutC_,
+        OperatorClass_,
+        ArchTag_,
+        ThreadblockShape_,
+        WarpShape_,
+        InstructionShape_,
+        Stages,
+        Operator
+    >::ThreadblockMma;
+
+    using ThreadblockMma = MmaPlanarComplexMultistage<
+      ThreadblockShape_,
+      typename RealMmaMultistage::IteratorA,
+      typename RealMmaMultistage::SmemIteratorA,
+      cutlass::arch::CacheOperation::Global,
+      typename RealMmaMultistage::IteratorB,
+      typename RealMmaMultistage::SmemIteratorB,
+      cutlass::arch::CacheOperation::Global,
+      ElementAccumulator_,
+      LayoutC_,
+      typename RealMmaMultistage::Policy,
+      Stages,
+      TransformA,
+      TransformB
+    >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}   // namespace threadblock
+}   // namespace gemm
+}   // namespace cutlass
+
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..51327c1a382cfff194741d32cdcfcf32d2dca5b8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "cutlass/gemm/warp/mma_planar_complex.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_pipelined.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Element type for internal accumulation
+  typename ElementAccumulator_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Operator class tag
+  typename OperatorClass_,
+  /// Tag indicating architecture to tune for
+  typename ArchTag_,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape_,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape_,
+  /// Instruction-level tile size (concept: GemmShape)
+  typename InstructionShape_,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Complex transformation on operand A
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transformation on operand B
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Math operator tag (e.g. arch::OpMultiplyAdd)
+  typename Operator = arch::OpMultiplyAdd
+>
+struct DefaultMmaPlanarComplexPipelined {
+
+  // Construct a planar complex variant from the real-valued variant
+  using RealMma = typename DefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator_,
+    LayoutC_,
+    OperatorClass_,
+    ArchTag_,
+    ThreadblockShape_,
+    WarpShape_,
+    InstructionShape_,
+    Stages,
+    Operator
+  >::ThreadblockMma;
+
+  using ThreadblockMma = MmaPlanarComplexPipelined<
+    ThreadblockShape_,
+    typename RealMma::IteratorA,
+    typename RealMma::SmemIteratorA,
+    typename RealMma::IteratorB,
+    typename RealMma::SmemIteratorB,
+    ElementAccumulator_,
+    LayoutC_,
+    typename RealMma::Policy,
+    Stages,
+    TransformA,
+    TransformB
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8c6cf7e248435bd5d931d23d837b4ea41b145bf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined softmax-GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+#include "cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h"
+#include "cutlass/gemm/warp/scale_bias_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for Scale/Bias vectors
+    typename ElementScaleBias,
+    /// Layout type for Scale/Bias vectors
+    typename LayoutScaleBias,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaSoftmaxMainloopFusion {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpGammaBeta = CacheOpA;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using IteratorNormSum =
+      cutlass::transform::threadblock::PredicatedScaleBiasVectorIterator<
+          cutlass::MatrixShape<1, WarpShape::kN>,
+          ElementScaleBias,
+          LayoutScaleBias>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaSoftmaxMainloopFusionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, IteratorNormSum,
+      ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, InternalTranspose, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae1ac25346bec4339815cf5eb25f6d83e9e836a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
@@ -0,0 +1,141 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_with_reduction.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operands
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    ///                                                                                               
+    bool ReduceKForA_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Use zfill or predicate for SM80 out-of-bound cp.async 
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone
+    >
+struct DefaultMmaWithReduction {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaWithReductionCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      ReduceKForA_,  Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaWithReductionMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClear>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..62d0c49b338e09a21efb8148b9418ff200ee9dc7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -0,0 +1,159 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageMmaComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageMmaComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            ElementAccumulator, layout::RowMajor, OperatorClass,
+                            ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..8751495a58c5b403b67a43f7dedf16a39615bd3a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -0,0 +1,119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template defininng default matrix multiply operators inferred from
+/// threadblock tile size, global memory data layout, and target math
+/// instruction.
+template <
+    /// Shape of threadblock-scoped matrix multiply operator
+    typename Shape,
+    /// Shape of warp-level matrix multiply operator
+    typename WarpShape,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape,
+    /// Element data type of A operand
+    typename ElementA,
+    /// Layout of operand A
+    typename LayoutA,
+    /// Element data type of B operand
+    typename ElementB,
+    /// Layout of operand B
+    typename LayoutB,
+    /// Data type of accumulator
+    typename ElementC,
+    /// Layout of accumulator
+    typename LayoutC,
+    /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp)
+    typename OperatorClass,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA =
+        cutlass::arch::CacheOperation::Global,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB =
+        cutlass::arch::CacheOperation::Global>
+struct DefaultMultistageMmaComplexCore;
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9716f324fd9ee12ff1b7e0dd508d77c766514f8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -0,0 +1,1808 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic properties needed by CTA-level GEMMs assuming
+   expectations about data layout of the global memory fragments, data types,
+   and internal tile sizes.
+
+      Partial specializations for threadblock::Mma operations targeting TensorOp
+   instructions.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+#include "cutlass/gemm/warp/mma_simt.h"
+#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::ColumnMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  using Operator = Operator_;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::ColumnMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_, 
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex double-precision
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,    
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, InstructionShape_, 
+    complex<double>, layout::RowMajor,
+    complex<double>, layout::RowMajor,
+    complex<double>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages, 
+    TransformA, TransformB, 
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = InstructionShape_;
+  using ElementA = complex<double>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<double>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<double>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+  
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped 128
+  static int const kAccessSizeInBits = 128;
+
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<8, 4>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+
+/// Partial specialization for complex floating-point
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::ColumnMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kM, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::ColumnMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kN>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex floating-point
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex
+///   Math Instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<16, 8, 8>, 
+    complex<float>, layout::RowMajor,
+    complex<float>, layout::RowMajor,
+    complex<float>, LayoutC_, 
+    arch::OpClassTensorOp,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<16, 8, 8>;
+  using ElementA = complex<float>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<float>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<float>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of a threadblock-scoped
+  static int const kAccessSizeInBits = 64;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap<
+      layout::PitchLinearShape<Shape::kK, Shape::kM>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementA>::value>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 1,
+      IteratorThreadMapA>;
+
+  /// ThreadMap of iterator B
+  using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap<
+      layout::PitchLinearShape<Shape::kN, Shape::kK>, kThreads,
+      layout::PitchLinearShape<16, 2>,
+      kAccessSizeInBits / sizeof_bits<ElementB>::value>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 0,
+      IteratorThreadMapB>;
+      
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level tensor op
+  using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp<
+      WarpShape, InstructionShape, 
+      ElementA, SmemLayoutA, 
+      ElementB, SmemLayoutB,
+      ElementC, LayoutC, 
+      kTransformA, kTransformB,
+      Operator>::Type;
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<MmaTensorOp, MatrixShape<0, 0>,
+                                        MatrixShape<0, 0>, WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: column-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: column-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::ColumnMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::ColumnMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kM, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      IteratorThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<0, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: row-major
+///   B: column-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::ColumnMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::ColumnMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kN>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator B 
+  using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapB>;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      SmemThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, Shape::kK / 32>,
+    WarpCount::kK>;
+};
+
+/// Partial specialization for complex SIMT operation
+///
+///   A: row-major
+///   B: row-major
+///   Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex
+///
+/// This uses the default warp-level operator given tile sizes
+template <
+    /// Shape of threadblock-scoped matrix multiply operator (concept:
+    /// GemmShape)
+    typename Shape_,
+    /// Shape of warp-level matrix multiply operator (concept: GemmShape)
+    typename WarpShape_,
+    typename RealA,
+    typename RealB,
+    typename RealC,
+    /// Layout of accumulator
+    typename LayoutC_,
+    /// Number of stages
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_,
+    /// Cache operation of operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Cache operation of operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB>
+struct DefaultMultistageMmaComplexCore<
+    Shape_, WarpShape_, GemmShape<1, 1, 1>, 
+    complex<RealA>, layout::RowMajor,
+    complex<RealB>, layout::RowMajor,
+    complex<RealC>, LayoutC_, 
+    arch::OpClassSimt,
+    Stages,
+    TransformA, TransformB,
+    Operator_,
+    CacheOpA, CacheOpB> {
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using InstructionShape = GemmShape<1, 1, 1>;
+  using ElementA = complex<RealA>;
+  using LayoutA = layout::RowMajor;
+  using ElementB = complex<RealB>;
+  using LayoutB = layout::RowMajor;
+  using ElementC = complex<RealC>;
+  using LayoutC = LayoutC_;
+  static int const kStages = Stages;
+  static ComplexTransform const kTransformA = TransformA;
+  static ComplexTransform const kTransformB = TransformB;
+  using Operator = Operator_;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always;
+
+  /// Number of warps present
+  using WarpCount = GemmShape<Shape::kM / WarpShape::kM,
+                              Shape::kN / WarpShape::kN, 
+                              Shape::kK / WarpShape::kK>;
+
+  // Divisility requirements
+  static_assert(
+      !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN),
+      "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size.");
+
+  static_assert(WarpCount::kCount > 1,
+    "This specialization requires at least two warps.");
+
+  /// Number of threads per warp
+  static int const kWarpSize = warp::WarpSize<arch::OpClassTensorOp>::value;
+
+  /// Number of threads total
+  static int const kThreads = WarpCount::kCount * kWarpSize;
+
+  /// Size of access
+  static int const kAccessSizeInBits = sizeof_bits<ElementA>::value;
+
+  /// No vectorized accesses
+  static int const kElementsPerAccess = 1;
+
+  //
+  // Shared memory layouts
+  //
+
+  using SmemLayoutA = layout::ColumnMajor;
+
+  using SmemLayoutB = layout::RowMajor;
+
+  //
+  // Iterators to write to shared memory
+  //
+
+  /// ThreadMap of iterator A
+  using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kK, Shape::kM>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Transpose the ThreadMap of iterator A
+  using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt<IteratorThreadMapA>;
+
+  /// Shared memory iterator to A operand
+  using SmemIteratorA = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kM, Shape::kK>, ElementA, SmemLayoutA, 0,
+      SmemThreadMapA>;
+
+  /// Policy of iterator B
+  using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap<
+    layout::PitchLinearShape<Shape::kN, Shape::kK>,
+    kThreads,
+    kElementsPerAccess
+  >;
+
+  /// Shared memory iterator to B operand
+  using SmemIteratorB = transform::threadblock::RegularTileAccessIterator<
+      MatrixShape<Shape::kK, Shape::kN>, ElementB, SmemLayoutB, 1,
+      IteratorThreadMapB>;
+
+  //
+  // Warp-level matrix multiply operator
+  //
+
+  // Define the warp-level op
+  static const int WarpNumThreadsM = 4;
+  static const int WarpNumThreadsN = 8;
+  static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN),
+      "WarpShape must be divisible by ThreadTile shape.");
+  static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM;
+  static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN;
+  static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1;
+  static const int numElementsA = 128 / sizeof_bits<ElementA>::value;
+  static const int numElementsB = 128 / sizeof_bits<ElementB>::value;
+  static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM);
+  static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN);
+  // these should have max of thread tile also
+  using LaneMmaShape = cutlass::gemm::GemmShape<
+      LaneM,
+      LaneN,
+      1>;
+  using Policy = cutlass::gemm::warp::MmaSimtPolicy<
+      cutlass::MatrixShape<WarpNumThreadsM, WarpNumThreadsN>,   // WarpShape
+      cutlass::layout::RowMajorInterleaved<LaneLayout>,         // LaneLayout
+      LaneMmaShape
+  >;
+
+  using MmaWarpSimt = cutlass::gemm::warp::MmaSimt<
+    WarpShape,    /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8
+    ElementA,     /// Data type of A elements
+    SmemLayoutA,  /// Layout of A matrix (concept: MatrixLayout)
+    ElementB,     /// Data type of B elements
+    SmemLayoutB,  /// Layout of B matrix (concept: MatrixLayout)
+    ElementC,     /// Element type of C matrix
+    LayoutC,      /// Layout of C matrix (concept: MatrixLayout)
+    Policy,       /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+    1,            /// 1 partition along K dimension
+    kTransformA,  /// Transform for A
+    kTransformB   /// Transform for B
+    >;            /// Used for partial specialization
+
+  /// Policy used to define MmaPipelined
+  using MmaPolicy = MmaPolicy<
+    MmaWarpSimt,
+    MatrixShape<Shape::kK / 32, 0>,
+    MatrixShape<0, 0>,    // or Shape::kK / 32
+    WarpCount::kK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..4045dd2e4173c072b359bfccf0e4c48f6c15146d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
@@ -0,0 +1,556 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator = arch::OpMultiplyAddComplex,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMultistageTrmmComplex;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, kDiagType, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, kDiagType,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, kDiagType,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode, unit diagonal
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator> {
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            kSideMode, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          kSideMode, kFillMode, DiagType::kUnit, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          kSideMode, FillMode::kFull, DiagType::kInvalid,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output and right-side mode (for TRMM where diagonal imag part is ignored - used by HEMM)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Complex transformation on operand A
+    ComplexTransform TransformA,
+    /// Complex transformation on operand B
+    ComplexTransform TransformB,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator>
+struct DefaultMultistageTrmmComplex<ElementA, LayoutA, ElementB, LayoutB,
+                            SideMode::kRight, kFillMode, DiagType::kNonUnit,
+                            ElementAccumulator, layout::RowMajor, OperatorClass, ArchTag, ThreadblockShape, WarpShape,
+                            InstructionShape, Stages, TransformA, TransformB, Operator, BlasMode::kHermitian> {
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, 
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass,
+      Stages, TransformA, TransformB, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, 
+          SideMode::kRight, FillMode::kFull, DiagType::kInvalid, 
+          AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  // PredicatedTileAccessIteratorTriangularMatrix only tracks diagonal elements,
+  // when DiagType is kUnit
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, 
+          SideMode::kRight, kFillMode, DiagType::kUnit,
+          AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill,
+      BlasMode::kHermitian>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c8632c8f4a109df6d5b1f80903cb1dbdb34122e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -0,0 +1,196 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultSparseMma;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultSparseMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+  
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultSparseMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  static int const kSparse = MmaCore::kSparse;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK / kSparse>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define iterators over tiles from the E operand
+  using ElementE = typename MmaCore::ElementE;
+  using LayoutE = typename MmaCore::GmemLayoutE;
+  using ThreadMapE = typename MmaCore::IteratorThreadMapE;
+  using AccessTypeE =
+      cutlass::Array<ElementE, 128 / sizeof_bits<ElementE>::value>;
+  using IteratorE =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM,
+                               ThreadblockShape::kK / kSparse /
+                                   MmaCore::kElementsPerElementE>,
+          ElementE, LayoutE, 1, ThreadMapE, AccessTypeE>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::SparseMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      IteratorE, typename MmaCore::SmemIteratorE, MmaCore::kCacheOpE,
+      typename MmaCore::MmaPolicy, Stages>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_trmm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..066ecd6aa4cf6137f78b0ee502053f59d1d18354
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/default_trmm.h
@@ -0,0 +1,445 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+// 
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/wmma.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h"
+#include "cutlass/gemm/threadblock/mma_blas3_multistage.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+#include "cutlass/gemm/threadblock/default_mma_core_wmma.h"
+#endif //CUTLASS_ARCH_WMMA_ENABLED
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+    >
+struct DefaultTrmm;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, kDiagType, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Diag Type for the triangular matrix
+    DiagType kDiagType,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, kDiagType, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, kDiagType, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output with unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Side Mode for the kernel
+    SideMode kSideMode,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  kSideMode, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, kSideMode, kFillMode, DiagType::kUnit, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, kSideMode, FillMode::kFull, DiagType::kInvalid, AccessTypeB>;
+  
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for row-major output, right side mode, unit diagonal (OperatorClass TensorOp)
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Fill Mode for the triangular matrix
+    FillMode kFillMode,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator
+    >
+struct DefaultTrmm<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, 
+                  SideMode::kRight, kFillMode, DiagType::kUnit, 
+                  ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag, ThreadblockShape, WarpShape,
+                  InstructionShape, Stages, Operator, false> {
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  // Define the MmaCore components
+  using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using AccessTypeA = cutlass::Array<ElementA, kAlignmentA>;
+
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, SideMode::kRight, FillMode::kFull, DiagType::kInvalid, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeB = cutlass::Array<ElementB, kAlignmentB>;
+
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIteratorTriangularMatrix<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, SideMode::kRight, kFillMode, DiagType::kUnit, AccessTypeB>;
+
+  // Define the threadblock-scoped multistage matrix multiply
+  using ThreadblockMma = cutlass::gemm::threadblock::MmaBlas3Multistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB,
+      MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor,
+      typename MmaCore::MmaPolicy, Stages, SharedMemoryClearOption::kZfill>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass 
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..83723619e8494c138bd0d17cb91b09fbfff27b39
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_multistage.h
@@ -0,0 +1,648 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a multistage threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class EllMmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, EllIterator &ell_iter,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(typename IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(typename IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iter.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(typename IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iter.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(typename IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      EllIterator &ell_iterator
+      ) {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool is_valid = iterator_A.valid();
+
+          if (!is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr +=  ell_offset * sizeof(typename IteratorA::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_A.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += (ell_offset * sizeof(typename IteratorA::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+          
+          auto gmem_ptr = iterator_B.get();
+          bool is_valid = iterator_B.valid();
+          
+          if (is_A_sparse){
+            if (is_offset_constant){
+              auto ell_offset = ell_iterator.get_offset_fast();
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ell_offset * sizeof(typename IteratorB::Element) / kSrcBytes;
+            } else {
+              int k_offset = iterator_B.get_k();
+              auto ell_offset = ell_iterator.get_offset(k_offset);
+              is_valid = is_valid && (ell_offset >= 0);
+              gmem_ptr += ( ell_offset * sizeof(typename IteratorB::Element)) / kSrcBytes;
+            }
+          }
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, is_valid);
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+      ++ell_iterator;
+      
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance<is_A_sparse, is_offset_constant>(
+              iterator_A, iterator_B, ell_iterator, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+          ++ell_iterator;
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..adcff38d23b8bd527284333253b5d54659808c8f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
@@ -0,0 +1,376 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Blocked-Ell MMA.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class EllMmaPipelined : public MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for EllMmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "EllMmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator;
+
+public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  EllMmaPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  template<bool is_A_sparse, bool is_offset_constant>
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum,                       ///< source accumulator tile
+    EllIterator &ell_iterator,
+    TransformA transform_A = TransformA(),            ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()) {          ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // load sparse matrix  
+    if (is_A_sparse){
+      iterator_A.load(tb_frag_A);
+    } else {
+      iterator_B.load(tb_frag_B);
+    }
+    
+    // load dense matrix
+    if (is_offset_constant){
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+      }
+    } else {
+      if (is_A_sparse){
+        iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+      } else {
+        iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+      }
+    }
+
+    ++iterator_A;
+    ++iterator_B;
+    ++ell_iterator;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    if (is_A_sparse){
+      iterator_A.ell_add_mask(ell_iterator.get_blocksize());
+    }
+    else {
+      iterator_B.ell_add_mask(ell_iterator.get_blocksize());
+    }
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+          
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          // load sparse matrix  
+          if (is_A_sparse){
+            iterator_A.load(tb_frag_A);
+          } else {
+            iterator_B.load(tb_frag_B);
+          }
+
+          // load dense matrix
+          if (is_offset_constant){
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index_fast(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index_fast(tb_frag_A, ell_iterator);
+            }
+          } else {
+            if (is_A_sparse){
+              iterator_B.load_with_ell_index(tb_frag_B, ell_iterator);
+            } else {
+              iterator_A.load_with_ell_index(tb_frag_A, ell_iterator);
+            }
+          }
+
+          ++iterator_A;
+          ++iterator_B;
+          ++ell_iterator;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(accum, warp_frag_A[warp_mma_k % 2],
+                 warp_frag_B[warp_mma_k % 2], accum);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/gemv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab747374d8f7b15b65371975379e17b8aee1707f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/gemv.h
@@ -0,0 +1,147 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Template for a threadblock-scoped GEMV kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix-vector product using SIMT math instructions.
+template <
+  class Core_ //< GemvCore
+>
+class Gemv {
+public:
+  using Shape = typename Core_::Shape;
+
+  /// The MMA operator that computes GEMV 
+  using Operator = typename Core_::Operator;
+
+  /// Iterates over A in global memory
+  using IteratorA = typename Core_::IteratorA;
+
+  /// Iterates over B in global memory
+  using IteratorB = typename Core_::IteratorB;
+
+  /// Fragment of operand C loaded from global memory
+  using IteratorC = typename Core_::IteratorC;
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of operand accumulator loaded/stored to global memory
+  using FragmentC = typename Operator::FragmentC;
+
+  /// Shape of the per-thread GEMV operation
+  using ThreadShape = typename Core_::ThreadShape;
+
+public:
+  CUTLASS_DEVICE
+  Gemv() { }
+
+  CUTLASS_DEVICE
+  void operator()(
+    GemmCoord const &problem_size,    ///< problem size of batched GEMV
+    FragmentC &accum,                 ///< destination accumulator tile
+    IteratorA iterator_A,             ///< iterator over A operand in global memory
+    IteratorB iterator_B,             ///< iterator over B operand in global memory
+    FragmentC const &src_accum) {     ///< source accumulator tile
+
+    //
+    // Prologue
+    //
+
+    FragmentA frag_A;
+    FragmentB frag_B;
+    frag_A.clear();
+    frag_B.clear();
+
+    iterator_A.load(frag_A);
+    iterator_B.load(frag_B);
+    ++iterator_A;
+    ++iterator_B;
+
+    //
+    // Mainloop
+    //
+    Operator thread_mma;
+    int gemm_k = problem_size.k();
+
+    if (gemm_k < Shape::kK)
+    {
+      iterator_A.clear_mask();
+      iterator_B.clear_mask();
+    }
+
+    // iterate over K to accumulate result
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k > 0; gemm_k -= Shape::kK) {
+      thread_mma(accum, frag_A, frag_B, accum);
+
+      iterator_A.load(frag_A);
+      iterator_B.load(frag_B);
+      ++iterator_A;
+      ++iterator_B;
+
+      if (gemm_k < Shape::kK)
+      {
+        iterator_A.clear_mask();
+        iterator_B.clear_mask();
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/index_remat.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/index_remat.h
new file mode 100644
index 0000000000000000000000000000000000000000..89e4b1af9c21d115632cd98f20bbc113de3b236b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/index_remat.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for rematerializing indices/dimensions in the thread hierarchy from special registers
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxX() {
+  return threadIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxY() {
+  return threadIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeThreadIdxZ() {
+  return threadIdx.z;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxX() {
+  return blockIdx.x;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxY() {
+  return blockIdx.y;
+}
+
+/// Helper to rematerialize block Idx. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockIdxZ() {
+  return blockIdx.z;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimX() {
+  return blockDim.x;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimY() {
+  return blockDim.y;
+}
+
+/// Helper to rematerialize block Dim. Reduces register liveness.
+CUTLASS_DEVICE
+int RematerializeBlockDimZ() {
+  return blockDim.z;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..2eaa40b707aef310fedc2cb226da1d26d8f0fdb2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_base.h
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/tensor_ref.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct MmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94c1de2cb6c17befd8ebd856a503b619bd73be7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
@@ -0,0 +1,707 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+    Used by BLAS3 kernels that need to treat diagonal elements of a input iterator as a special case.
+  
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kZfill,
+    /// Blas3 computation mode
+    BlasMode BlasMode_ = BlasMode::kTriangular,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBlas3Multistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+  ///< Blas Mode
+  static BlasMode const kBlasMode = BlasMode_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBlas3Multistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              /* The following logic to determine kSizeRealBytes is so that compiler doesn't complain when
+               * compiling for not complex datatype and using half the size for cp_async_zfill */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_A.get();
+          bool isvalid = iterator_A.valid();
+
+          if (isvalid && iterator_A.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorA::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorA::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorA::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorA::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          auto gmem_ptr = iterator_B.get();
+          bool isvalid = iterator_B.valid();
+
+          if (isvalid && iterator_B.getOnDiag()) {
+            // Elements that are on diagonal
+            if (kBlasMode == BlasMode::kHermitian && cutlass::is_complex<typename IteratorB::Element>::value) {
+              /* Copy real part from gmem, write zero for imag part in smem */
+              int const kSizeRealBytes = (platform::is_same<typename IteratorB::Element,
+                                          complex<double>>::value) ? 8 : 4;
+              cutlass::arch::cp_async_zfill<kSizeRealBytes, cutlass::arch::CacheOperation::Always>(
+                dst_ptr + v, gmem_ptr, true);
+              cutlass::arch::cp_async_diag<typename IteratorB::Element, true>(
+                reinterpret_cast<char *> (dst_ptr + v) + kSizeRealBytes);
+            } else {
+              /* Write one (1) directly to smem*/
+              cutlass::arch::cp_async_diag<typename IteratorB::Element>(dst_ptr + v);
+            }
+          } else {
+            // Elements that are not of diagonal
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, isvalid);
+          }
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for some kernels
+    // so that all accumulator elements outside the GEMM footprint are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<typename Operator::MathOperator,
+                              arch::OpMultiplyAddFastF32>::value
+          || platform::is_same<typename Operator::MathOperator,
+                               arch::OpMultiplyAddComplexFastF32>::value) {
+
+          warp_mma(
+            tmp_accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            tmp_accum
+          );
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+            accum, 
+            warp_transformed_frag_A[warp_mma_k % 2],
+            warp_transformed_frag_B[warp_mma_k % 2], 
+            accum
+          );
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    if (platform::is_same<typename Operator::MathOperator,
+                          arch::OpMultiplyAddFastF32>::value
+      || platform::is_same<typename Operator::MathOperator,
+                           arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum); 
+    }
+ 
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f533dde28e4353fc9516344c529e85349db8d09
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
@@ -0,0 +1,863 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, mean and var, in the prologue and
+    stores them in the register file.  In the mainloop, it loads two loop
+    variant vectors, gamma and beta, by using cp.async.  We will call
+    elementwise operation to apply var, mean, gamma, beta between ldmatrix and
+    warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/layernorm_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Element type of scale and bias vectors 
+    typename ElementScaleBias_,
+    /// Layout of scale and bias vectors
+    typename LayoutScaleBias_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Element type of scale and bias vectors 
+  using ElementScaleBias = ElementScaleBias_;
+
+  /// Layout of scale and bias vectors
+  using LayoutScaleBias = LayoutScaleBias_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the scale and bias vectors
+  using TensorRefGammaBeta = TensorRef<ElementScaleBias, LayoutScaleBias>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the A scale and bias vectors in shared memory
+    using ShapeGammaBeta =
+        MatrixShape<1 + Policy::SmemPaddingA::kRow,
+                    2 * Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for A operand Scale and Bias
+    AlignedBuffer<ElementScaleBias, ShapeGammaBeta::kCount> operand_A_gamma_beta;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the A scale and bias vectors
+    CUTLASS_DEVICE
+    static LayoutScaleBias LayoutScaleBias() {
+      return LayoutScaleBias::packed(
+          {ShapeGammaBeta::kRow, ShapeGammaBeta::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the A operand Scale vector
+    CUTLASS_HOST_DEVICE
+    TensorRefGammaBeta operand_A_gamma_beta_ref() {
+      return TensorRefGammaBeta{operand_A_gamma_beta.data(), LayoutScaleBias()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A operand scale and bias vector
+  /// from shared memory
+  WarpIteratorGammaBeta warp_tile_iterator_A_gamma_beta_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_A_gamma_beta_(
+            shared_storage.operand_A_gamma_beta_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorVarMean_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorGammaBeta_,
+    /// Iterates over vectors of scale and bias vector in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorGammaBeta_,
+    /// Cache operation for scale/bias operand 
+    cutlass::arch::CacheOperation::Kind CacheOpGammaBeta,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// WarpIterator to load Scale or Bias vector from the shared memory
+    typename WarpIteratorGammaBeta_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaLayernormMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta_::Element,
+                       typename IteratorGammaBeta_::Layout, Policy_, WarpIteratorGammaBeta_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorVarMean = IteratorVarMean_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorGammaBeta = IteratorGammaBeta_;
+  ///< WarpIterator to load Scale or Bias vector from the shared memory
+  using WarpIteratorGammaBeta = WarpIteratorGammaBeta_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, typename IteratorGammaBeta::Element, 
+                                     typename IteratorGammaBeta::Layout, Policy,
+                                     WarpIteratorGammaBeta, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorGammaBeta = SmemIteratorGammaBeta_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpGammaBeta =
+      CacheOpGammaBeta;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentVarMean = typename IteratorVarMean::Fragment;
+  using WarpLoadedFragmentGammaBeta =
+      typename WarpIteratorGammaBeta::Fragment;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of A operand scale vector to shared memory
+  SmemIteratorGammaBeta smem_iterator_A_gamma_beta_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaLayernormMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_A_gamma_beta_(shared_storage.operand_A_gamma_beta_ref(),
+                                  thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorGammaBeta &iterator_A_gamma_beta,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    // Async Copy for operand A scale and bias vector.  Scale and bias vectors
+    // are small.  One iteration is enough.
+    if (group_start_A == 0) {
+      typename IteratorGammaBeta::AccessType *dst_ptr =
+          reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+              this->smem_iterator_A_gamma_beta_.get());
+
+      int const kSrcBytes =
+          sizeof_bits<typename IteratorGammaBeta::Element>::value *
+          IteratorGammaBeta::kElementsPerAccess / 8;
+
+      cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+          dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorVarMean iterator_var_mean,
+      ///< iterator over scale and bias vectors in global memory
+      IteratorGammaBeta iterator_A_gamma_beta,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentVarMean warp_loaded_frag_var_mean;
+    iterator_var_mean.add_tile_offset({0, warp_idx_m_});
+    iterator_var_mean.load(warp_loaded_frag_var_mean);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      // Async Copy for operand A scale and bias vectors.  Scale and bias
+      // vectors are small.  One iteration is enough.
+      {
+        typename IteratorGammaBeta::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorGammaBeta::AccessType *>(
+                this->smem_iterator_A_gamma_beta_.get());
+
+        int const kSrcBytes =
+            sizeof_bits<typename IteratorGammaBeta::Element>::value *
+            IteratorGammaBeta::kElementsPerAccess / 8;
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpGammaBeta>(
+            dst_ptr, iterator_A_gamma_beta.get(), iterator_A_gamma_beta.valid());
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_A_gamma_beta.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpLoadedFragmentGammaBeta warp_loaded_frag_A_gamma_beta[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::LayernormScaleBiasTransform<WarpTransformedFragmentA,
+                                            WarpLoadedFragmentVarMean,
+                                            WarpLoadedFragmentGammaBeta>
+                         elementwise_transform;
+ 
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_A_gamma_beta_.load(
+        warp_loaded_frag_A_gamma_beta[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_gamma_beta_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_var_mean,
+                         warp_loaded_frag_A_gamma_beta[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_A_gamma_beta_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_gamma_beta_.load(
+            warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_A_gamma_beta_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+          elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                               warp_loaded_frag_var_mean,
+                               warp_loaded_frag_A_gamma_beta[warp_mma_k % 2]);
+        }
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	  		       group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_A_gamma_beta, iterator_B,
+	                               group_start_iteration_A, 
+                                 group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_A_gamma_beta.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_A_gamma_beta_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_A_gamma_beta_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_A_gamma_beta_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_A_gamma_beta.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+          elementwise_transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_var_mean,
+              warp_loaded_frag_A_gamma_beta[(warp_mma_k + 1) % 2]);
+        }
+      }
+
+    }
+    
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed278806f5f051c2bef3ac5dc9cad3becf24bcea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -0,0 +1,741 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    // Optional staged-accumulation (e.g., tf32x3 kernels) for improved numerical
+    // accuracy, where each mainloop iteration first accumulates into a temporary
+    // set of freshly-cleared accumulators, which are subsequently added to the
+    // final accumulator set.
+    static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation<Operator>::value;
+  };
+
+ private:
+
+
+  // Structure encapsulating pipeline state live from one iteration to the next
+  struct PipeState {
+
+    using WarpLoadedFragmentA = typename Operator::FragmentA;
+    using WarpLoadedFragmentB = typename Operator::FragmentB;
+    using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+    using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+    /// Temporary accumulator to facilitate staged-accumulation
+    FragmentC tmp_accum_;
+
+    /// Pair of A fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentA warp_loaded_frag_A_[2];
+    WarpTransformedFragmentA warp_transformed_frag_A_[2];
+
+    /// Pair of B fragments used to overlap shared memory loads and math instructions
+    WarpLoadedFragmentB warp_loaded_frag_B_[2];
+    WarpTransformedFragmentB warp_transformed_frag_B_[2];
+  };
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma_;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx_;
+
+  /// Shared memory read stage index
+  int smem_read_stage_idx_;
+
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_write_stage_idx_(0),
+      smem_read_stage_idx_(0)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Advance shared memory read-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_read_stage()
+  {
+    ++smem_read_stage_idx_;
+
+    if (smem_read_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+      smem_read_stage_idx_ = 0;
+    }
+  }
+
+  /// Advance global memory read-iterators and shared memory write-iterators to the stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage(
+    IteratorA &iterator_A,
+    IteratorB &iterator_B)
+  {
+    // Advance global iterators
+    iterator_A.add_tile_offset({0, 1});
+    iterator_B.add_tile_offset({1, 0});
+
+    // Advance shared iterators
+    smem_iterator_A_.add_tile_offset({0, 1});
+    smem_iterator_B_.add_tile_offset({1, 0});
+
+    // Increment shared memory write stage index
+    ++smem_write_stage_idx_;
+
+    if (smem_write_stage_idx_ == Base::kStages) {
+      // Wrap back around to the 'start' of the circular buffer in shared memory
+      smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+      smem_write_stage_idx_ = 0;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1; ++stage, --gemm_k_iterations) {
+
+      // Disable global fetching if done with global fetch iterations
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next write stage
+      advance_smem_write_stage(iterator_A, iterator_B);
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Optionally clear the remaining stages of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint are zero.
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+
+      /// Iterator to write threadblock-scoped tile of A operand to shared memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+      typename IteratorA::AccessType zero_A;
+
+      zero_A.clear();
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+  }
+
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    // Wait until we have at least one committed global fetch stage. (#uncommitted = Base::kStages - 1 - #committed)
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+  }
+
+
+  /// Perform a threadblock mainloop iteration of matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void mac_loop_iter(
+    PipeState &pipe_state,          ///< [in|out] loop-carried pipeline state
+    FragmentC &accum,               ///< [in|out] destination accumulator tile
+    IteratorA &iterator_A,          ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,          ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)         ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // Unroll the warp-level MMA tiles of a threadblock's mainloop iteration
+    CUTLASS_PRAGMA_UNROLL
+    for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+      // Load the next warp-tile's A fragment from shared memory
+      this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_A_;
+
+      // Load the next warp-tile's B fragment from shared memory
+      this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+      this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      ++this->warp_tile_iterator_B_;
+
+      // Except for the first warp-tile, all warp-tiles convert their incoming shared memory fragments as necessary
+      if (warp_mma_k > 0) {
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_loaded_frag_B_[warp_mma_k % 2]);
+      }
+
+      // Execute the current warp-tile of MMA operations
+      if (Detail::kStagedAccumulation) {
+        warp_mma_(
+          pipe_state.tmp_accum_,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          pipe_state.tmp_accum_
+        );
+
+        if (warp_mma_k == 0) {
+          plus<FragmentC> plus_accum;
+          accum = plus_accum(accum, pipe_state.tmp_accum_);
+          pipe_state.tmp_accum_.clear();
+        }
+      } else {
+        warp_mma_(
+          accum,
+          pipe_state.warp_transformed_frag_A_[warp_mma_k % 2],
+          pipe_state.warp_transformed_frag_B_[warp_mma_k % 2],
+          accum
+        );
+      }
+
+      // Except for the last warp-tile, all warp-tiles issue their share of
+      // global->shared fragment copies
+      if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+
+        int group_start_iteration_A, group_start_iteration_B;
+        group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+        group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+        copy_tiles_and_advance(
+            iterator_A,
+            iterator_B,
+            group_start_iteration_A,
+            group_start_iteration_B);
+      }
+
+      // The second-to-last warp-tile also:
+      //   - performs the last warp-tile's share of global->shared fragment copies
+      //   - moves to the next global fetch stage
+      if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+
+        // Performs the last warp-tile's share of global->shared fragment copies
+        int group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+        int group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+        copy_tiles_and_advance(
+          iterator_A,
+          iterator_B,
+          group_start_iteration_A,
+          group_start_iteration_B);
+
+        // Inserts a memory fence between stages of cp.async instructions.
+        cutlass::arch::cp_async_fence();
+
+        // Wait until we have at least one completed global fetch stage
+        gmem_wait();
+
+        // Move to the next global fetch stage
+        advance_smem_write_stage(iterator_A, iterator_B);
+        advance_smem_read_stage();
+
+        // Disable global fetching when done with global fetch iterations
+        --gemm_k_iterations;
+        iterator_A.clear_mask(gemm_k_iterations == 0);
+        iterator_B.clear_mask(gemm_k_iterations == 0);
+      }
+
+      // The last warp-tile also converts the shared memory fragments used by
+      // the first warp-tile of the next iteration, if necessary (so we can
+      // immediately start issuing MMA instructions at the top of the loop )
+      if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+
+        warp_mma_.transform(
+          pipe_state.warp_transformed_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_transformed_frag_B_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_A_[(warp_mma_k + 1) % 2],
+          pipe_state.warp_loaded_frag_B_[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+      int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+      FragmentC &accum,             ///< [in|out] accumulator tile
+      IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+      IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    PipeState pipe_state;
+
+    // Disable global fetching if done with global fetch iterations
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    // Load first warp-tile's A fragment from shared memory
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(pipe_state.warp_loaded_frag_A_[0]);
+    ++this->warp_tile_iterator_A_;
+
+    // Load first warp-tile's B fragment from shared memory
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(pipe_state.warp_loaded_frag_B_[0]);
+    ++this->warp_tile_iterator_B_;
+
+    // Transform, if necessary, the first warp-tile's shared memory fragments
+    warp_mma_.transform(
+      pipe_state.warp_transformed_frag_A_[0],
+      pipe_state.warp_transformed_frag_B_[0],
+      pipe_state.warp_loaded_frag_A_[0],
+      pipe_state.warp_loaded_frag_B_[0]);
+
+    if (Detail::kStagedAccumulation) {
+      pipe_state.tmp_accum_.clear();
+    }
+
+    // Mainloop
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      mac_loop_iter(
+        pipe_state,
+        accum,
+        iterator_A,
+        iterator_B,
+        gemm_k_iterations);
+    }
+
+    if (Detail::kStagedAccumulation) {
+      plus<FragmentC> plus_accum;
+      accum = plus_accum(accum, pipe_state.tmp_accum_);
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+
+
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // Catch-up the smem-read iterator to the smem-write iterator (so this class can be reused for another tile's prologue)
+
+    // First, increment remaining warp tiles to get to the next full stage.  (Ideally we would
+    // just decrement one tile, but not all iterators implement --() decrement.)
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
+    }
+    smem_read_stage_idx_++;
+
+    // Then wrap back two full stages (one for the tile advancing we just did, and one to catch the write iterators)
+    static const int kStageIters = Policy::kPartitionsK * Base::kWarpGemmIterations;
+    if (smem_read_stage_idx_ > 1)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, (-2 * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({(-2 * kStageIters), 0});
+    }
+    else
+    {
+      this->warp_tile_iterator_A_.add_tile_offset({0, ((Base::kStages - 2) * kStageIters)});
+      this->warp_tile_iterator_B_.add_tile_offset({((Base::kStages - 2) * kStageIters), 0});
+    }
+    smem_read_stage_idx_ = smem_write_stage_idx_;
+  }
+
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    // Prologue (start fetching iterations of global fragments into shared memory)
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Initialize destination accumulators with source accumulators
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..87ccc0a6138ff899aa20db15c3ceca890bd29976
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -0,0 +1,439 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Transformation applied to A operand
+  typename TransformA_ = NumericArrayConverter<
+    typename SmemIteratorA_::Element, 
+    typename IteratorA_::Element, 
+    IteratorA_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B operand
+  typename TransformB_ = NumericArrayConverter<
+    typename SmemIteratorB_::Element, 
+    typename IteratorB_::Element, 
+    IteratorB_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaPipelined : public MmaBase<Shape_, Policy_, 2> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 2>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// Warp-level MMA operator
+  Operator warp_mma;
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  ///< transformation applied to A fragment
+  TransformA transform_A_;
+
+  ///< transformation applied to B fragment
+  TransformB transform_B_;
+
+  /// Shared memory write stage index
+  int smem_write_stage_idx;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelined(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    TransformA transform_A = TransformA(),              ///< transformation applied to A fragment
+    TransformB transform_B = TransformB()               ///< transformation applied to B fragment
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+    transform_A_(transform_A),
+    transform_B_(transform_B),
+    smem_write_stage_idx(0)
+  {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+
+  /// Advance shared memory write-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_write_stage()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
+
+    smem_write_stage_idx ^= 1;
+  }
+
+  /// Advance shared memory read- and write-iterators to the next stage
+  CUTLASS_DEVICE
+  void advance_smem_stages()
+  {
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+    if (smem_write_stage_idx == 1) {
+      // wrap write stage
+      this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+      this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+    }
+    else
+    {
+      // wrap read stage
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+
+    smem_write_stage_idx ^= 1;
+  }
+
+
+  /// GEMM prologue.  Bootstrap the global->shared memory pipeline by fetching
+  /// the global fragments needed by the first kStages-1 threadblock mainloop iterations
+  CUTLASS_DEVICE
+  void prologue(
+    IteratorA &iterator_A,      ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B,      ///< [in|out] iterator over B operand in global memory
+    int &gemm_k_iterations)     ///< [in|out] number of threadblock mainloop iterations remaining
+  {
+    // The last kblock is loaded in the prolog
+
+    // Load A fragment from global A
+    FragmentA tb_frag_A;
+    tb_frag_A.clear();
+    iterator_A.load(tb_frag_A);
+    ++iterator_A;
+
+    // Load B fragment from global B
+    FragmentB tb_frag_B;
+    tb_frag_B.clear();
+    iterator_B.load(tb_frag_B);
+    ++iterator_B;
+
+    // Store A and B fragments to shared
+    this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B_(tb_frag_B));
+
+    // Advance write stage
+    advance_smem_write_stage();
+  }
+
+  /// Wait until we have at least one completed global fetch stage
+  CUTLASS_DEVICE
+  void gmem_wait()
+  {
+    __syncthreads();
+  }
+
+
+  /// Perform the specified number of threadblock mainloop iterations of matrix
+  /// multiply-accumulate.  Assumes prologue has been initiated.
+  CUTLASS_DEVICE
+  void gemm_iters(
+    int gemm_k_iterations,        ///< number of threadblock mainloop iterations
+    FragmentC &accum,             ///< [in|out] accumulator tile
+    IteratorA &iterator_A,        ///< [in|out] iterator over A operand in global memory
+    IteratorB &iterator_B)        ///< [in|out] iterator over B operand in global memory
+  {
+    using WarpFragmentA = typename Operator::FragmentA;
+    using WarpFragmentB = typename Operator::FragmentB;
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    // Load A fragment from shared A
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    ++this->warp_tile_iterator_A_;
+
+    // Load B fragment from shared B
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+    ++this->warp_tile_iterator_B_;
+
+    // Pair of fragments used to overlap global memory loads and math instructions;
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A_(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B_(tb_frag_B));
+
+          // Wait until we have at least one completed global fetch stage
+          gmem_wait();
+
+          // Advance smem read and write stages
+          advance_smem_stages();
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          // Load fragment from global A
+          tb_frag_A.clear();
+          iterator_A.load(tb_frag_A);
+          ++iterator_A;
+
+          // Load fragment from global B
+          tb_frag_B.clear();
+          iterator_B.load(tb_frag_B);
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+          accum,
+          warp_frag_A[warp_mma_k % 2],
+          warp_frag_B[warp_mma_k % 2],
+          accum);
+      }
+    }
+
+  }
+
+
+  /// Prepares the class for another prologue.
+  CUTLASS_DEVICE
+  void wind_down()
+  {
+    // First, increment remaining warp tiles to catch it up with the write stage.
+    #pragma unroll
+    for (int warp_mma_k = 1; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k)
+    {
+      this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k);
+      this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k);
+
+      ++this->warp_tile_iterator_A_;
+      ++this->warp_tile_iterator_B_;
+    }
+
+    // If we bumped the read iterators to the end of the circular buffer, wrap them around to
+    // align them with the write iterators
+    if (smem_write_stage_idx == 0)
+    {
+      this->warp_tile_iterator_A_.add_tile_offset(
+        {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset(
+        {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,                            ///< number of iterations of the mainloop
+    FragmentC &accum,                                 ///< destination accumulator tile
+    IteratorA iterator_A,                             ///< iterator over A operand in global memory
+    IteratorB iterator_B,                             ///< iterator over B operand in global memory
+    FragmentC const &src_accum)                       ///< source accumulator tile
+  {
+    // Prologue
+    prologue(iterator_A, iterator_B, gemm_k_iterations);
+
+    // Wait until we have at least one completed global fetch stage
+    gmem_wait();
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Perform the MAC-iterations
+    gemm_iters(gemm_k_iterations, accum, iterator_A, iterator_B);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0ba5094c5d2ba0ae4ec23b0068161a54ad7ba99
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -0,0 +1,208 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPlanarComplexBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Stride to the imaginary part of the A operand
+    static int const kImaginaryStrideA = ShapeA::kCount;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Stride to the imaginary part of the A operand
+    static int const kImaginaryStrideB = ShapeB::kCount;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount + kImaginaryStrideA> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount + kImaginaryStrideB> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bb9e6604f1b0cec1172e637831f2b4eb60053b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -0,0 +1,646 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Transformation applied to A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Transformation applied to B
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplexMultistage : 
+  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Architecture tag
+  using ArchTag = arch::Sm80;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  /// Transformation applied to A
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B
+  static ComplexTransform const kTransformB = TransformB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = ArrayPlanarComplex<
+    typename Policy::Operator::FragmentC::Element,
+    Policy::Operator::FragmentC::kElements
+  >;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLoadIterationsA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    static int const kAccessesPerGroupA =
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    static int const kAccessesPerGroupB =
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+    IteratorA &iterator_A_real,
+    IteratorA &iterator_A_imag,
+    
+    IteratorB &iterator_B_real, 
+    IteratorB &iterator_B_imag, 
+    
+    int group_start_A = 0, 
+    int group_start_B = 0) {
+
+    iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Load for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+        
+      typename IteratorA::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+          
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorA::Element>::value * 
+        IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+        auto gmem_ptr_real = iterator_A_real.get();
+        auto gmem_ptr_imag = iterator_A_imag.get();
+
+        bool pred_guard = iterator_A_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_A_real;
+        ++iterator_A_imag;
+      }
+
+      ++this->smem_iterator_A_;
+    }
+
+    iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      typename IteratorB::AccessType *dst_ptr = 
+        reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+      
+      int const kSrcBytes = 
+        sizeof_bits<typename IteratorB::Element>::value * 
+        IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+        auto gmem_ptr_real = iterator_B_real.get();
+        auto gmem_ptr_imag = iterator_B_imag.get();
+
+        bool pred_guard = iterator_B_real.valid();
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v,
+            gmem_ptr_real,
+            pred_guard);
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+            dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess),
+            reinterpret_cast<char const *>(gmem_ptr_imag),
+            pred_guard);
+
+        ++iterator_B_real;
+        ++iterator_B_imag;
+      }
+      ++this->smem_iterator_B_;
+    }
+  }
+
+  CUTLASS_DEVICE
+  void warp_mma_planar_complex(
+    Operator & warp_mma, 
+    FragmentC &accum,
+    WarpFragmentA const & real_A, 
+    WarpFragmentA const & imag_A, 
+    WarpFragmentB const & real_B, 
+    WarpFragmentB const & imag_B) {
+
+    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
+
+    WarpFragmentB neg_real_B = neg_op_B(real_B);
+    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
+
+    warp_mma(accum.real, real_A, real_B, accum.real);  
+
+    if (kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.imag, real_A, imag_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone) {
+      warp_mma(accum.imag, imag_A, real_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.real, imag_A, imag_B, accum.real);
+    }
+    else {
+      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
+    }
+  }
+
+public:
+  
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_real,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_imag,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_real,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_imag,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A_real.clear_mask(gemm_k_iterations == 0);
+      iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+      iterator_B_real.clear_mask(gemm_k_iterations == 0);
+      iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A_real.set_iteration_index(0);
+      iterator_A_imag.set_iteration_index(0);
+
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Load for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
+
+        typename IteratorA::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorA::AccessType *>(this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorA::Element>::value * 
+            IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_A_real.valid();
+
+          auto src_ptr_real = iterator_A_real.get();
+          auto src_ptr_imag = iterator_A_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideA /
+                      IteratorA::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_A_real;
+          ++iterator_A_imag;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B_real.set_iteration_index(0);
+      iterator_B_imag.set_iteration_index(0);
+
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
+
+        typename IteratorB::AccessType *dst_ptr = 
+          reinterpret_cast<typename IteratorB::AccessType *>(this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+
+          int const kSrcBytes = 
+            sizeof_bits<typename IteratorB::Element>::value * 
+            IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8;
+
+          bool pred_guard = iterator_B_real.valid();
+
+          auto src_ptr_real = iterator_B_real.get();
+          auto src_ptr_imag = iterator_B_imag.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+            dst_ptr + v, src_ptr_real, pred_guard);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v +
+                  Base::SharedStorage::kImaginaryStrideB /
+                      IteratorB::ThreadMap::kElementsPerAccess,
+              reinterpret_cast<char const *>(src_ptr_imag),
+              pred_guard);
+
+          ++iterator_B_real;
+          ++iterator_B_imag;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A_real.add_tile_offset({0, 1});
+      iterator_A_imag.add_tile_offset({0, 1});
+
+      iterator_B_real.add_tile_offset({1, 0});
+      iterator_B_imag.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Inserts a memory fence between stages of cp.async instructions
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Blocks until all but kStages-2 cp.async stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+
+    WarpFragmentA warp_frag_real_A[2];
+    WarpFragmentA warp_frag_imag_A[2];
+
+    WarpFragmentB warp_frag_real_B[2];
+    WarpFragmentB warp_frag_imag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
+    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
+
+    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
+    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A_real.clear_mask(gemm_k_iterations == 0);
+    iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+    iterator_B_real.clear_mask(gemm_k_iterations == 0);
+    iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag);
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
+        
+        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        }
+        else {
+          group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+    
+        copy_tiles_and_advance(
+          iterator_A_real, 
+          iterator_A_imag,
+          iterator_B_real, 
+          iterator_B_imag,
+          group_start_iteration_A, 
+          group_start_iteration_B);
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          // Inserts a memory fence between stages of cp.async instructions
+          cutlass::arch::cp_async_fence();
+
+          // Blocks until all but kStages-2 cp.async stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A_real.add_tile_offset({0, 1});
+          iterator_A_imag.add_tile_offset({0, 1});
+          
+          iterator_B_real.add_tile_offset({1, 0});
+          iterator_B_imag.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A_real.clear_mask(gemm_k_iterations == 0);
+          iterator_A_imag.clear_mask(gemm_k_iterations == 0);
+          iterator_B_real.clear_mask(gemm_k_iterations == 0);
+          iterator_B_imag.clear_mask(gemm_k_iterations == 0);
+        }
+
+        warp_mma_planar_complex(
+          warp_mma, 
+          accum, 
+          warp_frag_real_A[warp_mma_k % 2], 
+          warp_frag_imag_A[warp_mma_k % 2],
+          warp_frag_real_B[warp_mma_k % 2], 
+          warp_frag_imag_B[warp_mma_k % 2]);
+      }
+
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
new file mode 100644
index 0000000000000000000000000000000000000000..44585961f48a2c0de332ba9577a626f89a6da4f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -0,0 +1,424 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_planar_complex_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Transformation applied to A
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Transformation applied to B
+    ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplexPipelined : 
+  public MmaPlanarComplexBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaPlanarComplexBase<Shape_, Policy_, Stages>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  /// Transformation applied to A
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B
+  static ComplexTransform const kTransformB = TransformB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = ArrayPlanarComplex<
+    typename Policy::Operator::FragmentC::Element,
+    Policy::Operator::FragmentC::kElements
+  >;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+ private:
+
+  using FragmentA = typename IteratorA::Fragment;
+  using FragmentB = typename IteratorB::Fragment;
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPlanarComplexPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  void warp_mma_planar_complex(
+    Operator & warp_mma, 
+    FragmentC &accum,
+    WarpFragmentA const & real_A, 
+    WarpFragmentA const & imag_A, 
+    WarpFragmentB const & real_B, 
+    WarpFragmentB const & imag_B) {
+
+    cutlass::negate<Array<typename WarpFragmentB::Element, WarpFragmentB::kElements>> neg_op_B;
+
+    WarpFragmentB neg_real_B = neg_op_B(real_B);
+    WarpFragmentB neg_imag_B = neg_op_B(imag_B);
+
+    warp_mma(accum.real, real_A, real_B, accum.real);  
+
+    if (kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.imag, real_A, imag_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, real_A, neg_imag_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone) {
+      warp_mma(accum.imag, imag_A, real_B, accum.imag);
+    }
+    else {
+      warp_mma(accum.imag, imag_A, neg_real_B, accum.imag);
+    }
+
+    if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) {
+      warp_mma(accum.real, imag_A, imag_B, accum.real);
+    }
+    else {
+      warp_mma(accum.real, imag_A, neg_imag_B, accum.real);
+    }
+  }
+
+public:
+  
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_real,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A_imag,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_real,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B_imag,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A_real;
+    FragmentA tb_frag_A_imag;
+
+    FragmentB tb_frag_B_real;
+    FragmentB tb_frag_B_imag;
+
+    tb_frag_A_real.clear();
+    tb_frag_A_imag.clear();
+
+    tb_frag_B_real.clear();
+    tb_frag_B_imag.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A_real.load(tb_frag_A_real);
+    iterator_A_imag.load(tb_frag_A_imag);
+
+    iterator_B_real.load(tb_frag_B_real);
+    iterator_B_imag.load(tb_frag_B_imag);
+
+    ++iterator_A_real;
+    ++iterator_A_imag;
+
+    ++iterator_B_real;
+    ++iterator_B_imag;
+
+    this->smem_iterator_A_.store(tb_frag_A_real);
+    this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
+
+    this->smem_iterator_B_.store(tb_frag_B_real);
+    this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_real_A[2];
+    WarpFragmentA warp_frag_imag_A[2];
+
+    WarpFragmentB warp_frag_real_B[2];
+    WarpFragmentB warp_frag_imag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_real_A[0]);
+    this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA);
+
+    this->warp_tile_iterator_B_.load(warp_frag_real_B[0]);
+    this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB);
+
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A_real.clear_mask(gemm_k_iterations <= 1);
+    iterator_A_imag.clear_mask(gemm_k_iterations <= 1);
+    
+    iterator_B_real.clear_mask(gemm_k_iterations <= 1);
+    iterator_B_imag.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing 
+    // shared memory loads (which have the tightest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(tb_frag_A_real);
+          this->smem_iterator_A_.store_with_pointer_offset(tb_frag_A_imag, Base::SharedStorage::kImaginaryStrideA);
+
+          this->smem_iterator_B_.store(tb_frag_B_real);
+          this->smem_iterator_B_.store_with_pointer_offset(tb_frag_B_imag, Base::SharedStorage::kImaginaryStrideB);
+
+          __syncthreads();
+          
+          ++this->smem_iterator_B_;
+          ++this->smem_iterator_A_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK * Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA);
+        
+        this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A_real.load(tb_frag_A_real);
+          iterator_A_imag.load(tb_frag_A_imag);
+
+          iterator_B_real.load(tb_frag_B_real);
+          iterator_B_imag.load(tb_frag_B_imag);
+
+          ++iterator_A_real;
+          ++iterator_A_imag;
+          ++iterator_B_real;
+          ++iterator_B_imag;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A_real.clear_mask(gemm_k_iterations <= 2);
+          iterator_A_imag.clear_mask(gemm_k_iterations <= 2);
+          iterator_B_real.clear_mask(gemm_k_iterations <= 2);
+          iterator_B_imag.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma_planar_complex(
+          warp_mma, 
+          accum, 
+          warp_frag_real_A[warp_mma_k % 2], 
+          warp_frag_imag_A[warp_mma_k % 2],
+          warp_frag_real_B[warp_mma_k % 2], 
+          warp_frag_imag_B[warp_mma_k % 2]);
+      }
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
new file mode 100644
index 0000000000000000000000000000000000000000..3caba9f3110e31157692fc3dccbfd2842b305996
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -0,0 +1,265 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Iterates over tiles of A operand in global memory 
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Policy describing tuning details (concept: MmaPolicy)
+  typename Policy_,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaSingleStage : public MmaBase<Shape_, Policy_, 1> {
+public:
+
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, 1>;
+
+  using Shape = Shape_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA = IteratorA_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB = IteratorB_;     ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+  using Policy = Policy_;           ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  using ArchTag = arch::Sm70;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaSingleStage is 1 (single stage mma pipeline)
+  static_assert((Base::kStages==1), "MmaSingleStage requires kStages set to value 1");
+private:
+
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaSingleStage(
+    typename Base::SharedStorage &shared_storage,       ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx                                        ///< ID of each thread within a warp
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+    smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations,            ///< number of iterations of the mainloop
+    FragmentC &accum,                 ///< destination accumulator tile
+    IteratorA iterator_A,             ///< iterator over A operand in global memory
+    IteratorB iterator_B,             ///< iterator over B operand in global memory
+    FragmentC const &src_accum) {     ///< source accumulator tile
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA warp_frag_A;
+    WarpFragmentB warp_frag_B;
+
+    Operator warp_mma;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      this->smem_iterator_A_.store(tb_frag_A);
+      this->smem_iterator_B_.store(tb_frag_B);
+
+      __syncthreads();
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+        
+        this->warp_tile_iterator_A_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(warp_mma_k % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A);
+        this->warp_tile_iterator_B_.load(warp_frag_B);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        warp_mma(accum, warp_frag_A, warp_frag_B, accum);
+      }
+
+      // Add negative offsets to return smem load iterators to the 'start' of the shared memory
+      this->warp_tile_iterator_A_.add_tile_offset({0, -Policy::kPartitionsK * Base::kWarpGemmIterations});
+      this->warp_tile_iterator_B_.add_tile_offset({-Policy::kPartitionsK * Base::kWarpGemmIterations, 0});
+
+      __syncthreads();
+
+      iterator_A.load(tb_frag_A);
+      iterator_B.load(tb_frag_B);
+
+      ++iterator_A;
+      ++iterator_B;
+
+      // Avoid reading out of bounds if this was the last loop iteration
+      iterator_A.clear_mask(gemm_k_iterations <= 2);
+      iterator_B.clear_mask(gemm_k_iterations <= 2);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..5174be4babd78b5698ad7e6e4ac28134175f4a0b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
@@ -0,0 +1,756 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    It loads two loop invariant vectors, norm and sum, in the prologue and
+    stores them in the register file.  We will call elementwise operation to
+    apply norm and sum between ldmatrix and warp mma.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/softmax_scale_bias_transform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMainloopFusionBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = cutlass::gemm::GemmShape<Shape::kM / WarpGemm::kM,
+                                             Shape::kN / WarpGemm::kN,
+                                             Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMainloopFusionBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Iterates over vectors of var and mean vector in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorNormSum_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Whether problem has been transformed. This determines to which operand
+    /// the softmax is applied.
+    bool InternalTranspose,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaSoftmaxMainloopFusionMultistage : 
+  public MmaMainloopFusionBase<Shape_, Policy_, Stages> {
+public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of the var and mean vectors in global memory
+  using IteratorNormSum = IteratorNormSum_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  ///< Base class
+  using Base = MmaMainloopFusionBase<Shape_, Policy, Stages>;
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+  using WarpLoadedFragmentNormSum = typename IteratorNormSum::Fragment;
+
+  static bool const kInternalTranspose = InternalTranspose;
+
+  using SoftmaxFragment = typename platform::conditional<kInternalTranspose,
+                                                         WarpTransformedFragmentB,
+                                                         WarpTransformedFragmentA>::type;
+
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  int warp_idx_m_;
+
+  int warp_idx_n_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaSoftmaxMainloopFusionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    warp_idx_m_ = warp_idx_mn % Base::WarpCount::kM;
+    warp_idx_n_ = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m_, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n_});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A,
+                              IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over B operand in global memory
+      IteratorNormSum iterator_norm_sum,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    WarpLoadedFragmentNormSum warp_loaded_frag_norm_sum;
+    iterator_norm_sum.add_tile_offset({0, warp_idx_m_});
+    iterator_norm_sum.load(warp_loaded_frag_norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+    cutlass::gemm::warp::SoftmaxScaleBiasTransform<
+        SoftmaxFragment, WarpLoadedFragmentNormSum> elementwise_transform;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    // Start issuing the first group of the next stage outside of the mainloop
+    copy_tiles_and_advance(iterator_A, iterator_B);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    if (kInternalTranspose) {
+      elementwise_transform(warp_transformed_frag_B[0],
+                         warp_loaded_frag_norm_sum);
+    } else {
+      elementwise_transform(warp_transformed_frag_A[0],
+                         warp_loaded_frag_norm_sum);
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0) {
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[warp_mma_k % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+
+        // Issue global->shared copies for the next stage
+        int group_start_iteration_A, group_start_iteration_B;
+
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          group_start_iteration_A = 0;
+          group_start_iteration_B = 0;
+        } else {
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+        }
+
+        copy_tiles_and_advance(iterator_A, iterator_B,
+                               group_start_iteration_A,
+                               group_start_iteration_B);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum
+        );
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations) {
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+              if (kInternalTranspose) {
+                elementwise_transform(warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              } else {
+                elementwise_transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                                  warp_loaded_frag_norm_sum);
+              }
+        }
+      }
+
+    }
+    
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e94b0ffbf54678d8de3b51ec75bfa2c7966d54b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -0,0 +1,273 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy object describing MmaTensorOp
+template <
+    /// Warp-level GEMM operator (concept: gemm::warp::Mma)
+    typename Operator_,
+    /// Padding used for A operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingA_,
+    /// Padding used for B operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingB_,
+    /// Padding used for E operand in shared memory (concept: MatrixShape)
+    typename SmemPaddingE_,
+    /// Number of partitions of K dimension of GEMM
+    int PartitionsK = 1>
+struct SparseMmaPolicy {
+  /// Warp-level GEMM operator (concept: gemm::warp::MmaTensorOp or gemm::warp::MmaSimt)
+  using Operator = Operator_;
+
+  /// Padding used for A operand in shared memory
+  using SmemPaddingA = SmemPaddingA_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingB = SmemPaddingB_;
+
+  /// Padding used for B operand in shared memory
+  using SmemPaddingE = SmemPaddingE_;
+
+  /// Number of partitions of K dimension
+  static int const kPartitionsK = PartitionsK;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class SparseMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<Shape::kM / WarpGemm::kM,
+                              Shape::kN / WarpGemm::kN,
+                              Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  static_assert(kWarpGemmIterations > 1,
+                "The pipelined structure requires at least two warp-level "
+                "GEMM operations.");
+
+  static_assert((kWarpGemmIterations % 2) == 0,
+                "Inner loop iteration must be an even number.");
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  static int const kSparse = Operator::kSparse;
+
+  static int const kElementsPerElementE = Operator::kElementsPerElementE;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  /// Tensor reference to the E operand
+  using TensorRefE = TensorRef<typename Operator::ElementE, typename Operator::LayoutE>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK / kSparse * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+    /// Shape of the E matrix operand in shared memory
+    using ShapeE =
+        MatrixShape<Shape::kM * 2 + Policy::SmemPaddingE::kRow,
+                    Shape::kK / kSparse / kElementsPerElementE / 2 * kStages +
+                        Policy::SmemPaddingE::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+    /// Buffer for E operand
+    AlignedBuffer<typename Operator::ElementE, ShapeE::kCount> operand_E;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a layout object for the E matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutE LayoutE() {
+      return Operator::LayoutE::packed({ShapeE::kRow, ShapeE::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+
+    /// Returns a TensorRef to the E operand
+    CUTLASS_HOST_DEVICE
+    TensorRefE operand_E_ref() {
+      return TensorRefE{operand_E.data(), LayoutE()};
+    }
+  };
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of E operand from shared memory
+  typename Operator::IteratorE warp_tile_iterator_E_;
+
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  SparseMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A_(shared_storage.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx),
+      warp_tile_iterator_E_(shared_storage.operand_E_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc23c3fb77596ed3529dae8ec543c80b6060526
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -0,0 +1,668 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_sparse_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Iterates over tiles of E operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorE_,
+    /// Iterates over tiles of E operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorE_,
+    /// Cache operation for operand E
+    cutlass::arch::CacheOperation::Kind CacheOpE,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class SparseMmaMultistage : 
+  public SparseMmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = SparseMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Iterates over tiles of E operand in global memory
+  using IteratorE = IteratorE_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+  using SmemIteratorE = SmemIteratorE_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpE = CacheOpE;
+
+  static int const kSparse = Policy::Operator::kSparse;
+  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
+  static int const kMaxID2 = Policy::Operator::kMaxID2;
+  static int const kElementsPerElementE =
+      Policy::Operator::kElementsPerElementE;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// ElementE
+  using ElementE = typename IteratorE::Element;
+
+  /// LayoutE
+  using LayoutE = typename IteratorE::Layout; 
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of async copies to load one stage of operand A
+    static int const TBLoadIterationsA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of async copies to load one stage of operand B
+    static int const TBLoadIterationsB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of async copies to load one stage of operand E
+    static int const TBLoadIterationsE =
+        IteratorE::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of async copies to load one group of operand A
+    static int const kAccessesPerGroupA =
+        (TBLoadIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of async copies to load one group of operand B
+    static int const kAccessesPerGroupB =
+        (TBLoadIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of async copies to load one group of operand E
+    static int const kAccessesPerGroupE =
+        (TBLoadIterationsE + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// E operand is tiny.  For the most of time, not all the warps are needed
+    /// to load it from the global memory.
+    static int const kValidWarps = IteratorE::ThreadMap::kThreads / 32;
+
+    /// B operand is twice as big as A which brings very high register pressure.
+    /// We have to sacrifice the double buffer when the warp tile size is big.
+    static int const kBBufferSize =
+        ((sizeof(typename Operator::ElementC) == 4) &&
+         ((platform::is_same<typename Operator::Policy::Operator::ElementA,
+                             typename Operator::ElementA>::value &&
+           platform::is_same<typename Operator::Policy::Operator::ElementB,
+                             typename Operator::ElementB>::value)) &&
+         (Operator::Shape::kM >= 64 && Operator::Shape::kN >= 64))
+            ? 1
+            : 2;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+  using WarpFragmentE = typename Operator::FragmentE;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to write threadblock-scoped tile of E operand to shared memory
+  SmemIteratorE smem_iterator_E_;
+
+  /// Warp id
+  bool is_warp_valid_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  SparseMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx),
+      smem_iterator_E_(shared_storage.operand_E_ref(), thread_idx)
+  {
+    is_warp_valid_ = warp_idx < Detail::kValidWarps;
+
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_E_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              IteratorE &iterator_E, int group_start_A = 0,
+                              int group_start_B = 0, int group_start_E = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // async copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::TBLoadIterationsA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, gmem_ptr, iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // async copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::TBLoadIterationsB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, gmem_ptr, iterator_B.valid());
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+
+    iterator_E.set_iteration_index(group_start_E);
+    this->smem_iterator_E_.set_iteration_index(group_start_E);
+
+    // async copy for operand E
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupE; ++j) {
+      if (group_start_E + j < Detail::TBLoadIterationsE) {
+        typename IteratorE::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorE::AccessType *>(
+                this->smem_iterator_E_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
+                              IteratorE::ThreadMap::kElementsPerAccess / 8;
+
+        auto gmem_ptr = iterator_E.get();
+
+        cutlass::arch::cp_async<kSrcBytes, kCacheOpE>(
+            dst_ptr, gmem_ptr, iterator_E.valid() && is_warp_valid_);
+
+        ++iterator_E;
+        ++this->smem_iterator_E_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< iterator over E operand in global memory
+      IteratorE iterator_E,
+      ///< initial value of accumulator
+      FragmentC const &src_accum) {
+
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+      iterator_E.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // async copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // async copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      iterator_E.set_iteration_index(0);
+      this->smem_iterator_E_.set_iteration_index(0);
+
+      // async copy for operand E
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsE; ++j) {
+        typename IteratorE::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorE::AccessType *>(
+                this->smem_iterator_E_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorE::Element>::value *
+                              IteratorE::ThreadMap::kElementsPerAccess / 8;
+        if (is_warp_valid_)
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpE>(
+              dst_ptr, iterator_E.get(), iterator_E.valid());
+
+        ++iterator_E;
+
+        ++this->smem_iterator_E_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+      iterator_E.add_tile_offset({0, 1});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+      this->smem_iterator_E_.add_tile_offset({0, 1});
+
+      // cp.async.commit_group - completes a stage
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[Detail::kBBufferSize];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[Detail::kBBufferSize];
+    WarpFragmentE warp_frag_E[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_E_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+    this->warp_tile_iterator_E_.load(warp_frag_E[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+    ++this->warp_tile_iterator_E_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+    iterator_E.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_E_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_E_.load(warp_frag_E[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_E_;
+
+       if (Detail::kBBufferSize == 2) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
+          ++this->warp_tile_iterator_B_;
+        }
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % Detail::kBBufferSize]);
+
+        warp_mma(
+          accum,
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % Detail::kBBufferSize], accum,
+          warp_frag_E[warp_mma_k % 2]
+        );
+
+        if (Detail::kBBufferSize == 1) {
+          this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+          this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+          ++this->warp_tile_iterator_B_;
+  
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+          group_start_iteration_E = warp_mma_k * Detail::kAccessesPerGroupE;
+
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
+              group_start_iteration_B, group_start_iteration_E);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B, group_start_iteration_E;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+          group_start_iteration_E =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupE;
+
+          copy_tiles_and_advance(
+              iterator_A, iterator_B, iterator_E, group_start_iteration_A,
+              group_start_iteration_B, group_start_iteration_E);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed. 
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+          iterator_E.add_tile_offset({0, 1});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+          this->smem_iterator_E_.add_tile_offset({0, 1});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            this->smem_iterator_E_.add_tile_offset({0, -Base::kStages});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            this->warp_tile_iterator_E_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+          iterator_E.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % Detail::kBBufferSize]);
+      }
+
+    }
+
+
+    // Commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fd49a5bc462d81040abd463098a357f5eab2465
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/threadblock/mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaWithReductionMultistage : 
+  public MmaBase<Shape_, Policy_, Stages> {
+public:
+  ///< Base class
+  using Base = MmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  using FragmentReduction = typename Operator::FragmentReduction;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  static int const kReduceKForA = Operator::kReduceKForA;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaWithReductionMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx),
+      smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B,
+                              int group_start_A = 0, int group_start_B = 0) {
+    iterator_A.set_iteration_index(group_start_A *
+                                   IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+                              IteratorA::ThreadMap::kElementsPerAccess /
+                              IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(group_start_B *
+                                   IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+                              IteratorB::ThreadMap::kElementsPerAccess /
+                              IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC &accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const &src_accum,
+      FragmentReduction &gemm_k_reduction_accum) {
+
+    //
+    // Prologue
+    //
+    // Issue several complete stages
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations) {
+
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      this->smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType *>(
+                this->smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+              dst_ptr + v, iterator_A.get(), iterator_A.valid());
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      this->smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType *>(
+                this->smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+              dst_ptr + v, iterator_B.get(), iterator_B.valid());
+
+          ++iterator_B;
+        }
+
+        ++this->smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      this->smem_iterator_A_.add_tile_offset({0, 1});
+      this->smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0],
+                       warp_loaded_frag_A[0], warp_loaded_frag_B[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations);
+        
+        this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2],
+                             warp_transformed_frag_B[warp_mma_k % 2],
+                             warp_loaded_frag_A[warp_mma_k % 2],
+                             warp_loaded_frag_B[warp_mma_k % 2]);
+
+        warp_mma(
+          accum, 
+          warp_transformed_frag_A[warp_mma_k % 2],
+          warp_transformed_frag_B[warp_mma_k % 2], 
+          accum,
+          gemm_k_reduction_accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          int group_start_iteration_A, group_start_iteration_B;
+          group_start_iteration_A =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+          group_start_iteration_B =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, 
+                               group_start_iteration_B);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0, -Base::kStages * Policy::kPartitionsK *
+                        Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+    
+    // commit and drain all pending and predicated cp.async pnz from the GEMM mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
new file mode 100644
index 0000000000000000000000000000000000000000..9495d785536910355a5d0f9a3cd91dc7b5895747
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -0,0 +1,459 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements several possible threadblock-swizzling functions mapping blockIdx to 
+      GEMM problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle_streamk.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+template <int N = 1>
+struct GemmIdentityThreadblockSwizzle {
+
+  CUTLASS_HOST_DEVICE
+  GemmIdentityThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *Gemm* problem size: gemm(M, N, K)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *ImplicitGemm* Conv2d problem size: conv_operator(NPQK, NHWC, KRSC)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return get_tiled_shape(
+      implicit_gemm_problem_size, tile_size, split_k_slices);
+  }
+
+  /// Returns the shape of the problem in units of logical tiles
+  /// *ImplicitGemm* Conv3d problem size: conv_operator(NZPQK, NDHWC, KTRSC)
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    cutlass::conv::Operator conv_operator,
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    gemm::GemmCoord implicit_gemm_problem_size = 
+    cutlass::conv::implicit_gemm_problem_size(conv_operator, problem_size);
+
+    return get_tiled_shape(
+      implicit_gemm_problem_size, tile_size, split_k_slices);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    int tile = 1 << get_log_tile(tiled_shape);
+    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    auto n = tiled_shape.n();
+    // Thresholds picked so that it doesn't cause too many no-op CTAs
+    if (N >= 8 && n >= 6)
+      return 3;
+    else if (N >= 4 && n >= 3)
+      return 2;
+    else if (N >= 2 && n >= 2)
+      return 1;
+    else
+      return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+
+    int const kTile = N;
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+
+    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
+      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
+
+    return GemmCoord{
+      (block_idx_x / kTile),
+      (block_idx_y * kTile) + (block_idx_x % kTile),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for GEMMs
+struct GemmHorizontalThreadblockSwizzle {
+
+  CUTLASS_HOST_DEVICE
+  GemmHorizontalThreadblockSwizzle() { }
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int split_k_slices) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      split_k_slices);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for batched GEMMs
+struct GemmBatchedIdentityThreadblockSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int batch_count) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      batch_count % (1 << 16));
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxZ()
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Gets the batch index
+  CUTLASS_DEVICE
+  static int get_batch_idx() {
+    return RematerializeBlockIdxZ();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for split-K GEMMs
+template <int N = 1>
+struct GemmSplitKIdentityThreadblockSwizzle {
+
+  int const kTile = N;
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int partitions) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      partitions);
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    auto n = tiled_shape.n();
+    // Thresholds picked so that it doesn't cause too many no-op CTAs
+    if (N >= 8 && n >= 6)
+      return 3;
+    else if (N >= 4 && n >= 3)
+      return 2;
+    else if (N >= 2 && n >= 2)
+      return 1;
+    else
+      return 0;
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    int tile = 1 << get_log_tile(tiled_shape);
+    return dim3(tiled_shape.m() * tile, (tiled_shape.n() + tile - 1) / tile, tiled_shape.k());
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+    int block_idx_z = RematerializeBlockIdxZ();
+
+    return GemmCoord{(block_idx_x >> log_tile),  //
+                     (block_idx_y << log_tile) + ((block_idx_x) & ((1 << (log_tile)) - 1)),
+                     block_idx_z};
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+
+    int const kTile = N;
+    int block_idx_x = RematerializeBlockIdxX();
+    int block_idx_y = RematerializeBlockIdxY();
+
+    if ((tiled_shape.m() < kTile) || (tiled_shape.n() < kTile))
+      return GemmCoord{block_idx_x, block_idx_y, RematerializeBlockIdxZ()};
+
+    return GemmCoord{
+      (block_idx_x / kTile),
+      (block_idx_y * kTile) + (block_idx_x % kTile),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for split-K GEMMs
+struct GemmSplitKHorizontalThreadblockSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static GemmCoord get_tiled_shape(
+    GemmCoord problem_size,
+    GemmCoord tile_size,
+    int partitions) {
+
+    return GemmCoord(
+      (problem_size.m() + tile_size.m() - 1) / tile_size.m(),
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      partitions);
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(GemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.m(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(int log_tile) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static GemmCoord get_tile_offset(GemmCoord tiled_shape) {
+    return GemmCoord{
+      RematerializeBlockIdxY(),
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ()
+    };
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock swizzling function for batched GEMVs
+struct GemvBatchedStridedThreadblockDefaultSwizzle {
+
+  /// Returns the shape of the problem in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static BatchedGemmCoord get_tiled_shape(
+    BatchedGemmCoord problem_size,
+    BatchedGemmCoord tile_size) {
+
+    return BatchedGemmCoord(
+      1, // M is always 1
+      (problem_size.n() + tile_size.n() - 1) / tile_size.n(),
+      (problem_size.k() + tile_size.k() - 1) / tile_size.k(),
+      (problem_size.batch() + tile_size.batch() - 1) / tile_size.batch());
+  }
+
+  /// Computes CUDA grid dimensions given a size in units of logical tiles
+  CUTLASS_HOST_DEVICE
+  static dim3 get_grid_shape(BatchedGemmCoord tiled_shape) {
+    return dim3(tiled_shape.n(), tiled_shape.batch(), tiled_shape.k());
+  }
+
+  /// Calculates optimal swizzle width
+  CUTLASS_HOST_DEVICE
+  static int get_log_tile(GemmCoord tiled_shape) {
+    return 0;
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static BatchedGemmCoord get_tile_offset(int log_tile) {
+    return BatchedGemmCoord{
+      0, // M is always 1
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ(),
+      RematerializeBlockIdxY(),
+    };
+  }
+
+  /// Obtains the threadblock offset (in units of threadblock-scoped tiles)
+  CUTLASS_DEVICE
+  static BatchedGemmCoord get_tile_offset() {
+    return BatchedGemmCoord{
+      0, // M is always 1
+      RematerializeBlockIdxX(),
+      RematerializeBlockIdxZ(),
+      RematerializeBlockIdxY(),
+    };
+  }
+
+  /// Gets the batch tile index
+  CUTLASS_DEVICE
+  static int get_batch_tile_idx() {
+    return RematerializeBlockIdxY();
+  }
+
+  /// Gets the absolute batch index
+  CUTLASS_DEVICE
+  static int get_batch_idx() {
+    return RematerializeBlockDimY()*RematerializeBlockIdxY() + RematerializeThreadIdxY();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
new file mode 100644
index 0000000000000000000000000000000000000000..da54eee5a7618c61fc0b9736418ae05ce0466bce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -0,0 +1,801 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implements streamk threadblock mapping blockIdx to GEMM problems.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/gemm/gemm_enumerated_types.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/gemm/threadblock/index_remat.h"
+
+#if !defined(__CUDACC_RTC__)
+#include <iostream>
+#include "cutlass/core_io.h"
+#include "cutlass/trace.h"
+#endif
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Threadblock mapping control for GEMMs
+struct ThreadblockSwizzleStreamK {
+
+  /// Advertise StreamkFeature
+  using StreamkFeature = void;
+
+
+  /// Kernel traits
+  template <typename GemmKernel>
+  struct KernelTraits {};
+
+
+  /// Reduction strategy
+  enum ReductionStrategy
+  {
+    kNone,      // Data-parallel strategy (no seams, fixup, etc.)
+
+    kAtomic,    // Non-deterministic reduction of SK-block partials using atomic aggregation in L2
+
+    kMixed,     // Deterministic reduction of SK-block partials employing either:
+                //   (a) A separate wave of reduction thread blocks" (for scenarios with lots of
+                //       SK-blocks per SK-tile)
+                //   (b) Turnstile-ordered atomic aggregation in L2 (for scenarios with few
+                //       SK-blocks per SK-tile)
+  };
+
+  static ReductionStrategy const kReductionStrategy = kMixed;
+
+
+  //
+  // Heuristics
+  //
+
+  /// Data-parallel wave-quantization efficiency threshold (above which we go data-parallel)
+  static float constexpr kDpEfficiencyThreshold = 0.92f;
+
+  /// Minimum number of MAC-iterations per streamk block
+  static int const kMinItersPerSkBlock = 2;
+
+  /// Height in CTAs of a grid rasterization cohort
+  static int const kCohortCtasM = 8;
+
+  /// Width in CTAs of a grid rasterization cohort
+  static int const kCohortCtasN = 4;
+
+  /// Number of CTAs per cohort
+  static int const kCtasPerCohort = kCohortCtasN * kCohortCtasM;
+
+  /// Cost-equivalent number of SM-iterations for fixup I/O
+  static int const kFixupStartupIterEquiv = 10;
+  static int const kFixupPeerIterEquiv = 3;
+
+
+  //
+  // Member state
+  //
+
+
+  /// The 3D value-extents of the GEMM computation volume (m,n,k)
+  GemmCoord problem_size;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_tiled_shape_m;
+  FastDivmod div_mod_tiled_shape_n;
+  FastDivmod div_mod_tiled_cohort_shape_n;
+  FastDivmod div_mod_iters_per_tile;
+
+  /// Whether to perform cohort CTA rasterization
+  bool cohort_raster;
+
+  // Whether to pad and remap block indices
+  bool remap_block_indices;
+
+  /// CTA occupancy per SM
+  int sm_occupancy;
+
+  /// Number of SMs for dispatch heuristics to load-balance using Stream-K CTAs (wave size)
+  int avail_sms;
+
+  int dp_blocks;                            /// Number of data-parallel thread blocks in the grid
+  int dp_first_wave_tiles;                  /// Number of output tiles each CTA in the first DP wave will produce
+
+  /// Number of reduction blocks in the grid
+  int reduction_blocks;
+
+  int sk_waves;
+  int sk_tiles;
+  int sk_big_blocks_per_region;
+  int sk_iters_per_region;
+
+  /// Div/mod accelerators
+  FastDivmod div_mod_sk_iters_per_normal_block;
+  FastDivmod div_mod_sk_iters_per_big_block;
+  FastDivmod div_mod_sk_iters_per_region;
+  FastDivmod div_mod_sk_regions;                      //!! used in block map
+  FastDivmod div_mod_sk_blocks_per_region;            //!! used in block map
+
+  /// The batch count
+  int batch_count;
+
+
+  //
+  // Host+device interface
+  //
+
+  /// Constructor
+  ThreadblockSwizzleStreamK() = default;
+
+  /// Returns the GEMM volume in thread block tiles
+  CUTLASS_HOST_DEVICE
+  GemmCoord tiled_shape() const
+  {
+    return GemmCoord(
+        static_cast<int>(div_mod_tiled_shape_m),
+        static_cast<int>(div_mod_tiled_shape_n),
+        batch_count);
+  }
+
+  /// Number of iterations per output tile
+  CUTLASS_HOST_DEVICE
+  int iters_per_tile() const
+  {
+    return static_cast<int>(div_mod_iters_per_tile);
+  }
+
+  /// Number of iterations for normal SK-blocks
+  CUTLASS_HOST_DEVICE
+  int sk_iters_per_normal_block() const
+  {
+    return static_cast<int>(div_mod_sk_iters_per_normal_block);
+  }
+
+  /// Number of SK regions
+  CUTLASS_HOST_DEVICE
+  int sk_regions() const
+  {
+    return static_cast<int>(div_mod_sk_regions);
+  }
+
+  /// Number of SK blocks per region (splitting factor)
+  CUTLASS_HOST_DEVICE
+  int sk_blocks_per_region() const
+  {
+    return static_cast<int>(div_mod_sk_blocks_per_region);
+  }
+
+
+  //
+  // Host-side interface
+  //
+
+  /// Debug print
+  void Print()
+  {
+#ifndef __CUDA_ARCH__
+    auto tiles = tiled_shape().mn().product();
+    std::cout <<
+        "problem_size: (" << problem_size.m() << "," << problem_size.n() << ")" <<
+        ", tiled_shape: (" << tiled_shape().m() << "," << tiled_shape().n() << ")" <<
+        ", tiles: " << tiles <<
+        ", dp_tiles: " << tiles - sk_tiles <<
+        ", sk_tiles: " << sk_tiles <<
+        ", iters_per_tile: " << iters_per_tile() <<
+        ", reduction_blocks: " << reduction_blocks <<
+        ", dp_blocks: " << dp_blocks <<
+        ", dp_waves: " << dp_blocks / avail_sms <<
+        ", dp_first_wave_tiles: " << dp_first_wave_tiles <<
+        ", sk_blocks_per_region: " << sk_blocks_per_region() <<
+        ", sk_regions: " << sk_regions() <<
+        ", sk_waves: " << sk_waves <<
+        ", sk_iters_per_normal_block: " << sk_iters_per_normal_block() <<
+        ", sk_big_blocks_per_region: " << sk_big_blocks_per_region <<
+        ", remap_block_indices: " << remap_block_indices <<
+        ", cohort_raster: " << cohort_raster <<
+        ", sm_occupancy: " << sm_occupancy <<
+        ", avail_sms: " << avail_sms <<
+        ", num_blocks: " << get_num_blocks() <<
+        "\n\n";
+#endif
+  }
+
+
+  // Compute sk_blocks to dispatch for a given number of sk_tiles
+  static void get_sk_blocks(
+    int &sk_blocks,     /// [out]
+    int &savings_iters, /// [out]
+    int sk_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int max_sk_occupancy,
+    bool allow_partial_wave)
+  {
+    savings_iters = INT_MIN;
+    sk_blocks = 0;
+
+    if (sk_tiles == 0) {
+      return;
+    }
+
+    int sk_iters = sk_tiles * iters_per_tile;
+
+    int dp_equiv_waves = (sk_tiles + avail_sms - 1) / avail_sms;
+    int dp_equiv_iters = iters_per_tile * dp_equiv_waves;
+
+    int min_sk_blocks = (allow_partial_wave) ? fast_min(avail_sms, sk_tiles + 1) : avail_sms;
+    int max_sk_blocks = fast_min(avail_sms * max_sk_occupancy, sk_iters / kMinItersPerSkBlock);
+
+    for (int trial_sk_blocks = min_sk_blocks; trial_sk_blocks <= max_sk_blocks; ++trial_sk_blocks)
+    {
+      int sk_waves = (trial_sk_blocks + avail_sms - 1) / avail_sms;
+      int max_sk_iters_per_block = (sk_iters + trial_sk_blocks - 1) / trial_sk_blocks;
+      int sk_iter_equiv = max_sk_iters_per_block * sk_waves;
+
+      int num_peers = ((trial_sk_blocks + sk_tiles - 1) / sk_tiles) + 1;        // add one for alignment skew
+
+      float iter_cost = 0.02f * float(num_peers) * float(sk_iter_equiv);
+
+      if (trial_sk_blocks % sk_tiles == 0)
+      {
+        // aligned
+        num_peers = (trial_sk_blocks / sk_tiles);
+
+        iter_cost = 0.0f;
+      }
+
+      float peer_cost = 2.0f * float(num_peers);
+
+      float base_cost = 2.0f * float(sk_waves);
+
+      int fixup_iter_equiv = int(base_cost + iter_cost + peer_cost);
+
+      int trial_savings_iters = dp_equiv_iters - sk_iter_equiv - fixup_iter_equiv;
+
+      if (trial_savings_iters >= savings_iters) {
+          savings_iters = trial_savings_iters;
+          sk_blocks = trial_sk_blocks;
+      }
+    }
+  }
+
+
+  /// Determine the populations of DP and SK blocks to invoke for the given number of output tiles
+  static void get_blocks(
+    int &dp_tiles,      /// [out]
+    int &sk_blocks,     /// [out]
+    int output_tiles,
+    int iters_per_tile,
+    int avail_sms,
+    int sm_occupancy)
+  {
+    int full_waves = output_tiles / avail_sms;
+    int full_wave_tiles = full_waves * avail_sms;
+    int partial_wave_tiles = output_tiles - full_wave_tiles;
+
+    int score = -1;
+    dp_tiles = output_tiles;
+    sk_blocks = 0;
+
+    if (partial_wave_tiles == 0)
+    {
+      // Perfect quantization
+      return;
+    }
+
+    if (full_waves < sm_occupancy)
+    {
+        // We're less than full GPU occupancy
+
+        // Form the SK wave from the partial wave to get us up to full GPU occupancy
+        int max_sk_occupancy = sm_occupancy - full_waves;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score < 0) {
+          // not profitable
+          sk_blocks = 0;
+          dp_tiles = output_tiles;
+        }
+
+        return;
+    }
+
+    // We're at (or greater) than GPU occupancy
+
+    if ((sm_occupancy > 1 ) && (full_waves % sm_occupancy == sm_occupancy - 1))
+    {
+        // If occupancy is more than one CTA per SM, form the SK wave from the partial
+        // wave to get us to full GPU occupancy
+        int max_sk_occupancy = 1;
+
+        dp_tiles = full_wave_tiles;
+
+        get_sk_blocks(
+          sk_blocks,
+          score,
+          partial_wave_tiles,
+          iters_per_tile,
+          avail_sms,
+          max_sk_occupancy,
+          true);                 // we can run with less than a full wave of SK-blocks
+
+        if (score >= 0) {
+            return;
+        }
+    }
+
+    // Form the SK wave by combining the last full wave and the partial wave
+    // We're less than full GPU occupancy
+    dp_tiles = full_wave_tiles - avail_sms;
+
+    int max_sk_occupancy = sm_occupancy - ((full_waves - 1) % sm_occupancy);
+
+    get_sk_blocks(
+      sk_blocks,
+      score,
+      partial_wave_tiles + avail_sms,
+      iters_per_tile,
+      avail_sms,
+      max_sk_occupancy,
+      false);                 // we cannot run with less than a full wave of SK-blocks
+
+    if (score < 0) {
+      // not profitable
+      sk_blocks = 0;
+      dp_tiles = output_tiles;
+    }
+
+  }
+
+  /// Constructor: *Gemm* problem size (m, n, k)
+  ThreadblockSwizzleStreamK(
+    GemmUniversalMode const mode_,
+    GemmCoord const problem_size_,
+    GemmCoord const tile_size_,
+    int const batch_split_,                        /// Either (mode == GemmUniversalMode::kBatched) the batch count, or (mode == GemmUniversalMode::kGemm) the tile-splitting factor (1 defaults to StreamK, >1 emulates Split-K)
+    int const sm_occupancy_,
+    int const device_sms_,
+    int const avail_sms_,                          /// The number of SMs that StreamK dispatch heuristics will attempt to load-balance across (-1 defaults to device width, 1 implies classic data-parallel scheduling)
+    size_t const element_A_bytes_,
+    size_t const element_B_bytes_,
+    size_t const element_C_bytes_,
+    int const epilogue_acc_fragments_)
+  :
+    problem_size(problem_size_),
+    batch_count((mode_ == GemmUniversalMode::kBatched || mode_ == GemmUniversalMode::kArray) ? batch_split_ : 1),
+    reduction_blocks(0),
+    dp_blocks(0),
+    dp_first_wave_tiles(1),     // Default: one tile per DP-block in the first wave of DP blocks
+    sk_tiles(0),
+    sk_big_blocks_per_region(0),
+    sk_iters_per_region(0),
+    sk_waves(0),
+    sm_occupancy(sm_occupancy_),
+    remap_block_indices(false),
+    avail_sms(fast_max(1, avail_sms_)),
+    cohort_raster(false)
+  {
+    int gpu_occupancy = device_sms_ * sm_occupancy;
+    int iters_per_tile = (problem_size.k() + tile_size_.k() - 1) / tile_size_.k();
+    int sk_iters_per_normal_block = 0;
+
+    int sk_regions = 1;              // Default: a single region of iteration space (across all SK tiles)
+    int sk_blocks_per_region = 0;
+
+    GemmCoord tiled_shape(
+      (problem_size.m() + tile_size_.m() - 1) / tile_size_.m(),
+      (problem_size.n() + tile_size_.n() - 1) / tile_size_.n(),
+      batch_count);
+
+    size_t problem_bytes =
+              (element_C_bytes_ * problem_size.m() * problem_size.n()) +
+              (element_A_bytes_ * problem_size.m() * problem_size.k()) +
+              (element_B_bytes_ * problem_size.k() * problem_size.n());
+
+    size_t problem_flops = size_t(problem_size.m()) * size_t(problem_size.n()) * size_t(problem_size.k()) * 2;
+
+    [[maybe_unused]] float flops_per_byte = float(problem_flops) / float(problem_bytes);
+
+    int output_tiles = tiled_shape.m() * tiled_shape.n();
+    int waves = (output_tiles + avail_sms - 1) / avail_sms;
+    [[maybe_unused]] float dp_efficiency = float(output_tiles) / float(waves * avail_sms);
+
+    //
+    // Determine dispatch composition of DP-tiles and SK-blocks
+    //
+
+    // Start with a DP-only configuration
+    int dp_tiles = output_tiles;    // Number of data-parallel tiles
+    int sk_blocks = 0;              // Number of thread blocks to produce the remaining SK tiles
+
+    // Only kGemm mode allows for SK load balancing
+    if (mode_ == GemmUniversalMode::kGemm)
+    {
+      int split_factor = batch_split_;
+      if (split_factor > 1)
+      {
+        // Split-K override
+        dp_tiles = 0;
+        sk_blocks = output_tiles * split_factor;
+      }
+      else if ((kReductionStrategy != kNone) &&   // Load-balancing strategy statically enabled
+        (avail_sms > 1))                         // Plurality of SMs to load balance across
+      {
+        // Use heuristics
+        get_blocks(
+          dp_tiles,      /// [out]
+          sk_blocks,     /// [out]
+          output_tiles,
+          iters_per_tile,
+          avail_sms,
+          sm_occupancy);
+      }
+    }
+
+    sk_tiles = output_tiles - dp_tiles;
+
+
+    // Compute SK block iteration details
+    if (sk_blocks > 0)
+    {
+      sk_waves = (sk_blocks + avail_sms - 1) / avail_sms;
+
+      int sk_iters = sk_tiles * iters_per_tile;
+      sk_blocks = fast_min(sk_blocks, sk_iters);
+
+      sk_iters_per_normal_block = sk_iters / sk_blocks;
+      int extra_sk_iters = sk_iters - (sk_iters_per_normal_block * sk_blocks);
+      int sk_big_blocks = extra_sk_iters;
+
+      if ((sk_blocks > sk_tiles) && (sk_blocks % sk_tiles == 0))
+      {
+        // Split-K decomposition
+        sk_regions = sk_tiles;
+      }
+
+      sk_blocks_per_region = sk_blocks / sk_regions;
+      sk_big_blocks_per_region = sk_big_blocks / sk_regions;
+      sk_iters_per_region = sk_iters / sk_regions;
+
+      // Use a separate reduction wave when all of:
+      // - Non-atomic reduction stratgy
+      // - The number of SK waves won't fully occupy the GPU (Otherwise we don't have
+      //   a strong-scaling case for more parallel reduction)
+      // - More than three peers working on an SK tile.  (This occurs when the ratio of
+      //   SK-blocks to SK-tiles > 2, as a single tile may be covered by four SK-blocks,
+      //   e.g.:[partial-block | block | block | partial-block] ).  With three or
+      //   less peers, the two non-finishing SK-blocks are not expected to contend.
+      if ((kReductionStrategy == kMixed) &&
+          (sk_waves < sm_occupancy) &&
+          (sk_blocks > 2 * sk_tiles))
+      {
+        // Launch a reduction block for every accumulator fragment in each SK-tile
+        reduction_blocks = sk_tiles * epilogue_acc_fragments_;
+
+      }
+
+      // When we have a multi-occupancy kernel and at least two waves of active blocks (where
+      // at least one wave is SK blocks), we need to (1) dispatch at least four waves, and (2)
+      // remap the block indices so that we can reliably spread the SK blocks evenly across the
+      // device's first SM occupancy valence. Also see get_num_blocks() and get_block_idx().
+      remap_block_indices = (
+          (sm_occupancy > 1) &&
+          (device_sms_ == avail_sms) &&
+          (get_num_active_blocks() > avail_sms * 2));
+
+      // Initialize fast div/mod members related to SK
+      div_mod_sk_iters_per_normal_block = FastDivmod(sk_iters_per_normal_block);
+      div_mod_sk_iters_per_big_block = FastDivmod(sk_iters_per_normal_block + 1);
+      div_mod_sk_iters_per_region = FastDivmod(sk_iters_per_region);
+      div_mod_sk_regions = FastDivmod(sk_regions);
+      div_mod_sk_blocks_per_region = FastDivmod(sk_blocks_per_region);
+    }
+
+    //
+    // Compute DP blocks
+    //
+
+    dp_blocks = dp_tiles;
+
+    cutlass::gemm::GemmCoord tiled_cohort_shape(
+        (tiled_shape.m() + kCohortCtasM - 1) / kCohortCtasM,
+        (tiled_shape.n() + kCohortCtasN - 1) / kCohortCtasN,
+        tiled_shape.k());
+    int cohort_blocks = (tiled_cohort_shape.m() * tiled_cohort_shape.n()) * kCtasPerCohort;
+    float cohort_efficiency = float(dp_blocks) / float(cohort_blocks);
+
+    // Check if the SK tiles would be in cohorts that are in-bounds
+    bool sk_in_range = true;
+    if (sk_tiles > 0)
+    {
+      int last_sk_tile = sk_tiles - 1;
+      int cohort_tile_idx = last_sk_tile / kCtasPerCohort;
+      int cohort_grid_m = cohort_tile_idx / tiled_cohort_shape.n();
+      int cohort_grid_n = (cohort_grid_m > 0) ?
+        tiled_cohort_shape.n() - 1 :
+        cohort_tile_idx % tiled_cohort_shape.n();
+
+      if ((((cohort_grid_m + 1) * kCohortCtasM) >= tiled_shape.m()) ||
+          (((cohort_grid_n + 1) * kCohortCtasN) >= tiled_shape.n()))
+      {
+        sk_in_range = false;
+      }
+
+    }
+
+    // Decide if we're going to be doing cohort raster
+    if (sk_in_range &&
+        (dp_blocks >= gpu_occupancy * 2) &&
+        (cohort_efficiency > 0.85f))
+    {
+      cohort_raster = true;
+      dp_blocks = cohort_blocks;
+    }
+    else if (sk_waves > 0)
+    {
+      // Update semi-persistence of first DP wave to ensure full grid wavesets
+      // (Only applies when there's an SK component and we're not doing blocked cohort rasterization)
+      int dp_tile_waves = (dp_tiles + avail_sms - 1) / avail_sms;
+      int full_dp_tile_waves = dp_tiles / avail_sms;
+      int waveset_excess = (sk_waves + dp_tile_waves) % sm_occupancy;
+
+      if (dp_first_wave_tiles + waveset_excess <= full_dp_tile_waves)
+      {
+        dp_first_wave_tiles += waveset_excess;
+        dp_blocks -= (waveset_excess * avail_sms);
+      }
+    }
+
+    // Setup fast-div/mod for device-side usage
+    div_mod_tiled_shape_m = FastDivmod(tiled_shape.m());
+    div_mod_tiled_shape_n = FastDivmod(tiled_shape.n());
+    div_mod_tiled_cohort_shape_n = FastDivmod(tiled_cohort_shape.n());
+    div_mod_iters_per_tile = FastDivmod(iters_per_tile);
+
+  }
+
+  /// Number of blocks performing useful work
+  int get_num_active_blocks() const
+  {
+    return (sk_waves * avail_sms) + dp_blocks + reduction_blocks;
+  }
+
+  /// Obtains number of threadblocks per GEMM
+  int get_num_blocks() const
+  {
+    int active_blocks = get_num_active_blocks();
+    if (remap_block_indices)
+    {
+      // Add padding blocks if we are performing remapping in order to dispatch a grid of at least four waves
+      return fast_max(active_blocks, avail_sms * 4);
+    }
+
+    return active_blocks;
+  }
+
+
+  /// Obtains grid extents in CTAs
+  dim3 get_grid_dims() const
+  {
+    return dim3(get_num_blocks(), 1, batch_count);
+  }
+
+
+  //
+  // Device-side interface
+  //
+
+  /// Obtains number of threadblocks per GEMM
+  CUTLASS_DEVICE
+  int device_num_blocks() const
+  {
+    return gridDim.x;
+  }
+
+  /// Obtains tile index for the given sk iteration
+  CUTLASS_DEVICE
+  int get_sk_tile_idx(int iter) const
+  {
+    int tile_idx = div_mod_iters_per_tile.div(iter);
+    return tile_idx;
+  }
+
+  /// Obtains the batch index
+  CUTLASS_DEVICE
+  int get_batch_idx() const
+  {
+    return RematerializeBlockIdxZ();
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset(int tile_idx) const
+  {
+    int m, n;
+
+    // row-major raster
+    div_mod_tiled_shape_n(m, n, tile_idx);
+
+    if (tiled_shape().m() < tiled_shape().n())
+    {
+      // column-major raster
+      div_mod_tiled_shape_m(n, m, tile_idx);
+    }
+
+    if (cohort_raster)
+    {
+      // tiled cohort raster
+      int cohort_tile_idx = tile_idx / kCtasPerCohort;
+      int cohort_grid_m, cohort_grid_n;
+      div_mod_tiled_cohort_shape_n(cohort_grid_m, cohort_grid_n, cohort_tile_idx);
+
+      int block_idx_cohort = tile_idx % kCtasPerCohort;
+      int block_cohort_m = block_idx_cohort / kCohortCtasN;
+      int block_cohort_n = block_idx_cohort % kCohortCtasN;
+
+      m = (cohort_grid_m * kCohortCtasM) + block_cohort_m;
+      n = (cohort_grid_n * kCohortCtasN) + block_cohort_n;
+    }
+
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
+  CUTLASS_DEVICE
+  GemmCoord get_tile_offset_row_major(int tile_idx) const
+  {
+    // row-major raster
+    int m, n;
+    div_mod_tiled_shape_n(m, n, tile_idx);
+    return GemmCoord(m, n, get_batch_idx());
+  }
+
+  /// Obtains calling threadblock's linear threadblock index
+  CUTLASS_DEVICE
+  int get_block_idx() const
+  {
+    int block_idx = RematerializeBlockIdxX();
+
+    // Remap the block indices for the first two waves of thread blocks if
+    // we have multi-occupancy and the grid constitutes four or more waves
+    if (remap_block_indices && (block_idx < avail_sms * 2))
+    {
+      int dest_sm = block_idx / 2;
+      int dest_wave = block_idx % 2;
+      int remapped_block_idx = dest_sm + (dest_wave * avail_sms);
+      block_idx = remapped_block_idx;
+    }
+
+    // Remap block indices to interleave SK regions to limit intra-region waiting
+    if (block_idx < sk_regions() * sk_blocks_per_region())
+    {
+      int block_in_region;
+      int region;
+      div_mod_sk_regions(block_in_region, region, block_idx);
+      block_idx = (region * sk_blocks_per_region()) + block_in_region;
+    }
+
+    return block_idx;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_sk_block_idx(int iter) const
+  {
+    int region_idx;
+    int iter_in_region;
+    div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
+
+    int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
+    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks
+
+    int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
+    int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
+
+    int block_idx_in_region = (big_block_idx_in_region < sk_big_blocks_per_region) ?
+        big_block_idx_in_region :
+        normal_block_idx_in_region;
+
+    int owning_block_idx = (sk_blocks_per_region() * region_idx) + block_idx_in_region;
+
+    return owning_block_idx;
+  }
+
+  /// Obtains iteration extends for the given SK block index
+  CUTLASS_DEVICE
+  void get_iter_extents(
+      int sk_block_idx,
+      int &block_iter_begin,
+      int &block_iter_end) const
+  {
+    int region_idx;
+    int block_idx_in_region;
+    div_mod_sk_blocks_per_region(region_idx, block_idx_in_region, sk_block_idx);
+
+    block_iter_begin = (region_idx * sk_iters_per_region) + (block_idx_in_region * sk_iters_per_normal_block());
+
+    // Adjust extents for the first "num_big_blocks" blocks that get one extra iteration
+    int block_iters = sk_iters_per_normal_block();
+    if (block_idx_in_region < sk_big_blocks_per_region) {
+      // This is a +1 iteration block
+      block_iter_begin += block_idx_in_region;
+      block_iters++;
+    } else {
+      // This is a regular block
+      block_iter_begin += sk_big_blocks_per_region;
+    }
+    block_iter_end = block_iter_begin + block_iters;
+  }
+
+
+  /// Obtains calling linear threadblock index of the first block to work on the given tile
+  CUTLASS_DEVICE
+  int get_first_block_idx(int tile_idx, int block_idx) const
+  {
+    if (tile_idx >= sk_tiles) {
+      // DP tile
+      return block_idx;
+    }
+
+    int iter = tile_idx * iters_per_tile();
+    return get_sk_block_idx(iter);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..067da30b1901532ffccc69c19906ff6630520f71
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -0,0 +1,612 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Complex transform on A operand
+    ComplexTransform TransformA = ComplexTransform::kNone,
+    /// Complex transform on B operand
+    ComplexTransform TransformB = ComplexTransform::kNone,
+    /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex)
+    typename Operator_ = arch::OpMultiplyAddComplex>
+struct DefaultMmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case
+//  4 real-valued mma operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma operations
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use TF32 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t,
+        cutlass::layout::RowMajor,
+        tfloat32_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use BF16 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 operations on BF16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastBF16> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.bf16.bf16.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        bfloat16_t,
+        cutlass::layout::RowMajor,
+        bfloat16_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization - input and output types are complex<float>*complex<float> 
+//  Use F16 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.f16.f16.f32 operations on F16
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddFastF16> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.f16.f16.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        half_t,
+        cutlass::layout::RowMajor,
+        half_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 3xTF32 or 4xTF32 (fast and accurate complex<float> operation)
+/// Partial specialization - input and output types are complex<float> * complex<float> 
+//  Use 3xTF32 or 4xTF32 tensor operation internally
+//  4 real-valued mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 operations on TF32 
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = 3x[(ar*br - ai*bi) + j (ar*bi + ai*br)]
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    InstructionShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplexFastF32> {
+
+  // Complex floating point tensor operation use mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32 mma instruction
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t,
+        cutlass::layout::RowMajor,
+        tfloat32_t,
+        cutlass::layout::ColumnMajor,
+        float,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOpFastF32<
+    WarpShape_,
+    complex<float>,
+    LayoutA,
+    complex<float>,
+    LayoutB,
+    complex<float>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex<double>*complex<double> case
+//  4 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations
+//  A = (ar + j ai), B (br +j bi), D = AB
+//  D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex<T>*complex<T> case using GaussianComplex operation
+//  3 real-valued mma.sync.aligned.m16n8k4.f64.f64.f64.f64 operations 
+//  A  = (ar + j ai), B = (br +j bi), D = AB
+//  P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) 
+//  D  = dr + j di = (P1 - P3) + j (P1 + P2)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Real-valued underlying type of complex-valued A operand
+    typename RealElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Real-valued underlying type of complex-valued B operand
+    typename RealElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Real-valued underlying type of complex-valued C operand
+    typename RealElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Complex transform on A operand
+    ComplexTransform TransformA,
+    /// Complex transform on B operand
+    ComplexTransform TransformB>
+struct DefaultMmaComplexTensorOp<
+    WarpShape_,
+    GemmShape<16, 8, 4>,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC,
+    TransformA,
+    TransformB,
+    arch::OpMultiplyAddGaussianComplex> {
+
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 4>,
+        32, 
+        RealElementA,
+        cutlass::layout::RowMajor,
+        RealElementB,
+        cutlass::layout::ColumnMajor,
+        RealElementC,
+        cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd>,
+      cutlass::MatrixShape<1, 1>
+    >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp<
+    WarpShape_,
+    complex<RealElementA>,
+    LayoutA,
+    complex<RealElementB>,
+    LayoutB,
+    complex<RealElementC>,
+    LayoutC, 
+    Policy,
+    TransformA,
+    TransformB,
+    true>;
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2cb3f2249c9beabd0e557c96d7361be2e28a133
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -0,0 +1,165 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_sparse_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false
+>
+struct DefaultSparseMmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultSparseMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::SparseMma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t, cutlass::layout::RowMajor, 
+        tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultSparseMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::SparseMma<InstructionShape_, 32, ElementA,
+                               cutlass::layout::RowMajor, ElementB,
+                               cutlass::layout::ColumnMajor, ElementC,
+                               cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::SparseMmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d7fe1155bdd3e60bdc935e9ba48afa7cbf8f84
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -0,0 +1,123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMmaTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/default_mma_tensor_op_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c9abb8236230edd5787a4422907cef90a525579
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_mixed_input_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+#include "cutlass/gemm/warp/default_mma_tensor_op.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses BF16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastBF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses BF16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        bfloat16_t, cutlass::layout::RowMajor, 
+        bfloat16_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses F16 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  GemmShape<16, 8, 8>, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastF16, 
+  PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses F16 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 8>, 
+        32, 
+        half_t, cutlass::layout::RowMajor, 
+        half_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 internally
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        tfloat32_t, cutlass::layout::RowMajor, 
+        tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOp<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs and output types are float - uses TF32 for Fast Accurate FP32
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of target matrix multiply instruction (concept: GemmShape)
+    typename InstructionShape_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_, 
+  InstructionShape_, 
+  float, LayoutA, 
+  float, LayoutB, 
+  float, LayoutC, 
+  arch::OpMultiplyAddFastF32, PartitionsK, AccumulatorsInRowMajor> {
+
+  // Uses TF32 internally
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        InstructionShape_, 
+        32, 
+        cutlass::tfloat32_t, cutlass::layout::RowMajor, 
+        cutlass::tfloat32_t, cutlass::layout::ColumnMajor,
+        float, cutlass::layout::RowMajor, 
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpFastF32<
+      WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. F16 <= F16 x S8 + F16, F16 <= BF16 x S8 + F32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 16>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using ElementAMma = ElementOperand;
+  using ElementBMma = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 16>,
+        32,
+        ElementAMma, cutlass::layout::RowMajor,
+        ElementBMma, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAdd
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization - inputs are mixed types  - uses wider datatype internally.
+/// (e.g. S32 <= S4 x S8 + S32, S32 <= S8 x S4 + S32)
+template <
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Element type of A matrix
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Element type of B matrix
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Number of partitions along K dimension
+    int PartitionsK,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor>
+struct DefaultMmaTensorOp<
+  WarpShape_,
+  GemmShape<16, 8, 32>,                 // InstructionShape
+  ElementA,                             // Element type of A matrix in Global Memory
+  LayoutA,                              // Layout of A matrix in Global Memory
+  ElementB,                             // Element type of B matrix in Global Memory
+  LayoutB,                              // Layout of B matrix in Global Memory
+  ElementC,                             // Element type of C matrix in Global Memory
+  LayoutC,                              // Layout of C matrix in Global Memory
+  arch::OpMultiplyAddMixedInputUpcast,  // Tag to indicate mixed-input datatype, where narrower datatype is upcasted to wider datatype
+  PartitionsK, AccumulatorsInRowMajor> {
+
+
+  // Check if the ElementA and ElementB are of different data types
+  static_assert(!platform::is_same<ElementA, ElementB>::value,
+    "DefaultMmaTensorOp with arch::OpMultiplyAddMixedInputUpcast ElementA and ElementB cannot be of the same data type");
+
+  // Data type used for internal computation - use the wider of the two data types for mma.sync operands
+  using ElementOperand = typename platform::conditional<(sizeof_bits<ElementA>::value > sizeof_bits<ElementB>::value),
+                                                    ElementA, ElementB>::type;
+
+  // Operand datatypes in the internal MMA instruction - use the wider of the two data types
+  using MmaElementA = ElementOperand;
+  using MmaElementB = ElementOperand;
+  using MmaElementC = ElementC;
+
+  // Uses
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<
+        GemmShape<16, 8, 32>,
+        32,
+        MmaElementA, cutlass::layout::RowMajor,
+        MmaElementB, cutlass::layout::ColumnMajor,
+        MmaElementC, cutlass::layout::RowMajor,
+        arch::OpMultiplyAddSaturate
+      >,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaMixedInputTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bd8c0fde5f0d3360c9468484ee61721fc9f30e0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
@@ -0,0 +1,92 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_with_reduction_tensor_op.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Reduce operand A or B along K dimension
+    bool ReduceKForA_,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false>
+struct DefaultMmaWithReductionTensorOp {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Mma<InstructionShape_, 32, ElementA,
+                         cutlass::layout::RowMajor, ElementB,
+                         cutlass::layout::ColumnMajor, ElementC,
+                         cutlass::layout::RowMajor, Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaWithReductionTensorOp<
+      WarpShape_, ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+      Policy, ReduceKForA_, PartitionsK, AccumulatorsInRowMajor>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a90a780520e888733f74a3d84e447470684c094
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -0,0 +1,130 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op_wmma.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the Gemm problem (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA_,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA_,
+    /// Data type of B elements
+    typename ElementB_,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB_,
+    /// Element type of C matrix
+    typename ElementC_,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC_,
+    /// Operator describing the tensor operation
+    typename Operator_ = arch::OpMultiplyAdd,
+    /// Number of partitions along K dimension
+    int PartitionsK = 1
+>
+struct DefaultMmaTensorOpWmma;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for m-by-n-by-kgroup
+template <
+    ///< Shape of one matrix production operation (concept: GemmShape)
+    typename WarpShape_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Data type of A elements
+    typename ElementA,
+    /// Layout of A matrix (concept: MatrixLayout)
+    typename LayoutA,
+    /// Data type of B elements
+    typename ElementB,
+    /// Layout of B matrix (concept: MatrixLayout)
+    typename LayoutB,
+    /// Element type of C matrix
+    typename ElementC,
+    /// Layout of C matrix (concept: MatrixLayout)
+    typename LayoutC,
+    /// Operator describing the tensor operation
+    typename Operator_,
+    /// Number of partitions along K dimension
+    int PartitionsK>
+struct DefaultMmaTensorOpWmma {
+  using Policy = cutlass::gemm::warp::MmaTensorOpPolicy<
+      cutlass::arch::Wmma<
+          InstructionShape_, 
+          ElementA,
+          LayoutA, 
+          ElementB,
+          LayoutB, 
+          ElementC,
+          LayoutC, 
+          Operator_>,
+      cutlass::MatrixShape<1, 1> >;
+
+  // Define the warp-level tensor op
+  using Type = cutlass::gemm::warp::MmaTensorOpWmma<
+        WarpShape_,
+        ElementA, 
+        LayoutA, 
+        ElementB, 
+        LayoutB,
+        ElementC, 
+        LayoutC, 
+        Policy, 
+        PartitionsK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+#endif
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..f032f26fcac99de781d67d4012e813e920803948
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
@@ -0,0 +1,139 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per channel scale+bias+relu before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentVarMean, typename FragmentGammaBeta>
+struct LayernormScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumVarMean = FragmentVarMean::kElements;
+  static int const NumGammaBeta = FragmentGammaBeta::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using VarMeanOperand = Array<__half2, MmaScaleBiasPair>;
+  using GammaBetaOperand = Array<T, MmaElements * MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 VarMeanOperand const &var_mean,
+                 GammaBetaOperand const &gamma_beta) {
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+    uint32_t *ptr_activations = reinterpret_cast<uint32_t *>(&activations);
+    uint32_t const *ptr_var_mean = reinterpret_cast<uint32_t const *>(&var_mean);
+    uint32_t const *ptr_gamma_beta = reinterpret_cast<uint32_t const *>(&gamma_beta);
+
+    // Apply per channel scale+bias+relu if the data is not a special NaN
+    // (0x7eff).  If it is a special NaN (0x7eff), hard code the output to 0.
+
+    // We assumes the pair of FP16 are either both inbound or both out-of-bound.
+    // It requires C to be an even number.
+    asm volatile(
+        "{\n\t"
+        " fma.rn.f16x2 %0, %1, %2, %3;\n"
+        " fma.rn.f16x2 %0, %4, %0, %5;\n"
+        "}\n"
+        : "=r"(ptr_activations[0])
+        : "r"(ptr_var_mean[0]), "r"(ptr_activations[0]),
+          "r"(ptr_var_mean[1]),
+          "r"(ptr_gamma_beta[0]), "r"(ptr_gamma_beta[1]));
+#else
+    assert(0);
+#endif
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentVarMean const &var_mean,
+                  FragmentGammaBeta const &gamma_beta) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    VarMeanOperand const *ptr_var_mean =
+        reinterpret_cast<VarMeanOperand const *>(&var_mean);
+    GammaBetaOperand const *ptr_gamma_beta =
+        reinterpret_cast<GammaBetaOperand const *>(&gamma_beta);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_var_mean[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows],
+                ptr_gamma_beta[(i / MmaScaleBiasPair) % MmaCols]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm 
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd67743301140d50d38b27926b56d654168f5fdd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma.h
@@ -0,0 +1,60 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates exposing architecture support for warp-level multiply-add operations
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Query the number of threads per warp
+template <typename OperatorClass>
+struct WarpSize {
+  static int const value = 32;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b7cf0384627299e2ad4e916bc023cf7384e242
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -0,0 +1,1168 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+#include "cutlass/arch/mma_sm90.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  /// Data type of real & imag members of complex numbers in the SourceFragment
+  typename RealElement,
+  /// Destination fragment required by the mma operation 
+  typename DestinationFragment,
+  /// Source fragment holding complex<RealElement> elements
+  typename SourceFragment,
+  /// Number of mma operations performed
+  typename MmaIterations,
+  /// Shape of operand elements
+  typename MmaOperandShape,
+  /// Complex transform on A operand
+  ComplexTransform Transform_,
+  /// Operand A or Operand B
+  Operand Operand_,
+  /// Floating-point rounding style
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma;
+
+// Partial specialization for OperandA and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kA,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kA;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::ColumnMajor;
+  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kRow; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r + i * MmaOperandShape::kRow;
+          int col = c;
+
+          // Access complex<RealElement> and apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for OperandB and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle Round_>
+struct UnpackComplexConvertAndPackForMma <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kB,
+  Round_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kB;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRound = Round_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElement <= RealElement
+  using Converter = NumericConverter<MmaElement, RealElement, kRound>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::RowMajor;
+  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMma() {}
+
+  CUTLASS_HOST_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kColumn; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r;
+          int col = c + i * MmaOperandShape::kColumn;
+
+          // Access complex<RealElement> apply rounding on real and imag parts
+          MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest[i][pos] = a;
+          dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b);
+        }
+      }
+    }
+  }
+};
+} // namespace detail 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddComplex;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = B[n].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = A[m].real();
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag());
+        operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag());
+        operand_B[0] = B[n].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+//  Output data type: complex<float>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<float>, 
+  LayoutA_, 
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = float;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = float;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = float;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = typename arch::OpMultiplyAddComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of complex products operations performed (one complex product needs four mma instructions)
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
+      "This implementation only supports mma.m16n8k8 math instructions.");
+
+    static_assert(InstMmaOperandA::kElements == 4, 
+      "This implementation only supports math instructions in which exactly four element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(InstMmaOperandB::kElements == 2, 
+      "This implementation only supports math instructions in which exactly two element is needed for the B operand."
+      "We can geneneralize later.");
+
+    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
+    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&A);
+    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A[m], operand_B[n], *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
+      }
+
+      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // negate OperandB to accumulate  -(a.imag()*b.imag())
+        // negating OperandB emits less instructions than negating OperandA as OperandB has less elements
+        negate<InstMmaOperandB> negate_op;
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    // Alias types for underlying real-valued matrix multiply operator
+    using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+    using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+
+    //
+    // Define conversions from source type to instruction operands' type
+    //
+
+    #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_to_nearest;
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_to_nearest;
+    #else
+    FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; 
+    FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz;
+    #endif
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementA,
+      InstMmaOperandA,
+      FragmentA,
+      MmaIterations,
+      MatrixShape<2, 2>,
+      kTransformA,
+      Operand::kA,
+      kRoundA> convert_A;
+
+    detail::UnpackComplexConvertAndPackForMma <
+      RealElementB,
+      InstMmaOperandB,
+      FragmentB,
+      MmaIterations,
+      MatrixShape<2, 1>,
+      kTransformB,
+      Operand::kB,
+      kRoundB> convert_B;
+
+    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
+    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
+    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<double>
+//  Math instruction: mma.sync.aligned.m16n8k4.f64.f64.f64.f64
+//  Output data type: complex<double>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaComplexTensorOp<
+  Shape_, 
+  complex<double>, 
+  LayoutA_, 
+  complex<double>,
+  LayoutB_,
+  complex<double>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = double;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = double;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = double;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = typename arch::OpMultiplyAddComplex;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 2 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected planar complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ? 
+                          -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.real(), -a.imag(), b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        // A imaginary part is intentionally negated
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          A[m*MmaOperandA::kElements + mk].imag() : -A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+            operand_B[nk] = (kTransformB == ComplexTransform::kConjugate ?
+                            -B[n*MmaOperandB::kElements + nk].imag() : B[n*MmaOperandB::kElements + nk].imag());
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_A;
+        MmaOperandB operand_B;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_A[mk] = (kTransformA == ComplexTransform::kConjugate ?
+                          -A[m*MmaOperandA::kElements + mk].imag() : A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_B[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A, operand_B, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd90ab8c4252f95fff90c21bfeaf6fb45c4b110b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
@@ -0,0 +1,663 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+namespace detail {
+
+template <
+  /// Data type of real & imag members of complex numbers in the SourceFragment
+  typename RealElement,
+  /// Destination fragment required by the mma operation 
+  typename DestinationFragment,
+  /// Source fragment holding complex<RealElement> elements
+  typename SourceFragment,
+  /// Number of mma operations performed
+  typename MmaIterations,
+  /// Shape of operand elements
+  typename MmaOperandShape,
+  /// Complex transform on A operand
+  ComplexTransform Transform_,
+  /// Operand A or Operand B
+  Operand Operand_,
+  /// Floating-point rounding style for big part
+  FloatRoundStyle RoundBig_,
+  /// Floating-point rounding style for small part
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32;
+
+// Partial specialization for OperandA and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle RoundBig_,
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32 <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kA,
+  RoundBig_,
+  RoundSmall_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kA;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRoundBig = RoundBig_;
+  static FloatRoundStyle const kRoundSmall = RoundSmall_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
+  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::ColumnMajor;
+  static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow;
+
+  // BigSmall Fragment holding two TF32 elements (big, small) for every float
+  using BigSmallFragment = Array<MmaElement, 2>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMmaFastF32() {}
+
+  CUTLASS_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
+    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kRow * 2]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kRow; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r + i * MmaOperandShape::kRow;
+          int col = c;
+
+          // Access complex<RealElement> and apply rounding on real and imag parts
+          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_big_[i][pos] = a[kBigIndex];
+          dest_big_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_small_[i][pos] = a[kSmallIndex];
+          dest_small_[i+MmaIterations::kRow][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
+
+          // Next position
+          pos++;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for OperandB and Congruous smem layout
+template <
+  typename RealElement,
+  typename DestinationFragment, 
+  typename SourceFragment,
+  typename MmaIterations,
+  typename MmaOperandShape,
+  ComplexTransform Transform_,
+  FloatRoundStyle RoundBig_,
+  FloatRoundStyle RoundSmall_>
+struct UnpackComplexConvertAndPackForMmaFastF32 <
+  RealElement,
+  DestinationFragment,
+  SourceFragment,
+  MmaIterations,
+  MmaOperandShape,
+  Transform_,
+  Operand::kB,
+  RoundBig_,
+  RoundSmall_> {
+  
+  //
+  // Type definitions
+  //
+  static Operand const kOperand = Operand::kB;
+  static ComplexTransform const kTransform = Transform_;
+  static FloatRoundStyle const kRoundBig = RoundBig_;
+  static FloatRoundStyle const kRoundSmall = RoundSmall_;
+
+  // Data type of elements in the destination fragment
+  using MmaElement = typename DestinationFragment::Element;
+
+  // Numeric convertor MmaElementBig, MmaElementSmall <= RealElement
+  using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+  // Operand layout parameters
+  using SourceFragmentLayout = layout::RowMajor;
+  static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn;
+
+  // BigSmall Fragment holding two TF32 elements (big, small) for every float
+  using BigSmallFragment = Array<MmaElement, 2>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Ctor
+  CUTLASS_DEVICE
+  UnpackComplexConvertAndPackForMmaFastF32() {}
+
+  CUTLASS_HOST_DEVICE
+  void operator()(DestinationFragment *dest, SourceFragment const &source) {
+    
+    Converter convert_op;
+    SourceFragmentLayout layout(kLdm);
+
+    DestinationFragment *dest_big_ = reinterpret_cast<DestinationFragment*>(dest);
+    DestinationFragment *dest_small_ = reinterpret_cast<DestinationFragment*>(&dest[MmaIterations::kColumn * 2]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i=0; i<MmaIterations::kColumn; i++) {
+      int pos = 0;
+      CUTLASS_PRAGMA_UNROLL
+      for(int c=0; c<MmaOperandShape::kColumn; c++) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int r=0; r<MmaOperandShape::kRow; r++) {
+          // Logical position of element in source fragment
+          int row = r;
+          int col = c + i * MmaOperandShape::kColumn;
+
+          // Access complex<RealElement> apply rounding on real and imag parts
+          BigSmallFragment a = convert_op(source[layout(MatrixCoord{row,col})].real());
+          BigSmallFragment b = convert_op(source[layout(MatrixCoord{row,col})].imag());
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_big_[i][pos] = a[kBigIndex];
+          dest_big_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kBigIndex] : b[kBigIndex]);
+
+          // Unpack rounded complex<MmaElement> and pack into DestinationFragment for mma operation
+          dest_small_[i][pos] = a[kSmallIndex];
+          dest_small_[i+MmaIterations::kColumn][pos] = (kTransform == ComplexTransform::kConjugate ? -b[kSmallIndex] : b[kSmallIndex]);
+
+          // next position
+          pos++;       
+        }
+      }
+    }
+  }
+};
+} // namespace detail 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaComplexTensorOpFastF32;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex:
+//  Operands data type: complex<float>
+//  Rounding: float -> tfloat32_t (round half_ulp_truncate nearest)
+//  Math instruction: mma.sync.aligned.m16n8k8.f32.tf32.tf32.f32
+//  Output data type: complex<float>
+// 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaComplexTensorOpFastF32<
+  Shape_, 
+  complex<float>, 
+  LayoutA_, 
+  complex<float>,
+  LayoutB_,
+  complex<float>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  Enable>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of members of complex multiplicand A
+  using RealElementA = float;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of members of complex multiplicand B
+  using RealElementB = float;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of members of complex accumulator matrix C
+  using RealElementC = float;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddComplexFastF32;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+
+  /// Tune F32 to TF32 big small conversion for complex<float> operation
+  /// Different combination of big small conversin can cause different tradeoff
+  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
+  /// improve the performance but hur the accuracy.
+  using ComplexFastF32 = FastF32 <
+    FloatRoundStyle::round_toward_zero,        // kRoundBigA
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
+    FloatRoundStyle::round_toward_zero,        // kRoundBigB
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
+    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
+  >;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  // (4 times the original FragmentA::kElements)
+  // (real_big), (imag_big), (real_small), (imag_small)
+  using TransformedFragmentA = Array<typename ArchMmaOperator::ElementA, 
+                                              FragmentA::kElements * 2 * 2>;
+
+  // Fragment bisecting big and small sections
+  // (real_big, imag_big), (real_small, imag_small)
+  using AccessTypeFragmentA = Array<typename ArchMmaOperator::ElementA, 
+                                                    FragmentA::kElements * 2>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile 
+  // (4 times the original FragmentB::kElements)
+  // (real_big), (imag_big), (real_small), (imag_small)
+  using TransformedFragmentB = Array<typename ArchMmaOperator::ElementB, 
+                                              FragmentB::kElements * 2 * 2>;
+
+  // Fragment bisecting big and small sections
+  // (real_big, imag_big), (real_small, imag_small)
+  using AccessTypeFragmentB = Array<typename ArchMmaOperator::ElementB, 
+                                                    FragmentB::kElements * 2>;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of complex products operations performed (one complex product needs four mma instructions)
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued
+  /// parts are stored consecutively followed by all imaginary parts. This matches the structure
+  /// of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  //
+  // Alias types for underlying real-valued matrix multiply operator
+  //
+  using InstMmaOperandA = typename ArchMmaOperator::FragmentA;
+  using InstMmaOperandB = typename ArchMmaOperator::FragmentB;
+  using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+  static_assert(platform::is_same<cutlass::gemm::GemmShape<16, 8, 8>, typename ArchMmaOperator::Shape>::value, 
+    "This implementation only supports mma.m16n8k8 math instructions.");
+
+  static_assert(InstMmaOperandA::kElements == 4, 
+    "This implementation only supports math instructions in which exactly four element is needed for the A operand."
+    "We can geneneralize later.");
+
+  static_assert(InstMmaOperandB::kElements == 2, 
+    "This implementation only supports math instructions in which exactly two element is needed for the B operand."
+    "We can geneneralize later.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaComplexTensorOpFastF32() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    AccessTypeFragmentA const *complex_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
+    AccessTypeFragmentB const *complex_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+
+
+    complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kBigIndex], D);
+
+    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kSmallIndex], D);
+
+    complex_mma_operator(D, complex_A[kBigIndex], complex_B[kBigIndex], D);
+
+    if (ComplexFastF32::kPrecision == TensorFloat32Op::k4xTF32)
+      complex_mma_operator(D, complex_A[kSmallIndex], complex_B[kSmallIndex], D);
+  }
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void complex_mma_operator(
+    FragmentC &D, 
+    AccessTypeFragmentA const &complex_A, 
+    AccessTypeFragmentB const &complex_B, 
+    FragmentC const &C
+  ) const {
+
+    // Instruction Operands A & B holding real part followed by imaginary part for mma operations
+    InstMmaOperandA const *operand_A = reinterpret_cast<InstMmaOperandA const *>(&complex_A);
+    InstMmaOperandB const *operand_B = reinterpret_cast<InstMmaOperandB const *>(&complex_B);
+
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.real(), a.real(), b.real(), accum.real());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+          mma(*accum, operand_A[m], operand_B[n], *accum);
+      }
+
+      // mma(accum.imag(), a.real(), b.imag(), accum.imag()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum);
+      }
+
+      // mma(accum.real(), a.imag(), -b.imag(), accum.real())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // negate OperandB to accumulate  -(a.imag()*b.imag())
+        // negating OperandB emits less instructions than negating OperandA as OperandB has less elements
+        negate<InstMmaOperandB> negate_op;
+
+        // Real-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+         mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum);
+      }
+
+      // mma(accum.imag(), a.imag(), b.real(), accum.imag())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Complex-valued accumulator part
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    detail::UnpackComplexConvertAndPackForMmaFastF32 <
+      RealElementA,
+      InstMmaOperandA,
+      FragmentA,
+      MmaIterations,
+      MatrixShape<2, 2>,
+      kTransformA,
+      Operand::kA,
+      ComplexFastF32::kRoundBigA,
+      ComplexFastF32::kRoundSmallA> convert_A;
+
+    detail::UnpackComplexConvertAndPackForMmaFastF32 <
+      RealElementB,
+      InstMmaOperandB,
+      FragmentB,
+      MmaIterations,
+      MatrixShape<2, 1>,
+      kTransformB,
+      Operand::kB,
+      ComplexFastF32::kRoundBigB,
+      ComplexFastF32::kRoundSmallB> convert_B;
+
+    // Convert Fragment[A|B] holding complex<RealElement[A|B]> to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element
+    convert_A(reinterpret_cast<InstMmaOperandA *>(&dst_A), A); 
+    convert_B(reinterpret_cast<InstMmaOperandB *>(&dst_B), B); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..e14450d363f18bbd63ef129b398a97545f29dc95
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,2485 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad_pair = lane_id / 8;
+    int quad = lane_id / 4;
+    int lane = lane_id % 4;
+
+    int row = (quad & 1) * 4 + (lane ^ quad_pair);
+    
+    byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset =
+      (tile_offset.contiguous() * Shape::kContiguous) +
+      (tile_offset.strided() * InstructionShape::kStrided * stride_);
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kStrided;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a planar complex arrangement with the real parts as entirely contiguous
+  /// followed by the imaginary parts.
+  using Fragment = Array<RealElement, Shape::kCount / kThreads * 2>;
+
+  static int const kRealIndex = 0;
+  static int const kImaginaryIndex = Shape::kCount / kThreads;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 128b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 128, "This is specialized for 128b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 1;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 8>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) {
+
+    int quad = lane_id / 4;
+    int liq = lane_id % 4;
+
+    int c = liq + (quad & 1) * 4;
+    int s = (quad / 2);
+
+    byte_offset_ = (c + s * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    // Compute the offset in units of elements. Note, the external coordinate system is
+    // approximately transposed with respect to the tiled internal structure
+    int offset =
+      (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ +
+      (tile_offset.strided() * Shape::kStrided);
+
+    add_pointer_offset(offset);
+
+    byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    byte_offset_ ^= 4 * sizeof(AccessType);
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = s + c * Policy::Iterations::kStrided;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * InstructionShape::kContiguous * stride_ +
+        tile_offset.strided() * Shape::kStrided;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Congruous shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, cutlass::complex<float>,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility.");
+
+  /// Element type
+  using Element = cutlass::complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Crosswise shared memory layout
+// Warp-level iterators for complex<float>*complex<float> + complex<float> => complex<float>
+// The underlying iterators are similar to that for f64*f64 + f64 = f64 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, complex<float>,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<complex<float>>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = complex<float>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+
+    if (k_group_idx_ & 1)
+      byte_offset_ ^= 0x40;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+    
+    // xor ptr
+    byte_offset_ ^= 0x40;
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c * Policy::Iterations::kStrided + s;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    // exchange on 64b granularity only for fragments held in k=8/2 to k=8 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) {
+      Element tmp = exchange_ptr[i];
+      exchange_ptr[i] = exchange_ptr[i + 1];
+      exchange_ptr[i + 1] = tmp;
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6728ac2010bc84e7a4edfcae956905e2432e56f3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -0,0 +1,642 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transform on B operand
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Do source operands need more than one elements
+  bool GeneralizedOperatorElements = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaGaussianComplexTensorOp;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddGaussianComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    static_assert(MmaOperandA::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the A operand."
+      "We can geneneralize later.");
+
+    static_assert(MmaOperandB::kElements == 1, 
+      "This implementation only supports math instructions in which exactly one element is needed for the B operand."
+      "We can geneneralize later.");
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag());
+        operand_Br[0] = B[n].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        operand_Ar[0] = -A[m].real();
+        operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag();
+        operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename RealElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename RealElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename RealElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Complex transform on A operand
+  ComplexTransform TransformA,
+  /// Complex transform on B operand
+  ComplexTransform TransformB
+>
+class MmaGaussianComplexTensorOp<
+  Shape_, 
+  complex<RealElementA>, 
+  LayoutA_, 
+  complex<RealElementB>,
+  LayoutB_,
+  complex<RealElementC>,
+  LayoutC_,
+  Policy_,
+  TransformA,
+  TransformB,
+  true>  {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = complex<RealElementA>;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = complex<RealElementB>;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = complex<RealElementC>;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Underlying arch tag
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddGaussianComplex;
+  
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+    Policy::OpDelta::kRow,
+    32,
+    1
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+    Policy::OpDelta::kColumn,
+    32,
+    1
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB = FragmentB;
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Shape::kM / ArchMmaOperator::Shape::kM,
+    Shape::kN / ArchMmaOperator::Shape::kN
+  >;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, 
+     ElementC, 
+     LayoutC,
+     typename ArchMmaOperator::Shape, 
+     typename Policy::OpDelta>;
+
+  /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this
+  /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is
+  /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively 
+  /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies.
+  using FragmentC = typename IteratorC::Fragment;
+
+  static_assert(
+    FragmentC::kElements == 3 * MmaIterations::kCount * ArchMmaOperator::FragmentC::kElements,
+    "Unexpected gaussian complex fragment length.");
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying real-valued matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaGaussianComplexTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    // Alias types for underlying real-valued matrix multiply operator
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1());
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Asum;
+        MmaOperandB operand_Br;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Asum[mk] = A[m*MmaOperandA::kElements + mk].real() + ((kTransformA == ComplexTransform::kConjugate) ?
+                            -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Br[nk] = B[n*MmaOperandB::kElements + nk].real();
+
+        // accumulator part1
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow);
+
+        mma(*accum, operand_Asum, operand_Br, *accum);
+      }
+
+      // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); 
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = MmaIterations::kColumn - 1; n >= 0; --n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ar;
+        MmaOperandB operand_Bdiff;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ar[mk] = -A[m*MmaOperandA::kElements + mk].real();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bdiff[nk] = B[n*MmaOperandB::kElements + nk].real() - ((kTransformB == ComplexTransform::kConjugate) ?
+                              -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part2
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + MmaIterations::kCount;
+
+        mma(*accum, operand_Ar, operand_Bdiff, *accum);
+      }
+
+      // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3())
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        // Pack operands together. This may result in actual MOVs 
+        MmaOperandA operand_Ai;
+        MmaOperandB operand_Bsum;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mk = 0; mk < MmaOperandA::kElements; ++mk)
+          operand_Ai[mk] = (kTransformA == ComplexTransform::kConjugate) ?
+                           -A[m*MmaOperandA::kElements + mk].imag() : +A[m*MmaOperandA::kElements + mk].imag();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int nk = 0; nk < MmaOperandB::kElements; ++nk)
+          operand_Bsum[nk] = B[n*MmaOperandB::kElements + nk].real() + ((kTransformB == ComplexTransform::kConjugate) ?
+                             -B[n*MmaOperandB::kElements + nk].imag() : +B[n*MmaOperandB::kElements + nk].imag());
+
+        // accumulator part3
+        MmaOperandC *accum = reinterpret_cast<MmaOperandC *>(&D) + 
+          (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount;
+
+        mma(*accum, operand_Ai, operand_Bsum, *accum);
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec99c77f4916e2040cef9fc724c431b0c1531f23
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,390 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Partial specialization for complex<T>
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of underlying field of reals.
+    typename RealElement,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpGaussianComplexAccumulatorTileIterator<
+    Shape_, complex<RealElement>, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = complex<RealElement>;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators
+  /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous
+  /// arranged as [part1, part2, part3]
+  using Fragment = Array<RealElement, (Shape::kCount / kThreads) * 3>;
+
+  static int const kPart1Index = (Shape::kCount / kThreads) * 0;
+  static int const kPart2Index = (Shape::kCount / kThreads) * 1;
+  static int const kPart3Index = (Shape::kCount / kThreads) * 2;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            Element z = offset_ref.at({accum_m, accum_n});
+
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real();
+            frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], 
+                      frag[kPart1Index + idx] + frag[kPart2Index + idx]);
+
+            offset_ref.at({accum_m, accum_n}) = z;
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b07575050ac2999cdcbeb0d4e8a64bfb63214cff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
@@ -0,0 +1,566 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+////////////////////////////////////////////////////////////////////////////////
+// Shuffle registers for layout conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment,
+  /// Identifies A or B multiplicand
+  Operand Operand_,
+  ///
+  typename Enable = void >
+struct FragmentShuffler {
+  public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand_;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+    return src;
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand A multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kA,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kA;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const*>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment*>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const *>(&mma_frag_src_ptr[n]);
+        uint32_t *dst_ptr = reinterpret_cast<uint32_t *>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+        uint32_t tmp2 = __shfl_up_sync(0xFFFFFFFF, src_ptr[1], delta_up_);
+        uint32_t tmp3 = __shfl_down_sync(0xFFFFFFFF, src_ptr[1], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp2, byte_selector_);
+        dst_ptr[1] = __byte_perm(tmp1, tmp3, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for `mma.sync` on 16b (F16/BF16) and `ldmatrix` on 8b (S8/U8)
+/// or for `mma.sync` on 8b (S8/U8) and `ldmatrix` on 4b (S4/U4)
+/// for operand B multiplicand going through upcasting. 
+template <
+  /// Element type for the operand in registers for the mma.sync
+  typename ElementMma_, 
+  /// Element type for the operand in shared memory for ldmatrix
+  typename ElementLoad_,
+  /// Number of mma.sync operations performed along rows or columns         
+  int NumMmaInstructions,
+  /// Number of elements in warp fragment
+  int NumElementsInWarpFragment,
+  /// Number of elements in mma fragment
+  int NumElementsInMmaFragment
+> 
+struct FragmentShuffler <ElementMma_, ElementLoad_,
+                         NumMmaInstructions, 
+                         NumElementsInWarpFragment, 
+                         NumElementsInMmaFragment,
+                         Operand::kB,
+                         typename platform::enable_if<(sizeof_bits<ElementMma_>::value /
+                                                 sizeof_bits<ElementLoad_>::value == 2)>::type> {
+public:
+  using ElementMma = ElementMma_;
+  using ElementLoad = ElementLoad_;
+
+  static int const kNumMmaInstructions = NumMmaInstructions;
+  static int const kNumElementsInWarpFragment = NumElementsInWarpFragment;
+  static int const kNumElementsInMmaFragment = NumElementsInMmaFragment;
+  static Operand const kOperand = Operand::kB;
+
+  using WarpFragment = Array<ElementLoad, kNumElementsInWarpFragment>;
+  using MmaFragment = Array<ElementLoad, kNumElementsInMmaFragment>;
+
+  static uint32_t const kSelectBytesEvenThread = 0x5410;
+  static uint32_t const kSelectBytesOddThread = 0x7632;
+
+private:
+  int delta_up_;
+  int delta_down_;
+  int odd_even_lane_id_;
+  uint32_t byte_selector_;
+
+public:
+  CUTLASS_DEVICE
+  FragmentShuffler() {
+    int lane_id = cutlass::arch::LaneId();
+    delta_up_ = (lane_id & 1) + ((lane_id & 2) >> 1);
+    delta_down_ = 2 - delta_up_;
+    odd_even_lane_id_ = static_cast<int>(lane_id & 1);
+    byte_selector_ = odd_even_lane_id_ * kSelectBytesOddThread +
+                    (1 - odd_even_lane_id_) * kSelectBytesEvenThread;
+  }
+
+  CUTLASS_DEVICE
+  WarpFragment operator()(WarpFragment const &src) {
+
+    WarpFragment result;
+
+    MmaFragment const* mma_frag_src_ptr = reinterpret_cast<MmaFragment const *>(&src);
+    MmaFragment* mma_frag_dst_ptr = reinterpret_cast<MmaFragment *>(&result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kNumMmaInstructions; n++) {
+
+        uint32_t const* src_ptr = reinterpret_cast<uint32_t const*>(&mma_frag_src_ptr[n]);
+        uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&mma_frag_dst_ptr[n]);
+
+        // Shuffle data within the warp, pull from other threads within the warp
+        uint32_t tmp0 = __shfl_up_sync(0xFFFFFFFF, src_ptr[0], delta_up_);
+        uint32_t tmp1 = __shfl_down_sync(0xFFFFFFFF, src_ptr[0], delta_down_);
+
+        // Reorder the data within the 32-bit word (4x8b) required for mma.sync
+        dst_ptr[0] = __byte_perm(tmp0, tmp1, byte_selector_);
+    }
+
+    return result;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Data type conversion
+////////////////////////////////////////////////////////////////////////////////
+template <
+  /// Destination type
+  typename ElementDst_, 
+  /// Source type
+  typename ElementSrc_,
+  /// Number of elements
+  int N,
+  ///
+  typename Enable = void> 
+struct FragmentConverter {
+
+  using ElementDst = ElementDst_;
+  using ElementSrc = ElementSrc_;
+
+  // Operand fragment registers in destination and source types
+  using DestinationFragment = Array<ElementDst, N>;
+  using SourceFragment = Array<ElementSrc, N>;
+
+  FastNumericArrayConverter<ElementDst, ElementSrc, N> convert;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return convert(src);
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+
+// Partial specialization for when Destination type is the *same* as 
+// Source type
+template <
+  /// Data type
+  typename Element,
+  /// Number of elements
+  int N,
+  /// 
+  typename Enable>
+struct FragmentConverter<Element, Element, N, Enable> {
+
+  using DestinationFragment = Array<Element, N>;
+  using SourceFragment = Array<Element, N>;
+
+  CUTLASS_DEVICE
+  DestinationFragment operator()(SourceFragment const &src) const {
+    return src;
+  }
+};
+
+} // namespace detail
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaMixedInputTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Underlying arch::Mma instruction datatype for A operand
+  using ElementAMma = typename ArchMmaOperator::ElementA;
+
+  /// Underlying arch::Mma instruction datatype for B operand
+  using ElementBMma = typename ArchMmaOperator::ElementB;
+
+  /// Underlying arch::Mma instruction datatype for C operand
+  using MmaElementC = typename ArchMmaOperator::ElementC;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// 
+  // static int const kLoadShapeK = InstructionShape::kK * 
+  //  (sizeof_bits<ElementAMma>::value / sizeof_bits<ElementB>::value);
+
+public:
+
+  /// Iterates over the A operand in Shared Memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile in registers (loaded from Shared Memory)
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile in registers (for use in Mma instruction)
+  using TransformedFragmentA =
+      Array<ElementAMma, FragmentA::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragment for matrix A
+  using MmaOperandA = typename ArchMmaOperator::FragmentA;
+
+  /// Iterates over the B operand in Shared Memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile in registers (loaded from Shared Memory)
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile in registers (for use in Mma instruction)
+  using TransformedFragmentB =
+      Array<ElementBMma, FragmentB::kElements>;
+
+  /// Underlying arch::Mma instruction operand fragment for matrix B
+  using MmaOperandB = typename ArchMmaOperator::FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Underlying arch::Mma instruction operand fragment for matrix C
+  using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaMixedInputTensorOp() {}
+
+    /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+        if (AccumulatorsInRowMajor) {  // matrix B is reordered
+          mma(
+            ptr_D[n_serpentine + m * MmaIterations::kColumn],
+            ptr_A[m],
+            ptr_B[n_serpentine],
+            ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+        } else {
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+        }
+      }
+    }
+  }
+
+  /// Transform the operand warp fragment register to the required data types and layout 
+  /// for the `cultass::arch::Mma`
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementBMma, ElementB, MmaIterations::kColumn, 
+             FragmentB::kElements, MmaOperandB::kElements, Operand::kB> shuffler_B;
+    FragmentB tmp_B; 
+    tmp_B = shuffler_B(B);
+
+    // Convert the B operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementBMma, ElementB, FragmentB::kElements> convert_B;
+    dst_B = convert_B(tmp_B);
+
+    FragmentA tmp_A;
+
+    Array<ElementA, FragmentA::kElements / 2> *
+        ptr_tmp_A = reinterpret_cast<Array<ElementA,
+                                             FragmentA::kElements / 2> *>(&tmp_A);
+    Array<ElementAMma, FragmentA::kElements / 2> *
+        ptr_dst_A = reinterpret_cast<Array<ElementAMma,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+
+    // Shuffle data within warp to obtain the mma.sync operand layout
+    detail::FragmentShuffler<ElementAMma, ElementA, MmaIterations::kRow,
+             FragmentA::kElements, MmaOperandA::kElements, Operand::kA> shuffler_A;
+
+    // Convert the A operand to the Mma Instruction operand type
+    detail::FragmentConverter<ElementAMma, ElementA, FragmentA::kElements / 2> convert_A;
+
+    tmp_A = shuffler_A(A);
+    ptr_dst_A[0] = convert_A(ptr_tmp_A[0]);
+
+    ptr_dst_A[1] = convert_A(ptr_tmp_A[1]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..af1031adb4a9e393135075a9a65553d8d7e17102
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/array_planar_complex.h"
+#include "cutlass/gemm/warp/tile_iterator_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Underlying real-valued warp-level matrix multiply
+  typename Operator_,
+  /// Transformation applied to A operand (typically folded into math instruction)
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Transformation applied to B operand (typically folded into math instruction)
+  ComplexTransform TransformB = ComplexTransform::kNone
+>
+class MmaPlanarComplex {
+public:
+
+  /// Underlying real-valued warp-level matrix multiply
+  using Operator = Operator_;
+
+  /// Shape of warp-level matrix multipy
+  using Shape = typename Operator::Shape;
+
+  /// Transformation applied to A operand (typically folded into math instruction)
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Transformation applied to B operand (typically folded into math instruction)
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Fragment of elements
+  using FragmentA = ArrayPlanarComplex<typename Operator::ElementA, Operator::FragmentA::kElements>;
+
+  /// Iterator into planar complex
+  using IteratorA = TileIteratorPlanarComplex<typename Operator::IteratorA>;
+
+  /// Layout in memory of the A operand
+  using LayoutA = typename Operator::LayoutA;
+
+  using FragmentB = ArrayPlanarComplex<typename Operator::ElementB, Operator::FragmentB::kElements>;
+
+  /// Iterator into planar complex
+  using IteratorB = TileIteratorPlanarComplex<typename Operator::IteratorB>;
+
+  /// Layout in memory of the B operand
+  using LayoutB = typename Operator::LayoutB;
+
+  /// Tile iterator for accumulator
+  using IteratorC = TileIteratorPlanarComplex<typename Operator::IteratorC>;
+
+  /// Accumulator fragment
+  using FragmentC = ArrayPlanarComplex<typename Operator::ElementC, Operator::FragmentC::kElements>;
+
+  /// Layout of accumulator fragment in memory
+  using LayoutC = typename Operator::LayoutC;
+
+private:
+
+    /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    Operator::Shape::kM / Operator::Policy::Operator::Shape::kM,
+    Operator::Shape::kN / Operator::Policy::Operator::Shape::kN
+  >;
+
+public:
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaPlanarComplex() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A_in, 
+    FragmentB const &B_in, 
+    FragmentC const &C) const {
+
+    D.real = C.real;
+    D.imag = C.imag;
+
+    //
+    // Transform fragments based on conjugate operations.
+    //
+
+    negate<typename FragmentA::ArrayReal> neg_A;
+
+    FragmentA frag_A;
+    frag_A.real = A_in.real;
+
+    if (kTransformA == ComplexTransform::kConjugate) {
+      frag_A.imag = neg_A(frag_A.imag);
+    }
+    else {
+      frag_A.imag = frag_A.imag;
+    }
+
+    FragmentB frag_B;
+    frag_B.real = B_in.real;
+
+    if (kTransformB == ComplexTransform::kConjugate) {
+      negate<typename FragmentB::ArrayReal> neg;
+      frag_B.imag = neg(frag_B.imag);
+    }
+    else {
+      frag_B.imag = frag_B.imag;
+    }
+
+    //
+    // Accumulated real-valued matrix multiplies
+    //
+
+    Operator real_mma;
+
+    // D.i += A.i * B.r
+    real_mma(D.imag, frag_A.imag, frag_B.real, D.imag);
+
+    // D.r += A.r * B.r
+    real_mma(D.real, frag_A.real, frag_B.real, D.real);
+
+    // D.i += A.r * B.i
+    real_mma(D.imag, frag_A.real, frag_B.imag, D.imag);
+
+    // D.r += -A.i * B.i
+    frag_A.imag = neg_A(frag_A.imag);
+    real_mma(D.real, frag_A.imag, frag_B.imag, D.real);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4152da36fe767dcbad2faca27ca22e282b6b0c5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/thread/mma.h"
+
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK = 1,
+  /// Complex transformation on operand A
+  ComplexTransform TransformA = ComplexTransform::kNone,
+  /// Complex transformation on operand B
+  ComplexTransform TransformB = ComplexTransform::kNone,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaSimt {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassSimt;
+
+  /// Hard-coded for now
+  using ArchTag = arch::Sm50;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = TransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = TransformB;
+
+  /// Layout of threads
+  using ThreadLayoutA = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA >::value,
+                  layout::ColumnMajor,
+                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value,
+                      layout::RowMajor,
+                      LayoutA>::type
+                 >::type;
+  
+  using ThreadLayoutB = typename platform::conditional< platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutB >::value,
+                  layout::ColumnMajor,
+                  typename platform::conditional < platform::is_same< layout::RowMajorInterleaved<4>, LayoutB >::value,
+                      layout::RowMajor,
+                      LayoutB>::type
+                 >::type;
+
+  static constexpr bool use_dp4a = (platform::is_same< layout::ColumnMajorInterleaved<4>, LayoutA>::value || 
+                                    platform::is_same< layout::RowMajorInterleaved<4>, LayoutA >::value) && 
+                                    platform::is_same< ElementA, int8_t >::value && 
+                                    platform::is_same< ElementB, int8_t >::value;
+
+  using dp4a_type = typename platform::conditional< use_dp4a , int8_t, bool >::type;
+
+  /// Thread-level matrix multiply accumulate operator
+  using ThreadMma = thread::Mma<
+    GemmShape<
+      Shape::kM / Policy::WarpShape::kRow,
+      Shape::kN / Policy::WarpShape::kColumn,
+      Policy::LaneMmaShape::kK>,
+    ElementA,
+    ThreadLayoutA,
+    ElementB,
+    ThreadLayoutB,
+    ElementC,
+    LayoutC,
+    arch::OpMultiplyAdd,
+    dp4a_type
+  >;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename ThreadMma::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Shape of the underlying instruction
+  using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Policy::LaneMmaShape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA = FragmentA;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaSimtTileIterator<
+    MatrixShape<Policy::LaneMmaShape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    Policy,
+    PartitionsK,
+    Shape::kK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentB = FragmentB;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaSimtTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    Operand::kC,
+    ElementC,
+    LayoutC,
+    Policy
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename ThreadMma::FragmentC;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaSimt() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &d, 
+    FragmentA a, 
+    FragmentB b, 
+    FragmentC const &c, int group_idx = 0) const {
+
+    ThreadMma mma;
+
+    if (kTransformA == ComplexTransform::kConjugate) {
+      conjugate<FragmentA> conj_a;
+      a = conj_a(a);
+    }
+
+    if (kTransformB == ComplexTransform::kConjugate) {
+      conjugate<FragmentB> conj_b;
+      b = conj_b(b);
+    }
+
+    mma(d, a, b, c);
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+    dst_A = A;
+    dst_B = B;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bca2348e89a3877ab517a833ba2084cc2f5abb5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -0,0 +1,69 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Describes the arrangement and configuration of per-lane operations in warp-level matrix multiply 
+template <
+  typename WarpShape_,              ///< shape of the warp in lanes (concept: MatrixShape)
+  typename LaneLayout_,             ///< layout function of lanes
+  typename LaneMmaShape_            ///< size of each lane's thread-level matrix product (concept: GemmShape)
+>
+struct MmaSimtPolicy {
+  using WarpShape = WarpShape_;
+  using LaneLayout = LaneLayout_;
+  using LaneMmaShape = LaneMmaShape_;
+  using MmaShape = LaneMmaShape;
+
+  /// Returns a layout functor mapping lane position in the warp to thread ID
+  CUTLASS_HOST_DEVICE
+  static LaneLayout get_lane_layout() {
+    return LaneLayout::packed({WarpShape::kRow, WarpShape::kColumn});
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c522eafa5ef5aa6fff18a196e27d777e05dd753e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -0,0 +1,1890 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Describes the lane policy used by warp-level matrix multiply operators targeting SIMT
+      instructions
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_simt_policy.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Iterates over operands to warp-level matrix multiply operations targeting SIMT instructions
+///
+/// concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Operand identity
+  Operand Operand,
+  /// Data type of A elements
+  typename Element_,
+  /// Layout of operand
+  typename Layout_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK = 1,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize = 1
+>
+class MmaSimtTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kM>, layout::ColumnMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kM);
+  }
+  
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow / Policy::LaneMmaShape::kM, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({0, Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        // This logic has been replaced with calls to inline PTX to guarantee vectorization.
+        #if 0
+        dst_ptr[m + k * Iterations::kRow] = 
+          *(ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM);
+        #endif
+
+        auto ptr = ref_.data() + ref_.offset({m * Policy::WarpShape::kRow, k}) + pointer_offset / Policy::LaneMmaShape::kM;
+        arch::shared_load(dst_ptr[m + k * Iterations::kRow], ptr);
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kN; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kM; ++m) {
+        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
+          src_ptr[m + k * Iterations::kM];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension - used in sliced-K
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads (scalar loads)
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::RowMajor> ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() : divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) : extent_(Shape::kRow, Shape::kColumn), divisible_ (true) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref,
+    TensorCoord extent, 
+    int lane_id
+  ) : extent_(extent), divisible_ (false) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    origin_ = lane_offset;
+    
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    TensorCoord coord_offset(
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn);
+    
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({0, Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+          
+          MatrixCoord offset(m * Policy::WarpShape::kRow * Policy::LaneMmaShape::kM + i, k);
+            
+          MatrixCoord access_coord = origin_ + offset;
+
+          int frag_idx = m * Policy::LaneMmaShape::kM + i + k * Iterations::kRow;
+
+          if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+          
+            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
+          }
+          else {
+            frag[frag_idx] = Element();
+          }
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator. 
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kM; i++) {
+
+          *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM * Policy::LaneMmaShape::kM + i, k) + pointer_offset) = 
+            frag[m * Policy::LaneMmaShape::kM + i + k * Iterations::kM];
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+  
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+protected:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kN>, layout::RowMajor> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kN);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kN});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (vector loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        #if 0
+        dst_ptr[n + k * Iterations::kColumn] = 
+          *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN);
+        #endif
+
+        void const *ptr = ref_.data() + ref_.offset({k, n * Policy::WarpShape::kColumn}) + pointer_offset / Policy::LaneMmaShape::kN;
+        arch::shared_load(dst_ptr[n + k * Iterations::kColumn], ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Group Size along kPartition - used in sliced-K
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::ColumnMajor, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+  
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Element, layout::ColumnMajor> ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ): extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref,
+    TensorCoord extent, 
+    int lane_id
+  ): extent_(extent), divisible_(false) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    origin_ = lane_offset;
+
+    ref.add_coord_offset(lane_offset);
+
+    ref_.reset(ref.data(), ref.stride(0));
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    TensorCoord coord_offset(
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn);
+
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator. (scalar loads)
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < Policy::LaneMmaShape::kN; ++i) {
+
+          MatrixCoord offset(k, n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + i);
+            
+          MatrixCoord access_coord = origin_ + offset;
+
+          int frag_idx = n * Policy::LaneMmaShape::kN + i + k * Iterations::kColumn;
+
+          if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+
+            frag[frag_idx] = *(ref_.data() + ref_.offset(offset) + pointer_offset);
+          }
+          else {
+            frag[frag_idx] = Element();
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for C operands of column-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_
+>
+class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::ColumnMajor, Policy_> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of accumulators in memory
+  using Layout = layout::ColumnMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(
+    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thraed-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(
+    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+  
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using Delta = MatrixShape<
+    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
+    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to be loaded from memory
+    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kN; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+
+        Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(
+            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kN + n}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kM; ++mma_m) {
+
+          Array<Element, Policy::LaneMmaShape::kM> *dst_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(&frag) + 
+            mma_m + Iterations::kM * (n + mma_n * Policy::LaneMmaShape::kN);
+
+          *dst_ptr = src_ptr[mma_m * Policy::WarpShape::kM];
+        }
+      }
+    }
+  }
+    
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+
+        Array<Element, Policy::LaneMmaShape::kM> *dst_ptr= 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> *>(
+            ref_.data() + pointer_offset + ref_.offset({0, mma_n * Delta::kColumn + n}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+
+          Array<Element, Policy::LaneMmaShape::kM> const *src_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kM> const *>(&frag) + 
+            mma_m + Iterations::kRow * (n + mma_n * Policy::LaneMmaShape::kN);
+
+          dst_ptr[mma_m * Policy::WarpShape::kRow] = *src_ptr;
+        }
+      }
+    }
+  }
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for C operands of row-major layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_
+>
+class MmaSimtTileIterator<Shape_, Operand::kC, Element_, layout::RowMajor, Policy_> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of accumulators in memory
+  using Layout = layout::RowMajor;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(
+    (!(Shape::kRow % Policy::WarpShape::kRow)) && (!(Shape::kColumn % Policy::WarpShape::kColumn)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thraed-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(
+    (!(ThreadShape::kRow % Policy::LaneMmaShape::kM)) && (!(ThreadShape::kColumn % Policy::LaneMmaShape::kN)),
+    "Warp-level GEMM shape must be divisible by the arrangement of threads in the warp.");
+  
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  using Delta = MatrixShape<
+    Policy::WarpShape::kRow * Policy::LaneMmaShape::kM,
+    Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+    
+    ref_.add_coord_offset(lane_offset);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    ref_.add_coord_offset({Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to be loaded from memory
+    Index pointer_offset) const {               ///< linear offset (in units of Element) when loading
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+
+        Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(
+            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+
+          Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag) + 
+            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
+
+          *dst_ptr = src_ptr[mma_n * Policy::WarpShape::kColumn];
+        }
+      }
+    }
+  }
+    
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+
+        Array<Element, Policy::LaneMmaShape::kN> *dst_ptr = 
+          reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(
+            ref_.data() + pointer_offset + ref_.offset({mma_m * Delta::kRow + m, 0}));
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+
+          Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+            reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> const *>(&frag) + 
+            mma_n + Iterations::kColumn * (m + mma_m * Policy::LaneMmaShape::kM);
+
+          dst_ptr[mma_n * Policy::WarpShape::kColumn] = *src_ptr;
+        }
+      }
+    }
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for A operands of column-major-K interleaved layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Number of KGroups per kPartition
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kA, Element_, layout::ColumnMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::ColumnMajorInterleaved<4> ;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Iterleave factor
+  static const int kInterleave = 4;
+  
+  /// Number of partitions along K dimension
+  static const int kPartitionsK = PartitionsK;
+
+  /// Number of KGroups per kPartition
+  static const int kGroupPerTile = PartitionGroupSize / Shape::kColumn;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kRow % Policy::WarpShape::kRow), 
+    "The warp-level GEMM M size must be divisible by the number of threads arranged along the M dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kRow > 0, "Policy::WarpShape::kRow must be greater than zero.");
+  static_assert(Shape::kRow / Policy::WarpShape::kRow > 0, "Shape::kRow / Policy::WarpShape::kRow must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow / Policy::WarpShape::kRow,
+    Shape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kRow % Policy::LaneMmaShape::kM) && !(ThreadShape::kColumn % Policy::LaneMmaShape::kK), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kM,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kK
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kMK>, layout::ColumnMajorInterleaved<4>> ref_;
+
+  /// group index within tile
+  int k_group_idx_;
+
+public:
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(Policy::LaneMmaShape::kM, 0);
+
+    ref.add_coord_offset(lane_offset);
+
+    k_group_idx_ = 0;
+    ref_.reset(reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(ref.data()), ref.stride(0)/Policy::LaneMmaShape::kMK);
+  }
+  
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow / Policy::LaneMmaShape::kMK, 
+      coord.column() * Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == kGroupPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset({0, kGroupPerTile * (kPartitionsK-1)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({0, -Shape::kColumn});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kMK > *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        dst_ptr[m + k * Iterations::kRow] = 
+          *((ref_.data() + ref_.offset({m * Policy::WarpShape::kRow / kInterleave, 
+                  k*Policy::LaneMmaShape::kK}) + pointer_offset / Policy::LaneMmaShape::kM));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    Array<Element, Policy::LaneMmaShape::kMK> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kMK > *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kN; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kM; ++m) {
+        *(ref_.data() + ref_.offset(m * Policy::WarpShape::kM, k) + pointer_offset / Policy::LaneMmaShape::kM) = 
+          src_ptr[m + k * Iterations::kM];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization for B operands of row-major k-interleaved layouts
+///
+/// Concept: MutableRandomAccessContiguousTileIteratorConcept
+///
+template <
+  /// Size of the matrix to load (concept: MatrixShape)
+  typename Shape_,
+  /// Data type of A elements
+  typename Element_,
+  /// Shape of the warp in units of thread (concept: MmaSimtPolicy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK,
+  /// Number of KGroups per kPartition
+  int PartitionGroupSize
+>
+class MmaSimtTileIterator<Shape_, Operand::kB, Element_, layout::RowMajorInterleaved<4>, Policy_, PartitionsK, PartitionGroupSize> {
+public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of policy
+  using Layout = layout::RowMajorInterleaved<4>;
+
+  /// Decomposition of elements among threads
+  using Policy = Policy_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Interleave factor
+  static const int kInterleave = 4;
+
+  /// Number of partitions along K dimension
+  static const int kPartitionsK = PartitionsK;
+
+  /// Number of KGroups per kPartition
+  static const int kGroupPerTile = PartitionGroupSize / Shape::kRow;
+
+  //
+  // Derived quantities
+  //
+
+  static_assert(!(Shape::kColumn % Policy::WarpShape::kColumn), 
+    "The warp-level GEMM N size must be divisible by the number of threads arranged along the N dimension.");
+
+  static_assert(Shape::kRow > 0, "Shape::kRow must be greater than zero.");
+  static_assert(Shape::kColumn > 0, "Shape::kColumn must be greater than zero.");
+  static_assert(Policy::WarpShape::kColumn > 0, "Policy::WarpShape::kColumn must be greater than zero.");
+  static_assert(Shape::kColumn / Policy::WarpShape::kColumn > 0, "Shape::kColumn / Policy::WarpShape::kColumn must be greater than zero.");
+
+  /// Thread-level shape of a fragment
+  using ThreadShape = MatrixShape<
+    Shape::kRow,
+    Shape::kColumn / Policy::WarpShape::kColumn
+  >;
+
+  static_assert(!(ThreadShape::kColumn % Policy::LaneMmaShape::kN) && !(ThreadShape::kRow % Policy::LaneMmaShape::kK), 
+    "Thread-level GEMM must be divisible by Policy::LaneMmaShape.");
+
+  /// Number of individual loads
+  using Iterations = MatrixShape<
+    ThreadShape::kRow / Policy::LaneMmaShape::kK,
+    ThreadShape::kColumn / Policy::LaneMmaShape::kN
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, ThreadShape::kCount>;
+
+
+private:
+
+  /// Internal reference
+  cutlass::TensorRef<Array<Element, Policy::LaneMmaShape::kKN>, layout::RowMajorInterleaved<4>> ref_;
+
+  /// group index within tile
+  int k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator(
+    TensorRef ref, 
+    int lane_id
+  ) {
+
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) * 
+      MatrixCoord(0, Policy::LaneMmaShape::kN);
+
+    ref.add_coord_offset(lane_offset);
+
+    k_group_idx_ = 0;
+
+    ref_.reset(
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(ref.data()),
+      ref.stride(0) / Policy::LaneMmaShape::kKN);
+  }
+  
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator &add_tile_offset(TensorCoord const &coord) {
+
+    ref_.add_coord_offset({
+      coord.row() * Shape::kRow, 
+      coord.column() * Shape::kColumn / Policy::LaneMmaShape::kKN});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator++() {
+
+    add_tile_offset({1, 0});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == kGroupPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset({kGroupPerTile * (kPartitionsK-1), 0});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaSimtTileIterator & operator--() {
+
+    ref_.add_coord_offset({-Shape::kRow, 0});
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kKN> *dst_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kKN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        dst_ptr[n + k * Iterations::kColumn] = 
+          *(ref_.data() + ref_.offset({k * Policy::LaneMmaShape::kK, 
+                n * Policy::WarpShape::kColumn / kInterleave}) + pointer_offset / Policy::LaneMmaShape::kN);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+  
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+
+    Array<Element, Policy::LaneMmaShape::kN> const *src_ptr = 
+      reinterpret_cast<Array<Element, Policy::LaneMmaShape::kN> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kM; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kN; ++n) {
+        *(ref_.data() + ref_.offset({k, n * Policy::WarpShape::kN}) + pointer_offset / Policy::LaneMmaShape::kN) = 
+          src_ptr[n + k * Iterations::kN];
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, Index pointer_offset) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..902a3d10674c99428ed36404dbdbc27555fc46a7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -0,0 +1,382 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate
+   operations targeting sparse Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class SparseMmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Equivalent base dense mma
+  using Base = MmaTensorOp<Shape, ElementA, LayoutA, ElementB, LayoutB,
+                           ElementC, LayoutC, Policy, PartitionsK_,
+                           AccumulatorsInRowMajor, Enable>;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Base::ArchMmaOperator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename Base::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = typename Base::OperatorClass;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename Base::InstructionShape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Base::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Base::kTransformB;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Sparsity in Operand A
+  static int const kSparse = Policy::Operator::kSparse;
+
+  /// Meta data size in bits 
+  static int const kMetaSizeInBits = Policy::Operator::kMetaSizeInBits;
+
+  /// Max ID2
+  static int const kMaxID2 = Policy::Operator::kMaxID2;
+
+    static int const kVerticalVisit = false;
+  /// Data type of meta E that is moved at the same time
+  using ElementE =
+      typename cutlass::platform::conditional<kMaxID2 == 1, uint32_t,
+                                              uint16_t>::type;
+
+  /// Number of ElementA that is associated with one ElementE
+  static int const kElementsPerElementE =
+      128 / cutlass::sizeof_bits<ElementA>::value;
+
+  /// Meta data is essentially interleaved but mapped to ColumnMajor internally
+  static int const kInterleaved = 2;
+
+  /// Layout of meta E 
+  using LayoutE = cutlass::layout::ColumnMajor;
+
+ public:
+
+  /// Iterates over the A operand in memory
+ using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK / kSparse>, Operand::kA, ElementA,
+     LayoutA,
+     MatrixShape<Policy::Operator::Shape::kM,
+                 Policy::Operator::Shape::kK / kSparse>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+ /// Storage for A tile
+ using FragmentA = typename IteratorA::Fragment;
+
+ /// Storage for transformed A tile
+ using TransformedFragmentA =
+     Array<typename Policy::Operator::ElementA, FragmentA::kElements>;
+
+ /// Iterates over the B operand in memory
+ using IteratorB = typename Base::IteratorB;
+
+ /// Storage for B tile
+ using FragmentB = typename Base::FragmentB;
+
+ /// Storage for transformed B tile
+ using TransformedFragmentB = typename Base::TransformedFragmentB;
+
+ /// Iterates over the C operand in memory
+ using IteratorC = typename Base::IteratorC;
+
+ /// Storage for C tile
+ using FragmentC = typename Base::FragmentC;
+
+ /// Iterates over the E operand in memory
+ using IteratorE = SparseMmaTensorOpMetaTileIterator<
+     MatrixShape<Shape::kM * kInterleaved,
+                 Shape::kK / kSparse / kElementsPerElementE / kInterleaved>,
+     ElementE, LayoutE,
+     MatrixShape<Policy::Operator::Shape::kM,
+                 Policy::Operator::Shape::kK / kSparse / kElementsPerElementE /
+                     kInterleaved>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+ /// Storage for E tile
+ using FragmentE = typename IteratorE::Fragment;
+
+ /// Number of mma operations performed
+ using MmaIterations = typename Base::MmaIterations;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  SparseMmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C,
+    FragmentE const &E
+  ) const {
+
+    using MmaOperandA = typename Policy::Operator::FragmentA;
+    using MmaOperandB = typename Policy::Operator::FragmentB;
+    using MmaOperandC = typename Policy::Operator::FragmentC;
+    using MmaOperandE = typename Policy::Operator::FragmentE;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+    MmaOperandE const *ptr_E = reinterpret_cast<MmaOperandE const *>(&E);
+
+    if (kVerticalVisit) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+          int id2 = m_serpentine % kMaxID2;
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_E[(m_serpentine / kMaxID2)],
+              id2);
+          }
+        }
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        int id2 = m % kMaxID2;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_E[(m / kMaxID2)],
+              id2);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_E[(m / kMaxID2)],
+                id2);
+          }
+        }
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+
+    if (kVerticalVisit) {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+  
+      dst_A = convert_A(A);
+  
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+    } else {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                             FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..190e92fc5a036e2ce038983130e07c27e25deced
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h" 
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename T, typename S, int N, FloatRoundStyle Round>
+struct ConvertAndPack {
+
+  using Converter = NumericArrayConverter<T, S, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<S, N> const &source) {
+    Converter converter;
+
+    return converter(source);
+  }
+};
+
+template <typename T, int N, FloatRoundStyle Round>
+struct ConvertAndPack<T, T, N, Round> {
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &source) {
+		return source;
+  }
+};
+
+template <int N, FloatRoundStyle Round>
+struct ConvertAndPack<bfloat16_t, float, N, Round> {
+
+  using Converter = NumericArrayConverter<bfloat16_t, float, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(Array<float, N> const &source) {
+    Converter converter;
+
+    Array<float, N> tmp;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+      tmp[i] = source[idx];
+    }
+
+    return converter(tmp);
+  }
+};
+
+template <int N, FloatRoundStyle Round>
+struct ConvertAndPack<half_t, float, N, Round> {
+
+  using Converter = NumericArrayConverter<half_t, float, N, Round>;
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(Array<float, N> const &source) {
+    Converter converter;
+
+    Array<float, N> tmp;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc));
+      tmp[i] = source[idx];
+    }
+
+    return converter(tmp);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting Tensor Cores.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  #if defined(__CUDA_ARCH__) && ((__CUDA_ARCH__ < 800) || (__CUDA_ARCH__ == 890)) 
+    static int const kVerticalVisit = true;
+  #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 1200) 
+    static int const kVerticalVisit = true;
+  #else
+    static int const kVerticalVisit = false;
+  #endif
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+      
+    if (kVerticalVisit) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+          int m_serpentine = ((n % 2) ? (MmaIterations::kRow - 1 - m) : m);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n + m_serpentine * MmaIterations::kColumn],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[n + m_serpentine * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m_serpentine + n * MmaIterations::kRow],
+              ptr_A[m_serpentine],
+              ptr_B[n],
+              ptr_D[m_serpentine + n * MmaIterations::kRow]);
+          }
+        }
+      }
+    } else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+                ptr_A[m],
+                ptr_B[n_serpentine],
+                ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        }
+      }
+    }
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+    if (kVerticalVisit) {    
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+  
+      dst_A = convert_A(A);
+  
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+    } else {
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+  
+      dst_B = convert_B(B);
+  
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/gemm/warp/mma_tensor_op_fast_f32.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
new file mode 100644
index 0000000000000000000000000000000000000000..570298bccdae2e014a32b8ad31b32d84bd4332bd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
@@ -0,0 +1,471 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class TensorFloat32Op {
+  k3xTF32, 
+  k4xTF32 
+}; 
+
+template <
+  /// Floating-point rounding style
+  FloatRoundStyle RoundBigA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundSmallA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundBigB_ = RoundBigA_,
+  /// Floating-point rounding style
+  FloatRoundStyle RoundSmallB_ = RoundSmallA_,
+  /// Precision for TensorFloat32Op 
+  // (k3xTF32: BigxBig, BigxSmall, SmallxBig)
+  // (k4xTF32: BigxBig, BigxSmall, SmallxBig, SmallxSmall)
+  TensorFloat32Op Precision_ = TensorFloat32Op::k3xTF32
+  >
+struct FastF32 {
+
+  static FloatRoundStyle const kRoundBigA = RoundBigA_;
+  static FloatRoundStyle const kRoundSmallA = RoundSmallA_;
+  static FloatRoundStyle const kRoundBigB = RoundBigB_;
+  static FloatRoundStyle const kRoundSmallB = RoundSmallB_;
+  static TensorFloat32Op const kPrecision = Precision_;
+};
+
+
+namespace detail {
+
+  template<
+    int N,
+    FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
+    FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
+  >
+  struct ConvertAndPackAccurateF32 {
+  
+    /// Rounding styles for big and small part
+    static FloatRoundStyle const kRoundBig = RoundBig;
+    static FloatRoundStyle const kRoundSmall = RoundSmall;
+
+    /// Converter type
+    using Converter = NumericConverterFastF32<kRoundBig, kRoundSmall>;
+
+    /// Source fragement
+    using SourceFragment = Array<float, N>;
+
+    /// Destination fragment
+    using DestinationFragment = Array<tfloat32_t, N>;
+
+    /// Converter Fragment holding two tfloat32_t elements for every float
+    using ConverterFragment = Array<tfloat32_t, 2>;
+
+    /// Index in fargments for the big and small part
+    static int const kBigIndex = 0;
+    static int const kSmallIndex = 1;
+
+    CUTLASS_HOST_DEVICE
+    void operator()(SourceFragment const &source,
+                    DestinationFragment &dst_big,
+                    DestinationFragment &dst_small) {
+      
+      Converter convert_;
+      ConverterFragment result_;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < N; ++i) {
+        // convert source to result fragment
+        result_ = convert_(source[i]);
+
+        // store converted result fragments to destination fragment
+        dst_big[i] = result_[kBigIndex];
+        dst_small[i] = result_[kSmallIndex];
+      }
+    }
+  };
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOpFastF32;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float*float+float => float using TF32 TensorOps
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Number of partitions along K dimension
+  int PartitionsK_,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor,
+  /// Used for partial specialization
+  typename Enable
+>
+class MmaTensorOpFastF32<
+  Shape_,
+  float, LayoutA_,
+  float, LayoutB_,
+  float, LayoutC_,
+  Policy_, PartitionsK_,
+  AccumulatorsInRowMajor, Enable> {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = float;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = float;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = float;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = arch::OpMultiplyAddFastF32;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Tune F32 to TF32 big small conversion for float operation
+  /// Different combination of big small conversin can cause different tradeoff
+  /// between speed and accuracy.  Generally, use round_half_ulp_truncate can
+  /// improve the performance but hur the accuracy.
+  using MmaFastF32 = FastF32 <
+    FloatRoundStyle::round_toward_zero,        // kRoundBigA
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallA
+    FloatRoundStyle::round_toward_zero,        // kRoundBigB
+    FloatRoundStyle::round_half_ulp_truncate,  // kRoundSmallB
+    TensorFloat32Op::k3xTF32                   // Number of TF32 operations 
+  >;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kM, Shape::kK>, 
+      Operand::kA, 
+      ElementA, 
+      LayoutA,
+      MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+      Policy::OpDelta::kRow, 
+      kThreadCount, 
+      kPartitionsK
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements * 2>;
+
+  /// Fragment bisecting big and small sections
+  using AccessTypeFragmentA = 
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, 
+      Operand::kB, 
+      ElementB, 
+      LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, 
+      kThreadCount, 
+      kPartitionsK
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements * 2>;
+
+  /// Fragment bisecting big and small sections
+  using AccessTypeFragmentB = 
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Index in fargments for the big and small part
+  static int const kBigIndex = 0;
+  static int const kSmallIndex = 1;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpFastF32() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    TransformedFragmentA const &A, 
+    TransformedFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    AccessTypeFragmentA const *ptr_A = reinterpret_cast<AccessTypeFragmentA const*>(&A);
+    AccessTypeFragmentB const *ptr_B = reinterpret_cast<AccessTypeFragmentB const*>(&B);
+
+    //
+    // Accumulate in place
+    //
+    D = C;
+    
+    mma_operator(D, ptr_A[kSmallIndex], ptr_B[kBigIndex], D);
+
+    mma_operator(D, ptr_A[kBigIndex], ptr_B[kSmallIndex], D);
+
+    mma_operator(D, ptr_A[kBigIndex], ptr_B[kBigIndex], D);
+
+    if (MmaFastF32::kPrecision == TensorFloat32Op::k4xTF32)
+      mma_operator(D, ptr_A[kSmallIndex], ptr_B[kSmallIndex], D);
+  }
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void mma_operator(
+    FragmentC &D, 
+    AccessTypeFragmentA const &A, 
+    AccessTypeFragmentB const &B, 
+    FragmentC const &C
+  ) const {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+      using MmaOperandA = typename ArchMmaOperator::FragmentA;
+      using MmaOperandB = typename ArchMmaOperator::FragmentB;
+      using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+      MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+      MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+      MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+      // Serpentine visitation order maximizing reuse of Ra
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          // This allows to reuse of Rb when at serpentine turns
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          if (AccumulatorsInRowMajor) {  // matrix B is reordered
+            mma(
+              ptr_D[n_serpentine + m * MmaIterations::kColumn],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[n_serpentine + m * MmaIterations::kColumn]);
+          } else {
+            mma(
+              ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+          }
+        } // end n loop
+      } // end m loop
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      
+      detail::ConvertAndPackAccurateF32<
+        FragmentA::kElements / 2,
+        MmaFastF32::kRoundBigA,
+        MmaFastF32::kRoundSmallA> convert_A;
+      
+      detail::ConvertAndPackAccurateF32<
+        FragmentB::kElements,
+        MmaFastF32::kRoundBigB,
+        MmaFastF32::kRoundSmallB> convert_B;
+      
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *ptr_dst_B = 
+        reinterpret_cast<Array<typename ArchMmaOperator::ElementB, FragmentB::kElements> *>(&dst_B);
+      
+      convert_B(B, ptr_dst_B[0], ptr_dst_B[1]);
+
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *ptr_dst_A =
+        reinterpret_cast<Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *>(&dst_A);
+      
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A = 
+        reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      
+      convert_A(ptr_A[0], ptr_dst_A[0], ptr_dst_A[2]);
+      
+      convert_A(ptr_A[1], ptr_dst_A[1], ptr_dst_A[3]);
+    #else
+      assert(0);
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c70bc581dd5a77d9d17c533717d8a7b3693b55ad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,559 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of a warp tile
+      that participate in one warp-level mma operation.
+
+      Typically, this is used to access the accumulator tile/fragment of a warp-level mma operation.
+      The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into 
+      next warp-level mma operation. 
+
+      This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is 
+      reused as multiplicand tile for the next mma.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Size of the accumulation tile shape (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on the fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator;
+
+
+// Partial specialization for col-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, OutputOp_> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow, 
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn 
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    int index = index_ * MmaIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            n * AccumulatorIterations::kRow + m + index;
+            
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset]);
+      }
+    }
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  /// Then apply per-channel scale and bias
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, ScaleBiasFragment &scale, 
+        ScaleBiasFragment &bias, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
+    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
+
+    int index = index_ * MmaIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        int accumulator_access_offset = 
+            n * AccumulatorIterations::kRow + m + index;
+            
+        frag_ptr[m * MmaIterations::kColumn + n].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+            frag_ptr[m * MmaIterations::kColumn + n] = 
+                output_op(accumulators_[accumulator_access_offset], 
+                    scale_ptr[n] /*scale*/, bias_ptr[n] /*bias*/);
+      }
+    }
+  }
+
+
+
+};
+
+// Partial specialization for row-major accumulator tile
+
+template <
+    /// Shape of warp tile to load (concept: MatrixShape)
+    typename Shape_,
+    /// Shape of the warp accumulation tile (concept: MatrixShape)
+    typename AccumulatorShape_,
+    /// KBlocks columns to compute residual
+    int KBlocksColumn_,    
+    /// Accumulator Element type
+    typename ElementAccumulator_,    
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Output operation on fragment
+    typename OutputOp_>
+class MmaTensorOpFragmentIterator<Shape_, AccumulatorShape_, KBlocksColumn_, ElementAccumulator_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, OutputOp_> {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+    
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = AccumulatorShape_;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = KBlocksColumn_;
+
+  /// Accumulator Element type
+  using ElementAccumulator = ElementAccumulator_;
+
+  /// Element type
+  using Element = Element_;
+  
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Output operation on fragment
+  using OutputOp = OutputOp_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow, 
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kRowsPerIteration = 8;
+  static int const kColumnsPerIteration = 16;
+  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kN / kThreads;
+  static int const kElementsPerAccess = kRowsPerIteration * kColumnsPerIteration / kThreads;
+  static int const kIterationsPerAccess = kElementsPerAccess / kElementsPerIteration;
+  
+  // Number of iterations per actual instruction
+  static int const kIterationsPerInstruction = InstructionShape::kM / kRowsPerIteration;
+
+  static int const kAccessStride = kIterationsPerInstruction;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of Accesses in a warp
+  using AccessIterations = MatrixShape<MmaIterations::kRow * kIterationsPerInstruction, 
+                                        MmaIterations::kColumn / kIterationsPerAccess>;
+
+  /// Number of K iterations    
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn;
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerIteration>;
+  using FragmentAccessType = Array<Element, kElementsPerIteration>;
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerIteration>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator(AccumulatorFragment const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0), is_residual_tile_(true) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    index_ = idx;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    int index = index_ * AccessIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < AccessIterations::kCount; i++) {
+
+      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
+                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
+                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
+                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
+                                    (kIterationsPerInstruction * kIterationsPerAccess) +
+                                    (index % kIterationsPerInstruction);
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kIterationsPerAccess; j++) {
+  
+        frag_ptr[i*kIterationsPerAccess + j].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+              frag_ptr[i*kIterationsPerAccess + j] = output_op(accumulators_[accumulator_access_offset + j * kAccessStride]);
+      }
+      index++;
+    }
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  /// Then apply per-channel scale and bias
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, ScaleBiasFragment &scale, 
+        ScaleBiasFragment & bias, OutputOp output_op) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+    ScaleBiasAccessType * scale_ptr = reinterpret_cast<ScaleBiasAccessType *>(&scale);
+    ScaleBiasAccessType * bias_ptr = reinterpret_cast<ScaleBiasAccessType *>(&bias);
+
+    int index = index_ * AccessIterations::kCount;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < AccessIterations::kCount; i++) {
+
+      int accumulator_access_offset = index / AccessIterations::kCount * (MmaIterations::kColumn * kIterationsPerInstruction) +
+                                    (index % AccessIterations::kCount) / (AccessIterations::kColumn * kIterationsPerInstruction) *
+                                    AccumulatorIterations::kColumn * kIterationsPerInstruction +
+                                    (index % (AccessIterations::kColumn * kIterationsPerInstruction)) / kIterationsPerInstruction *
+                                    (kIterationsPerInstruction * kIterationsPerAccess) +
+                                    (index % kIterationsPerInstruction);
+
+      int scale_bias_offset = (index 
+                    % (kIterationsPerInstruction * AccessIterations::kColumn))
+                    * kIterationsPerAccess;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < kIterationsPerAccess; j++) {
+
+  
+        frag_ptr[i*kIterationsPerAccess + j].clear();
+        if(!(is_residual_tile_ && index_ >= kResidualIndex))
+              frag_ptr[i*kIterationsPerAccess + j] = output_op(
+                    accumulators_[accumulator_access_offset + j * kAccessStride], 
+                    scale_ptr[scale_bias_offset + j], bias_ptr[scale_bias_offset + j]);
+      }
+      index++;
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..febd0e48be683db49b588d2e5c1d56de39d2ad13
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Policy describing implementation details of warp-level GEMM targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Policy 
+template <
+  typename Operator_,        ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
+  typename OpDelta_          ///< distance between operations (concept: MatrixShape)
+>
+struct MmaTensorOpPolicy {
+
+  using Operator = Operator_;    ///< hardware instruction(s) performing TensorOp (concept: arch::Mma)
+  using OpDelta = OpDelta_;      ///< distance between operations (concept: MatrixShape)
+  using MmaShape = typename Operator::Shape;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7a4d87f99ae8ff97e8ca615a74c923e2f745fc9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -0,0 +1,280 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+
+    This is a work in progress.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/mma.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaVoltaTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Architecture tag
+  using ArchTag = arch::Sm70;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Underlying instruction shape
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// interleaved 32x32 tiles
+  using InterleavedTileShape = GemmShape<32, 32, 4>;
+
+  static_assert(!(Shape::kM % InterleavedTileShape::kM) &&
+                !(Shape::kN % InterleavedTileShape::kN),
+                "Shape must be a multiple of InterleavedTileShape.");
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaVoltaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kM, Shape::kK>,
+    Operand::kA,
+    ElementA,
+    LayoutA,
+    MatrixShape<
+      ArchMmaOperator::Shape::kM,
+      ArchMmaOperator::Shape::kK
+    >,
+    Policy::OpDelta::kRow,
+    kThreadCount
+  >;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaVoltaTensorOpMultiplicandTileIterator<
+    MatrixShape<Shape::kK, Shape::kN>,
+    Operand::kB,
+    ElementB,
+    LayoutB,
+    MatrixShape<
+      ArchMmaOperator::Shape::kK,
+      ArchMmaOperator::Shape::kN
+    >,
+    Policy::OpDelta::kRow,
+    kThreadCount
+  >;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaVoltaTensorOpAccumulatorTileIterator<
+    MatrixShape<Shape::kM, Shape::kN>,
+    ElementC,
+    LayoutC,
+    typename ArchMmaOperator::Shape,
+    typename Policy::OpDelta
+  >;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  static_assert(
+    !(Shape::kM % ArchMmaOperator::Shape::kM) && 
+    !(Shape::kN % ArchMmaOperator::Shape::kN),
+    "Shape of warp-level Mma must be divisible by operator shape.");
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    InterleavedTileShape::kM / ArchMmaOperator::Shape::kM,
+    InterleavedTileShape::kN / ArchMmaOperator::Shape::kN
+  >;
+  using TileIterations = MatrixShape<
+    Shape::kM / InterleavedTileShape::kM,
+    Shape::kN / InterleavedTileShape::kN
+  >;
+
+  // Whether matrix B is reordered
+  bool reorder_B_;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+  
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C)  {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int outer_col = 0; outer_col < TileIterations::kColumn; ++outer_col) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inner_col = 0; inner_col < MmaIterations::kColumn; ++inner_col) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int outer_row = 0; outer_row < TileIterations::kRow; ++outer_row) {
+          CUTLASS_PRAGMA_UNROLL
+
+          for (int inner_row = 0; inner_row < MmaIterations::kRow; ++inner_row) {
+      
+            int op_col = inner_col + MmaIterations::kColumn * outer_col;
+
+            // Column-major serpentine sequence to maximize reuse of A operand.
+            int inner_row_serp = inner_row;
+            int outer_row_serp = outer_row;
+            if (op_col & 1) {
+              inner_row_serp = MmaIterations::kRow - inner_row - 1;
+              outer_row_serp = TileIterations::kRow - outer_row - 1;
+            }
+            int op_row = inner_row_serp + MmaIterations::kRow * outer_row_serp;
+            int op_idx = inner_row_serp + MmaIterations::kRow * 
+                         (inner_col + MmaIterations::kColumn * 
+                          (outer_row_serp + TileIterations::kRow * outer_col));
+            mma(
+              ptr_D[op_idx],
+              ptr_A[op_row],
+              ptr_B[op_col],
+              ptr_D[op_idx]);
+
+          }
+        }
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f37c5c1434c0f1887ce70ae8a11eea25b6c293d6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+
+/// Tile access iterator
+/// Each iteration access in the tile is
+/// used as multiplicand for one
+/// warp-level matrix multiplication
+template <
+    /// Size of the tile (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Enable Residual Support
+    bool EnableResidual = false,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1
+>
+class MmaTensorOpMultiplicandTileAccessIterator {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 
+    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+    Shape::kRow / InstructionShape::kRow,
+    Shape::kColumn / InstructionShape::kColumn
+  >;
+
+  static int const kIterations = (kOperand == Operand::kA) ? 
+    InstructionCount::kColumn : InstructionCount::kRow;
+
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA) ? 
+      (Shape::kRow * InstructionShape::kColumn / kThreads) : 
+      (Shape::kColumn * InstructionShape::kRow / kThreads)
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to load residual tile
+  bool is_residual_;
+  
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+public:
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), is_residual_(false), iterations_(0) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+
+    if(EnableResidual) {
+      // compute residual offset
+      if (kOperand == Operand::kA) {
+        typename TensorCoord::Index residual_size = 
+          extent_.column() % Shape::kColumn;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(0, residual_size);
+        }
+      }
+      else {
+        typename TensorCoord::Index residual_size = 
+          extent_.row() % Shape::kRow;
+        if(residual_size) {
+          is_residual_ = true;
+          residual_offset_ = make_Coord(residual_size, 0);
+        }
+      }
+    }
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): MmaTensorOpMultiplicandTileAccessIterator(ref,
+    {Shape::kRow, Shape::kColumn}, lane_id) {
+  }
+ 
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+
+    if(EnableResidual && is_residual_) {
+      is_residual_ = false;
+
+      origin_ += residual_offset_;
+      ref_.add_coord_offset(residual_offset_);
+
+    }
+
+    else {
+      if (kOperand == Operand::kA) {
+        add_tile_offset({0, 1});
+      }
+      else {
+        add_tile_offset({1, 0});
+      }
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileAccessIterator & operator++() {
+
+    iterations_++;
+
+    if(iterations_ >= kIterations)
+      advance();
+    
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    int const kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn : InstructionShape::kRow);
+
+    // Take advantage of Tensor Op's 8 x 4T access pattern
+    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+
+    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    if (kOperand == Operand::kA) {
+      int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
+            int access_idx = 
+              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+            
+            MatrixCoord offset(
+              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
+              inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kColumn);
+
+            MatrixCoord access_coord = origin_ + offset;
+
+//            if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+
+              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+                ref_.data() + ref_.offset(offset));
+//            }
+//            else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//            }
+          }
+        }
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+            inner_idx * 4 * kElementsPerAccess + iterations_ * InstructionShape::kRow,
+            inst_n_idx * 8);
+
+          MatrixCoord access_coord = origin_ + offset;
+
+//          if(access_coord.row() < extent_.row() && access_coord.column() < extent_.column()) {
+              
+            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              ref_.data() + ref_.offset(offset));
+//          }
+//          else {
+//              AccessType zero;
+//              zero.clear();
+//              access_ptr[access_idx] = zero;
+//          }
+        }
+      } 
+    }
+  }
+
+};
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd15097d3ebd0e2e4c663c9ee57e0e6520eb6b6b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -0,0 +1,4803 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaTensorOpMultiplicandTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                   64>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, 64>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner), 
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+      
+    int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 4) {
+        // Matrix multiply 1688 A/B
+        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
+        // Four blocks are next to each other in the contiguous dimension.
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ i);
+        access_contiguous_idx = (quad_pair ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx =
+            (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1));
+        access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 2));
+        access_contiguous_idx = ((i & 3) ^ lane_in_quad);
+        access_strided_idx = lane_id;
+      }
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It
+/// uses LDS.32 to load from shared memory and therefore must be initialized
+/// with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_,
+    OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual 32bit
+    // shared memory load op.  Every one warp of 32bit shared memory load loads
+    // 8x4 elements
+    static int const kLdsOpInner = Layout::TileShape::kStrided;
+    static int const kLdsOpOuter = kThreads / kLdsOpInner;
+
+    static_assert(!(Shape::kContiguous % kLdsOpOuter),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsOpInner),
+                  "Shape of warp-level mma must be divisible by 32bit "
+                  "fundamental tile size.");
+
+    /// Number of 32 bit shared memory load instructions needed by one MMA instruction
+    /// 1688  A 2x2
+    /// 1688  B 1x2
+    /// 16816 B 1x4
+    static int const LdsShapeContiguous =
+        InstructionShape::kContiguous / kLdsOpOuter;
+    static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner;
+    using LdsShape =
+        layout::PitchLinearShape<LdsShapeContiguous, LdsShapeStrided>;
+
+    /// Number and arrangement of LDS instructions
+    using LdsIterations = layout::PitchLinearShape<
+        Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = Layout::TileShape::kContiguous *
+                                   Layout::kElementsPerAccess /
+                                   Policy::kLdsOpOuter;
+
+  /// Vectorized access is not used
+  static int const kElementsPerAccess = 1;
+
+  /// Pointer type used for accesses
+  using AccessType = Element;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+ private:
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int access_strided = lane_id % Policy::kLdsOpInner;
+      int access_contiguous = (lane_id / Policy::kLdsOpInner) +
+                              (access_strided ^ i) * Policy::kLdsOpOuter;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) {
+      if (tile_offset.contiguous() % 2) {
+        // Matrix multiply 1688 pointer_[0] <=> pointer_[4] pointer_[1] <=> pointer_[5]
+        //           pointer_[2] <=> pointer_[6] pointer_[3] <=> pointer_[7]
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Element *fetch_ptr = reinterpret_cast<Element *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) {
+            int access_idx =
+                cc + (ss + (c + s * Policy::LdsIterations::kContiguous) *
+                               Policy::LdsShape::kStrided) *
+                         Policy::LdsShape::kContiguous;
+            int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous;
+            int access_idx_strided =
+                (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner;
+
+            AccessType const *source_ptr =
+                pointer_[access_idx_contiguous % kPointerCount] +
+                Layout::TileShape::kContiguous * Layout::kElementsPerAccess *
+                    (access_idx_contiguous / kPointerCount) +
+                access_idx_strided * stride_;
+
+            char const *source_byte_ptr =
+                reinterpret_cast<char const *>(source_ptr) + byte_offset +
+                byte_offset_;
+
+            fetch_ptr[access_idx] =
+                *reinterpret_cast<Element const *>(source_byte_ptr);
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps with 64B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 32>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = 32;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+      
+    int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    //int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 4) {
+        // Matrix multiply 1688 A/B
+        // Q0 Q1 Q2 Q3 (Q stands for 1 8x128bit block).
+        // Four blocks are next to each other in the contiguous dimension.
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = quad_pair ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (((quad_pair & 1) + i * 2) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_pair + (lane_id >> 4 << 3)) / 2;
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (quad_quad + i * 2) ^ (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor) ^ i;
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr = 
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps with 32B warp tile
+/// the contiguous dimension. This assumes Threadblock contiguous dimension has
+/// the same size as the warp tile.  It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// This specialization can be merged into the general one.  Most code is the same.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous<16, 16>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = 16;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+      "Shape of warp-level mma must be divisible by LDSM's fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeStrided =
+        InstructionShape::kStrided / kLdsmOpInner;
+    static int const LdsmShapeContiguous = 4 / LdsmShapeStrided;
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kContiguous / Layout::kElementsPerAccess / LdsmShapeContiguous,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kStrided / InstructionShape::kStrided;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount =
+      Layout::TileShape::kContiguous / Policy::LdsmShape::kContiguous / Layout::kFactor;
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+    byte_offset_(0),
+    k_group_idx_(0) {
+
+    //int quad_pair = (lane_id >> 3);
+    int quad_quad = (lane_id >> 4);
+    int lane_in_pair = (lane_id & 1);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+      int partition_contiguous_idx = -1;
+      int access_contiguous_idx = -1;
+      int access_strided_idx = -1;
+
+      if (Policy::LdsmShape::kContiguous == 2 &&
+          kOperand == Operand::kA) {
+        // Matrix multiply 16816 A
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 2 &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816 B
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = lane_in_quad / 2;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * 2 + quad_quad) ^
+             access_strided_idx);
+      } else if (Policy::LdsmShape::kContiguous == 1) {
+        // Matrix multiply 16832.SP B
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        int factor_in_partition =
+            (Layout::PartitionShape::kContiguous * Layout::kFactor /
+             Layout::TileShape::kContiguous);
+
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
+                                 (lane_in_quad_quad / Layout::kFactor) ^ i);
+        access_strided_idx = lane_id / Layout::kFactor;
+      } 
+
+      int access_contiguous =
+          partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+          access_contiguous_idx;
+
+      int access_strided = access_strided_idx;
+
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+                    access_contiguous + access_strided * stride_;
+    }
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    if (Shape::kContiguous ==
+        Layout::PartitionShape::kContiguous * Layout::kElementsPerAccess) {
+      if (tile_offset.contiguous() % 2) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kPointerCount / 2; ++i) {
+          AccessType const *tmp_pointer = pointer_[i];
+          pointer_[i] = pointer_[i + kPointerCount / 2];
+          pointer_[i + kPointerCount / 2] = tmp_pointer;
+        }
+      }
+      contiguous_offset = (tile_offset.contiguous() >> 1) << 1;
+    }
+
+    int offset = (tile_offset.strided() * InstructionShape::kStrided) *
+                     stride_ * Layout::kElementsPerAccess / Layout::kFactor +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+      reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_[c % kPointerCount] +
+            Layout::TileShape::kContiguous * (c / kPointerCount) +
+            Policy::kLdsmOpInner * Policy::LdsmShape::kStrided * s * stride_ / Layout::kFactor;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::ColumnMajor, Policy::LdsmShape::kCount>(
+          fetch_ptr[access_idx],
+          source_byte_ptr
+        );
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess +
+      tile_offset.strided() * InstructionShape::kStrided * stride_ / Layout::kFactor;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA,
+                "MmaTensorOpMultiplicandIterator for ColumnMajor Congruous may "
+                "only be instantiated for A operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// MBlock or NBlock size
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator for RowMajor Congruous may "
+                "only be instantiated for B operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                   Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = Layout::kElementsPerAccess;
+    static int const kLdsmOpInner = 8;
+
+    static_assert(!(Shape::kContiguous % kLdsmOpOuter),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kStrided % kLdsmOpInner),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeContiguous =
+        InstructionShape::kContiguous / kLdsmOpOuter;
+    static int const LdsmShapeStrided =
+        ((4 / LdsmShapeContiguous * kLdsmOpInner) > Shape::kStrided)
+            ? (Shape::kStrided / kLdsmOpInner)
+            : (4 / LdsmShapeContiguous);
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeContiguous, LdsmShapeStrided>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations =
+        layout::PitchLinearShape<1, Shape::kStrided / kLdsmOpInner /
+                                        LdsmShape::kStrided>;
+
+    ///
+    static int const kGroupsPerTile = Layout::TileShape::kContiguous /
+                                      Layout::kFactor / LdsmShape::kContiguous;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kStrided *
+                                      InstructionShape::kContiguous / kThreads>;
+
+ private:
+
+  /// Total number of sections.  The memory is divided into stages.  One stage
+  /// can store one tile.  Stage is divided into sections.  Interleaved layout
+  /// can have multiple sections in a stage.  The rest layout only has one section
+  /// in a stage.
+  int sections_;
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator()
+      : pointer_(nullptr),
+        sections_(0),
+        stride_(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        sections_(ref.stride(0) / kCrosswise),
+        // stride_ = kCrosswise x sections_ x kFactor
+        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0),
+        k_group_idx_(0) {
+    // Warp level iterator at most use double buffer to hide latency.  If there
+    // are more than 2 sections, every stage should have more than 1 section.
+
+    // Turing silicon requires all 32 threads in a warp provide valid addresses
+    // even for LDSM.1 and LDSM.2
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 750))
+    lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner);
+#endif
+
+    int quad_quad = (lane_id >> 4);
+    int quad_pair = (lane_id >> 3);
+    int lane_in_pair = (lane_id & 1);
+    int lane_in_quad = (lane_id & 3);
+    int lane_in_quad_pair = (lane_id & 7);
+    int lane_in_quad_quad = (lane_id & 15);
+
+    int partition_contiguous_idx = -1;
+    int access_contiguous_idx = -1;
+    int access_strided_idx = -1;
+
+    if (Layout::kFactor == 8) {
+      int factor_in_partition =
+          (Layout::PartitionShape::kContiguous * Layout::kFactor /
+           Layout::TileShape::kContiguous);
+
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        partition_contiguous_idx = lane_in_quad_pair / factor_in_partition;
+        access_contiguous_idx = ((lane_in_quad) ^ (lane_id / Layout::kFactor));
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+    } else if (Layout::kFactor == 4) {
+      // Super Integer matrix multiply Interleaved-32
+
+      int factor_in_partition =
+          (Layout::PartitionShape::kContiguous * Layout::kFactor /
+           Layout::TileShape::kContiguous);
+
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Integer matrix multiply 8816  A/B
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_contiguous_idx = ((lane_in_pair * factor_in_partition) ^
+                                 (lane_in_quad_quad / Layout::kFactor));
+        access_strided_idx = lane_id / Layout::kFactor;
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Integer matrix multiply 16832 A
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_quad / Layout::kFactor;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + quad_quad) ^
+             access_strided_idx);
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Integer matrix multiply 16832 B
+        partition_contiguous_idx = lane_in_quad / factor_in_partition;
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2;
+        access_contiguous_idx =
+            ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^
+             access_strided_idx);
+      }
+    } else if (Layout::kFactor == 2) {
+      // Super Matrix multiply kBlock = 32
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Matrix multiply 1688 A/B
+        // (Q stands for 1 8x128bit block).
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        // Four blocks are next to each other in the strided dimension.
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor);
+        access_strided_idx = lane_id / Layout::kFactor;
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (quad_quad ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = (lane_in_quad_quad / Layout::kFactor);
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx =
+            (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor;
+      } 
+      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
+        // Matrix multiply 16832.SP B
+        // Q0 Q1 Q2 Q3
+        partition_contiguous_idx = (lane_id % Layout::kFactor);
+        access_contiguous_idx =
+            (quad_pair ^ (lane_in_quad_pair / Layout::kFactor));
+        access_strided_idx = lane_in_quad_pair / Layout::kFactor;
+      }
+    } else if (Layout::kFactor == 1) {
+      // Super Matrix multiply kBlock = 64
+      if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) {
+        // Q0
+        // Q1
+        // Q2
+        // Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = lane_in_quad;
+        access_strided_idx = lane_id;
+      }
+      else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kA) {
+        // Matrix multiply 16816|1688.TF32 A
+        // Q0 Q2
+        // Q1 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = (quad_quad ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_quad;
+      } else if (Policy::LdsmShape::kStrided ==
+                     (Policy::LdsmShape::kCount / 2) &&
+                 kOperand == Operand::kB) {
+        // Matrix multiply 16816|1688.TF32 B
+        // Q0 Q1
+        // Q2 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3);
+      } 
+      else if (Policy::LdsmShape::kContiguous == Policy::LdsmShape::kCount) {
+        // Matrix multiply 16832.SP B
+        // Q0 Q1 Q2 Q3
+        partition_contiguous_idx = (lane_in_quad_pair >> 2);
+        access_contiguous_idx = (quad_pair ^ lane_in_quad);
+        access_strided_idx = lane_in_quad_pair;
+      }
+    }
+
+    int access_contiguous =
+        partition_contiguous_idx * Layout::PartitionShape::kContiguous +
+        access_contiguous_idx;
+
+    int access_strided = access_strided_idx;
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) *
+                   sizeof_bits<Element>::value * Layout::kElementsPerAccess / 8;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+
+    byte_offset_ ^= k_groups_delta * sizeof_bits<Element>::value *
+                    Layout::kElementsPerAccess *
+                    Policy::LdsmShape::kContiguous / 8;
+    pointer_ +=
+        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
+        whole_tiles * stride_ / sections_;
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+    if (k_groups_delta < 0) {
+        whole_tiles -= 1;
+        k_groups_delta += Policy::kGroupsPerTile;
+    }
+
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) {
+      byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+    if ((Policy::kGroupsPerTile / kPartitionsK) == 8) {
+      byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * 
+                        Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+
+    k_group_idx_ += k_groups_delta;
+    whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK);
+    k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK);
+
+    pointer_ +=
+        tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor +
+        whole_tiles * stride_ / sections_;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+
+    // Integer matrix multiply 16832 Interleaved-32
+    //   NONE
+    // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32
+
+    // Integer matrix multiply 8816  Interleaved-32
+    //   ^1 ^1
+    // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64
+    // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64
+    //   ^1 ^3 ^1 ^3
+    // Matrix multiply 1688 kblock=64
+    //   ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7
+
+    // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64
+    //   ^2 ^2
+    // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128
+    //   ^2 ^6 ^2 ^6
+
+    if ((Policy::kGroupsPerTile / kPartitionsK) > 1) {
+      int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8)
+                     ? 3
+                     : (((Policy::kGroupsPerTile / kPartitionsK) == 4) ? 1 : 0);
+
+      if (((k_group_idx_ & mask) % 2) == 0)
+        byte_offset_ ^= 1 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+      else if ((k_group_idx_ & mask) == 1)
+        byte_offset_ ^= 3 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+      else if ((k_group_idx_ & mask) == 3)
+        byte_offset_ ^= 7 * Policy::LdsmShape::kContiguous *
+                        sizeof_bits<Element>::value *
+                        Layout::kElementsPerAccess / 8;
+    }
+
+    k_group_idx_++;
+
+    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
+      k_group_idx_ = 0;
+      add_tile_offset({Policy::kGroupsPerTile, 0});
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ + Policy::LdsmShape::kContiguous * c +
+            Policy::kLdsmOpInner / Layout::kFactor *
+                Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr =
+            reinterpret_cast<char const *>(source_ptr) + byte_offset +
+            byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator for ColumnMajor Crosswise may "
+                "only be instantiated for B operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Element number when the layout crosses (in units of elements)
+    int Crosswise,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA,
+                "MmaTensorOpMultiplicandIterator for RowMajor Crosswise may "
+                "only be instantiated for A operand to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Element number when the layout crosses
+  static int const kCrosswise = Crosswise;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kCrosswise>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            kCrosswise>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible =
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout.
+///
+/// This iterator is not tested.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::AffineRankN<2>, InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible =
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+
+            frag[mma_accum_start + row * kElementsPerAccess + col] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaTensorOpAccumulatorTileIterator<Shape_, Element_,
+                                         cutlass::layout::ColumnMajor,
+                                         InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static bool const kDivisible = 
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN);
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<
+      (Shape::kRow + InstructionShape::kM - 1) / InstructionShape::kM,
+      (Shape::kColumn + InstructionShape::kN - 1) / InstructionShape::kN
+    >;
+  };
+
+private:
+
+  // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire
+  // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements
+  // of that row. The accumulators within one row are assumed to be consecutive.
+ static int const kElementsPerAccess = InstructionShape::kN / 4;
+ static int const kRowsPerTile = 8;
+ static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, 
+    Policy::MmaIterations::kCount * InstructionShape::kMN / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+
+            frag[idx] = offset_ref.at({accum_m, accum_n});
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        
+        int mma_accum_start = kAccumulatorRows * kElementsPerAccess * 
+          (mma_n * Policy::MmaIterations::kRow + mma_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < kAccumulatorRows; ++row) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+                          row * kRowsPerTile;
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col;
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            
+            offset_ref.at({accum_m, accum_n}) = frag[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element typ
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Interleaved N
+    int InterleavedN>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::ColumnMajorInterleaved<InterleavedN>,
+    InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorInterleaved<InterleavedN>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                      Shape::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  static int const kElementsPerAccess = 2;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+          offset_ref.offset(TensorCoord(accum_m, accum_n)));
+
+        frag_ptr[idx] = access_ptr[0];
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kRow;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 offset_ref.offset(TensorCoord(accum_m, accum_n)));
+
+        access_ptr[0] = frag_ptr[idx];               
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element typ
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Interleaved N
+    int InterleavedN>
+class MmaTensorOpAccumulatorTileIterator<
+    Shape_, Element_, cutlass::layout::TensorNCxHWx<InterleavedN>,
+    InstructionShape_, OpDelta_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = int8_t;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorNCxHWx<InterleavedN>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    /// Number of elements in strided dimension that each STG writes
+    static int const kStridedPerSTG = 8;
+
+    /// Factor to calculate reorder index to pack accumulator.
+    static int const kPackedFactor = Shape::kColumn / 32;
+
+    /// Number of mma operations performed
+    using MmaIterations = MatrixShape<Shape::kRow / kStridedPerSTG,
+                                      Shape::kColumn / InterleavedN>;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InterleavedN / 4;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  struct alignas((kElementsPerAccess * sizeof_bits<Element>::value / 8)) AccessType {
+      Array<Element, kElementsPerAccess> storage;
+  };
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<int32_t, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+  /// Row offset index globally
+  LongIndex global_offset_row_;
+
+  /// Column offset index globally
+  LongIndex global_offset_col_;
+
+  /// Output tensor size
+  TensorCoord extent_;
+
+  /// Alpha 
+  float alpha_;
+
+  /// Beta
+  float beta_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref,
+    int const lane_id,
+    TensorCoord extent,
+    float alpha = 1.0f,
+    float beta = 0.0f
+  ):
+    ref_(ref),
+    extent_(extent),
+    alpha_(alpha),
+    beta_(beta) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+
+    global_offset_row_ = quad;
+
+    global_offset_col_ = lane_in_quad * kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator &add_tile_offset(MatrixCoord const &tile_offset) {
+
+    global_offset_row_ += tile_offset.row() * Shape::kRow;
+
+    global_offset_col_ += tile_offset.column() * Shape::kColumn;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Policy::MmaIterations::kN; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kM; ++mma_m) {
+        int accum_m = mma_m * InstructionShape::kM;
+        int accum_n = mma_n * InstructionShape::kN;
+
+        int idx = mma_m + mma_n * Policy::MmaIterations::kM;
+
+        AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                 accum_m * offset_ref.stride(0) + accum_n);
+
+        frag_ptr[idx] = access_ptr[0];
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+  
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    Array<float, Shape::kCount / kThreads> output_frag_f;
+    Array<Element, Shape::kCount / kThreads> output_frag;
+
+    LongIndex pq = extent_.h() * extent_.w();
+
+    LongIndex extent_row = extent_.n() * pq;
+    LongIndex extent_col = extent_.c();
+
+    LongIndex k_major = (global_offset_col_ / InterleavedN) * pq;
+    Index k_minor = global_offset_col_ % InterleavedN;
+    LongIndex k_offset = k_major * InterleavedN + k_minor;
+    LongIndex k_offset_delta = pq * InterleavedN;
+
+    LongIndex stride_n = pq * extent_.c();
+
+    Index n;
+    LongIndex pq_rem;
+
+    unsigned int pq_mul, pq_shr;
+    find_divisor(pq_mul, pq_shr, pq);
+
+    if(beta_ == 0.0f) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int i = 0; i < int(frag.size()); ++i) {
+        output_frag_f[i] = frag[i];
+      }
+
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          output_frag[i] = (Element)(output_frag_f[i] * alpha_);
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag[i] = (Element)(output_frag_f[map_i] * alpha_);
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
+      }
+    } else {
+      if(InstructionShape::kM == Policy::kStridedPerSTG) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          output_frag_f[i] = frag[i];
+        }
+      } else {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < int(frag.size()); ++i) {
+          int map_i = (i / (16 * Policy::kPackedFactor)) * (16 * Policy::kPackedFactor)
+                    + (i % (8 * Policy::kPackedFactor)) / 2 * 4
+                    + (i % (8 * Policy::kPackedFactor)) % 2
+                    + (i / (8 * Policy::kPackedFactor)) % 2 * 2;
+          output_frag_f[i] = frag[map_i];
+        }
+      }
+
+      AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&output_frag);
+
+      Array<Element, kElementsPerAccess> ref_frag;
+      AccessType *ref_frag_ptr = reinterpret_cast<AccessType *>(&ref_frag);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        int accum_m = mma_m * Policy::kStridedPerSTG;
+
+        fast_divmod(n, pq_rem, global_offset_row_ + accum_m, pq, pq_mul, pq_shr);
+        LongIndex offset_m = n * stride_n + k_offset + pq_rem * InterleavedN;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+       
+          int accum_n = mma_n * InterleavedN;
+
+          int idx = mma_n + mma_m * Policy::MmaIterations::kColumn;
+         
+          if((global_offset_row_ + accum_m < extent_row) && (global_offset_col_ + accum_n < extent_col)) {
+            AccessType* access_ptr = reinterpret_cast<AccessType *>(offset_ref.data() +
+                                                                    offset_m + mma_n * k_offset_delta);
+
+            ref_frag_ptr[0] = access_ptr[0];
+
+            CUTLASS_PRAGMA_UNROLL
+            for(int i = 0; i < kElementsPerAccess; ++i) {
+              output_frag[idx * kElementsPerAccess + i] = Element(alpha_ * output_frag_f[idx * kElementsPerAccess + i]
+                                                                + beta_ * ref_frag[i]);
+            }
+
+            access_ptr[0] = frag_ptr[idx];
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d1da845ca08e1999403c5e34260b8e54bb6a85c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -0,0 +1,3096 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+
+#include "cutlass/platform/platform.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads>
+class MmaVoltaTensorOpMultiplicandTileIterator;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Shape of one individual LDS.128
+    using LdsShape = layout::PitchLinearShape<
+      32,
+      4
+    >;
+
+    // LdsShapes are arranged in the strided direction in SMEM
+    using LdsIterations = layout::PitchLinearShape<
+      InstructionShape::kStrided / LdsShape::kStrided,
+      Shape::kContiguous / LdsShape::kContiguous
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Number of internal pointers needed to reference shared memory
+  static int const kPointerCount = 2;
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment = Array<Element, Shape::kContiguous *
+                                     InstructionShape::kStrided / kThreads * 2>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_[kPointerCount];
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+    // swizzle patterns for operandA LDS are
+    // 1. (tid[4] << 3) | (tid[2:0] ^ tid[4])
+    // 2. (tid[4] << 3) | (tid[2:0] ^ tid[4] ^ 0b10010)
+
+    int vec_row = (lane_id >> 4); // tid[4]
+    int vec_col = ((lane_id & 4) >> 2); // tid[2]
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPointerCount; ++i) {
+
+      if(i == 1) {
+        vec_row |= 2;
+      }
+      int access_contiguous_idx = (vec_col << 2) | ((lane_id & 3) ^ vec_row);
+      int access_contiguous = access_contiguous_idx;
+
+      int access_strided = vec_row;
+      pointer_[i] = reinterpret_cast<AccessType const *>(ref.data()) +
+        access_contiguous + access_strided * stride_;
+    }
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+
+    // To support 32x32 tile size
+    if (Shape::kContiguous == Policy::LdsShape::kContiguous) {
+      if (contiguous_offset % 2) {
+        AccessType const *tmp_pointer = pointer_[0];
+        pointer_[0] = pointer_[1];
+        pointer_[1] = tmp_pointer;
+      }
+      contiguous_offset = contiguous_offset / 2 * 2;
+    }
+
+    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
+                     Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_[s & 1] +
+          Policy::LdsShape::kContiguous * c +
+          Policy::LdsShape::kStrided * (s / 2) * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandBCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+    /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kContiguous % InstructionShape::kContiguous),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+
+    // Shape of one individual LDS
+    using LdsShape = layout::PitchLinearShape<
+      32,
+      4
+    >;
+
+    using LdsIterations = layout::PitchLinearShape<
+      Shape::kContiguous / LdsShape::kContiguous,
+      InstructionShape::kStrided / LdsShape::kStrided
+    >;
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile, needs on more time number of registers
+ using Fragment = Array<Element, Shape::kContiguous *
+                                     InstructionShape::kStrided / kThreads * 2>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    // swizzle pattern is (tid & (3 << 3) | (tid[1:0] ^ tid[4:3]))
+    int access_strided = (lane_id >> 3) & 0x3;
+    int access_contiguous = ((lane_id ^ (lane_id >> 3)) & 0x3);
+
+    pointer_ = reinterpret_cast<AccessType const *>(ref.data()) +
+                access_contiguous + access_strided * stride_;
+
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+
+    int offset = (strided_offset * InstructionShape::kStrided) * stride_ *
+                     Layout::kElementsPerAccess +
+                 contiguous_offset * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+    byte_offset_ += stride_ * InstructionShape::kStrided * sizeof(Element) *
+                    Layout::kElementsPerAccess;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+          Policy::LdsShape::kContiguous / Layout::kElementsPerAccess * c +
+          Policy::LdsShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset =
+        tile_offset.contiguous() * Shape::kContiguous /
+            Layout::kElementsPerAccess +
+        tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_,
+    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<
+        sizeof_bits<Element_>::value>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It is used to load or store
+/// accumulators from memory and is agnostic to layout. It could be faster if it assumed row-major
+/// accumulator layout.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_>
+class MmaVoltaTensorOpAccumulatorTileIterator {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kC;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+
+    /// Volta Tensor Op uses 32x32 interleaved tile
+    using InterleavedTile = MatrixShape<32, 32>;
+
+    static_assert(!(Shape::kRow % InterleavedTile::kRow) && !(Shape::kColumn % InterleavedTile::kColumn),
+      "Shape of warp-level Mma must be divisible by operator shape.");
+
+    static_assert(platform::is_same<TensorCoord, MatrixCoord>::value,
+      "Layouts must be defined for logical MatrixCoord coordinate space.");
+
+    /// Number of mma operations performed
+    using TileIterations = MatrixShape<
+      Shape::kRow / InterleavedTile::kRow,
+      Shape::kColumn / InterleavedTile::kColumn
+    >;
+
+    using MmaIterations =
+        MatrixShape<InterleavedTile::kRow / InstructionShape::kM,
+                    InterleavedTile::kColumn / InstructionShape::kN>;
+  };
+
+private:
+
+  // Assume accumulator tile is multipile interleaved 32x32 tile.
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename platform::conditional<
+                              platform::is_same<Element, float>::value,
+                              MatrixShape<2, 2>,
+                              MatrixShape<1, 4> >::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = MatrixShape<4, 4>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+private:
+
+  /// Reference to output tensor
+  TensorRef ref_;
+
+public:
+
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator(
+    TensorRef const &ref,
+    int lane_id
+  ):
+    ref_(ref) {
+
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+                  (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    MatrixCoord lane_offset(accum_m, accum_n);
+
+    ref_.add_coord_offset(lane_offset);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn));
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator++() {
+    // deliberate no-op
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator--() {
+    // deliberate no-op
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index pointer_offset) const {               ///< loads a tile with a linear offset
+
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                    Policy::MmaIterations::kColumn + mma_n) *
+                     Policy::MmaIterations::kRow + mma_m) * 
+                    kElementsPerMma;
+
+           CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
+                                mma_n * QuadShapePerPatialMma::kColumn +
+                                p * Policy::InterleavedTile::kColumn/2 + n;
+                  int idx = mma_accum_start + p * kElementsPerPartial + 
+                            m * EleShapePerPatial::kColumn + n;
+                frag[idx] = offset_ref.at({accum_m, accum_n});
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    Index byte_offset) const {                  ///< loads a tile with a linear offset
+
+    load_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset) const {     ///< loads a tile with a logical offset in units of whole tiles
+
+    load(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void load(
+    Fragment &frag,                             ///< fragment to load from the tensor
+    TensorCoord const &tile_offset,             ///< loads a tile with a logical offset in units of whole tiles
+    Index pointer_offset) const {               ///< loads a tile with a logical offset AND a pointer offset
+
+    load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index pointer_offset) const {               ///< store a tile with a linear offset
+
+    TensorRef offset_ref(ref_);
+    offset_ref.add_pointer_offset(pointer_offset);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                    Policy::MmaIterations::kColumn + mma_n) *
+                     Policy::MmaIterations::kRow + mma_m) * 
+                    kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                                mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn + 
+                                mma_n * QuadShapePerPatialMma::kColumn +
+                                p * Policy::InterleavedTile::kColumn/2 + n;
+                  int idx = mma_accum_start + p * kElementsPerPartial + 
+                            m * EleShapePerPatial::kColumn + n;
+                  offset_ref.at({accum_m, accum_n}) = frag[idx];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment to memory with additional pointer offset
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(
+    Fragment const &frag,                       ///< fragment to store from the tensor
+    Index byte_offset) const {                  ///< store a tile with a linear offset
+
+    store_with_pointer_offset(byte_offset / sizeof(Element));
+  }
+
+  /// Stores a fragment to memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void store(
+    Fragment &frag,                             ///< fragment to store to the tensor
+    TensorCoord const &tile_offset) const {     ///< stores a tile with a logical offset in units of whole tiles
+
+    store(frag, tile_offset, 0);
+  }
+
+  /// Stores a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_HOST_DEVICE
+  void store(
+      /// fragment to store to the tensor
+      Fragment const &frag,
+      /// stores a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// stores a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset);
+  }
+};
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::VoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+
+    /// Shape of one individual LDS instruction
+    using LdsShape = layout::PitchLinearShape<1, 32>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsIterations = layout::PitchLinearShape<1, Shape::kStrided / 32>;
+
+    /// Using LDS.128
+    static int const kElementsPerAccess = 8;
+
+    /// Contiguous elements per line
+    static int const kContiguousElementsPerLine = 4;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, Policy::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+      Array<Element,
+            Shape::kStrided * InstructionShape::kContiguous / kThreads * 2>;
+
+ private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Crosswised elements are arranged in a SMEM line
+  /// in units of AccessType
+  Index line_size;
+
+  /// Internal counter used to determine load addr offset 
+  /// and when to swap higher 64bit with lower 64bit
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator()
+      : pointer_(nullptr),
+        stride_(0),
+        line_size(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        stride_(ref.stride(0) * Policy::kElementsPerAccess),
+        line_size((ref.stride(0) * Policy::kContiguousElementsPerLine) /
+                  Policy::kElementsPerAccess),
+        k_group_idx_(0),
+        byte_offset_(0) {
+
+    int quad = (lane_id / 4);
+    int lane_in_quad = (lane_id % 4);
+    int access_contiguous;
+
+    if(kOperand == Operand::kA) {
+
+      // swizzle id: tid[4]|tid[1:0]|(tid[2]^tid[4])
+      access_contiguous = ((quad & 0x4) << 1) + ((lane_in_quad) << 1) +
+                            ((quad & 0x1) ^ ((quad & 0x4) >> 2));
+    } else {
+
+      // swizzle id: tid[4]|tid[1:0]|tid[3]
+      access_contiguous = ((quad & 0x4) << 1) + (lane_in_quad << 1) +
+                            ((quad & 0x2) >> 1 ^ ((quad & 0x4) >> 2));
+    }
+
+    byte_offset_ = access_contiguous *
+                   sizeof(Element) * Policy::kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    int contiguous_offset = tile_offset.contiguous();
+    int strided_offset = tile_offset.strided();
+    k_group_idx_ = 0;
+
+    pointer_ += contiguous_offset *
+                    (InstructionShape::kContiguous /
+                     Policy::kContiguousElementsPerLine) *
+                    line_size +
+                strided_offset * Shape::kStrided / 2;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    k_group_idx_ = (k_group_idx_ + 1) % 8;
+
+    if (k_group_idx_ == 4 || k_group_idx_ == 0) {
+      byte_offset_ ^= 1 * sizeof(Element) * Policy::kElementsPerAccess;
+    }
+
+    pointer_ += line_size;
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType * fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsIterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+          Policy::LdsShape::kContiguous * c * line_size +
+          Policy::LdsShape::kStrided * s / 2;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+        fetch_ptr[access_idx] = *(reinterpret_cast<AccessType const*> (source_byte_ptr));
+
+        // swap higher 64bit and lower 64bit
+        if (k_group_idx_ &  0x2) {
+            uint64_t *low = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2;
+            uint64_t *high = reinterpret_cast<uint64_t *>(&frag) + access_idx * 2 + 1;
+            uint64_t tmp = *low;
+            *low = *high;
+            *high = tmp;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Policy::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 kKBlock>,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.contiguous(), tile_offset.strided()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDS to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// KBlock size (in units of elements)
+    int KBlock>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, KBlock>,
+    InstructionShape_, OpDelta_, 32> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand == Operand::kB,
+                "MmaTensorOpMultiplicandIterator may only be instantiated for "
+                "A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// KBlock size
+  static int const kKBlock = KBlock;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, kKBlock>;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaVoltaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 kKBlock>,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id)
+      : iterator_({ref.data(), ref.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+  
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for 'TN' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile  + lane_in_quad;
+      int col_idx = 0;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = 0;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile  + lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[row_offset * ldm / kElementsPerAccess];
+      } 
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns + quad_idx * 4;
+        frag_ptr[idx] = access_ptr[col_offset * ldm / kElementsPerAccess];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+
+/// Tile iterator specialized for 'NT' arrangement
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of matrix operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaVoltaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 4;
+
+private:
+
+  static int const kInterleavedTileRows = 32;
+  static int const kInterleavedTileColumns = 32;
+  static int const kInstructionsPerTile = 2;
+  
+  /// Rounded up instruction counts
+  using TileCount = MatrixShape<
+    Shape::kRow / kInterleavedTileRows,
+    Shape::kColumn / kInterleavedTileColumns
+  >;
+
+  using FragmentCount = MatrixShape<
+    TileCount::kRow * kInstructionsPerTile,
+    TileCount::kColumn * kInstructionsPerTile
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    (kOperand == Operand::kA ? FragmentCount::kRow : FragmentCount::kColumn) * kElementsPerAccess
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    int lane_id
+  ): 
+    ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    int quad_id = lane_id / 4;
+    int lane_in_quad = (lane_id % 4);
+  
+    if (kOperand == Operand::kA) {
+      
+      int row_idx = ((quad_id & 1) + ((quad_id & 4) / 2)) * 4 * kInstructionsPerTile;
+      int col_idx = lane_in_quad;
+
+      origin_ = MatrixCoord(row_idx, col_idx);
+    }
+    else {
+
+      int row_idx = lane_in_quad;
+      int col_idx = (quad_id / 2) * 4 * kInstructionsPerTile;
+
+      origin_ = MatrixCoord(row_idx, col_idx); 
+    }
+
+    #if defined(__CUDA_ARCH__)
+    __syncthreads();
+    #endif
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    AccessType const *access_ptr = reinterpret_cast<AccessType const *>(ref_.data());
+    int ldm = ref_.stride()[0];
+
+    if (kOperand == Operand::kA) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kRow; ++idx) {
+        
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int row_offset = tile_idx * kInterleavedTileRows;
+        frag_ptr[idx] = access_ptr[row_offset / kElementsPerAccess + quad_idx];
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx = 0; idx < FragmentCount::kColumn; ++idx) {
+
+        int tile_idx = idx / 2;
+        int quad_idx = idx % 2;
+
+        int col_offset = tile_idx * kInterleavedTileColumns;
+        frag_ptr[idx] = access_ptr[col_offset / kElementsPerAccess + quad_idx];
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::RowMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kA, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+  Shape_, 
+  Operand::kA, 
+  Element_,
+  cutlass::layout::ColumnMajor,
+  InstructionShape_, 
+  OpDelta_,
+  32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kA, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> ;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::ColumnMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalInner<
+  Shape_, Operand::kB, Element_, cutlass::layout::ColumnMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_>
+class MmaVoltaTensorOpMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_,
+    cutlass::layout::RowMajor,
+    InstructionShape_, OpDelta_, 32
+> : public MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> {
+
+public:
+  using Base = MmaVoltaTensorOpMultiplicandTileIteratorCanonicalOuter<
+  Shape_, Operand::kB, Element_, cutlass::layout::RowMajor, InstructionShape_, OpDelta_>;
+
+  using TensorRef = typename Base::TensorRef;
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaVoltaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): Base(ref, lane_id) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5370ff8f14a3e384da392782cdc26c1f34a4eff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -0,0 +1,2440 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<8, 4>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess / Delta::kContiguous,
+      InstructionShape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+  /// Internal counter used to jump to next K partition
+  int k_group_idx_;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kContiguous * InstructionShape::kStrided / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / Policy::Delta::kContiguous;
+    int access_contiguous = (lane_id  % Policy::Delta::kContiguous) ^ access_strided;
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data()) +
+      access_contiguous + access_strided * stride_;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    byte_offset_ += offset * sizeof(Element);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    int offset = 
+      (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + 
+      tile_offset.contiguous() * Shape::kContiguous;
+
+    add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    add_tile_offset({0, 1});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+    
+    add_tile_offset({0, -1});
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c +
+            Policy::Delta::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for loading 128b vectors of 64b elements.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::TensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility.");
+
+  static_assert(sizeof_bits<Element_>::value == 64, "This is specialized for 64b accesses.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Long Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Load two elements per access
+  static int const kElementsPerAccess = 2;
+
+  /// Policy defining internal details of tile iterator
+  struct Policy {
+
+    /// Shape of one access
+    using Delta = layout::PitchLinearShape<4, 16>;
+
+    /// Number of iterations to load
+    using Iterations = layout::PitchLinearShape<
+      InstructionShape::kContiguous / Delta::kContiguous,
+      Shape::kStrided / Delta::kStrided
+    >;
+
+  };
+
+private:
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = AlignedArray<Element, kElementsPerAccess, 16>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+ using Fragment =
+     Array<Element, Shape::kStrided * InstructionShape::kContiguous / kThreads>;
+
+private:
+
+  /// Layout object storing stride values
+  StrideIndex stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter for tracking K-group
+  Index k_group_idx_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ):
+    stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0),
+    k_group_idx_(0) {
+
+    int access_strided = lane_id / 8;
+    int access_contiguous = (lane_id  % 8);
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType);
+
+    pointer_= reinterpret_cast<AccessType const *>(ref.data());
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    pointer_ += offset / kElementsPerAccess;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) *
+                     stride_ * kElementsPerAccess +
+                 tile_offset.strided() * Shape::kStrided;
+
+    add_pointer_offset(offset);
+    
+    int old_k_group_idx = k_group_idx_;
+
+    k_group_idx_ += tile_offset.contiguous();
+
+    if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) {
+      byte_offset_ ^= 0x40;
+    }
+
+    return *this;
+  }
+
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    pointer_ += stride_ * InstructionShape::kContiguous;
+
+    if (k_group_idx_ & 0x1) {
+      // xor ptr
+      byte_offset_ ^= 0x40;
+    }
+
+    ++k_group_idx_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    AccessType *fetch_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < Policy::Iterations::kContiguous; ++c) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int s = 0; s < Policy::Iterations::kStrided; ++s) {
+
+        int access_idx = c + s * Policy::Iterations::kContiguous;
+
+        AccessType const *source_ptr = pointer_ +
+            Policy::Delta::kContiguous * c * stride_ +
+            Policy::Delta::kStrided * s / kElementsPerAccess;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) + byte_offset + byte_offset_;
+
+        AccessType const *source = reinterpret_cast<AccessType const *>(source_byte_ptr);
+
+        fetch_ptr[access_idx] = *source;
+      }
+    }
+
+    Element *exchange_ptr = reinterpret_cast<Element *>(&frag);
+
+    if (k_group_idx_ & 1) {
+      // exchange on 64b granularity
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Fragment::kElements; i += 2) {
+        Element tmp = exchange_ptr[i];
+        exchange_ptr[i] = exchange_ptr[i + 1];
+        exchange_ptr[i + 1] = tmp;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               Layout::kElementsPerAccess +
+                           tile_offset.strided() * Shape::kStrided * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.strided(), tile_offset.contiguous()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, kOperand, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      layout::PitchLinearShape<InstructionShape::kRow,
+                               InstructionShape::kColumn>,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Tile iterator specialized for canonical matrix layouts
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads = 32,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class MmaTensorOpMultiplicandTileIteratorCanonical {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess = 
+    (sizeof_bits<Element>::value >= 32 ? 1 : 32 / sizeof_bits<Element>::value);
+
+private:
+
+  static int const kWarpShapeOuter = 
+    (kOperand == Operand::kA ? Shape::kRow : Shape::kColumn);
+
+  static int const kWarpShapeInner =
+    (kOperand == Operand::kA ? Shape::kColumn : Shape::kRow);
+
+  
+  /// Rounded up instruction counts
+  using InstructionCount = MatrixShape<
+    Shape::kRow / InstructionShape::kRow,
+    Shape::kColumn / InstructionShape::kColumn
+  >;
+
+  /// Rounded up tile dimensions
+  using WarpShapeDivisible = MatrixShape<
+    InstructionCount::kRow * InstructionShape::kRow,
+    InstructionCount::kColumn * InstructionShape::kColumn
+  >;
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+    Element, 
+    WarpShapeDivisible::kRow * WarpShapeDivisible::kColumn / kThreads
+  >;
+
+  /// Memory access type
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+private:
+
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Extent of tensor
+  MatrixCoord extent_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Used to conditionally enable extents checking
+  bool divisible_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(): divisible_(true) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(
+    TensorRef const &ref, 
+    int lane_id
+  ): ref_(ref), extent_(Shape::kRow, Shape::kColumn), divisible_(true) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical(
+    TensorRef const &ref, 
+    TensorCoord extent,
+    int lane_id
+  ): ref_(ref), extent_(extent), divisible_(false) {
+  
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id / 4, (lane_id % 4) * kElementsPerAccess);
+    }
+    else {
+      origin_ = MatrixCoord((lane_id % 4) * kElementsPerAccess, lane_id / 4);
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical &add_pointer_offset(LongIndex offset) {
+
+    ref_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical &add_tile_offset(TensorCoord const &tile_offset) {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator++() {
+
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    }
+    else {
+      add_tile_offset({1, 0});
+    }    
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator--() {
+    
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, -1});
+    }
+    else {
+      add_tile_offset({-1, 0});
+    }    
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIteratorCanonical & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    int const kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? WarpShapeDivisible::kColumn : WarpShapeDivisible::kRow);
+
+    // Take advantage of Tensor Op's 8 x 4T access pattern
+    int const kAccessesInner = (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+
+    AccessType *access_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    if (kOperand == Operand::kA) {
+      int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow; ++inst_m_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction; ++access_m_idx) {
+            int access_idx = 
+              access_m_idx + kTilesPerInstruction * (inner_idx + kAccessesInner * inst_m_idx);
+            
+            MatrixCoord offset(
+              access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, 
+              inner_idx * 4 * kElementsPerAccess);
+
+            MatrixCoord access_coord = origin_ + offset;
+
+            if (divisible_ || 
+              (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+
+              access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+                ref_.data() + ref_.offset(offset));
+            }
+            else {
+              AccessType zero;
+              zero.clear();
+              access_ptr[access_idx] = zero;
+            }
+          }
+        }
+      }
+    }
+    else {
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; ++inst_n_idx) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+            inner_idx * 4 * kElementsPerAccess,
+            inst_n_idx * 8);
+
+          MatrixCoord access_coord = origin_ + offset;
+
+          if (divisible_ ||
+            (access_coord.row() < extent_.row() && access_coord.column() < extent_.column())) {
+              
+            access_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              ref_.data() + ref_.offset(offset));
+          }
+          else {
+              AccessType zero;
+              zero.clear();
+              access_ptr[access_idx] = zero;
+          }
+        }
+      } 
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+
+    load_with_pointer_offset(frag, byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + pointer_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    TensorCoord coord_offset(tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+  
+    load_with_pointer_offset(frag, ref_.offset(coord_offset) + byte_offset * 8 / sizeof_bits<Element>::value);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation
+  }
+};
+
+/// Wrapper for ColumnMajor
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::ColumnMajor,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
+      Shape, kOperand, Element,
+      layout::ColumnMajor,
+      InstructionShape,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+  
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    TensorCoord const & extent,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+/// Wrapper for RowMajor
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Identifies A or B multiplicand
+    Operand Operand_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions)
+    int OpDelta_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class MmaTensorOpMultiplicandTileIterator<
+    Shape_, Operand_, Element_,
+    cutlass::layout::RowMajor,
+    InstructionShape_, OpDelta_, 32, PartitionsK_> {
+ public:
+
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  static_assert(kOperand == Operand::kA || kOperand== Operand::kB,
+    "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Underlying tile iterator implementation
+  using Base = MmaTensorOpMultiplicandTileIteratorCanonical<
+      Shape, kOperand, Element,
+      layout::RowMajor,
+      InstructionShape,
+      kOpDelta, kThreads, PartitionsK_>;
+
+ public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+private:
+
+  /// Underlying tile iterator
+  Base iterator_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, lane_id) {
+  }
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator(
+    TensorRef const &ref, 
+    TensorCoord const &extent,
+    int lane_id
+  ): iterator_({ref.data(), ref.stride()}, extent, lane_id) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator++() {
+
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator--() {
+
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    iterator_.load(frag);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+      frag,
+      {tile_offset.contiguous(), tile_offset.strided()},
+      byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f7e14f940ff29ff257ba18d2dfa6f5e844ea25
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -0,0 +1,380 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators to load sparse meta data used by warp-level matrix multiply operations
+   targeting Sparse Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Delta between *MMA operations (in units of *MMA operations, concept:
+    /// MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class SparseMmaTensorOpMetaTileIterator {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  static int const kSparse = 2;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kColumn % InstructionShape::kColumn),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    
+    static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+
+    // Determine number of elements along outer dimension per individual LDSM op
+    static int const kLdsmOpOuter = InstructionShape::kColumn;
+    static int const kLdsmOpInner = 8 * kElementsPerAccess / kLdsmOpOuter;
+
+    static_assert(!(Shape::kColumn % kLdsmOpOuter),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    static_assert(!(Shape::kRow % kLdsmOpInner),
+                  "Shape of warp-level mma must be divisible by LDSM's "
+                  "fundamental tile size.");
+
+    /// Shape of one individual LDSM instruction
+    static int const LdsmShapeColumn =
+        InstructionShape::kColumn / kLdsmOpOuter;
+    static int const LdsmShapeRow =
+        ((4 / LdsmShapeColumn * kLdsmOpInner) > Shape::kRow)
+            ? (Shape::kRow / kLdsmOpInner)
+            : (4 / LdsmShapeColumn);
+    using LdsmShape =
+        layout::PitchLinearShape<LdsmShapeRow, LdsmShapeColumn>;
+
+    /// Number and arrangement of LDSM instructions
+    using LdsmIterations = layout::PitchLinearShape<
+        Shape::kRow / kLdsmOpInner / LdsmShapeRow,
+        1>;
+
+    /// Number of groups for each tile
+    static int const kGroupsPerTile =
+        Shape::kColumn / InstructionShape::kColumn;
+  };
+
+ private:
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+                "Alternative arrangements not supported at present.");
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, Policy::kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment =
+      Array<Element, Shape::kRow * InstructionShape::kColumn / kThreads>;
+
+ private:
+
+  /// Layout object storing stride values
+  Index stride_;
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  SparseMmaTensorOpMetaTileIterator()
+      : pointer_(nullptr),
+        stride_(0),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator(TensorRef const &ref, int lane_id)
+      : pointer_(reinterpret_cast<AccessType const *>(ref.data())),
+        stride_(ref.stride(0) / Policy::kElementsPerAccess),
+        byte_offset_(0),
+        k_group_idx_(0) {
+
+    int access_contiguous = (lane_id % (Shape::kRow / Policy::kElementsPerAccess));
+    int access_strided = (lane_id / (Shape::kRow / Policy::kElementsPerAccess));
+
+    byte_offset_ = (access_contiguous + access_strided * stride_) *
+                   sizeof_bits<Element>::value * Policy::kElementsPerAccess / 8;
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int offset = tile_offset.row() * Shape::kRow +
+                 tile_offset.column() * InstructionShape::kColumn * stride_ *
+                     Policy::kElementsPerAccess;
+
+    add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator++() {
+    add_tile_offset({0, 1});
+
+    if (kPartitionsK > 1) {
+      ++k_group_idx_;
+      // Jump to next stage
+      if (k_group_idx_ == Policy::kGroupsPerTile) {
+        k_group_idx_ = 0;
+        add_tile_offset(
+            {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)});
+      }
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator--(){
+    byte_offset_ -= stride_ * InstructionShape::kColumn *
+                    sizeof_bits<Element>::value * Policy::kElementsPerAccess /
+                    8;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE SparseMmaTensorOpMetaTileIterator &
+  operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  SparseMmaTensorOpMetaTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, Policy::LdsmShape::kCount> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, Policy::LdsmShape::kCount> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < Policy::LdsmIterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ +
+            Policy::LdsmShape::kContiguous * Policy::kLdsmOpInner * c +
+            Policy::LdsmShape::kStrided * s * stride_;
+
+        char const *source_byte_ptr = reinterpret_cast<char const *>(source_ptr) +
+                                      byte_offset + byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, Policy::LdsmShape::kCount>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = 
+      tile_offset.contiguous() * Shape::kRow / Layout::kElementsPerAccess + 
+      tile_offset.strided() * InstructionShape::kColumn * stride_;
+
+    byte_offset += sizeof(AccessType) * pointer_offset;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no op
+  }
+};
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
new file mode 100644
index 0000000000000000000000000000000000000000..92e065f236fe8d62068487abb266a0e9c77fe712
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -0,0 +1,805 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Operand identity (A or B)
+    Operand Operand,
+    /// Data type of operand
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Delta between *MMA operations (in units of *WMMA operations, concept:MatrixShape)
+    int OpDelta_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator;
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::load_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    int OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator<
+    Shape_, Operand::kA, Element_, Layout_,
+    OpDelta_, 32, Policy_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kA;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *WMMA operations
+  static int const kOpDelta = OpDelta_;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Stride Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape for operand A (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kM, 
+    Policy::Operator::Shape::kK
+  >;
+
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Shape of individual WMMA load / stores for operand A
+  using Iterations = MatrixShape<
+    Shape::kRow / WmmaShape::kRow,
+    1 
+  >;
+
+  /// Fragment object holding a warps part 
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentA, Iterations::kCount>;
+
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically assert this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// This iterator is specalized for Operand A
+  static_assert(kOperand == Operand::kA,
+    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for A operands to warp-level Mma.");
+
+  /// Supported memory layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+
+  /// Shared memory base pointers - not advanced
+  char const *pointer_;
+  
+  /// Byte offset into shared memory - advanced
+  Index byte_offset_;
+  
+  /// Stride in units of number of elements
+  StrideIndex stride_;
+
+  /// Layout of shared memory
+  Layout layout_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) { 
+  
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+
+    Index elements_offset = layout_({tile_offset.row() * Shape::kRow, tile_offset.column() * WmmaShape::kColumn});
+    
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
+    
+    Index elements_offset = layout_({0, WmmaShape::kColumn});
+
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
+    
+    Index elements_offset = layout_({0, WmmaShape::kColumn});
+
+    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        Index load_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset); 
+
+        nvcuda::wmma::load_matrix_sync(frag[m], ptr, stride_); 
+      
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kColumn; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Iterations::kRow; ++m) {
+
+        Index store_byte_offset = layout_({m * WmmaShape::kRow, k * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
+
+        nvcuda::wmma::store_matrix_sync(ptr, frag[m], stride_); 
+      
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::load_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    int OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaMultiplicandTileIterator<
+    Shape_, Operand::kB, Element_, Layout_,
+    OpDelta_, 32, Policy_> {
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Operand tag
+  static Operand const kOperand = Operand::kB;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *WMMA operations
+  static int const kOpDelta = OpDelta_;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Stride Index type
+  using StrideIndex = typename TensorRef::Layout::Stride::Index;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kK, 
+    Policy::Operator::Shape::kN
+  >;
+
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Shape of individual WMMA load / stores for operand B
+  using Iterations = MatrixShape<
+    1,
+    Shape::kColumn / WmmaShape::kColumn
+  >;
+
+  /// Fragment object holding a warps part
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentB, Iterations::kCount>;
+
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically asserts this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// This iterator is specalized for Operand B
+  static_assert(kOperand == Operand::kB,
+    "MmaTensorOpWmmaMultiplicandTileIterator may only be instantiated for B operands to warp-level Mma.");
+
+  /// Supported memory layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+  /// Not working on this feature at the moment.
+  static_assert(kOpDelta == 1,
+    "Alternative arrangements not supported at present.");
+
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+
+private:
+
+  /// Shared memory base pointers - not advanced
+  char const *pointer_;
+  
+  /// Byte offset into shared memory - advanced
+  Index byte_offset_;
+  
+  /// Stride in units of number of elements
+  StrideIndex stride_;
+
+  /// Layout of shared memory
+  Layout layout_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): pointer_(reinterpret_cast<char const*>(ref.data())), byte_offset_(0), stride_(ref.stride(0)), layout_(ref.stride(0)) {
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_pointer_offset(LongIndex offset) {
+    
+    byte_offset_ += (offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    
+    Index elements_offset = layout_({tile_offset.row() * WmmaShape::kRow, tile_offset.column() * Shape::kColumn});
+    
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator++() {
+    
+    Index elements_offset = layout_({WmmaShape::kRow, 0});
+
+    byte_offset_ += (elements_offset * sizeof_bits<Element>::value) / 8;
+    
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator--() {
+
+    Index elements_offset = layout_({WmmaShape::kRow, 0});
+
+    byte_offset_ -= (elements_offset * sizeof_bits<Element>::value) / 8;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+        
+        Index load_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        const WmmaDataType *ptr = reinterpret_cast<const WmmaDataType *>(pointer_ + byte_offset_ + load_byte_offset + byte_offset);
+
+        nvcuda::wmma::load_matrix_sync(frag[n], ptr, stride_);        
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < Iterations::kRow; ++k) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        Index store_byte_offset = layout_({k * WmmaShape::kRow, n * WmmaShape::kColumn}) * sizeof_bits<Element>::value / 8;
+
+        WmmaDataType *ptr = reinterpret_cast<WmmaDataType *>(pointer_ + byte_offset_ + store_byte_offset + byte_offset);
+        
+        nvcuda::wmma::store_matrix_sync(ptr, frag[n], stride_);        
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions, concept: MatrixShape)
+    typename OpDelta_,
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaAccumulatorTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// This tile iterator is specialized for 32-thread WMMA operation. 
+/// It uses nvcuda::wmma::store_matrix_sync to load from shared
+/// memory and therefore must be initialized with a TensorRef to shared memory. 
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept |
+///   WriteableRandomAccessContiguousTileIteratorConcept
+///
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    ///< Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Interval between adjacent *WMMA instructions (in units of WMMA instructions)
+    typename OpDelta_,    
+    /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+    typename Policy_>
+class MmaTensorOpWmmaAccumulatorTileIterator
+{
+ public:
+
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = Layout_;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape)
+  using OpDelta = OpDelta_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Wmma Operator information and operation delta
+  using Policy = Policy_;
+
+
+  //
+  // Derived quantities
+  //
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Native Wmma shape (concept MatrixShape)
+  using WmmaShape = MatrixShape<
+    Policy::Operator::Shape::kM, 
+    Policy::Operator::Shape::kN
+  >;
+  
+  /// Map cutlass dataype to nvcuda::wmma datatype
+  using WmmaDataType = typename cutlass::arch::CutlassToWmmaDataType<Element>::Type;
+
+  /// Map cutlass::layout to nvuda::wmma::layout_t enum
+  static nvcuda::wmma::layout_t const WmmaLayout = cutlass::arch::CutlassToWmmaLayout<Layout>::value;
+
+  /// Shape of individual WMMA load / stores for accumulator
+  using Iterations = MatrixShape<
+    Shape::kRow / WmmaShape::kRow,
+    Shape::kColumn / WmmaShape::kColumn
+  >;
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = WmmaFragmentArray<typename Policy::Operator::FragmentC, Iterations::kCount>;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// statically asserts this specialization
+  /////////////////////////////////////////////////////////////////////////////////////////////////////
+  /// Supported layouts
+  static_assert(
+    platform::is_same<cutlass::layout::RowMajor, Layout>::value ||
+    platform::is_same<cutlass::layout::ColumnMajor, Layout>::value,
+    "Supported list of memory layouts for WMMA are: RowMajor, ColumnMajor");
+
+private:
+  
+  /// Internal reference
+  cutlass::TensorRef<Element, Layout> ref_;
+
+public:
+  
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator() { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator(
+    TensorRef const &ref, 
+    int lane_id
+  ): ref_(ref) { }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator &add_pointer_offset(LongIndex offset) {
+    ref_.add_pointer_offset(offset);
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) {
+    ref_.add_coord_offset({tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn});
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator++() {
+    ref_.add_coord_offset({Shape::kRow, 0});
+    return *this;
+  }
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator--() {
+    ref_.add_coord_offset({-Shape::kRow, 0});
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  MmaTensorOpWmmaAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < Iterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        const WmmaDataType * ptr = reinterpret_cast<const WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
+        
+        nvcuda::wmma::load_matrix_sync(frag[m * Iterations::kColumn + n], ptr, ref_.stride()[0], WmmaLayout); 
+
+      }
+    }
+  }
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+    
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) const {
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < Iterations::kRow; ++m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Iterations::kColumn; ++n) {
+
+        WmmaDataType * ptr = reinterpret_cast<WmmaDataType*> (ref_.data() + ref_.offset({m * WmmaShape::kRow, n * WmmaShape::kColumn}) + pointer_offset);
+
+        nvcuda::wmma::store_matrix_sync(ptr, frag[m * Iterations::kColumn + n], ref_.stride()[0], WmmaLayout); 
+      }
+    }
+  }
+
+  /// Stores a fragment to memory at the location pointed to by the iterator
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) const {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    // no operation here
+  }
+};
+
+
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec445443afd504a201b6788133099015dd52e7a9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/wmma_array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+///< Structure to compute the matrix product targeting CUDA cores via WMMA.
+template < 
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  ///< Data type of A elements
+  typename ElementA_,
+  ///< Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  ///< Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  ///< Element type of C matrix
+  typename ElementC_,
+  ///< Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  ///< Policy describing warp-level Wmma operation (concept: MmaTensorOpPolicy)
+  typename Policy_,
+  ///< Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  ///< Used for partial specialization
+  typename Enable = bool
+>
+class MmaTensorOpWmma {
+public:
+  ///< Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  ///< Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  ///< Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  ///< Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  ///< Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  ///< Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  ///< Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy)
+  using Policy = Policy_;
+
+  /// Underlying instruction shape
+  using InstructionShape = typename Policy::Operator::Shape;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator 
+  using MathOperator = typename ArchMmaOperator::Operator;
+  
+  /// Underlying architecture tag
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassWmmaTensorOp;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpWmmaMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     Policy::OpDelta::kRow, kThreadCount, Policy>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpWmmaMultiplicandTileIterator<
+     MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+     Policy::OpDelta::kRow, kThreadCount, Policy>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpWmmaAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+    typename Policy::OpDelta, Policy>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+private:
+
+  static_assert(
+    !(Shape::kM % Policy::Operator::Shape::kM) && 
+    !(Shape::kN % Policy::Operator::Shape::kN),
+    "Shape of warp-level Wmma must be divisible by operator shape (wmma native size)");
+
+  /// Number of wmma operations performed
+  using WmmaIterations = MatrixShape<
+    Shape::kM / Policy::Operator::Shape::kM,
+    Shape::kN / Policy::Operator::Shape::kN 
+  >;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: cutlass::arch::Wmma)
+  typename Policy::Operator wmma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaTensorOpWmma() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D, 
+    FragmentA const &A, 
+    FragmentB const &B, 
+    FragmentC const &C) const {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < WmmaIterations::kColumn; ++n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < WmmaIterations::kRow; ++m) {
+
+        // accumulate wmma mma
+        wmma(D[m * WmmaIterations::kColumn + n], A[m], B[n], C[m * WmmaIterations::kColumn + n]);
+      }
+    }  
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d97c8f449f84e1cc3b08977b109aeda7c827d89f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
@@ -0,0 +1,449 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations targeting
+      Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+#include "cutlass/gemm/warp/mma_tensor_op.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape_,
+  /// Data type of A elements
+  typename ElementA_,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA_,
+  /// Data type of B elements
+  typename ElementB_,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB_,
+  /// Element type of C matrix
+  typename ElementC_,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC_,
+  /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy)
+  typename Policy_,
+  /// Reduce operand A or B along K dimension
+  bool ReduceKForA_,
+  /// Number of partitions along K dimension
+  int PartitionsK_ = 1,
+  /// Store the accumulators in row major or column major.  Row major is used
+  /// when output layout is interleaved.
+  bool AccumulatorsInRowMajor = false,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class MmaWithReductionTensorOp {
+public:
+  /// Shape of warp-level matrix operation (concept: GemmShape)
+  using Shape = Shape_;
+
+  /// Data type of multiplicand A
+  using ElementA = ElementA_;
+
+  /// Layout of multiplicand A
+  using LayoutA = LayoutA_;
+
+  /// Data type of multiplicand B
+  using ElementB = ElementB_;
+
+  /// Layout of multiplicand B
+  using LayoutB = LayoutB_;
+
+  /// Data type of accumulator matrix C
+  using ElementC = ElementC_;
+
+  /// Layout of accumulator matrix C
+  using LayoutC = LayoutC_;
+
+  /// Shape of the warp in units of thread (concept: MmaLanePolicySimt)
+  using Policy = Policy_;
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  using ArchMmaOperator = typename Policy::Operator;
+
+  /// Indicates math operator
+  using MathOperator = typename ArchMmaOperator::Operator;
+
+  /// Architecture tag from underlying instruction
+  using ArchTag = typename ArchMmaOperator::ArchTag;
+
+  /// Indicates class of matrix operator
+  using OperatorClass = arch::OpClassTensorOp;
+
+  /// Shape of underlying instruction
+  using InstructionShape = typename ArchMmaOperator::Shape;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  /// Number of threads participating in warp-level matrix product
+  static int const kThreadCount = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  static bool const kReduceKForA = ReduceKForA_;
+
+  static_assert(platform::is_same<ElementA, cutlass::half_t>::value ||
+                platform::is_same<ElementA, cutlass::bfloat16_t>::value,
+                "ElementA needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<ElementB, cutlass::half_t>::value ||
+                platform::is_same<ElementB, cutlass::bfloat16_t>::value,
+                "ElementB needs to be fp16 or bf16.");
+
+  static_assert(platform::is_same<InstructionShape,
+                                  cutlass::gemm::GemmShape<16, 8, 16>>::value,
+                "Only supports 16x8x16 tensor core instruction.");
+
+  static_assert(!AccumulatorsInRowMajor,
+                "Only calls tensor core instructions in column major.");
+
+public:
+
+  /// Iterates over the A operand in memory
+  using IteratorA = MmaTensorOpMultiplicandTileIterator<
+     MatrixShape<Shape::kM, Shape::kK>, Operand::kA, ElementA, LayoutA,
+     MatrixShape<ArchMmaOperator::Shape::kM, ArchMmaOperator::Shape::kK>,
+     Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for A tile
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Storage for transformed A tile
+  using TransformedFragmentA =
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements>;
+
+  /// Iterates over the B operand in memory
+  using IteratorB = MmaTensorOpMultiplicandTileIterator<
+      MatrixShape<Shape::kK, Shape::kN>, Operand::kB, ElementB, LayoutB,
+      MatrixShape<ArchMmaOperator::Shape::kK, ArchMmaOperator::Shape::kN>,
+      Policy::OpDelta::kRow, kThreadCount, kPartitionsK>;
+
+  /// Storage for B tile
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Storage for transformed B tile
+  using TransformedFragmentB =
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements>;
+
+  /// Iterates over the C operand in memory
+  using IteratorC = MmaTensorOpAccumulatorTileIterator<
+     MatrixShape<Shape::kM, Shape::kN>, ElementC, LayoutC,
+     typename ArchMmaOperator::Shape, typename Policy::OpDelta>;
+
+  /// Storage for C tile
+  using FragmentC = typename IteratorC::Fragment;
+
+  /// Number of mma operations performed
+  using MmaIterations = MatrixShape<
+    (Shape::kM + ArchMmaOperator::Shape::kM - 1) / ArchMmaOperator::Shape::kM,
+    (Shape::kN + ArchMmaOperator::Shape::kN - 1) / ArchMmaOperator::Shape::kN
+  >;
+
+  using FragmentReduction = Array<ElementC, kReduceKForA ? (Shape::kM / 8) : (Shape::kN / 8)>;
+
+public:
+
+  /// Underlying matrix multiply operator (concept: arch::Mma)
+  ArchMmaOperator mma;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_DEVICE
+  MmaWithReductionTensorOp() {}
+
+  /// Performs a warp-level matrix multiply-accumulate operation
+  CUTLASS_DEVICE
+  void operator()(
+    FragmentC &D,
+    TransformedFragmentA const &A,
+    TransformedFragmentB const &B,
+    FragmentC const &C,
+    FragmentReduction &gemm_k_reduction
+  ) const {
+
+    using MmaOperandA = typename ArchMmaOperator::FragmentA;
+    using MmaOperandB = typename ArchMmaOperator::FragmentB;
+    using MmaOperandC = typename ArchMmaOperator::FragmentC;
+
+    D = C;
+
+    [[maybe_unused]] MmaOperandA const *ptr_A = reinterpret_cast<MmaOperandA const *>(&A);
+    [[maybe_unused]] MmaOperandB const *ptr_B = reinterpret_cast<MmaOperandB const *>(&B);
+    [[maybe_unused]] MmaOperandC *ptr_D = reinterpret_cast<MmaOperandC *>(&D);
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      assert(0);
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      // Serpentine visitation order maximizing reuse of Ra
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < MmaIterations::kRow; ++m) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < MmaIterations::kColumn; ++n) {
+
+          int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n);
+
+          mma(ptr_D[m + n_serpentine * MmaIterations::kRow],
+              ptr_A[m],
+              ptr_B[n_serpentine],
+              ptr_D[m + n_serpentine * MmaIterations::kRow]);
+
+          if (!kReduceKForA && m == 0) {
+            #if 0
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 1]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 2]);
+            gemm_k_reduction[n_serpentine] += float(B[n_serpentine * 4 + 3]);
+            #else
+            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&B);
+
+            if (platform::is_same<ElementB, cutlass::half_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f16 low, high;\n\t"
+                " .reg .f32 tmp;\n\t"
+                " mov.b32 {low, high}, %1;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %2;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[n_serpentine])
+                : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else if (platform::is_same<ElementB, cutlass::bfloat16_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %1, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %1, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[n_serpentine])
+              : "r"(tmp[n_serpentine * 2]), "r"(tmp[n_serpentine * 2 + 1]));
+            } else {
+                assert(0);
+            }
+            #endif
+          }
+
+          if (kReduceKForA && (n == 0)) {
+            #if 0
+            gemm_k_reduction[m * 2] += float(A[m * 8]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 1]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 4]);
+            gemm_k_reduction[m * 2] += float(A[m * 8 + 5]);
+
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 2]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 3]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 6]);
+            gemm_k_reduction[m * 2 + 1] += float(A[m * 8 + 7]);
+            #else
+            uint32_t const *tmp = reinterpret_cast<uint32_t const *>(&A);
+
+            if (platform::is_same<ElementA, cutlass::half_t>::value) {
+              asm volatile(
+                "{\n\t"
+                " .reg .f16 low, high;\n\t"
+                " .reg .f32 tmp;\n\t"
+                " mov.b32 {low, high}, %2;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %3;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " mov.b32 {low, high}, %4;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " mov.b32 {low, high}, %5;\n\t"
+                " cvt.f32.f16 tmp, low;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " cvt.f32.f16 tmp, high;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else if (platform::is_same<ElementA, cutlass::bfloat16_t>::value) {
+
+              asm volatile(
+                "{\n\t"
+                " .reg .f32 tmp;\n\t"
+                " shl.b32 tmp, %2, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %2, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %3, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %3, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " shl.b32 tmp, %4, 16;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " and.b32 tmp, %4, 0xffff0000;\n\t"
+                " add.f32 %0, tmp, %0;\n\t"
+                " shl.b32 tmp, %5, 16;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                " and.b32 tmp, %5, 0xffff0000;\n\t"
+                " add.f32 %1, tmp, %1;\n\t"
+                "}\n\t"
+                : "+f"(gemm_k_reduction[m * 2]), "+f"(gemm_k_reduction[m * 2 + 1])
+                : "r"(tmp[m * 4]), "r"(tmp[m * 4 + 1]),"r"(tmp[m * 4 + 2]), "r"(tmp[m * 4 + 3]));
+
+            } else {
+              assert(0);
+            }
+            #endif
+          }
+        }
+      }
+    #else
+      assert(0);
+    #endif
+  }
+
+  /// Transform the mma operands to the required types
+  CUTLASS_DEVICE
+  void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B,
+                 FragmentA const &A, FragmentB const &B) const {
+
+    //
+    // Define conversions from source type to instruction type
+    //
+    FloatRoundStyle const kRoundA =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementA,
+                              ElementA>::kRound;
+    FloatRoundStyle const kRoundB =
+        PreferredRoundingMode<typename ArchMmaOperator::ElementB,
+                              ElementB>::kRound;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements / 2, kRoundB>
+          convert_B;
+      Array<ElementB, FragmentB::kElements / 2> const *ptr_B =
+          reinterpret_cast<Array<ElementB, FragmentB::kElements / 2> const *>(&B);
+      Array<typename ArchMmaOperator::ElementB, FragmentB::kElements / 2> *
+          ptr_dst_B = reinterpret_cast<Array<typename ArchMmaOperator::ElementB,
+                                             FragmentB::kElements / 2> *>(&dst_B);
+
+      dst_A = convert_A(A);
+
+      ptr_dst_B[0] = convert_B(ptr_B[0]);
+      ptr_dst_B[1] = convert_B(ptr_B[1]);
+
+    #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+      detail::ConvertAndPack<typename ArchMmaOperator::ElementA, ElementA,
+                            FragmentA::kElements / 2, kRoundA>
+          convert_A;
+      NumericArrayConverter<typename ArchMmaOperator::ElementB, ElementB,
+                            FragmentB::kElements, kRoundB>
+          convert_B;
+      Array<ElementA, FragmentA::kElements / 2> const *ptr_A =
+          reinterpret_cast<Array<ElementA, FragmentA::kElements / 2> const *>(&A);
+      Array<typename ArchMmaOperator::ElementA, FragmentA::kElements / 2> *
+          ptr_dst_A = reinterpret_cast<Array<typename ArchMmaOperator::ElementA,
+                                             FragmentA::kElements / 2> *>(&dst_A);
+
+      dst_B = convert_B(B);
+
+      ptr_dst_A[0] = convert_A(ptr_A[0]);
+      ptr_dst_A[1] = convert_A(ptr_A[1]);
+    #else
+      assert(0);
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d79dcf7005a3940e6960d5e9b5c7ad87ea4ed9f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
@@ -0,0 +1,572 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Defines iterators used by warp-level loading scale and bias vectors.
+   Every scale/bias data only needs to be loaded once for every channel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/fast_math.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of A elements
+    typename Element_,
+    /// Layout of operand
+    typename Layout_,
+    /// Shape of one matrix production operation (concept: GemmShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of threads participating in one matrix operation
+    int Threads,
+    /// Number of partitions along K dimension
+    int PartitionsK_ = 1>
+class ScaleBiasTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: PitchLinearShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: PitchLinearShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::PitchLinear,
+                             InstructionShape_, Policy_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::PitchLinear;
+
+  /// Shape of one matrix product operation (concept: GemmShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Number of partitions along K dimension
+  static int const kPartitionsK = PartitionsK_;
+
+  /// Number of partitions along K dimension
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  using Policy = Policy_;
+
+ private:
+
+  /// Pointer type used for accesses
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<Element, 2 * Policy::kLdsmOpInner *
+                                      InstructionShape::kContiguous / kThreads>;
+
+ private:
+
+  /// Shared memory base pointers - not advanced
+  AccessType const *pointer_;
+
+  /// Byte offset incremented as iterator advances
+  Index byte_offset_;
+
+  /// Internal counter used to determine when to increment byte offset and when
+  /// to XOR it
+  int k_group_idx_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator()
+      : pointer_(nullptr),
+        byte_offset_(0),
+        k_group_idx_(0) {}
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias,
+                         int lane_id)
+      : byte_offset_(0), k_group_idx_(0) {
+    /// 16816 only
+    pointer_ = reinterpret_cast<AccessType const *>(ref_scale_bias.data()) +
+               ((lane_id >> 3) & 1) * Shape::kContiguous / kElementsPerAccess +
+               (lane_id >> 4);
+  }
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
+    byte_offset_ += offset * sizeof_bits<Element>::value / 8;
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile;
+    int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile;
+
+    byte_offset_ += k_groups_delta * sizeof_bits<Element>::value *
+                    kElementsPerAccess * Policy::LdsmShape::kContiguous / 8;
+
+    // Multiply by 2 because scale and bias belonging to the same stage are next
+    // to each other in the shared memory.
+    pointer_ += (2 * whole_tiles * Shape::kContiguous / kElementsPerAccess);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator++() {
+    byte_offset_ += Policy::LdsmShape::kContiguous *
+                    sizeof_bits<Element>::value * kElementsPerAccess / 8;
+
+    k_group_idx_++;
+
+    if (k_group_idx_ == (Policy::kGroupsPerTile / kPartitionsK)) {
+      k_group_idx_ = 0;
+      byte_offset_ -= (Policy::kGroupsPerTile / kPartitionsK) *
+                      Policy::LdsmShape::kContiguous *
+                      sizeof_bits<Element>::value * kElementsPerAccess / 8;
+      add_tile_offset({Policy::kGroupsPerTile, 0});
+    }
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator--() { assert(0); }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { load_with_byte_offset(frag, 0); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+    Array<unsigned, 4> *fetch_ptr =
+        reinterpret_cast<Array<unsigned, 4> *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < 1; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < Policy::LdsmIterations::kContiguous; ++c) {
+        int access_idx = c + s * Policy::LdsmIterations::kContiguous;
+
+        AccessType const *source_ptr =
+            pointer_ + Policy::LdsmShape::kContiguous * c;
+
+        char const *source_byte_ptr =
+            reinterpret_cast<char const *>(source_ptr) + byte_offset +
+            byte_offset_;
+
+        cutlass::arch::ldsm<layout::RowMajor, 4>(
+            fetch_ptr[access_idx], source_byte_ptr);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    load_with_byte_offset(frag, tile_offset, 0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element));
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    Index pointer_offset = tile_offset.contiguous() *
+                               InstructionShape::kContiguous /
+                               kElementsPerAccess;
+
+    byte_offset += sizeof_bits<AccessType>::value * pointer_offset / 8;
+
+    load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    k_group_idx_ = k_group % (Policy::kGroupsPerTile / kPartitionsK);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to
+/// load from shared memory and therefore must be initialized with a TensorRef
+/// to shared memory.
+///
+/// Satisfies:
+///   ReadableRandomAccessContiguousTileIteratorConcept
+///
+template <
+    /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Data type of elements
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Policy of the details of LDSM shape and iterations
+    typename Policy_,
+    /// Number of partitions along K dimension
+    int PartitionsK_>
+class ScaleBiasTileIterator<Shape_, Element_, cutlass::layout::RowMajor,
+                             InstructionShape_, Policy_, 32, PartitionsK_> {
+ public:
+  /// Shape of tile to load (concept: PitchLinearShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Internal structure of iterator - made public to enable introspection
+  using Policy = Policy_;
+
+  /// Underlying tile iterator implementation
+  using Base = ScaleBiasTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      layout::PitchLinearShape<InstructionShape::kColumn,
+                               InstructionShape::kRow>,
+      Policy, kThreads, PartitionsK_>;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = typename Base::Fragment;
+
+ private:
+  /// Underlying tile iterator
+  Base iterator_;
+
+ public:
+  /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator() {}
+
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator(TensorRef const &ref_scale_bias, int lane_id)
+      : iterator_({ref_scale_bias.data(), ref_scale_bias.stride()}, lane_id) {}
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &add_pointer_offset(LongIndex offset) {
+    iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &add_tile_offset(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &add_tile_offset_negative(
+      TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()});
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator++() {
+    ++iterator_;
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_HOST_DEVICE
+  ScaleBiasTileIterator &operator--() {
+    --iterator_;
+
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator+=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of
+  ///< the tensor
+  CUTLASS_DEVICE
+  ScaleBiasTileIterator &operator-=(
+      TensorCoord const &tile_offset) {
+    add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row()));
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const { iterator_.load(frag); }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+    assert(0);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+    iterator_.load_with_byte_offset(
+        frag, {tile_offset.strided(), tile_offset.contiguous()}, byte_offset);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    iterator_.set_kgroup_index(k_group); 
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm 
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e3af9bff42a8895c7fb1e55a873b74e2a7ba249
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
@@ -0,0 +1,117 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level per-channel softmax before
+   matrix multiply-accumulate operations targeting Tensor Cores.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/arch/memory_sm75.h"
+#include "cutlass/arch/mma_sm75.h"
+#include "cutlass/arch/mma_sm80.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_policy.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename FragmentActivations, typename FragmentNormSum>
+struct SoftmaxScaleBiasTransform {
+
+  using T = typename FragmentActivations::Element;
+
+  static int const NumActivations = FragmentActivations::kElements;
+  static int const NumNormSum = FragmentNormSum::kElements;
+  static int const MmaElements = 2;
+  // One element has one scale and one bias
+  static int const MmaScaleBiasPair = 2;
+  // 16816 has 2 columns and 2 rows
+  static int const MmaCols = 2;
+  static int const MmaRows = 2;
+
+  using MmaOperand = Array<T, MmaElements>;
+  using NormSumOperand = Array<__half2, MmaScaleBiasPair>;
+
+  CUTLASS_DEVICE
+  void transform(MmaOperand &activations,
+                 NormSumOperand const &norm_sum) {
+
+    __half2* packed_activations = reinterpret_cast<__half2*>(&activations);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < MmaElements / 2; ++i) {
+      __half2 out = ::h2exp(__hsub2(packed_activations[i], norm_sum[2*i]));
+      packed_activations[i] = __hmul2(out, norm_sum[2*i + 1]);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void operator()(FragmentActivations &activations,
+                  FragmentNormSum const &norm_sum) {
+    MmaOperand *ptr_activations = reinterpret_cast<MmaOperand *>(&activations);
+    NormSumOperand const *ptr_norm_sum =
+        reinterpret_cast<NormSumOperand const *>(&norm_sum);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < (NumActivations / MmaElements); ++i) {
+      transform(ptr_activations[i],
+                ptr_norm_sum[i / (MmaCols * MmaRows) * MmaRows + i % MmaRows]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..0406db0ddff902995a92b5c11d4c5e5024334e4c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -0,0 +1,250 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing warp-level matrix multiply-accumulate operations.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/array_planar_complex.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename TileIterator_>
+class TileIteratorPlanarComplex {
+public:
+
+  /// Underlying iterator over real-valued tiles
+  using TileIterator = TileIterator_;
+
+  /// Underlying element type
+  using Element = typename TileIterator::Element;
+
+  /// Underlying layout type
+  using Layout = typename TileIterator::Layout;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = typename TileIterator::TensorRef;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Planar complex fragment
+  using Fragment = ArrayPlanarComplex<Element, TileIterator::Fragment::kElements>;
+
+public:
+
+  /// Underlying tile iterator
+  TileIterator tile_iterator_;
+
+  /// Offset (in units of bytes) to the imaginary part of the planar complex matrix
+  LongIndex imaginary_offset_;
+
+public:
+    /// Default ctor constructs null iterator
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex(): imaginary_offset_(0) { }
+
+  /// Constructor from TensorRef
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex(
+    TensorRef const &ref, 
+    int lane_id,
+    LongIndex imaginary_offset
+  ):
+    tile_iterator_(ref, lane_id),
+    imaginary_offset_((imaginary_offset * sizeof_bits<Element>::value) / 8) { }
+
+
+  /// Adds a pointer offset to internal pointer(s) to advance through memory
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex &add_pointer_offset(LongIndex offset) {
+
+    tile_iterator_.add_pointer_offset(offset);
+
+    return *this;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex &add_tile_offset(TensorCoord const &tile_offset) {
+
+    tile_iterator_.add_tile_offset(tile_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator++() {
+    ++tile_iterator_;
+    return *this;
+  }
+
+  //
+  // WIP
+  //
+
+  /// Advances the iterator along the opposite of the advance dimension
+  CUTLASS_HOST_DEVICE
+  TileIteratorPlanarComplex & operator--() {
+    --tile_iterator_;
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator+=(TensorCoord const &tile_offset) {
+    tile_iterator_.add_tile_offset(tile_offset);
+    return *this;
+  }
+
+  ///< advances in units of whole tiles along the logical coordinate space of the tensor
+  CUTLASS_DEVICE
+  TileIteratorPlanarComplex & operator-=(TensorCoord const &tile_offset) {
+    tile_iterator_.add_tile_offset(-tile_offset);
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, 0);
+    tile_iterator_.load_with_byte_offset(frag.imag, imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset in units of bytes
+      Index byte_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with additional logical offset
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a linear offset
+      Index pointer_offset) const {
+
+    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
+
+    tile_iterator_.load_with_byte_offset(frag.real, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, 0);
+    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index pointer_offset) const {
+
+    Index byte_offset = (pointer_offset * sizeof_bits<Element>::value)/8;
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset + imaginary_offset_);
+  }
+
+  /// Loads a fragment from memory with logical offset in units of whole tiles.
+  CUTLASS_DEVICE
+  void load_with_byte_offset(
+      /// fragment to load from the tensor
+      Fragment &frag,
+      /// loads a tile with a logical offset in units of whole tiles
+      TensorCoord const &tile_offset,
+      /// loads a tile with a logical offset AND a pointer offset
+      Index byte_offset) const {
+
+    tile_iterator_.load_with_byte_offset(frag.real, tile_offset, byte_offset);
+    tile_iterator_.load_with_byte_offset(frag.imag, tile_offset, byte_offset + imaginary_offset_);
+  }
+
+  /// Notify the iterator which k-group it is currently pointing to.
+  ///
+  /// This does not advance the iterator. Rather, it overrides its internal
+  /// tracking with constant-valued k-group index to enable the compiler to
+  /// fold constants and achieve more efficient code.
+  ///
+  /// This is used by some nontrivial permuted layouts.
+  CUTLASS_DEVICE
+  void set_kgroup_index(int k_group) {
+    tile_iterator_.set_kgroup_index(k_group);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd826de23c463d021d5c0abb50867faebbdc9b47
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Shape of a matrix multiply-add operation
+template <
+  /// Rows of matrix product
+  int M = 1,
+  /// Columns of matrix product
+  int N = 1,
+  /// Inner dimension of matrix product
+  int K = 1
+>
+struct GemmShape {
+  static int const kM = M;
+  static int const kN = N;
+  static int const kK = K;
+
+  static int const kMN = M * N;
+  static int const kMK = M * K;
+  static int const kKN = N * K;
+  static int const kMNK = M * N * K;
+
+  static int const kCount = kMNK;
+
+  //
+  // Static member functions
+  //
+
+  /// Returns a Coord object
+  CUTLASS_HOST_DEVICE
+  static Coord<3> toCoord() {
+    return make_Coord(kM, kN, kK);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Type alias of the transpose of a GemmShape
+template <
+  /// concept: GemmShape
+  typename Shape
+>
+using GemmShapeTranspose = GemmShape<Shape::kN, Shape::kM, Shape::kK>;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// GemmCoord is a structure derived from Coord<3> that specifies a location within the
+/// coordinate space of a GEMM problem.
+struct GemmCoord : public Coord<3, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=3
+  typedef Coord<3, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  GemmCoord() { }
+
+  /// Constructs from Coord<3> and a batch
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Coord<3, Index> const& coord): Base(make_Coord(coord[0], coord[1], coord[2])) { }
+
+  /// Helper to construct from a K, N, M, batch variables
+  CUTLASS_HOST_DEVICE
+  GemmCoord(Index m, Index n, Index k): Base(make_Coord(m, n, k)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> mnk() const {
+    return make_Coord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<3> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<3> knm() const {
+    return make_Coord(k(), n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nm() const {
+    return make_Coord(n(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mn() const {
+    return make_Coord(m(), n());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> mk() const {
+    return make_Coord(m(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> km() const {
+    return make_Coord(k(), m());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> nk() const {
+    return make_Coord(n(), k());
+  }
+
+  /// Obtains a Coord<2> from GemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<2> kn() const {
+    return make_Coord(k(), n());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator+(Base const& b) const {
+    return GemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator-(Base const& b) const {
+    return GemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator*(Base const& b) const {
+    return GemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  GemmCoord operator/(Base const& b) const {
+    return GemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  GemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// BatchedGemmCoord is a structure derived from Coord<4> that specifies a location within the
+/// coordinate space of a batched GEMM problem.
+struct BatchedGemmCoord : public Coord<4, int> {
+
+  /// Integer-valued index
+  typedef int Index;
+
+  /// Base type is a Coord of rank=4
+  typedef Coord<4, Index> Base;
+
+  /// GEMM M dimension - rows of the output C matrix
+  static int const kM = 0;
+
+  /// GEMM N dimension - columns of the output C matrix
+  static int const kN = 1;
+
+  /// GEMM K dimension - inner dimension of the GEMM problem
+  static int const kK = 2;
+
+  /// GEMM Batch dimension - inner dimension of the GEMM problem
+  static int const kBatch = 3;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord() { }
+
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Base const& coord): Base(coord) { }
+
+  /// Helper to construct from a K, N, M, and batch variables
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord(Index m, Index n, Index k, Index b): Base(make_Coord(m, n, k, b)) { }
+
+  /// Returns the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  m() const { return this->at(kM); }
+
+  /// Returns reference to the GEMM M coordinate
+  CUTLASS_HOST_DEVICE
+  Index & m() { return this->at(kM); }
+
+  /// Returns the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  n() const { return this->at(kN); }
+
+  /// Returns reference to the GEMM N coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  k() const { return this->at(kK); }
+
+  /// Returns reference to the GEMM K coordinate
+  CUTLASS_HOST_DEVICE
+  Index & k() { return this->at(kK); }
+
+  /// Returns the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index const&  batch() const { return this->at(kBatch); }
+
+  /// Returns reference to the GEMM batch coordinate
+  CUTLASS_HOST_DEVICE
+  Index & batch() { return this->at(kBatch); }
+
+  /// Obtains a GemmCoord from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  GemmCoord mnk() const {
+    return GemmCoord(m(), n(), k());
+  }
+
+  /// Obtains a Coord<4> from BatchedGemmCoord
+  CUTLASS_HOST_DEVICE
+  Coord<4> mnkb() const {
+    return make_Coord(m(), n(), k(), batch());
+  }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator+(Base const& b) const {
+    return BatchedGemmCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator-(Base const& b) const {
+    return BatchedGemmCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator*(Base const& b) const {
+    return BatchedGemmCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord operator/(Base const& b) const {
+    return BatchedGemmCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  BatchedGemmCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a22b8031d186f25e58cd96df6c75606454d50d0f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/gemm_coord.hpp
@@ -0,0 +1,66 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Utilities to convert a CuTe tuple to a GemmCoord or BatchedGemmCoord
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cutlass/gemm_coord.h"
+
+namespace cutlass {
+namespace gemm {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Tuple>
+CUTLASS_HOST_DEVICE
+auto
+to_gemm_coord(Tuple tuple) {
+  static_assert(cute::rank(tuple) <= 4, "Can only convert tuples of rank <= 4.");
+
+  if constexpr (cute::rank(tuple) <= 3) {
+    auto tuple_mnk = cute::append<3>(tuple, cute::Int<0>{});
+    return GemmCoord(cute::size<0>(tuple_mnk), cute::size<1>(tuple_mnk), cute::size<2>(tuple_mnk));
+  }
+  else {
+    return BatchedGemmCoord(cute::size<0>(tuple), cute::size<1>(tuple), cute::size<2>(tuple), cute::size<3>(tuple));
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/half.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/half.h
new file mode 100644
index 0000000000000000000000000000000000000000..118a80d7045dddd4239fc7f0756dc445fa9a2895
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/half.h
@@ -0,0 +1,930 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using IEEE half-precision floating-point types in host or
+      device code.
+*/
+
+#pragma once
+
+#ifndef CUTLASS_ENABLE_F16C
+#define CUTLASS_ENABLE_F16C 0
+#endif
+
+#if defined(__CUDACC_RTC__)
+
+#include "cutlass/floating_point_nvrtc.h"
+
+// F16C extensions are not meaningful when compiling for NVRTC which only accommodates device code.
+#undef CUTLASS_ENABLE_F16C
+#define CUTLASS_ENABLE_F16C 0
+
+#else
+//
+// Standard Library headers belong here to avoid conflicts with NVRTC.
+//
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring>
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/float8.h"
+#include "cutlass/platform/platform.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Optionally target F16C extensions to accelerate half-precision conversion.
+#if !defined(__CUDA_ARCH__) && (CUTLASS_ENABLE_F16C)
+#if defined(_MSC_VER)
+
+#include <immintrin.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <intrin.h>
+#endif
+
+#define F16C_ROUND_NEAREST 0
+
+#if !defined(__CUDA_ARCH__)
+extern __inline float _cvtsh_ss (unsigned short __S) {
+  __m128i packed;
+  std::memcpy(&packed, &__S, sizeof(__S));
+
+  __m128 result = _mm_cvtph_ps(packed);
+
+  float flt;
+  std::memcpy(&flt, &result, sizeof(flt));
+
+  return flt;
+}
+
+__inline unsigned short _cvtss_sh (float __F, const int) {
+  __m128 packed;
+  std::memcpy(&packed, &__F, sizeof(__F));
+
+  __m128i result = _mm_cvtps_ph(packed, F16C_ROUND_NEAREST);
+
+  unsigned short u;
+  std::memcpy(&u, &result, sizeof(u));
+
+  return u;
+}
+#endif
+
+#else
+
+// Linux
+#include <x86intrin.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
+#define F16C_ROUND_NEAREST (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)
+
+#endif // _MSC_VER
+
+class CpuId {
+
+  bool f16c_enabled;
+
+  CpuId() {
+  #if defined(__i386__) || defined(__x86_64__)
+    #if defined(_MSC_VER)
+      int exx[4];
+
+      __cpuid (exx, 1); 
+      f16c_enabled = exx[2] & 0x20000000;
+
+    #else 
+    // GCC / Clang
+       int eax, ebx, ecx, edx;
+
+      __cpuid (1 , eax, ebx, ecx, edx); 
+      f16c_enabled = ecx & 0x20000000;
+    #endif
+  #else 
+  // Arm / PowerPC etc.
+    f16c_enabled = false;
+  #endif
+  }
+
+public:
+
+  bool is_f16c_supported() const {
+    return f16c_enabled;
+  } 
+
+  static const CpuId& instance() {
+      static CpuId cpu;
+      return cpu;
+  }
+};
+#endif // !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// IEEE half-precision floating-point type
+struct alignas(2) half_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint16_t storage;
+
+  //
+  // Static conversion operators
+  //
+
+  /// Constructs from an unsigned short
+  CUTLASS_HOST_DEVICE
+  static half_t bitcast(uint16_t x) {
+    half_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+    // Avoid inlining in device code if no hardware support
+    __device__ __noinline__
+  #else
+    CUTLASS_HOST_DEVICE
+  #endif  
+  static half_t convert(float const& flt) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__float2half_rn(flt));
+  #else
+
+    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+      if( CpuId::instance().is_f16c_supported() ) {
+        unsigned short u = _cvtss_sh(flt, F16C_ROUND_NEAREST);
+        return bitcast(u);
+      }
+    #endif
+
+    // software implementation rounds toward nearest even
+    unsigned s;
+
+    #if defined(__CUDA_ARCH__)
+    s = reinterpret_cast<unsigned const &>(flt);
+    #else
+    std::memcpy(&s, &flt, sizeof(s));
+    #endif
+
+    uint16_t sign = uint16_t((s >> 16) & 0x8000);
+    int16_t exp = uint16_t(((s >> 23) & 0xff) - 127);
+    int mantissa = s & 0x7fffff;
+    uint16_t u = 0;
+
+    if ((s & 0x7fffffff) == 0) {
+      // sign-preserving zero
+      return bitcast(sign);
+    }
+
+    if (exp > 15) {
+      if (exp == 128 && mantissa) {
+        // not a number
+        u = 0x7fff;
+      } else {
+        // overflow to infinity
+        u = sign | 0x7c00;
+      }
+      return bitcast(u);
+    }
+
+    int sticky_bit = 0;
+
+    if (exp >= -14) {
+      // normal fp32 to normal fp16
+      exp = uint16_t(exp + uint16_t(15));
+      u = uint16_t(((exp & 0x1f) << 10));
+      u = uint16_t(u | (mantissa >> 13));
+    } else {
+      // normal single-precision to subnormal half_t-precision representation
+      int rshift = (-14 - exp);
+      if (rshift < 32) {
+        mantissa |= (1 << 23);
+
+        sticky_bit = ((mantissa & ((1 << rshift) - 1)) != 0);
+
+        mantissa = (mantissa >> rshift);
+        u = (uint16_t(mantissa >> 13) & 0x3ff);
+      } else {
+        mantissa = 0;
+        u = 0;
+      }
+    }
+
+    // round to nearest even
+    int round_bit = ((mantissa >> 12) & 1);
+    sticky_bit |= ((mantissa & ((1 << 12) - 1)) != 0);
+
+    if ((round_bit && sticky_bit) || (round_bit && (u & 1))) {
+      u = uint16_t(u + 1);
+    }
+
+    u |= sign;
+
+    return bitcast(u);
+  #endif
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  CUTLASS_HOST_DEVICE
+  static half_t convert(int const& n) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__int2half_rn(n));
+  #else
+    return convert(float(n));
+  #endif
+  }
+
+  /// FP32 -> FP16 conversion - rounds to nearest even
+  CUTLASS_HOST_DEVICE
+  static half_t convert(unsigned const& n) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half_t(__uint2half_rn(n));
+  #else
+    return convert(float(n));
+  #endif
+  }
+
+  /// Converts a half-precision value stored as a uint16_t to a float
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+    // Avoid inlining in device code if no hardware support
+    __device__ __noinline__
+  #else
+    CUTLASS_HOST_DEVICE
+  #endif
+  static float convert(half_t const& x) {
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return __half2float(x.to_half());
+  #else
+
+    #if !defined(__CUDA_ARCH__) && CUTLASS_ENABLE_F16C
+      if( CpuId::instance().is_f16c_supported() ) {
+        unsigned short u = x.storage;
+        return _cvtsh_ss(u);
+      }
+    #endif
+
+    uint16_t const &h = x.storage;
+    uint32_t sign = ((h >> 15) & 1);
+    uint32_t exp = ((h >> 10) & 0x1f);
+    uint32_t mantissa = (h & 0x3ff);
+    unsigned f = 0;
+
+    if (exp > 0 && exp < 31) {
+      // normal
+      exp += 112;
+      f = (sign << 31) | (exp << 23) | (mantissa << 13);
+    } else if (exp == 0) {
+      if (mantissa) {
+        // subnormal
+        exp += 113;
+        while ((mantissa & (1 << 10)) == 0) {
+          mantissa <<= 1;
+          exp--;
+        }
+        mantissa &= 0x3ff;
+        f = (sign << 31) | (exp << 23) | (mantissa << 13);
+      } else {
+        // sign-preserving zero
+        f = (sign << 31);
+      }
+    } else if (exp == 31) {
+      if (mantissa) {
+        f = 0x7fffffff;  // not a number
+      } else {
+        f = (0xff << 23) | (sign << 31);  //  inf
+      }
+    }
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const&>(f);
+    #else
+    float flt;
+    std::memcpy(&flt, &f, sizeof(flt));
+    return flt;
+    #endif
+  #endif
+  }
+
+  //
+  // Methods
+  //
+
+  /// Default constructor
+  half_t() = default;
+
+  /// Reinterpret cast from CUDA's half type
+  CUTLASS_HOST_DEVICE
+  explicit half_t(half const & x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __half_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+  }
+
+  /// Floating point conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float x) {
+    storage = convert(x).storage;
+  }
+
+  /// Floating point conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(double x): half_t(float(x)) {
+
+  }
+
+  /// float_e4m3_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e4m3_t x): half_t(float(x)) {
+
+  }
+
+  /// float_e5m2_t conversion
+  CUTLASS_HOST_DEVICE
+  explicit half_t(float_e5m2_t x): half_t(float(x)) {
+
+  }
+
+  /// Integer conversion - round to nearest even
+  CUTLASS_HOST_DEVICE
+  explicit half_t(int x) {
+    storage = convert(x).storage;
+  }
+
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit half_t(unsigned x) {
+    storage = convert(x).storage;
+  }
+
+  /// Assignment
+  CUTLASS_HOST_DEVICE
+  half_t & operator=(half const &x) {
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint16_t const &>(x);
+    #else
+    __half_raw raw(x);
+    std::memcpy(&storage, &raw.x, sizeof(storage));
+    #endif
+    return *this;
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+    return convert(*this);
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(convert(*this));
+  }
+
+  /// Converts to float
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(convert(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (convert(*this) != 0.0f);
+  }
+
+  /// Bitcasts to CUDA's half type
+  CUTLASS_HOST_DEVICE
+  half to_half() const {
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<half const &>(storage);
+    #else
+    __half_raw raw;
+    std::memcpy(&raw.x, &storage, sizeof(raw.x));
+    return half(raw);
+    #endif
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  uint16_t& raw() {
+    return storage;
+  }
+
+  /// Accesses raw internal state
+  CUTLASS_HOST_DEVICE
+  uint16_t raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((storage & 0x8000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((storage >> 10) & 0x1f);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 15;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(storage & 0x3ff);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::half_t const& h) {
+  return ((h.raw() & 0x8000) != 0);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t abs(cutlass::half_t const& h) {
+  return cutlass::half_t::bitcast(h.raw() & 0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::half_t const& h) {
+  return (h.exponent_biased() == 0x1f) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::half_t const& h) {
+  return (h.exponent_biased() != 0x1f);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t nanh(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::half_t::bitcast(0x7fff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::half_t const& h) {
+  return (h.exponent_biased() == 0x1f) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::half_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x1f;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::half_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x1f) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t sqrt(cutlass::half_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::half_t(sqrtf(float(h)));
+#else
+  return cutlass::half_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t copysign(half_t const& a, half_t const& b) {
+
+  uint16_t a_mag = (a.raw() & 0x7fff);  
+  uint16_t b_sign = (b.raw() & 0x8000);
+  uint16_t result = (a_mag | b_sign);
+
+  return half_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if !defined(__CUDACC_RTC__)
+namespace std {
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::half_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = true;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 10;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
+};
+}  // namespace std
+#endif
+
+namespace cutlass {
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::half_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+#if !defined(__CUDACC_RTC__)
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+#endif
+  static bool const has_denorm_loss = true;
+#if !defined(__CUDACC_RTC__)
+  static std::float_round_style const round_style = std::round_to_nearest;
+#endif
+  static bool const is_iec559 = true;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 10;
+
+  /// Least positive value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t min() { return cutlass::half_t::bitcast(0x0001); }
+
+  /// Minimum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t lowest() { return cutlass::half_t::bitcast(0xfbff); }
+
+  /// Maximum finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t max() { return cutlass::half_t::bitcast(0x7bff); }
+
+  /// Returns smallest finite value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t epsilon() { return cutlass::half_t::bitcast(0x1800); }
+
+  /// Returns maximum rounding error
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t round_error() { return cutlass::half_t(0.5f); }
+
+  /// Returns positive infinity value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t infinity() { return cutlass::half_t::bitcast(0x7c00); }
+
+  /// Returns quiet NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t quiet_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns signaling NaN value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t signaling_NaN() { return cutlass::half_t::bitcast(0x7fff); }
+
+  /// Returns smallest positive subnormal value
+  CUTLASS_HOST_DEVICE
+  static cutlass::half_t denorm_min() { return cutlass::half_t::bitcast(0x0001); }
+};
+}  // namespace platform 
+}  // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __heq(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) == float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hne(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) != float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hlt(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) < float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hle(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) <= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hgt(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) > float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hge(lhs.to_half(), rhs.to_half());
+#else
+  return float(lhs) >= float(rhs);
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator+(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hadd(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) + float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator-(half_t const& lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hneg(lhs.to_half()));
+#else
+  return half_t(-float(lhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator-(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hsub(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) - float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator*(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hmul(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) * float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator/(half_t const& lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return half_t(__hdiv(lhs.to_half(), rhs.to_half()));
+#else
+  return half_t(float(lhs) / float(rhs));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator+=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) + float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator-=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) - float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator*=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hmul(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) * float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator/=(half_t & lhs, half_t const& rhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hdiv(lhs.to_half(), rhs.to_half()));
+#else
+  lhs = half_t(float(lhs) / float(rhs));
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator++(half_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  ++tmp;
+  lhs = half_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t& operator--(half_t & lhs) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  --tmp;
+  lhs = half_t(tmp);
+#endif
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator++(half_t & lhs, int) {
+  half_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hadd(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  tmp++;
+  lhs = half_t(tmp);
+#endif
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+half_t operator--(half_t & lhs, int) {
+  half_t ret(lhs);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  lhs = half_t(__hsub(lhs.to_half(), half_t(1.0f).to_half()));
+#else
+  float tmp(lhs);
+  tmp--;
+  lhs = half_t(tmp);
+#endif
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t operator "" _hf(long double x) {
+  return cutlass::half_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::half_t operator "" _hf(unsigned long long int x) {
+  return cutlass::half_t(int(x));
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/integer_subbyte.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/integer_subbyte.h
new file mode 100644
index 0000000000000000000000000000000000000000..43047eaeec355b8c13ce034ffa7d508f083e823b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/integer_subbyte.h
@@ -0,0 +1,301 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a class for using integer types smaller than one byte in host or
+      device code.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+
+#include "cutlass/numeric_size.h"
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+template <int Bits, bool Signed = true>
+struct integer_subbyte {
+  using Storage = uint8_t;
+
+  static_assert(Bits <= 8*sizeof(Storage), "Require a subbyte of bits in integer_subbyte");
+
+  // "External type"; the integer type for which
+  // integer_subbyte has a conversion-to operator
+  using xint_t = typename cutlass::platform::conditional<Signed, int, unsigned>::type;
+
+  // Bitmask for truncation from larger integers
+  static constexpr Storage bits_mask_ = Storage(Storage(-1) >> (8 - Bits));
+  // Bitmask for the sign bit
+  static constexpr Storage sign_mask_ = Storage((Signed ? 1 : 0) << (Bits - 1));
+
+  // Where the bits are stored
+  Storage storage;
+
+  // Default construction does NOT zero-initialize
+  integer_subbyte() = default;
+
+  // Implicit conversion is DEPRECATED.
+  // Please use one of the two explicit constructors below.
+  template<class T,
+    class Enable = cutlass::platform::enable_if_t<cutlass::platform::is_convertible_v<T, int>>
+  >
+#if !defined(CUTLASS_EXTRA_WARNINGS)
+  [[deprecated("Implicit conversion is deprecated; please use explicit construction instead")]]
+#endif
+  CUTLASS_HOST_DEVICE
+  integer_subbyte(T value)
+      : integer_subbyte(static_cast<xint_t>(value)) {}
+
+  CUTLASS_HOST_DEVICE
+  integer_subbyte(float value)
+      : integer_subbyte(static_cast<xint_t>(value)) {}
+
+  // CUTLASS code commonly converts both signed and unsigned integers
+  // into integer_subbyte, so the class provides both explicit
+  // conversions.
+
+  // Precondition: If the external type is unsigned int, then value
+  // fits in unsigned int (is nonnegative).
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(int value)
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value <= upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value >= 0);
+      assert(value < static_cast<int>(upper_bound));
+    }
+  }
+
+  // Precondition: If the external type is (signed) int, then value
+  // fits in int.
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(unsigned value)
+      : storage(reinterpret_cast<Storage const&>(value) & bits_mask_)
+  {
+    if constexpr (Signed) {
+      [[maybe_unused]] constexpr int lower_bound = -(1 << (Bits - 1));
+      [[maybe_unused]] constexpr int upper_bound = (1 << (Bits - 1)) - 1;
+      assert(value >= lower_bound);
+      assert(value <= upper_bound);
+    }
+    else {
+      [[maybe_unused]] constexpr unsigned upper_bound = 1u << Bits;
+      assert(value < upper_bound);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE explicit
+  integer_subbyte(uint8_t value)
+    : integer_subbyte(static_cast<unsigned>(value)) {}
+
+  // Convert to the "external" integer type (int or unsigned)
+  CUTLASS_HOST_DEVICE
+  operator xint_t() const {
+    if (sign_mask_ & storage) {  // Sign extend
+      return xint_t(storage) | ~xint_t(bits_mask_);
+    } else {
+      return xint_t(storage);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator==(integer_subbyte const& rhs) const {
+    return storage == rhs.storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator!=(integer_subbyte const& rhs) const {
+    return storage != rhs.storage;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator<(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage < rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator<=(integer_subbyte const& rhs) const {
+    if ((sign_mask_ & storage) == (sign_mask_ & rhs.storage)) {
+      // If both *this and rhs have the same sign, compare storage directly.
+      return storage <= rhs.storage;
+    }
+    else {
+      // If *this and rhs don't have the same sign,
+      // then return whether *this is negative.
+      return sign_mask_ & storage;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator>=(integer_subbyte const& rhs) const {
+    return !(*this < rhs);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator>(integer_subbyte const& rhs) const {
+    return !(*this <= rhs);
+  }
+
+  CUTLASS_HOST_DEVICE friend integer_subbyte
+  conj(integer_subbyte const& x) {
+    return x;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-bit binary type
+using bin1_t = bool;
+
+/// 1-bit Unsigned integer type
+using uint1b_t = integer_subbyte<1, false>;
+
+/// 2-bit Integer type
+using int2b_t = integer_subbyte<2, true>;
+
+/// 2-bit Unsigned integer type
+using uint2b_t = integer_subbyte<2, false>;
+
+/// 3-bit Integer type
+using int3b_t = integer_subbyte<3, true>;
+
+/// 3-bit Unsigned integer type
+using uint3b_t = integer_subbyte<3, false>;
+
+/// 4-bit Integer type
+using int4b_t = integer_subbyte<4, true>;
+
+/// 4-bit Unsigned integer type
+using uint4b_t = integer_subbyte<4, false>;
+
+/// 6-bit integer type
+using int6b_t = integer_subbyte<6, true>;
+
+/// 6-bit unsigned integer type
+using uint6b_t = integer_subbyte<6, false>;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Bits, bool Signed>
+struct sizeof_bits<integer_subbyte<Bits,Signed>> {
+  static constexpr int value = Bits;
+};
+
+/// Defines the size of an element in bits - specialized for bin1_t
+template <>
+struct sizeof_bits<bin1_t> {
+  static constexpr int value = 1;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace platform {
+
+/// Forward Declaration
+template <class T>
+struct numeric_limits;
+
+// Specialization for signed integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, true>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, true>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{
+      -(1 << (NumBits - 1))
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1 << (NumBits - 1)) - 1
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool has_infinity = false;
+};
+
+// Specialization for unsigned integer_subbyte
+template<int NumBits>
+struct numeric_limits<cutlass::integer_subbyte<NumBits, false>> {
+private:
+  using value_type = cutlass::integer_subbyte<NumBits, false>;
+
+public:
+  CUTLASS_HOST_DEVICE static value_type lowest() noexcept {
+    return value_type{0u};
+  }
+
+  CUTLASS_HOST_DEVICE static value_type max() noexcept {
+    return value_type{
+      (1u << NumBits) - 1u
+    };
+  }
+
+  CUTLASS_HOST_DEVICE static value_type const min() noexcept {
+    return lowest();
+  }
+
+  static constexpr bool is_integer = true;
+  static constexpr bool is_signed = false;
+};
+
+} // namespace platform
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d7c685f6e830b2cf90611f84ff5f65afc058c17
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.h
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/device_kernel.h"
+#if !defined(__CUDACC_RTC__)
+#include "cuda_runtime.h"
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif
+#include <cute/int_tuple.hpp>
+
+namespace cutlass {
+
+struct KernelHardwareInfo {
+  //
+  // Data members
+  //
+
+  // Hardware properties
+  int device_id = 0;
+  int sm_count  = 0;
+
+  // Kernel properties
+  int max_active_clusters = 0;              // Maximum number of clusters that could co-exist on the target device.
+  dim3 cluster_shape = {0,0,0};             
+  dim3 cluster_shape_fallback = {0,0,0};    
+
+  //
+  // Methods
+  //
+
+#if !defined(__CUDACC_RTC__)
+  static inline int
+  query_device_multiprocessor_count(int device_id = 0) {
+    cudaError_t result = cudaGetDevice(&device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_id);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    return multiprocessor_count;
+  }
+
+  // Query maximum number of active clusters that could co-exist on the target device
+  // based on kernel properties such as cluster dims and threadblock dims
+  static inline int
+  query_device_max_active_clusters(
+      dim3 cluster_dims,
+      uint32_t threads_per_block,
+      void const* kernel_ptr) {
+    int max_active_clusters = 0;
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    ClusterLauncher::LaunchConfig cluster_launch_config = ClusterLauncher::make_cluster_launch_config(
+                                                            cluster_dims /* minimum grid dim */, cluster_dims, {threads_per_block, 1, 1});
+    // Given the kernel function and launch configuration, return the maximum number of clusters that could co-exist on the target device.
+    cudaError_t result = cudaOccupancyMaxActiveClusters(&max_active_clusters, kernel_ptr, &cluster_launch_config.launch_config);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    CUTLASS_TRACE_HOST("cudaOccupancyMaxActiveClusters: maximum number of clusters that could co-exist on the target device = "
+        << max_active_clusters << "\n");
+    return max_active_clusters;
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster occupancy query.");
+    return max_active_clusters;
+#endif
+  }
+
+  // Simpler version of the above query function that fetches relevant information from the Kernel 
+  template <typename Kernel>
+  static inline int
+  query_device_max_active_clusters() {
+    dim3 cluster_dims(cute::size<0>(typename Kernel::ClusterShape{}),
+                      cute::size<1>(typename Kernel::ClusterShape{}),
+                      cute::size<2>(typename Kernel::ClusterShape{}));
+    uint32_t threads_per_block = Kernel::MaxThreadsPerBlock;
+    void const* kernel_ptr = (void*)(device_kernel<Kernel>);
+    return query_device_max_active_clusters(cluster_dims, threads_per_block, kernel_ptr);
+  }
+
+  template <typename Kernel>
+  static inline KernelHardwareInfo
+  make_kernel_hardware_info(int const device_id = 0, int sm_count = 0, int max_active_clusters = 0) {
+    if (sm_count == 0) {
+      sm_count = query_device_multiprocessor_count(device_id);
+    }
+    if (max_active_clusters == 0) {
+      max_active_clusters = query_device_max_active_clusters<Kernel>();
+    }
+    return {device_id, sm_count, max_active_clusters};
+  }
+#endif
+};
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1758eac060aae26ccd8dd36fb06db71ff354bb6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_hardware_info.hpp
@@ -0,0 +1,35 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+// Simply import .h version of header so as to avoid breaking any existing CUTLASS builds
+// after .hpp was changed to .h
+#include "cutlass/kernel_hardware_info.h"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_launch.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e92e6c13f51315316051dabadc635de25bbbae90
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/kernel_launch.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Defines structures and helpers to launch CUDA kernels within CUTLASS.
+*/
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/trace.h"
+#include "cutlass/device_kernel.h" // cutlass::device_kernel
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure containing the basic launch configuration of a CUDA kernel.
+struct KernelLaunchConfiguration {
+
+  /// CUDA grid dimensions
+  dim3 grid;
+
+  /// CUDA threablock dimensions
+  dim3 block;
+
+  /// Bytes of dynamically allocated SMEM in addition to static SMEM
+  size_t dynamic_smem;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a KernellaunchConfiguration object
+  CUTLASS_HOST_DEVICE
+  KernelLaunchConfiguration(
+    dim3 _grid = dim3(1,1,1),
+    dim3 _block = dim3(1,1,1),
+    size_t _dynamic_smem = 0
+  ):
+    grid(_grid),
+    block(_block),
+    dynamic_smem(_dynamic_smem) { }
+};
+
+
+template <typename GemmKernel, typename Params>
+Status kernel_launch(
+    dim3 const grid_dims,
+    dim3 const block_dims,
+    size_t const smem_size,
+    cudaStream_t cuda_stream,
+    const Params &kernel_params,
+    bool launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  CUTLASS_TRACE_HOST("cutlass::kernel_launch");
+#endif
+
+  if (not launch_with_pdl) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: No PDL");
+#endif
+    device_kernel<GemmKernel><<<grid_dims, block_dims, smem_size, cuda_stream>>>(kernel_params);
+  }
+  else {
+#if ((__CUDACC_VER_MAJOR__ >= 12) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8)))
+    if constexpr (GemmKernel::ArchTag::kMinComputeCapability < 90) {
+      CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported for SM90.");
+      return Status::kInvalid;
+    }
+
+    cudaLaunchConfig_t config;
+    cudaLaunchAttribute attrs[1];
+
+    config.gridDim = grid_dims;
+    config.blockDim = block_dims;
+    config.dynamicSmemBytes = smem_size;
+    config.stream = cuda_stream;
+
+    config.attrs = attrs;
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = 1;
+    config.numAttrs = 1;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: Calling cudaLaunchKernelEx");
+#endif
+    cudaError_t launch_result = cudaLaunchKernelEx(&config, &device_kernel<GemmKernel>, kernel_params);
+    if (cudaSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaLaunchKernelEx failed with error: " << cudaGetErrorString(launch_result));
+      return Status::kErrorInternal;
+    }
+#else
+    CUTLASS_TRACE_HOST("  Programmatic dependent launch (PDL) is only supported starting CUDA 11.8.");
+    return Status::kInvalid;
+#endif
+  }
+
+  cudaError_t result = cudaGetLastError();
+  if (cudaSuccess == result) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("cutlass::kernel_launch: cudaGetLastError reports success");
+#endif
+    return Status::kSuccess;
+  }
+  else {
+    CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << result);
+    return Status::kErrorInternal;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/layout.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2e377c21339ff6c71d45370fa0572bf15c3f415
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/layout.h
@@ -0,0 +1,64 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes. 
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/matrix.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..281b668ba59e3ddd7a1861e995ba7def13b83df2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/matrix.h
@@ -0,0 +1,1349 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes. 
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/pitch_linear_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines data layouts of various matrix formats usable by TensorRef and other classes.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for row-major matrices.
+class RowMajor {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  RowMajor(LongIndex ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajor(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajor packed(MatrixCoord const &extent) {
+    return RowMajor(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return LongIndex(coord.row()) * LongIndex(stride_[0]) + coord.column();
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(Index(offset / stride_[0]), Index(offset % stride_[0]));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return LongIndex(extent.row()) * LongIndex(stride_[0]);
+  }
+};
+
+/// Mapping function for column-major matrices.
+class ColumnMajor {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajor(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajor(Stride stride): stride_(stride) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajor packed(MatrixCoord const &extent) {
+    return ColumnMajor(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return LongIndex(coord.column()) * LongIndex(stride_[0]) + coord.row();
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(Index(offset % stride_[0]), Index(offset / stride_[0]));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return LongIndex(extent.column()) * LongIndex(stride_[0]);
+  }
+};
+
+/// Mapping function for interleaved matrices. Matrix is structured
+/// as row-major arrangement of fixed-size columns.
+template <int Interleave>
+struct RowMajorInterleaved {
+  
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of interleaved columns
+  static int const kInterleave = Interleave;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorInterleaved packed(MatrixCoord const &extent) {
+    return RowMajorInterleaved(extent.column() * kInterleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index row_major = coord.row() / kInterleave;
+    Index row_minor = coord.row() % kInterleave;
+    return LongIndex(row_major) * LongIndex(stride_[0]) + LongIndex(coord.column()) * kInterleave + row_minor;
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    Index row_major = Index(offset / stride_[0]);
+    Index residual = Index(offset % stride_[0]);
+
+    Index column = residual / kInterleave;
+    Index row_minor =  residual % kInterleave;
+
+    return MatrixCoord(row_major * kInterleave + row_minor, column);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.row() + kInterleave - 1) / kInterleave * stride_[0];
+  }
+};
+
+/// Mapping function for interleaved matrices. Matrix is structured
+/// as column-major arrangement of fixed-size rows.
+template <int Interleave>
+struct ColumnMajorInterleaved {
+  
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of interleaved columns
+  static int const kInterleave = Interleave;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorInterleaved(LongIndex ldm = 0): stride_(ldm) { }
+  
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorInterleaved(Stride stride): stride_(stride) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorInterleaved packed(MatrixCoord const &extent) {
+    return ColumnMajorInterleaved(extent.row() * kInterleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index column_major = coord.column() / kInterleave;
+    Index column_minor = coord.column() % kInterleave;
+    return LongIndex(column_major) * LongIndex(stride_[0]) + LongIndex(coord.row()) * kInterleave + column_minor;
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    Index column_major = Index(offset / stride_[0]);
+    Index residual = Index(offset % stride_[0]);
+
+    Index row = residual / kInterleave;
+    Index column_minor =  residual % kInterleave;
+
+    return MatrixCoord(row, column_major * kInterleave + column_minor);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.column() + kInterleave - 1) / kInterleave * stride_[0];
+  }
+};
+
+/// Enumerated type for canonical pitch-linear matrix layouts
+enum class Matrix {
+  kColumnMajor,       ///< leading dimension refers to stride between columns; stride along rows is 1
+  kRowMajor           ///< leading dimension refers to stride between rows; stride along columns is 1
+};
+
+/// Mapping function for scenario in which layout is row-major or column-major but this information
+/// is only available at runtime.
+struct ContiguousMatrix {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+  /// Enumerated type indicating canonical matrix layout
+  Matrix layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ContiguousMatrix(
+    Index ldm = 0, 
+    Matrix layout = Matrix::kColumnMajor
+  ):
+    stride_(ldm), layout_(layout) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ContiguousMatrix packed(
+    MatrixCoord const &extent, 
+    Matrix layout = Matrix::kColumnMajor) {
+
+    Index ldm = 0;
+    if (layout == Matrix::kColumnMajor) {
+      ldm = extent.row();
+    }
+    else if (layout == Matrix::kRowMajor) {
+      ldm = extent.column();
+    }
+    return ContiguousMatrix(ldm, layout);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    if (layout_ == Matrix::kColumnMajor) {
+      return coord.row() + coord.column() * stride_[0];
+    }
+    else if (layout_ == Matrix::kRowMajor) {
+      return coord.row() * stride_[0] + coord.column();
+    }
+    else {
+      // degenerate case
+      return 0;
+    }
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    if (layout_ == Matrix::kColumnMajor) {
+      return stride_[0] * extent.column();
+    }
+    else if (layout_ == Matrix::kRowMajor) {
+      return stride_[0] * extent.row();
+    }
+    else {
+      // degenerate case
+      return 0;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+template <int Rank>
+struct AffineRankN {
+
+  /// Logical rank of tensor
+  static int const kRank = Rank;
+
+  /// Rank of stride vector
+  static int const kStrideRank = kRank;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    Coord<kRank/2, LongIndex> const &stride_m,
+    Coord<kRank/2, LongIndex> const &stride_n
+  ) { 
+
+    // Concatenate the strides
+    CUTLASS_PRAGMA_UNROLL
+    for (int m = 0; m < kRank/2; ++m) {
+      stride_[m] = stride_m[m];
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < kRank/2; ++n) {
+      stride_[n + kRank/2] = stride_n[n];
+    }
+  }
+
+  /// Ctor for N = 2
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    LongIndex const &stride_m,
+    LongIndex const &stride_n
+  ) { 
+      stride_[0] = stride_m;
+      stride_[1] = stride_n;
+  }
+
+  /// Ctor for N = 2
+  CUTLASS_HOST_DEVICE
+  AffineRankN(
+    LongIndex const &stride
+  ) { 
+      stride_[0] = stride;
+      stride_[1] = 1;
+  }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRankN packed(TensorCoord const &extent) {
+    
+    AffineRankN layout;
+    layout.stride_[kRank - 1] = 1;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = kRank - 1; i > 0; --i) {
+      layout.stride_[i - 1] = layout.stride_[i] * extent[i];
+    }
+
+    return layout;
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    int idx = stride_.max_dim_index();
+    return extent[idx] * stride_[idx];
+  }
+};
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+/// Row stride is smaller than column stride in AffineRank2ColumnMajor.
+struct AffineRank2ColumnMajor {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    LongIndex row_stride,           ///< stride between elements in consecutive rows
+    LongIndex column_stride         ///< stride between elements in consecutive columns
+  )
+    { stride_[0] = row_stride; stride_[1] = column_stride;}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2ColumnMajor(
+    LongIndex stride
+  )
+    { stride_[0] = 1; stride_[1] = stride;}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRank2ColumnMajor packed(MatrixCoord const &extent) {
+    return AffineRank2ColumnMajor(1, extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return extent.column() * stride_[1];
+  }
+};
+
+/// Mapping function for scenario in which both rows and columns are separated by a stride.
+/// Column stride is smaller than row stride in AffineRank2RowMajor.
+struct AffineRank2RowMajor {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    Stride const &stride = Stride()
+  ):
+    stride_(stride) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    LongIndex row_stride,           ///< stride between elements in consecutive rows
+    LongIndex column_stride         ///< stride between elements in consecutive columns
+  ) { stride_[0] = row_stride; stride_[1] = column_stride;}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  AffineRank2RowMajor(
+    LongIndex stride
+  ) { stride_[0] = stride; stride_[1] = 1;}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static AffineRank2RowMajor packed(MatrixCoord const &extent) {
+    return AffineRank2RowMajor(1, extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return dot(coord, stride_);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    CUTLASS_UNUSED(offset);
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return extent.row() * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Utility functions to convert stride_factor to the strides used by the Affine2 layout.
+//
+// stride_factor is the logical distance between two coorinates.
+//
+// All Coodinates used here are matrix coordinates.  stride[0] and extent[0] are for the
+// rows.  stride[1] and extent[1] are for the columns.
+template <typename Affine2Layout>
+  struct Affine2Layout_Factory {
+  CUTLASS_HOST_DEVICE
+  static Affine2Layout layout_factory(cutlass::Coord<2> const &extent, typename Affine2Layout::Stride stride_factor) {
+    return Affine2Layout::packed(extent);
+  }
+};
+
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRank2ColumnMajor> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRank2ColumnMajor layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRank2ColumnMajor::Stride stride_factor) {
+    return cutlass::layout::AffineRank2ColumnMajor({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
+  }
+};
+
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRank2RowMajor> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRank2RowMajor layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRank2RowMajor::Stride stride_factor) {
+    return cutlass::layout::AffineRank2RowMajor({ stride_factor[0] * stride_factor[1] * extent[1], stride_factor[1] });
+  }
+};
+
+// The base layout cutlass::layout::AffineRankN<2> is similar to AffineRank2ColumnMajor
+template <>
+struct Affine2Layout_Factory<cutlass::layout::AffineRankN<2>> {
+CUTLASS_HOST_DEVICE
+static cutlass::layout::AffineRankN<2> layout_factory(
+  cutlass::Coord<2> const &extent,
+  typename cutlass::layout::AffineRankN<2>::Stride stride_factor) {
+    return cutlass::layout::AffineRankN<2>({ stride_factor[0], stride_factor[0] * stride_factor[1] * extent[0] });
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for block-linear matrices. Matrix is structured
+/// as column-major arrangement of 2D tiles (that are column-major).
+template <int BlockRows, int BlockColumns>
+struct ColumnMajorBlockLinear {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of a block in rows
+  static int const kBlockRows = BlockRows;
+
+  /// Size of a block in columns
+  static int const kBlockColumns = BlockColumns;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorBlockLinear packed(MatrixCoord const &extent) {
+    return ColumnMajorBlockLinear(extent.row() * kBlockRows * kBlockColumns);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return 
+      (coord.row() % kBlockRows) + 
+      (coord.column() % kBlockColumns) * kBlockRows +
+      (coord.row() / kBlockRows) * kBlockRows * kBlockColumns +
+      (coord.column() / kBlockColumns) * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.column() + kBlockColumns - 1) / kBlockColumns * stride_[0];
+  }
+};
+
+/// Mapping function for block-linear matrices. Matrix is structured
+/// as row-major arrangement of 2D tiles (that are row-major)
+template <int BlockRows, int BlockColumns>
+struct RowMajorBlockLinear {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+  /// Size of a block in rows
+  static int const kBlockRows = BlockRows;
+
+  /// Size of a block in columns
+  static int const kBlockColumns = BlockColumns;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorBlockLinear(Index ldm = 0): stride_(ldm) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorBlockLinear packed(MatrixCoord const &extent) {
+    return RowMajorBlockLinear(extent.column() * kBlockRows * kBlockColumns);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    return 
+      (coord.column() % kBlockColumns) +
+      (coord.row() % kBlockRows) * kBlockColumns +
+      (coord.column() / kBlockColumns) * kBlockRows * kBlockColumns +
+      (coord.row() / kBlockRows) * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  MatrixCoord inverse(LongIndex offset) const {
+    return MatrixCoord(0, 0);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+  
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    return (extent.row() + kBlockRows - 1) / kBlockRows * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GeneralMatrix {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 2;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+  //
+  // Data members
+  //
+
+  Matrix layout_id_;
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  GeneralMatrix(): layout_id_(Matrix::kColumnMajor), stride_(make_Coord(0, 1)) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  GeneralMatrix(
+    Matrix layout_id, 
+    Index ldm, 
+    Index interleave): layout_id_(layout_id), stride_(make_Coord(ldm, interleave)) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static GeneralMatrix packed(
+    MatrixCoord const &extent, 
+    Matrix layout_id = Matrix::kColumnMajor, 
+    Index interleave = 1) {
+
+    Index c;
+    if (layout_id == Matrix::kRowMajor) {
+      c = extent.column();
+    }
+    else {
+      c = extent.row();
+    }
+
+    Index ldm = c * interleave;
+
+    return GeneralMatrix(layout_id, ldm, interleave);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (row, column)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord const &coord) const {
+    Index c, s;
+    if (layout_id_ == Matrix::kRowMajor) {
+      c = coord.column();
+      s = coord.row();
+    }
+    else {
+      s = coord.column();
+      c = coord.row();
+    }
+
+    Index v = s / stride_[1];
+    Index residual = (s % stride_[1]);
+
+    return LongIndex(c) * LongIndex(stride_[1]) + LongIndex(v) * LongIndex(stride_[0]) + residual;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix layout_id() const {
+    return layout_id_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix & layout_id() {
+    return layout_id_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index stride(int idx) const {
+    return stride_[idx];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  typename Stride::Index & stride(int idx) {
+    return stride_[idx];
+  }
+  
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(MatrixCoord const &extent) const {
+    Index s;
+    if (layout_id_ == Matrix::kRowMajor) {
+      s = extent.row();
+    }
+    else {
+      s = extent.column();
+    }
+
+    Index v = Index((s + stride_[1] - 1) / stride_[1]);
+    return LongIndex(v) * LongIndex(stride_[0]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines transposes of matrix layouts
+template <typename Layout>
+struct LayoutTranspose;
+
+/// Transpose of row-major is column-major
+template <>
+struct LayoutTranspose<layout::RowMajor> {
+  using type = layout::ColumnMajor;
+};
+
+/// Transpose of column-major is row-major
+template <>
+struct LayoutTranspose<layout::ColumnMajor> {
+  using type = layout::RowMajor;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/permute.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/permute.h
new file mode 100644
index 0000000000000000000000000000000000000000..99e3353f7ba0be2fef2a4a9c475e3babe0b70058
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/permute.h
@@ -0,0 +1,824 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by GEMM+permute path for common tensor or matrix formats.
+
+    Like Layout functions, permute layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Permute layout functions must implement all members in the interface of NoPermute<> defined in this file. Address offset
+    computation lies in operator() with private member variables  {col_permute_, row_permute_ and stride_} as new addresses after permute op.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+// template<PermuteTag, typename Layout, bool Inverse>
+// struct PermuteSelect {
+//   // Try to give a reasonable error message to the user
+//   static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+//                 "You've tried to use a layout permutation for which the implementation is not availble. "
+//                 "In order to provide an implementation for a particular combination of matrix layout "
+//                 "and direction (direct/inverse), please specialize PermuteSelect trait.");
+// };
+
+// Base template for defining specializations of permutation inverses
+template<typename Permute>
+struct InversePermute
+{
+  // Try to give a reasonable error message to the user
+  static_assert(!platform::is_same<Permute, Permute>::value, // aka always_false<T>
+                "To apply permutation to a GEMM input operand (A or B), an inverse permutation for the desired "
+                "permute class must be defined and enabled by specializing cutlass::layout::InversePermute trait.");
+};
+
+class PermuteBase {
+public:
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+};
+
+class NoPermute : public PermuteBase {
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor from matrix extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(MatrixCoord extent, Index stride) { };
+
+  /// Constructor from pitch-linear extent
+  CUTLASS_HOST_DEVICE
+  NoPermute(PitchLinearCoord extent, Index stride) { };
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const { return 0; } // not correct but should never be called
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { return 0; } // not correct but should never be called
+};
+
+template<>
+struct InversePermute<NoPermute> {
+  using type = NoPermute;
+};
+
+/// Helper trait to detect if permute operation is a noop
+template<typename Permute>
+inline bool constexpr is_trivial_permute = platform::is_same<Permute, cutlass::layout::NoPermute>::value;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines permute layouts of various tensor formats.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermute0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D3_ = extent.column() / D2;
+
+    stride_ = stride * D1 / D2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() % D3_;
+    Index k = coord.column() / D3_;
+    Index j = coord.row() % D1;
+    Index i = coord.row() / D1;
+
+    MatrixCoord permuted{k + i * D2, l + j * D3_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213RowMajorInverse : public Tensor4DPermute0213RowMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213RowMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajor<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213RowMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213RowMajor<D1, D2>;
+};
+
+/// Permute layout function for 4-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/D1, D1, D2, N/D2]. Then perform permute([0, 2, 1, 3]) on the corresponding tensor.
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D0_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+    assert(extent.column() % D2 == 0);
+
+    D0_ = extent.row() / D1;
+
+    stride_ = stride * D2 / D1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermute0213ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermute0213ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column() / D2;
+    Index k = coord.column() % D2;
+    Index j = coord.row() / D0_;
+    Index i = coord.row() % D0_;
+
+    MatrixCoord permuted{i + k * D0_, j + l * D1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+// Inverse for Tensor4DPermute0213 can be implemented by simply swapping D1 and D2
+template <int D1, int D2>
+class Tensor4DPermute0213ColumnMajorInverse : public Tensor4DPermute0213ColumnMajor<D2, D1> {
+public:
+  using Base = Tensor4DPermute0213ColumnMajor<D2, D1>;
+  using Base::Base;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajor<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajorInverse<D1, D2>;
+};
+
+template<int D1, int D2>
+struct InversePermute<Tensor4DPermute0213ColumnMajorInverse<D1, D2>> {
+  using type = Tensor4DPermute0213ColumnMajor<D1, D2>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor4DPermuteBMM0213
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 2, 1, 3]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(MatrixCoord extent, Index stride) {
+
+    Index D2 = extent.row();
+    D3_ = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k, l + j * D3_};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0213RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D3_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.column() % D1 == 0);
+
+    Index D2 = extent.row();
+    D3_ = extent.column() / D1;
+
+    stride_ = stride / D1;
+
+    batch_stride_ = D2 * stride_;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0213RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0213RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // The batch index for BMM
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D2)->row, (D1,D3)->col]
+    Index l = coord.column() % D3_;
+    Index j = coord.column() / D3_;
+    Index k = coord.row();
+    Index i = BMM_batch_idx;
+
+    // compute original [batch, row, col] index
+    Index pbatch = j + i * D1;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() * LongIndex(stride_) + pcoord.column();
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajor<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0213RowMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0213RowMajor<D1>;
+};
+
+/// Permute layout function for 4-D permuted tensors for BMM with BMM tensor (dimensions [B, M, N]) reshaped
+/// as [B/D1, D1, M, N]. Then perform permute([0, 3, 2, 1]) on the corresponding whole BMM tensor.
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord extent, Index stride) {
+
+    D2_ = extent.row();
+    Index D3 = extent.column();
+
+    stride_ = stride * D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // [i,j,k,l] -> [i,k,j,l]
+    Index l = coord.column();
+    Index k = coord.row();
+    Index j = BMM_batch_idx % D1;
+    Index i = BMM_batch_idx / D1;
+
+    Index pbatch = i;
+    MatrixCoord pcoord{k + j * D2_, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template <int D1>
+class Tensor4DPermuteBMM0321ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index D2_;
+
+  Index stride_;
+
+  Index batch_stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % D1 == 0);
+
+    D2_ = extent.row() / D1;
+    Index D3 = extent.column();
+
+    stride_ = stride / D1;
+    batch_stride_ = stride_ * D3;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor4DPermuteBMM0321ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor4DPermuteBMM0321ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index BMM_batch_idx = blockIdx.z;
+    
+    // The following assumes grouping [(D0)->batch, (D1,D2)->row, (D3)->col]
+    Index l = coord.column();
+    Index k = coord.row() % D2_;
+    Index j = coord.row() / D2_;
+    Index i = BMM_batch_idx;
+
+    Index pbatch = i * D1 + j;
+    MatrixCoord pcoord{k, l};
+
+    return pbatch * LongIndex(batch_stride_) + pcoord.row() + pcoord.column() * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajor<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajorInverse<D1>;
+};
+
+template<int D1>
+struct InversePermute<Tensor4DPermuteBMM0321ColumnMajorInverse<D1>> {
+  using type = Tensor4DPermuteBMM0321ColumnMajor<D1>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//  Tensor5DPermute20314
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with output matrix (dimension as [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([2, 0, 3, 1, 4]) on the corresponding output tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T2 * T1; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajor(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+  
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T2, T0, T3, T1, T4].
+
+    Index m = coord.column() % T4_;
+    Index l = (coord.column() / T4_) % T3;
+    Index k = (coord.column() / T4_) / T3;
+    Index j = coord.row() % T1;
+    Index i = coord.row() / T1;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T1 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+/// Inverse for Tensor5DPermute20314 (could also be given a proper name, e.g. Tensor5DPermute13024).
+template <int T1, int T2, int T3>
+class Tensor5DPermute20314RowMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T1 * T2;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute20314RowMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute20314RowMajorInverse(MatrixCoord(extent.strided(), extent.contiguous()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{j + i * T1, m + l * T4_ + k * T3 * T4_};
+
+    return LongIndex(permuted.row()) * LongIndex(stride_) + LongIndex(permuted.column());
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.strided(), coord.contiguous()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute20314RowMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute20314RowMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Tensor5DPermute02413
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Permute layout function for 5-D permuted tensors with matrix (dimensions [M, N]) reshaped
+/// as [M/T1, T1, T2, T3, N/T2/T3]. Then perform permute([0, 2, 4, 1, 3]) on the corresponding tensor.
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajor : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T1 == 0);
+    assert(extent.column() % (T2 * T3) == 0);
+
+    T0_ = extent.row() / T1;
+    T4_ = extent.column() / (T2 * T3);
+
+    /// Update stride_permute with stride
+    stride_ = stride / T1 * T2; // stride in Elements
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajor(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajor(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+  
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    // Permute as torch.permute(X1, [2, 0, 3, 1, 4]) -> 5D Tensor indices as [i,j,k,l,m], the dimension of X 
+    // is [T0, T1, T2, T3, T4], after permutation the dim of X1 is [T0, T2, T4, T1, T3].
+
+    Index m = (coord.column() / T2) / T3;
+    Index l = (coord.column() / T2) % T3;
+    Index k = coord.column() % T2;
+    Index j = coord.row() / T0_;
+    Index i = coord.row() % T0_;
+
+    MatrixCoord permuted{i + k * T0_, m + j * T4_ + l * T4_ * T1};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+/// Inverse for Tensor5DPermute02413ColumnMajor
+template <int T1, int T2, int T3>
+class Tensor5DPermute02413ColumnMajorInverse : public PermuteBase {
+private:
+  //
+  // Data members
+  //
+
+  Index T0_;
+
+  Index T4_;
+
+  // Permuted stride in units of elements
+  Index stride_;
+  
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(MatrixCoord extent, Index stride) {
+
+    assert(extent.row() % T2 == 0);
+    assert(extent.column() % (T1 * T3) == 0);
+
+    T0_ = extent.row() / T2;
+    T4_ = extent.column() / (T1 * T3);
+
+    stride_ = stride / T2 * T1;
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  Tensor5DPermute02413ColumnMajorInverse(PitchLinearCoord extent, Index stride)
+  : Tensor5DPermute02413ColumnMajorInverse(MatrixCoord(extent.contiguous(), extent.strided()), stride) {}
+
+  /// Computes the offset after the inverse of permute operation in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(MatrixCoord coord) const {
+
+    Index m = coord.column() % T4_;
+    Index j = (coord.column() / T4_) % T1;
+    Index l = (coord.column() / T4_) / T1;
+    Index i = coord.row() % T0_;
+    Index k = coord.row() / T0_;
+
+    MatrixCoord permuted{i + j * T0_, k + l * T2 + m * T2 * T3};
+
+    return LongIndex(permuted.row()) + LongIndex(permuted.column()) * LongIndex(stride_);
+  }
+
+  /// Computes the offset after Permute Op in logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const { 
+    return operator()(MatrixCoord(coord.contiguous(), coord.strided()));
+  }
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajor<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>;
+};
+
+template<int T1, int T2, int T3>
+struct InversePermute<Tensor5DPermute02413ColumnMajorInverse<T1, T2, T3>> {
+  using type = Tensor5DPermute02413ColumnMajor<T1, T2, T3>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/pitch_linear.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/pitch_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..7052de14a2d2614c0d76d1423a3cda126cef6c68
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/pitch_linear.h
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/pitch_linear_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+template <int Contiguous, int Strided>
+  using PitchLinearShape = cutlass::PitchLinearShape < Contiguous, Strided >;
+  using PitchLinearCoord = PitchLinearCoord;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for pitch-linear memory
+class PitchLinear {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, LongIndex>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+  
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  PitchLinear(LongIndex ldm = 0): stride_(ldm) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  PitchLinear(Stride _stride): stride_(_stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static PitchLinear packed(TensorCoord const &extent) {
+    return PitchLinear(extent.contiguous());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return LongIndex(coord.contiguous()) + LongIndex(coord.strided()) * LongIndex(stride_[0]);
+  }
+
+  /// Returns the logical coordinate given an offset.
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex index) const {
+    return make_Coord(
+      TensorCoord::Index(index % stride_[0]),
+      TensorCoord::Index(index / stride_[0])
+    );
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  LongIndex stride(int rank) const {
+    return stride_[rank];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  LongIndex & stride(int rank) {
+    return stride_[rank];
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.strided() * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e8a354e663e486f58925403829ba10cbd775f76
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor.h
@@ -0,0 +1,644 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for common 4-D and 5-D
+      tensor formats.
+
+    Layout functions map logical coordinates to linear memory. They often require additional
+    data to describe strides between elements.
+
+    Layout functions must implement all members in the public interface of IdentityTensorLayout<>
+    defined in cutlass/tensor_ref.h.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(cassert)
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/coord.h"
+#include "cutlass/tensor_coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Defines data layouts of various tensor formats usable by TensorRef and other classes.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tag used for 3-D NWC tensors for 1-D convolutions; only used in 3.x API
+class TensorNWC {};
+
+/// Tag used for n-D KCSRT tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorKCS {};
+class TensorKCSR {};
+class TensorKCSRT {};
+
+/// Tag used for n-D CSRTK tensors for n-D convolutions; only used in 3.x API for wgrad output layouts
+class TensorCSK {};
+class TensorCSRK {};
+class TensorCSRTK {};
+
+/// Mapping function for 4-D NHWC tensors.
+class TensorNHWC {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, h, w, c)
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [stride_w, stride_h, stride_n]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ): 
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNHWC(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed NHWC tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNHWC packed(TensorCoord const &extent) {
+    return TensorNHWC(
+      make_Coord(
+        extent.c(), 
+        extent.w() * extent.c(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.c() + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.n());
+  }
+  
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[2]);
+  }
+
+  /// Returns the logical coordinate (n, h, w, c) from a given offset in linear memory.
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex index) const {
+
+    int n = 0, h = 0, w = 0, c = 0;
+
+    #if defined(__CUDA_ARCH__)
+    int tmp = 0;
+    c = int(index % static_cast<int>(stride_[0]));
+
+    unsigned int hw_mul, hw_shr, w_mul, w_shr, c_mul, c_shr;
+
+    find_divisor(hw_mul, hw_shr, stride_[2]);
+    find_divisor(w_mul, w_shr, stride_[1]);
+    find_divisor(c_mul, c_shr, stride_[0]);
+
+    fast_divmod(n, tmp, index, int(stride_[2]), hw_mul, hw_shr);
+    fast_divmod(h, w, tmp, int(stride_[1]), w_mul, w_shr);
+    fast_divmod(w, tmp, w, int(stride_[0]), c_mul, c_shr);
+    #else
+
+    n = int(index / stride_[2]);
+    LongIndex residual = index % stride_[2];
+
+    h = int(residual / stride_[1]);
+    residual = (residual % stride_[1]);
+
+    w = int(residual / stride_[0]);
+    c = int(residual % stride_[0]);
+
+    #endif
+    return TensorCoord(n, h, w, c);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.c() > stride_[0])
+        || (extent.w() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])) {
+      assert(0);
+    }
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NCHW tensors.
+class TensorNCHW {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [w, hw, chw]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCHW(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorNCHW packed(TensorCoord const &extent) {
+    return TensorNCHW(
+      make_Coord(
+        extent.w(),
+        extent.w() * extent.h(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.w() + 
+      LongIndex(stride_[0] * coord.h()) + 
+      LongIndex(stride_[1] * coord.c()) + 
+      LongIndex(stride_[2] * coord.n());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D NC/xHWx tensors.
+template <int Interleave>
+class TensorNCxHWx {
+public:
+
+  /// Interleaving quantity
+  static int const kInterleave = Interleave;
+
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [Interleave x w, Interleave x wh, hwc]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNCxHWx(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorNCxHWx packed(TensorCoord const &extent) {
+    return TensorNCxHWx(
+      make_Coord(
+        kInterleave * extent.w(),
+        kInterleave * extent.w() * extent.h(),
+        extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index c_minor = (coord.c() % kInterleave);
+    Index c_major = (coord.c() / kInterleave);
+
+    return c_minor + 
+      LongIndex(kInterleave * coord.w()) + 
+      LongIndex(stride_[0] * coord.h()) + 
+      LongIndex(stride_[1] * c_major) + 
+      LongIndex(stride_[2] * coord.n());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent.n() * stride_[2];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 4-D CxRSKx tensors.
+template <int Interleave>
+class TensorCxRSKx {
+public:
+
+  /// Interleaving quantity
+  static int const kInterleave = Interleave;
+
+  /// Logical rank of tensor
+  static int const kRank = 4;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 3;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Tensor4DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [Interleave x n, Interleave x nw, Interleave x nwh]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(
+    typename Stride::Index stride_w,    ///< number of elements between adjacent W coordinates
+    typename Stride::Index stride_h,    ///< number of elements between adjacent H coordinates
+    typename Stride::Index stride_n     ///< number of elements between adjacent N coordinates
+  ):
+    stride_(make_Coord(stride_w, stride_h, stride_n)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorCxRSKx(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]))
+    ) { }
+
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorCxRSKx packed(TensorCoord const &extent) {
+    return TensorCxRSKx(
+      make_Coord(
+        kInterleave * extent.n(),
+        kInterleave * extent.n() * extent.w(),
+        kInterleave * extent.n() * extent.w() * extent.h()
+      )
+    );
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index c_minor = (coord.c() % kInterleave);
+    Index c_major = (coord.c() / kInterleave);
+
+    return c_minor + 
+      LongIndex(kInterleave * coord.n()) + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) + 
+      LongIndex(stride_[2] * c_major);
+  }
+
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord const &coord) const {
+    return (coord.contiguous() % kInterleave) +
+      LongIndex((coord.contiguous() / kInterleave) * stride_[2]) +
+      LongIndex(coord.strided() * kInterleave);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent.c() / kInterleave * stride_[2]);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mapping function for 5-D NDHWC tensors.
+class TensorNDHWC {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 5;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 4;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate (n, d, h, w, c)
+  using TensorCoord = Tensor5DCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Stride data member - [c, wc, hwc, dhwc]
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(Stride const &stride = Stride(0)): stride_(stride) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(
+    typename Stride::Index c, 
+    typename Stride::Index wc, 
+    typename Stride::Index hwc, 
+    typename Stride::Index dhwc): 
+  stride_(make_Coord(c, wc, hwc, dhwc)) { }
+
+  /// Constructor
+  // Once convolutions implement 64b stride this ctor can be deleted
+  CUTLASS_HOST_DEVICE
+  TensorNDHWC(Coord<kStrideRank, LongIndex> const &stride): 
+    stride_(make_Coord(
+      static_cast<typename Stride::Index>(stride[0]), 
+      static_cast<typename Stride::Index>(stride[1]), 
+      static_cast<typename Stride::Index>(stride[2]),
+      static_cast<typename Stride::Index>(stride[3]))
+    ) { }
+
+  /// Helper returns a layout to a tightly packed NHWC tensor.
+  CUTLASS_HOST_DEVICE
+  static TensorNDHWC packed(TensorCoord const &extent) {
+    return TensorNDHWC(
+      make_Coord(
+        extent.c(), 
+        extent.w() * extent.c(),
+        extent.h() * extent.w() * extent.c(),
+        extent.d() * extent.h() * extent.w() * extent.c()
+      )
+    );
+  }
+  
+  /// Returns the offset of a coordinate (n, d, h, w, c) in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord.c() + 
+      LongIndex(stride_[0] * coord.w()) + 
+      LongIndex(stride_[1] * coord.h()) +
+      LongIndex(stride_[2] * coord.d()) +
+      LongIndex(stride_[3] * coord.n());
+  }
+
+  /// Returns the offset of a pitchlinear coordinate in linear memory. 
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(PitchLinearCoord coord) const {
+    return coord.contiguous() + LongIndex(coord.strided() * stride_[3]);
+  }
+  
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    // it does not make sense if the extent is larger than stride
+    // and we could not rely on the capacity calculation in such cases
+    // we could move this checkers to debug code only
+    if ((extent.c() > stride_[0])
+        || (extent.w() * stride_[0] > stride_[1]) 
+        || (extent.h() * stride_[1] > stride_[2])
+        || (extent.d() * stride_[2] > stride_[3])) {
+      assert(0);
+    }
+    return extent.n() * stride_[3];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4d25a5109c70d15e562881d79a2c384192b0346
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -0,0 +1,1045 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_coord.h" // cutlass::MatrixCoord
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct VoltaTensorOpMultiplicandCongruous;
+
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct ColumnMajorVoltaTensorOpMultiplicandCongruous;
+// template <
+//   int ElementSize,
+//   gemm::Operand Operand
+// >
+// struct RowMajorVoltaTensorOpMultiplicandCongruous;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize>
+struct VoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<8, 2>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  
+  using PartitionCount = PitchLinearShape<
+    TileShape::kContiguous / PartitionShape::kContiguous,
+    TileShape::kStrided / PartitionShape::kStrided
+  >;
+
+  using AccessCount = PitchLinearShape<
+    PartitionShape::kContiguous,
+    PartitionShape::kStrided
+  >;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCongruous(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCongruous(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    
+    // First, compute c and s of vector within source (in units of vector accesses)
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
+    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
+
+    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Then swizzle in a tile
+    // Swizzle pattern is (tid[2:0] << 2)|(tid[4:3] ^ tid[2:1])
+    int permuted_strided_within_tile = (tile_contiguous_residual >> 1);
+    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
+                                       ((tile_contiguous_residual & 1) << 2);
+    // Compute final element location
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
+
+    return element_contiguous + element_strided * stride_[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct ColumnMajorVoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct RowMajorVoltaTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+// template <int ElementSize, Operand Operand>
+template <int ElementSize>
+struct VoltaTensorOpMultiplicandBCongruous {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<4, 4>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  
+  using PartitionCount = PitchLinearShape<
+    TileShape::kContiguous / PartitionShape::kContiguous,
+    TileShape::kStrided / PartitionShape::kStrided
+  >;
+
+  using AccessCount = PitchLinearShape<
+    PartitionShape::kContiguous,
+    PartitionShape::kStrided
+  >;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandBCongruous(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandBCongruous(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandBCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    
+    // First, compute c and s of vector within source (in units of vector accesses)
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx = vec_contiguous_idx / TileShape::kContiguous;
+    int tile_strided_idx = vec_strided_idx / TileShape::kStrided;
+
+    int tile_contiguous_residual = vec_contiguous_idx % TileShape::kContiguous;
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Then swizzle in a tile
+    // Swizzle pattern is (tid[1:0] << 3)|(tid & 0x4)|(tid[1:0])
+    int permuted_strided_within_tile = (tile_contiguous_residual & 0x3);
+    int permuted_contiguous_within_tile = (tile_strided_residual ^ permuted_strided_within_tile) |
+                                       (tile_contiguous_residual & 0x4);
+  
+    // Compute final element location
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+        permuted_contiguous_within_tile) * kElementsPerAccess + (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = tile_strided_idx * TileShape::kStrided + permuted_strided_within_tile;
+
+    return element_contiguous + element_strided * stride_[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE 
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct ColumnMajorVoltaTensorOpMultiplicandBCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandBCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to VoltaTensorOpMultiplicandCongruous
+template <int ElementSize>
+struct RowMajorVoltaTensorOpMultiplicandBCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandBCongruous<ElementSize>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandBCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandBCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandBCongruous packed(TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandBCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and KBlock size (in elements).
+template <int ElementSize, int KBlock>
+struct VoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = 64;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kKBlock = KBlock;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member. For GEMM, it equals to KBlock x stage.
+  Stride stride_;
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  VoltaTensorOpMultiplicandCrosswise(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static VoltaTensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
+    return VoltaTensorOpMultiplicandCrosswise(extent[1]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    //
+    // First, compute c and s of vector within source (in units of vector
+    // accesses)
+    //
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided();
+
+    //
+    // Then swizzle
+    // The mapping is like this:
+    // id[1:0]|(id[3]^id[4])|id[2]
+
+    int vec_strided_within_tile = vec_contiguous_idx & 0x7;
+    int permuted_vec_contiguous =
+        (vec_strided_idx & (~0xF)) + (vec_strided_idx & 0x3) * 4 +
+        (((vec_strided_idx >> 2) ^ ((vec_strided_idx & 0x10) >> 3)) & 0x3);
+
+    permuted_vec_contiguous ^= ((vec_strided_within_tile >> 1) & 0x3);
+
+    int permuted_vec_strided = vec_contiguous_idx;
+
+    //
+    // Compute final element location
+    //
+
+    int element_contiguous = permuted_vec_contiguous *  kElementsPerAccess + 
+                             (coord.contiguous() % kElementsPerAccess);
+    
+    return element_contiguous + permuted_vec_strided * (stride_[0] * kElementsPerAccess);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[0] * stride_[0];
+  }
+};
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// VoltaTensorOpMultiplicandCrosswise
+template <int ElementSize, int KBlock>
+struct ColumnMajorVoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = Base::kAccessSize;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorVoltaTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return ColumnMajorVoltaTensorOpMultiplicandCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int KBlock>
+struct RowMajorVoltaTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = VoltaTensorOpMultiplicandCrosswise<ElementSize, KBlock>;
+
+  /// This layout is optimized for 64b accesses
+  static int const kAccessSize = Base::kAccessSize;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorVoltaTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorVoltaTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return RowMajorVoltaTensorOpMultiplicandCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+} // namespace layout
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ca60055e5555eac3c93cf8cd96938e6e2a92e56
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -0,0 +1,1169 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+/// This one is the base class of all Ampere/Turing fp16/bf16/int8/int4/int1
+/// tensor core kernels.  tf32 TN uses this too.
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicand {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kCrosswise = Crosswise;
+
+  /// Contiguous dimension of the tile shape matches one shared memory cache
+  /// line - 128B.  For 128bit access size, it equals to 8 accesses.
+  static int const kTileShapeContiguous = 128 / (kAccessSize / 8);
+
+  /// Number of kblocks to store PartitionShape::kContiguous Elements
+  static int const kFactor =
+      kTileShapeContiguous * kElementsPerAccess / kCrosswise;
+
+  static_assert(
+      (kFactor > 0),
+      "kCrosswise should be no large than one shared memory cache line.");
+
+  /// The strided dimension needs to be at least (WarpSize(32) /
+  /// kTileShapeContiguous) for a warp to access.  To ensure conflict free
+  /// access, it also needs to be at least (kTileShapeContiguous / kFactor).
+  /// See comments below
+  static int const kTileShapeStride =
+      ((kTileShapeContiguous / kFactor) > (32 / kTileShapeContiguous))
+          ? (kTileShapeContiguous / kFactor)
+          : (32 / kTileShapeContiguous);
+
+  /// Fundamental tile shape in units of vectors to guarantee bank conflict free
+  /// shared memory load/store.
+  /// For kFactor = 1, TileShape = <8, 8> 
+  /// For kFactor > 1, TileShape = <8, 4>
+  using TileShape = PitchLinearShape<kTileShapeContiguous, kTileShapeStride>;
+
+  /// Fundamental partition shape in units of vectors
+  using PartitionShape = PitchLinearShape<4, 4>;
+
+  using PartitionCount =
+      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
+                       TileShape::kStrided / PartitionShape::kStrided>;
+
+  using AccessCount =
+      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member. For GEMM, it equals to kCrosswise x stage.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicand packed(TensorCoord const &extent) {
+    return TensorOpMultiplicand(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    //
+    // First, compute c and s of vector within source (in units of vector
+    // accesses)
+    //
+
+    int vec_contiguous_idx = coord.contiguous() / kElementsPerAccess;
+    int vec_strided_idx = coord.strided() / kFactor;
+
+    // Compute the fundamental tile being accessed
+    int tile_contiguous_idx =
+        vec_contiguous_idx / (TileShape::kContiguous / kFactor);
+
+    int tile_contiguous_residual =
+        vec_contiguous_idx % (TileShape::kContiguous / kFactor) +
+        ((coord.strided() % kFactor) * (TileShape::kContiguous / kFactor));
+    int tile_strided_residual = vec_strided_idx % TileShape::kStrided;
+
+    // Compute the 'partition' within the fundamental tile
+    int partition_contiguous_idx =
+        tile_contiguous_residual / PartitionShape::kContiguous;
+    int partition_strided_idx =
+        tile_strided_residual / PartitionShape::kStrided;
+
+    int partition_contiguous_residual =
+        tile_contiguous_residual % PartitionShape::kContiguous;
+    int partition_strided_residual =
+        tile_strided_residual % PartitionShape::kStrided;
+
+    //
+    // Then swizzle
+    //
+
+    int permuted_vec_contiguous_within_partition =
+        partition_contiguous_residual ^ (partition_strided_residual % 4);
+
+    int permuted_partition_contiguous_within_tile =
+        partition_contiguous_idx ^ (partition_strided_idx % 2);
+
+    //
+    // Compute final element location
+    //
+
+    int element_contiguous = (tile_contiguous_idx * TileShape::kContiguous +
+                              permuted_partition_contiguous_within_tile *
+                                  PartitionShape::kContiguous +
+                              permuted_vec_contiguous_within_partition) *
+                                 kElementsPerAccess +
+                             (coord.contiguous() % kElementsPerAccess);
+
+    int element_strided = vec_strided_idx;
+
+    return element_contiguous + element_strided * stride_[0] * kFactor;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicandCongruous {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(coord);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return coord;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(extent);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+/// This one is just for TF32 NT kernel.
+template <int Crosswise>
+struct TensorOpMultiplicandCongruous<32, Crosswise> {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  /// Fundamental tile shape in units of vectors
+  using TileShape = PitchLinearShape<8, 4>;
+
+  /// Partitionshape is the same as TileShape for this layout
+  using PartitionShape = PitchLinearShape<8, 4>;
+
+  using PartitionCount =
+      PitchLinearShape<TileShape::kContiguous / PartitionShape::kContiguous,
+                       TileShape::kStrided / PartitionShape::kStrided>;
+
+  using AccessCount =
+      PitchLinearShape<PartitionShape::kContiguous, PartitionShape::kStrided>;
+
+  //
+  // Static constants
+  //
+  static int const kElementSize = 32;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+  static int const kCrosswise = Crosswise;
+  static int const kFactor = 1;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int tc = coord.contiguous() / 32;
+    int ts = coord.strided() / 4;
+
+    int c = (coord.contiguous() % 32) / kElementsPerAccess;
+    int s = coord.strided() % 4;
+
+    LongIndex offset = (c ^ (2 * s)) * kElementsPerAccess + s * stride_[0] +
+                       tc * 32 + ts * stride_[0] * 4 + coord.contiguous() % 4;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+template <int ElementSize, int Crosswise>
+struct ColumnMajorTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+template <int ElementSize, int Crosswise>
+struct RowMajorTensorOpMultiplicandCongruous {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+template <int ElementSize, int Crosswise>
+struct TensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  static int const kCrosswise = Base::kCrosswise;
+  static int const kFactor = Base::kFactor;
+  using PartitionCount =  typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCrosswise packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCrosswise(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(coord);
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return coord;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(extent);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int Crosswise>
+struct ColumnMajorTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount = typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicandCrosswise
+template <int ElementSize, int Crosswise>
+struct RowMajorTensorOpMultiplicandCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise<ElementSize, Crosswise>;
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = Base::kAccessSize;
+  using TileShape = typename Base::TileShape;
+  using PartitionShape = typename Base::PartitionShape;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = Base::kElementSize;
+  static int const kElementsPerAccess = Base::kElementsPerAccess;
+  using PartitionCount = typename Base::PartitionCount;
+  using AccessCount = typename Base::AccessCount;
+
+ private:
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise(Index ldm = 0) : layout_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise(Stride stride) : layout_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCrosswise packed(
+      TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return layout_.stride(); }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return layout_.stride(); }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize, int InterleavedK>
+struct TensorOpMultiplicandColumnMajorInterleaved {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+
+  //static int const kThreadBlockStrided = ThreadBlockStrided;
+  static int const kInterleavedK = InterleavedK;
+  
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandColumnMajorInterleaved(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandColumnMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandColumnMajorInterleaved packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandColumnMajorInterleaved(extent[0] * kInterleavedK);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int const rows_per_smem_cache_line = 128 / kInterleavedK;
+
+    int row_id = coord.strided() / rows_per_smem_cache_line;
+    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
+
+    int access_block_id = col_id >> 4;
+    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
+
+    int swizzle_col_id = swizzle_access_block_id << 4;
+
+    return row_id * 128 + swizzle_col_id;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent[1] / kInterleavedK) * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear memory.
+template <int ElementSize, int InterleavedK>
+struct TensorOpMultiplicandRowMajorInterleaved {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  /// This layout is optimized for 128b accesses
+  static int const kAccessSize = 128;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = ElementSize;
+  static int const kElementsPerAccess = kAccessSize / kElementSize;
+
+  //static int const kThreadBlockStrided = ThreadBlockStrided;
+  static int const kInterleavedK = InterleavedK;
+  
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandRowMajorInterleaved(Index ldm = 0): stride_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandRowMajorInterleaved(Stride stride): stride_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandRowMajorInterleaved packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandRowMajorInterleaved(extent[1] * kInterleavedK);
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    int const rows_per_smem_cache_line = 128 / kInterleavedK;
+
+    int row_id = coord.strided() / rows_per_smem_cache_line;
+    int col_id = (coord.strided() % rows_per_smem_cache_line) * kInterleavedK + coord.contiguous();
+
+    int access_block_id = col_id >> 4;
+    int swizzle_access_block_id = access_block_id ^ (row_id & 1);
+
+    int swizzle_col_id = swizzle_access_block_id << 4;
+
+    return row_id * 128 + swizzle_col_id;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return (extent[0] / kInterleavedK) * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3104906ee1b1d22df7f8d2822e67fd14cf4e56b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -0,0 +1,1139 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief layouts needed by Ampere fp64 tensor core kernels.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace layout {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous64b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous64b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 4;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 4;
+
+
+    int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8);
+    int row = (c & 6) / 2;
+
+    bank ^= ((s & 2) * 2);
+
+    LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous64b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous64b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous64b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 64;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return TensorOpMultiplicand64bCrosswise(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    int tc = coord.contiguous() / 16;
+    int ts = coord.strided() / 16;
+
+    int c = coord.contiguous() % 16;
+    int s = coord.strided() % 16;
+
+    int k_group = c / 4;
+    int access_s = s / 2;
+
+    int row = access_s % 4;
+    int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1);
+
+    int smem_row = (k_group * 4 + row) + tc * 16;
+    int smem_col = ts * 16 + bank;
+
+    LongIndex offset = smem_row * stride_[0] + smem_col;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct ColumnMajorTensorOpMultiplicand64bCrosswise {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct RowMajorTensorOpMultiplicand64bCrosswise {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicand64bCrosswise;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicand64bCrosswise(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCongruous128b {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCongruous128b(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 4;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 4;
+
+    Index k_index = (c / 2);
+
+    Index bank = (((c & 1) * 4) | (s ^ k_index));
+
+    LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0];
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    return TensorCoord();   
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.contiguous(), coord.strided());    
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCongruous128b {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCongruous128b;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCongruous128b(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Inverse of layout function, mapping linear offset to logical coordinate
+  CUTLASS_HOST_DEVICE
+  TensorCoord inverse(LongIndex offset) const {
+    PitchLinearCoord coord = layout_.inverse(offset);
+    return MatrixCoord(coord.strided(), coord.contiguous());
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template based on element size (in bits) - defined in terms of pitch-linear
+/// memory and Crosswise size (in elements).
+struct TensorOpMultiplicandCrosswise128x4 {
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = PitchLinearCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Static constants
+  //
+
+  static int const kElementSize = 128;
+  static int const kElementsPerAccess = 1;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member.
+  Stride stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {}
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {}
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return TensorOpMultiplicandCrosswise128x4(extent[0]);
+  }
+
+  /// Returns the offset of a coordinate in linear memory.
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+
+    Index tc = coord.contiguous() / 8;
+    Index ts = coord.strided() / 8;
+
+    Index c = coord.contiguous() % 8;
+    Index s = coord.strided() % 8;
+
+    Index liq = c % 4;
+
+    Index bank = liq + ((s & 1) * 4) ^ (c & 4);
+
+    Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2);
+
+    LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank;
+
+    return offset;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const { return stride_; }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride &stride() { return stride_; }
+
+  /// Compute the number of contiguous elements needed to store a tensor with
+  /// the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return extent[1] * stride_[0];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a column-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct ColumnMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.row(), coord.column()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.row(), extent.column()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Template mapping a row-major view of pitch-linear memory to
+/// TensorOpMultiplicand
+struct RowMajorTensorOpMultiplicandCrosswise128x4 {
+
+  /// Logical rank of tensor
+  static int const kRank = 2;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = MatrixCoord;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index, LongIndex>;
+
+  //
+  // Invariants
+  //
+
+  using Base = TensorOpMultiplicandCrosswise128x4;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Base layout_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) {
+    return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row());
+  }
+
+  /// Returns the offset of a coordinate in linear memory. 
+  /// Assumes coordinate has convention (contiguous, strided)
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return layout_(PitchLinearCoord(coord.column(), coord.row()));
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &extent) const {
+    return layout_.capacity(PitchLinearCoord(extent.column(), extent.row()));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace layout
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/vector.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cb74f35ffa1ac56a4c0c9c07e888b414d1be3a1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/layout/vector.h
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used for rank=1 vectors.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace layout {
+
+/// Tensor layout for densely packed vectors.
+class PackedVectorLayout {
+public:
+  /// Logical rank of tensor
+  static int const kRank = 1;
+
+  /// Rank of stride vector
+  static int const kStrideRank = 1;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+
+  //
+  // No actual stride vector stored
+  //
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  PackedVectorLayout() { }
+
+  /// Helper returns a layout to a tightly packed tensor
+  CUTLASS_HOST_DEVICE
+  static PackedVectorLayout packed(TensorCoord const &size) {
+    CUTLASS_UNUSED(size);
+    return PackedVectorLayout();
+  }
+
+  /// Returns the offset of a coordinate in linear memory
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(TensorCoord const &coord) const {
+    return coord[0];
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return make_Coord(1);
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &size) const {
+    return size[0];
+  }
+};
+
+} // namespace layout
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..00222c128dc1216d541e7dd7341d71138cfa28a0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix.h
@@ -0,0 +1,14129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*  
+  \file
+  \brief Matrix classes with value semantics.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <iosfwd>
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/matrix.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Primary template with partial specializations to follow
+template <typename Element, int Rows, int Columns> struct Matrix;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 2;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 1-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> transpose() const {
+    Matrix<Element, 2, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Forms a 1-by-2 matrix by horizontally concatenating an Element with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Element rhs) {
+    return Matrix(
+      lhs, rhs);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> hcat(Element rhs) const {
+    return Matrix<Element, 1, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> hcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 1, 4>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 2, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 3, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 1-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 1-by-2 matrix
+template <typename Element>
+using Matrix1x2 = Matrix<Element, 1, 2>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x2<Element> make_Matrix1x2(
+    Element _0_0, Element _0_1
+) {
+  return Matrix1x2<Element>(
+  _0_0, _0_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 3;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 1-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> transpose() const {
+    Matrix<Element, 3, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Forms a 1-by-3 matrix by horizontally concatenating an Element with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Matrix<Element, 1, 2> const & rhs) {
+    return Matrix(
+      lhs, rhs.at(0, 0), rhs.at(0, 1));
+  }
+  
+  /// Forms a 1-by-3 matrix by horizontally concatenating a 1-by-2 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Element rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> hcat(Element rhs) const {
+    return Matrix<Element, 1, 4>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 2, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 3, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 3, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    // k=2
+    accum += data[2] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 1-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+  /// Cross product
+  CUTLASS_HOST_DEVICE
+  Matrix cross(Matrix const &rhs) const {
+    return Matrix(
+      data[1] * rhs.data[2] - data[2] * rhs.data[1],
+      data[2] * rhs.data[0] - data[0] * rhs.data[2],
+      data[0] * rhs.data[1] - data[1] * rhs.data[0]
+    );
+  }
+  
+};
+
+/// Template alias for 1-by-3 matrix
+template <typename Element>
+using Matrix1x3 = Matrix<Element, 1, 3>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x3<Element> make_Matrix1x3(
+    Element _0_0, Element _0_1, Element _0_2
+) {
+  return Matrix1x3<Element>(
+  _0_0, _0_1, _0_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 1-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 1, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 1;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 1-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> transpose() const {
+    Matrix<Element, 4, 1> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 1 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Forms a 1-by-4 matrix by horizontally concatenating an Element with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Element lhs, Matrix<Element, 1, 3> const & rhs) {
+    return Matrix(
+      lhs, rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2));
+  }
+  
+  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 2> const & lhs, Matrix<Element, 1, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1));
+  }
+  
+  /// Forms a 1-by-4 matrix by horizontally concatenating a 1-by-3 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 1, 3> const & lhs, Element rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs);
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 2, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-4 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
+    return Matrix<Element, 3, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 3, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (1-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 1-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Element product(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    // k=0
+    accum += data[0] * rhs.data[0];
+
+    // k=1
+    accum += data[1] * rhs.data[1];
+
+    // k=2
+    accum += data[2] * rhs.data[2];
+
+    // k=3
+    accum += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Element operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 1, 2> accum = Matrix<Element, 1, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 1, 3> accum = Matrix<Element, 1, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 1, 4> accum = Matrix<Element, 1, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 1-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 1-by-4 matrix
+template <typename Element>
+using Matrix1x4 = Matrix<Element, 1, 4>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix1x4<Element> make_Matrix1x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3
+) {
+  return Matrix1x4<Element>(
+  _0_0, _0_1, _0_2, _0_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 2;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 2-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> transpose() const {
+    Matrix<Element, 1, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 2, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-1 matrix by vertically concatenating an Element with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Element lower) {
+    return Matrix(
+      upper
+      , lower);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> vcat(Element rhs) const {
+    return Matrix<Element, 3, 1>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> vcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 4, 1>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 2, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 2
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 2> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-1 matrix
+template <typename Element>
+using Matrix2x1 = Matrix<Element, 2, 1>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x1<Element> make_Matrix2x1(
+    Element _0_0, 
+    Element _1_0
+) {
+  return Matrix2x1<Element>(
+  _0_0, 
+  _1_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 2-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+  }
+    
+  /// Constructs a 2-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+  }
+    
+  /// Static method to construct a 2-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[3] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> transpose() const {
+    Matrix<Element, 2, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[1] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-2 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0));
+  }
+  
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 3, 2>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 2, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A, B
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+  /// Returns 2-by-2 rotation matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta) {
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    return Matrix(
+      c, -s,
+      s,  c
+    );
+  }
+    
+  /// Computes the determinant of a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+        accum += data[0] * data[3] - data[1] * data[2];
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 2-by-2 matrix given
+  /// the matrix's determinant
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element det) const {
+    return Matrix(
+      data[3], -data[1],
+      -data[2], data[0]
+    ) * (Element(1) / det); 
+  }
+
+  /// Computes the inverse of a 2-by-2 matrix.
+  CUTLASS_HOST_DEVICE
+  Matrix inverse() const {
+    return inverse(determinant());
+  }
+    
+};
+
+/// Template alias for 2-by-2 matrix
+template <typename Element>
+using Matrix2x2 = Matrix<Element, 2, 2>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x2<Element> make_Matrix2x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1
+) {
+  return Matrix2x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 6;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 2-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+  }
+    
+  /// Constructs a 2-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+  }
+    
+  /// Static method to construct a 2-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> transpose() const {
+    Matrix<Element, 3, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[4] = data[2];
+    mt.data[1] = data[3];
+    mt.data[3] = data[4];
+    mt.data[5] = data[5];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1));
+  }
+  
+  /// Forms a 2-by-3 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0));
+  }
+  
+  /// Concatenates this matrix with a a 2-by-1 matrix to form a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> hcat(Matrix<Element, 2, 1> const & rhs) const {
+    return Matrix<Element, 2, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 3, 3>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 2, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 2-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 2-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-3 matrix
+template <typename Element>
+using Matrix2x3 = Matrix<Element, 2, 3>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x3<Element> make_Matrix2x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2
+) {
+  return Matrix2x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 2-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 2, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 2;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 8;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 2-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+  }
+    
+  /// Constructs a 2-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+  }
+    
+  /// Static method to construct a 2-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[3] = diag.data[1];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[3];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> transpose() const {
+    Matrix<Element, 4, 2> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[2] = data[1];
+    mt.data[4] = data[2];
+    mt.data[6] = data[3];
+    mt.data[1] = data[4];
+    mt.data[3] = data[5];
+    mt.data[5] = data[6];
+    mt.data[7] = data[7];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 2 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> column(int j) const {
+    return slice_2x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 2, 1> const &v, int j =0) {
+    return set_slice_2x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-1 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 1> const & lhs, Matrix<Element, 2, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2));
+  }
+  
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 2> const & lhs, Matrix<Element, 2, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1));
+  }
+  
+  /// Forms a 2-by-4 matrix by horizontally concatenating a 2-by-3 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 2, 3> const & lhs, Matrix<Element, 2, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0));
+  }
+  
+  /// Forms a 2-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 3, 4>::vcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 2-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 2, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 2-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (2-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+
+    return m;
+  }
+  
+  /// Matrix product of size 2-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 2, 1> accum = Matrix<Element, 2, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 2, 2> accum = Matrix<Element, 2, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 2, 3> accum = Matrix<Element, 2, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 2, 4> accum = Matrix<Element, 2, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 2-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 2-by-4 matrix
+template <typename Element>
+using Matrix2x4 = Matrix<Element, 2, 4>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix2x4<Element> make_Matrix2x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3
+) {
+  return Matrix2x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 3;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 3-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+    data[2] = _2_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> transpose() const {
+    Matrix<Element, 1, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 3, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-3 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 3> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-1 matrix by vertically concatenating an Element with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Matrix<Element, 2, 1> const & lower) {
+    return Matrix(
+      upper
+      , lower.at(0, 0)
+      , lower.at(1, 0));
+  }
+  
+  /// Forms a 3-by-1 matrix by vertically concatenating a 2-by-1 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Element lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , lower);
+  }
+  
+  /// Concatenates this matrix with a an Element to form a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> vcat(Element rhs) const {
+    return Matrix<Element, 4, 1>::vcat(*this, rhs);
+  }
+    
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    data[2] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    data[2] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+    accum.data[2] += data[2] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+    accum.data[6] += data[2] * rhs.data[0];
+    accum.data[7] += data[2] * rhs.data[1];
+    accum.data[8] += data[2] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+    accum.data[8] += data[2] * rhs.data[0];
+    accum.data[9] += data[2] * rhs.data[1];
+    accum.data[10] += data[2] * rhs.data[2];
+    accum.data[11] += data[2] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 3, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 3
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 3> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+  /// Cross product
+  CUTLASS_HOST_DEVICE
+  Matrix cross(Matrix const &rhs) const {
+    return Matrix(
+      data[1] * rhs.data[2] - data[2] * rhs.data[1],
+      data[2] * rhs.data[0] - data[0] * rhs.data[2],
+      data[0] * rhs.data[1] - data[1] * rhs.data[0]
+    );
+  }
+  
+};
+
+/// Template alias for 3-by-1 matrix
+template <typename Element>
+using Matrix3x1 = Matrix<Element, 3, 1>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x1<Element> make_Matrix3x1(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0
+) {
+  return Matrix3x1<Element>(
+  _0_0, 
+  _1_0, 
+  _2_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 6;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 3-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+    data[4] = _2_0;  data[5] = _2_1;
+  }
+    
+  /// Constructs a 3-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1,
+    Matrix<Element, 1, 2> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+    data[4] = row_2.data[0];
+    data[5] = row_2.data[1];
+  }
+    
+  /// Static method to construct a 3-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    result.data[4] = column_0.data[2];
+    result.data[5] = column_1.data[2];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> transpose() const {
+    Matrix<Element, 2, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[1] = data[2];
+    mt.data[4] = data[3];
+    mt.data[2] = data[4];
+    mt.data[5] = data[5];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-2 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0)
+      , lhs.at(2, 0), rhs.at(2, 0));
+  }
+  
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 3-by-2 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 2> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1));
+  }
+  
+  /// Forms a 3-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-2 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> vcat(Matrix<Element, 1, 2> const & rhs) const {
+    return Matrix<Element, 4, 2>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A, B
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+    accum.data[2] += data[4] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+    accum.data[2] += data[5] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[2];
+    accum.data[5] += data[5] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+    accum.data[6] += data[4] * rhs.data[0];
+    accum.data[7] += data[4] * rhs.data[1];
+    accum.data[8] += data[4] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[3];
+    accum.data[7] += data[5] * rhs.data[4];
+    accum.data[8] += data[5] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+    accum.data[8] += data[4] * rhs.data[0];
+    accum.data[9] += data[4] * rhs.data[1];
+    accum.data[10] += data[4] * rhs.data[2];
+    accum.data[11] += data[4] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+    accum.data[8] += data[5] * rhs.data[4];
+    accum.data[9] += data[5] * rhs.data[5];
+    accum.data[10] += data[5] * rhs.data[6];
+    accum.data[11] += data[5] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 3-by-2 matrix
+template <typename Element>
+using Matrix3x2 = Matrix<Element, 3, 2>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x2<Element> make_Matrix3x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1
+) {
+  return Matrix3x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1, 
+  _2_0, _2_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 9;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 3-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
+  }
+    
+  /// Constructs a 3-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1,
+    Matrix<Element, 1, 3> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+    data[6] = row_2.data[0];
+    data[7] = row_2.data[1];
+    data[8] = row_2.data[2];
+  }
+    
+  /// Static method to construct a 3-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    result.data[6] = column_0.data[2];
+    result.data[7] = column_1.data[2];
+    result.data[8] = column_2.data[2];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[4] = Element(1);
+    m.data[8] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> transpose() const {
+    Matrix<Element, 3, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[6] = data[2];
+    mt.data[1] = data[3];
+    mt.data[4] = data[4];
+    mt.data[7] = data[5];
+    mt.data[2] = data[6];
+    mt.data[5] = data[7];
+    mt.data[8] = data[8];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1));
+  }
+  
+  /// Forms a 3-by-3 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0));
+  }
+  
+  /// Concatenates this matrix with a a 3-by-1 matrix to form a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> hcat(Matrix<Element, 3, 1> const & rhs) const {
+    return Matrix<Element, 3, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
+  }
+  
+  /// Forms a 3-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-3 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> vcat(Matrix<Element, 1, 3> const & rhs) const {
+    return Matrix<Element, 4, 3>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 3-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+    result.data[8] = data[8] + rhs.data[8];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+    data[8] += rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+    result.data[8] = data[8] - rhs.data[8];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+    data[8] -= rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+    result.data[8] = data[8] * rhs.data[8];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+    result.data[8] = data[8] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+    data[8] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+    result.data[8] = data[8] / rhs.data[8];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+    result.data[8] = data[8] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+    data[8] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+    data[8] /= rhs.data[8];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+    m.data[8] = -data[8];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+    accum.data[2] += data[6] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+    accum.data[2] += data[7] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+    accum.data[2] += data[8] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+    accum.data[4] += data[6] * rhs.data[0];
+    accum.data[5] += data[6] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[7] * rhs.data[2];
+    accum.data[5] += data[7] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+    accum.data[4] += data[8] * rhs.data[4];
+    accum.data[5] += data[8] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+    accum.data[8] += data[6] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[7] * rhs.data[3];
+    accum.data[7] += data[7] * rhs.data[4];
+    accum.data[8] += data[7] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+    accum.data[6] += data[8] * rhs.data[6];
+    accum.data[7] += data[8] * rhs.data[7];
+    accum.data[8] += data[8] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 3-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+    accum.data[8] += data[6] * rhs.data[0];
+    accum.data[9] += data[6] * rhs.data[1];
+    accum.data[10] += data[6] * rhs.data[2];
+    accum.data[11] += data[6] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+    accum.data[8] += data[7] * rhs.data[4];
+    accum.data[9] += data[7] * rhs.data[5];
+    accum.data[10] += data[7] * rhs.data[6];
+    accum.data[11] += data[7] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[8] * rhs.data[9];
+    accum.data[10] += data[8] * rhs.data[10];
+    accum.data[11] += data[8] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+    accum += data[8];
+
+    return accum;
+  }
+    
+  /// Returns 3-by-3 rotation matrix around the X axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_X(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(1, 1) = c;
+    m.at(1, 2) = -s;
+    m.at(2, 1) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 3-by-3 rotation matrix around the Y axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Y(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(2, 0) = -s;
+    m.at(0, 2) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 3-by-3 rotation matrix around the Z axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Z(Element theta) {
+    Matrix m = Matrix::identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(0, 1) = -s;
+    m.at(1, 0) = s;
+    m.at(1, 1) = c;
+
+    return m;
+  }
+
+  /// Returns a 3-by-3 rotation matrix around a unit-length axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
+    Element x = u.data[0];
+    Element y = u.data[1];
+    Element z = u.data[2];
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    Element one_minus_cos = Element(1) - fast_cos(theta);
+
+    Matrix m;
+
+    m.set_slice_3x3({
+      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
+      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
+      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
+    });
+
+    return m;
+  }
+
+  /// Returns a 3-by-3 reflection about the plane specified by the 
+  /// unit-length normal vector n_unit
+  CUTLASS_HOST_DEVICE
+  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
+
+    Element a = n_unit.data[0];
+    Element b = n_unit.data[1];
+    Element c = n_unit.data[2];
+
+    Matrix m = Matrix::identity();
+
+    m.set_slice_3x3({
+      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
+      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
+      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
+    });
+
+    return m;
+  }
+
+  /// Computes the determinant of a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+    
+    accum += at(0, 0) * Matrix<Element, 2, 2>({ at(1, 1), at(1, 2), at(2, 1), at(2, 2) }).determinant();
+    accum -= at(0, 1) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 2), at(2, 0), at(2, 2) }).determinant();
+    accum += at(0, 2) * Matrix<Element, 2, 2>({ at(1, 0), at(1, 1), at(2, 0), at(2, 1) }).determinant();
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 3-by-3 matrix given
+  /// the matrix's determinant
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element det) const {
+    return Matrix(
+      at(1, 1) * at(2, 2) - at(1, 2) * at(2, 1),
+      at(0, 2) * at(2, 1) - at(0, 1) * at(2, 2),
+      at(0, 1) * at(1, 2) - at(0, 2) * at(1, 1),
+
+      at(1, 2) * at(2, 0) - at(1, 0) * at(2, 2),
+      at(0, 0) * at(2, 2) - at(0, 2) * at(2, 0),
+      at(0, 2) * at(1, 0) - at(0, 0) * at(1, 2),
+
+      at(1, 0) * at(2, 1) - at(1, 1) * at(2, 0),
+      at(0, 1) * at(2, 0) - at(0, 0) * at(2, 1),
+      at(0, 0) * at(1, 1) - at(0, 1) * at(1, 0)
+    ) * (Element(1) / det);
+  }
+  /// Computes the inverse of a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix inverse() const {
+    return inverse(determinant());
+  }
+    
+};
+
+/// Template alias for 3-by-3 matrix
+template <typename Element>
+using Matrix3x3 = Matrix<Element, 3, 3>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x3<Element> make_Matrix3x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2
+) {
+  return Matrix3x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2, 
+  _2_0, _2_1, _2_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 3-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 3, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 3;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 12;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 3-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
+  }
+    
+  /// Constructs a 3-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1,
+    Matrix<Element, 1, 4> const &row_2
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+    data[8] = row_2.data[0];
+    data[9] = row_2.data[1];
+    data[10] = row_2.data[2];
+    data[11] = row_2.data[3];
+  }
+    
+  /// Static method to construct a 3-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    result.data[8] = column_0.data[2];
+    result.data[9] = column_1.data[2];
+    result.data[10] = column_2.data[2];
+    result.data[11] = column_3.data[2];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[4] = diag.data[1];
+    m.data[8] = diag.data[2];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[4];
+    diag.data[2] = data[8];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> transpose() const {
+    Matrix<Element, 4, 3> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[3] = data[1];
+    mt.data[6] = data[2];
+    mt.data[9] = data[3];
+    mt.data[1] = data[4];
+    mt.data[4] = data[5];
+    mt.data[7] = data[6];
+    mt.data[10] = data[7];
+    mt.data[2] = data[8];
+    mt.data[5] = data[9];
+    mt.data[8] = data[10];
+    mt.data[11] = data[11];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 3 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> column(int j) const {
+    return slice_3x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 3, 1> const &v, int j =0) {
+    return set_slice_3x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-1 matrix with a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 1> const & lhs, Matrix<Element, 3, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2));
+  }
+  
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-2 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 2> const & lhs, Matrix<Element, 3, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1));
+  }
+  
+  /// Forms a 3-by-4 matrix by horizontally concatenating a 3-by-3 matrix with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 3, 3> const & lhs, Matrix<Element, 3, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0));
+  }
+  
+  /// Forms a 3-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
+  }
+  
+  /// Forms a 3-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Concatenates this matrix with a a 1-by-4 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> vcat(Matrix<Element, 1, 4> const & rhs) const {
+    return Matrix<Element, 4, 4>::vcat(*this, rhs);
+  }
+    
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 3-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    result.data[8] = data[8] + rhs.data[8];
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    data[8] += rhs.data[8];
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    result.data[8] = data[8] - rhs.data[8];
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    data[8] -= rhs.data[8];
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    result.data[8] = data[8] * rhs.data[8];
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    result.data[8] = data[8] * s;
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    data[8] *= s;
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    result.data[8] = data[8] / rhs.data[8];
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    result.data[8] = data[8] / s;
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    data[8] /= s;
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (3-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    data[8] /= rhs.data[8];
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+    m.data[8] = -data[8];
+    m.data[9] = -data[9];
+    m.data[10] = -data[10];
+    m.data[11] = -data[11];
+
+    return m;
+  }
+  
+  /// Matrix product of size 3-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 3, 1> accum = Matrix<Element, 3, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+    accum.data[2] += data[8] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+    accum.data[2] += data[9] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+    accum.data[2] += data[10] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+    accum.data[2] += data[11] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 3, 2> accum = Matrix<Element, 3, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+    accum.data[4] += data[8] * rhs.data[0];
+    accum.data[5] += data[8] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[9] * rhs.data[2];
+    accum.data[5] += data[9] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+    accum.data[4] += data[10] * rhs.data[4];
+    accum.data[5] += data[10] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+    accum.data[4] += data[11] * rhs.data[6];
+    accum.data[5] += data[11] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 3, 3> accum = Matrix<Element, 3, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+    accum.data[6] += data[8] * rhs.data[0];
+    accum.data[7] += data[8] * rhs.data[1];
+    accum.data[8] += data[8] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[9] * rhs.data[3];
+    accum.data[7] += data[9] * rhs.data[4];
+    accum.data[8] += data[9] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+    accum.data[6] += data[10] * rhs.data[6];
+    accum.data[7] += data[10] * rhs.data[7];
+    accum.data[8] += data[10] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+    accum.data[6] += data[11] * rhs.data[9];
+    accum.data[7] += data[11] * rhs.data[10];
+    accum.data[8] += data[11] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 3, 4> accum = Matrix<Element, 3, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+    accum.data[8] += data[8] * rhs.data[0];
+    accum.data[9] += data[8] * rhs.data[1];
+    accum.data[10] += data[8] * rhs.data[2];
+    accum.data[11] += data[8] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+    accum.data[8] += data[9] * rhs.data[4];
+    accum.data[9] += data[9] * rhs.data[5];
+    accum.data[10] += data[9] * rhs.data[6];
+    accum.data[11] += data[9] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[10] * rhs.data[9];
+    accum.data[10] += data[10] * rhs.data[10];
+    accum.data[11] += data[10] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+    accum.data[8] += data[11] * rhs.data[12];
+    accum.data[9] += data[11] * rhs.data[13];
+    accum.data[10] += data[11] * rhs.data[14];
+    accum.data[11] += data[11] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 3-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+    accum += data[10];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 3-by-4 matrix
+template <typename Element>
+using Matrix3x4 = Matrix<Element, 3, 4>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix3x4<Element> make_Matrix3x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3
+) {
+  return Matrix3x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3, 
+  _2_0, _2_1, _2_2, _2_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-1 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 1> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 1;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 4;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 4-by-1 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0, 
+    Element _3_0
+  ) {
+
+    data[0] = _0_0;
+    data[1] = _1_0;
+    data[2] = _2_0;
+    data[3] = _3_0;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> transpose() const {
+    Matrix<Element, 1, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[1] = data[1];
+    mt.data[2] = data[2];
+    mt.data[3] = data[3];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 1 + j + 0];
+    m.data[1] = data[i * 1 + j + 1];
+    m.data[2] = data[i * 1 + j + 2];
+    m.data[3] = data[i * 1 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 1 + j + 0] = m.data[0];
+    data[i * 1 + j + 1] = m.data[1];
+    data[i * 1 + j + 2] = m.data[2];
+    data[i * 1 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 2>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 2> const & rhs) const {
+    return Matrix<Element, 4, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-3 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 3> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-1 matrix by vertically concatenating an Element with a 3-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Element upper, Matrix<Element, 3, 1> const & lower) {
+    return Matrix(
+      upper
+      , lower.at(0, 0)
+      , lower.at(1, 0)
+      , lower.at(2, 0));
+  }
+  
+  /// Forms a 4-by-1 matrix by vertically concatenating a 2-by-1 matrix with a 2-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 1> const & upper, Matrix<Element, 2, 1> const & lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , lower.at(0, 0)
+      , lower.at(1, 0));
+  }
+  
+  /// Forms a 4-by-1 matrix by vertically concatenating a 3-by-1 matrix with an Element
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 1> const & upper, Element lower) {
+    return Matrix(
+      upper.at(0, 0)
+      , upper.at(1, 0)
+      , upper.at(2, 0)
+      , lower);
+  }
+  
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+
+    data[1] *= s;
+
+    data[2] *= s;
+
+    data[3] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+
+    data[1] /= s;
+
+    data[2] /= s;
+
+    data[3] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-1)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 1, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[1] * rhs.data[0];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[0];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 1, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-1-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 1, 1> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 1, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[1] * rhs.data[0];
+    accum.data[3] += data[1] * rhs.data[1];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[0];
+    accum.data[7] += data[3] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 1, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 1, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[1] * rhs.data[0];
+    accum.data[4] += data[1] * rhs.data[1];
+    accum.data[5] += data[1] * rhs.data[2];
+    accum.data[6] += data[2] * rhs.data[0];
+    accum.data[7] += data[2] * rhs.data[1];
+    accum.data[8] += data[2] * rhs.data[2];
+    accum.data[9] += data[3] * rhs.data[0];
+    accum.data[10] += data[3] * rhs.data[1];
+    accum.data[11] += data[3] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 1, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 1, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[1] * rhs.data[0];
+    accum.data[5] += data[1] * rhs.data[1];
+    accum.data[6] += data[1] * rhs.data[2];
+    accum.data[7] += data[1] * rhs.data[3];
+    accum.data[8] += data[2] * rhs.data[0];
+    accum.data[9] += data[2] * rhs.data[1];
+    accum.data[10] += data[2] * rhs.data[2];
+    accum.data[11] += data[2] * rhs.data[3];
+    accum.data[12] += data[3] * rhs.data[0];
+    accum.data[13] += data[3] * rhs.data[1];
+    accum.data[14] += data[3] * rhs.data[2];
+    accum.data[15] += data[3] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-1
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 1, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 4, 1> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+
+  /// Dot product of vectors with extent 4
+  CUTLASS_HOST_DEVICE
+  Element dot(Matrix<Element, 1, 4> const &rhs, Element accum = Element()) const {
+    
+    accum += data[0] * rhs.data[0];
+    accum += data[1] * rhs.data[1];
+    accum += data[2] * rhs.data[2];
+    accum += data[3] * rhs.data[3];
+    return accum;
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-1 matrix
+template <typename Element>
+using Matrix4x1 = Matrix<Element, 4, 1>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x1<Element> make_Matrix4x1(
+    Element _0_0, 
+    Element _1_0, 
+    Element _2_0, 
+    Element _3_0
+) {
+  return Matrix4x1<Element>(
+  _0_0, 
+  _1_0, 
+  _2_0, 
+  _3_0 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-2 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 2> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 2;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 8;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 4-by-2 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1, 
+    Element _3_0, Element _3_1
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;
+    data[2] = _1_0;  data[3] = _1_1;
+    data[4] = _2_0;  data[5] = _2_1;
+    data[6] = _3_0;  data[7] = _3_1;
+  }
+    
+  /// Constructs a 4-by-2 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 2> const &row_0,
+    Matrix<Element, 1, 2> const &row_1,
+    Matrix<Element, 1, 2> const &row_2,
+    Matrix<Element, 1, 2> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_1.data[0];
+    data[3] = row_1.data[1];
+    data[4] = row_2.data[0];
+    data[5] = row_2.data[1];
+    data[6] = row_3.data[0];
+    data[7] = row_3.data[1];
+  }
+    
+  /// Static method to construct a 4-by-2 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 2, 1> const &column_0,
+    Matrix<Element, 2, 1> const &column_1
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_0.data[1];
+    result.data[3] = column_1.data[1];
+    result.data[4] = column_0.data[2];
+    result.data[5] = column_1.data[2];
+    result.data[6] = column_0.data[3];
+    result.data[7] = column_1.data[3];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 2, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 2> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> diagonal() const {
+    Matrix<Element, 2, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> transpose() const {
+    Matrix<Element, 2, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[1] = data[2];
+    mt.data[5] = data[3];
+    mt.data[2] = data[4];
+    mt.data[6] = data[5];
+    mt.data[3] = data[6];
+    mt.data[7] = data[7];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> row(int i) const {
+    return slice_1x2(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 2> const &v, int i = 0) {
+    return set_slice_1x2(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 2];
+    m.data[2] = data[i * 2 + j + 4];
+    m.data[3] = data[i * 2 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 2] = m.data[1];
+    data[i * 2 + j + 4] = m.data[2];
+    data[i * 2 + j + 6] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 2 + j + 0];
+    m.data[1] = data[i * 2 + j + 1];
+    m.data[2] = data[i * 2 + j + 2];
+    m.data[3] = data[i * 2 + j + 3];
+    m.data[4] = data[i * 2 + j + 4];
+    m.data[5] = data[i * 2 + j + 5];
+    m.data[6] = data[i * 2 + j + 6];
+    m.data[7] = data[i * 2 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 2 + j + 0] = m.data[0];
+    data[i * 2 + j + 1] = m.data[1];
+    data[i * 2 + j + 2] = m.data[2];
+    data[i * 2 + j + 3] = m.data[3];
+    data[i * 2 + j + 4] = m.data[4];
+    data[i * 2 + j + 5] = m.data[5];
+    data[i * 2 + j + 6] = m.data[6];
+    data[i * 2 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-2 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0)
+      , lhs.at(1, 0), rhs.at(1, 0)
+      , lhs.at(2, 0), rhs.at(2, 0)
+      , lhs.at(3, 0), rhs.at(3, 0));
+  }
+  
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 3>::hcat(*this, rhs);
+  }
+    
+  /// Concatenates this matrix with a a 4-by-2 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 2> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-2 matrix by vertically concatenating a 1-by-2 matrix with a 3-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 2> const & upper, Matrix<Element, 3, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1)
+      , lower.at(2, 0), lower.at(2, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by vertically concatenating a 2-by-2 matrix with a 2-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 2> const & upper, Matrix<Element, 2, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , lower.at(0, 0), lower.at(0, 1)
+      , lower.at(1, 0), lower.at(1, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by vertically concatenating a 3-by-2 matrix with a 1-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 2> const & upper, Matrix<Element, 1, 2> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1)
+      , upper.at(1, 0), upper.at(1, 1)
+      , upper.at(2, 0), upper.at(2, 1)
+      , lower.at(0, 0), lower.at(0, 1));
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Element                         B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A, B
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+      , C.at(2, 0), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , C.at(0, 0), D.at(0, 0)
+      , C.at(1, 0), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-2 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 1> const & B,
+    Element                         C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0)
+      , A.at(1, 0), B.at(1, 0)
+      , A.at(2, 0), B.at(2, 0)
+      , C, D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-2)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 2, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[2] * rhs.data[0];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[6] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[3] * rhs.data[1];
+    accum.data[2] += data[5] * rhs.data[1];
+    accum.data[3] += data[7] * rhs.data[1];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 2, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 2, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[2] * rhs.data[0];
+    accum.data[3] += data[2] * rhs.data[1];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[3] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[2];
+    accum.data[5] += data[5] * rhs.data[3];
+    accum.data[6] += data[7] * rhs.data[2];
+    accum.data[7] += data[7] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 2, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 2, 2> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 2, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[2] * rhs.data[0];
+    accum.data[4] += data[2] * rhs.data[1];
+    accum.data[5] += data[2] * rhs.data[2];
+    accum.data[6] += data[4] * rhs.data[0];
+    accum.data[7] += data[4] * rhs.data[1];
+    accum.data[8] += data[4] * rhs.data[2];
+    accum.data[9] += data[6] * rhs.data[0];
+    accum.data[10] += data[6] * rhs.data[1];
+    accum.data[11] += data[6] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[3] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[3];
+    accum.data[7] += data[5] * rhs.data[4];
+    accum.data[8] += data[5] * rhs.data[5];
+    accum.data[9] += data[7] * rhs.data[3];
+    accum.data[10] += data[7] * rhs.data[4];
+    accum.data[11] += data[7] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 2, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 2, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[2] * rhs.data[0];
+    accum.data[5] += data[2] * rhs.data[1];
+    accum.data[6] += data[2] * rhs.data[2];
+    accum.data[7] += data[2] * rhs.data[3];
+    accum.data[8] += data[4] * rhs.data[0];
+    accum.data[9] += data[4] * rhs.data[1];
+    accum.data[10] += data[4] * rhs.data[2];
+    accum.data[11] += data[4] * rhs.data[3];
+    accum.data[12] += data[6] * rhs.data[0];
+    accum.data[13] += data[6] * rhs.data[1];
+    accum.data[14] += data[6] * rhs.data[2];
+    accum.data[15] += data[6] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[3] * rhs.data[4];
+    accum.data[5] += data[3] * rhs.data[5];
+    accum.data[6] += data[3] * rhs.data[6];
+    accum.data[7] += data[3] * rhs.data[7];
+    accum.data[8] += data[5] * rhs.data[4];
+    accum.data[9] += data[5] * rhs.data[5];
+    accum.data[10] += data[5] * rhs.data[6];
+    accum.data[11] += data[5] * rhs.data[7];
+    accum.data[12] += data[7] * rhs.data[4];
+    accum.data[13] += data[7] * rhs.data[5];
+    accum.data[14] += data[7] * rhs.data[6];
+    accum.data[15] += data[7] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-2
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 2, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[3];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-2 matrix
+template <typename Element>
+using Matrix4x2 = Matrix<Element, 4, 2>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x2<Element> make_Matrix4x2(
+    Element _0_0, Element _0_1, 
+    Element _1_0, Element _1_1, 
+    Element _2_0, Element _2_1, 
+    Element _3_0, Element _3_1
+) {
+  return Matrix4x2<Element>(
+  _0_0, _0_1, 
+  _1_0, _1_1, 
+  _2_0, _2_1, 
+  _3_0, _3_1 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-3 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 3> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 3;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 12;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 4-by-3 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2, 
+    Element _3_0, Element _3_1, Element _3_2
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;
+    data[3] = _1_0;  data[4] = _1_1;  data[5] = _1_2;
+    data[6] = _2_0;  data[7] = _2_1;  data[8] = _2_2;
+    data[9] = _3_0;  data[10] = _3_1;  data[11] = _3_2;
+  }
+    
+  /// Constructs a 4-by-3 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 3> const &row_0,
+    Matrix<Element, 1, 3> const &row_1,
+    Matrix<Element, 1, 3> const &row_2,
+    Matrix<Element, 1, 3> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_1.data[0];
+    data[4] = row_1.data[1];
+    data[5] = row_1.data[2];
+    data[6] = row_2.data[0];
+    data[7] = row_2.data[1];
+    data[8] = row_2.data[2];
+    data[9] = row_3.data[0];
+    data[10] = row_3.data[1];
+    data[11] = row_3.data[2];
+  }
+    
+  /// Static method to construct a 4-by-3 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 3, 1> const &column_0,
+    Matrix<Element, 3, 1> const &column_1,
+    Matrix<Element, 3, 1> const &column_2
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_0.data[1];
+    result.data[4] = column_1.data[1];
+    result.data[5] = column_2.data[1];
+    result.data[6] = column_0.data[2];
+    result.data[7] = column_1.data[2];
+    result.data[8] = column_2.data[2];
+    result.data[9] = column_0.data[3];
+    result.data[10] = column_1.data[3];
+    result.data[11] = column_2.data[3];
+    return result;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 3, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 3> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> diagonal() const {
+    Matrix<Element, 3, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> transpose() const {
+    Matrix<Element, 3, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[8] = data[2];
+    mt.data[1] = data[3];
+    mt.data[5] = data[4];
+    mt.data[9] = data[5];
+    mt.data[2] = data[6];
+    mt.data[6] = data[7];
+    mt.data[10] = data[8];
+    mt.data[3] = data[9];
+    mt.data[7] = data[10];
+    mt.data[11] = data[11];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> row(int i) const {
+    return slice_1x3(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 3> const &v, int i = 0) {
+    return set_slice_1x3(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 3];
+    m.data[2] = data[i * 3 + j + 6];
+    m.data[3] = data[i * 3 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 3] = m.data[1];
+    data[i * 3 + j + 6] = m.data[2];
+    data[i * 3 + j + 9] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 3];
+    m.data[3] = data[i * 3 + j + 4];
+    m.data[4] = data[i * 3 + j + 6];
+    m.data[5] = data[i * 3 + j + 7];
+    m.data[6] = data[i * 3 + j + 9];
+    m.data[7] = data[i * 3 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 3] = m.data[2];
+    data[i * 3 + j + 4] = m.data[3];
+    data[i * 3 + j + 6] = m.data[4];
+    data[i * 3 + j + 7] = m.data[5];
+    data[i * 3 + j + 9] = m.data[6];
+    data[i * 3 + j + 10] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 3> m;
+    
+    m.data[0] = data[i * 3 + j + 0];
+    m.data[1] = data[i * 3 + j + 1];
+    m.data[2] = data[i * 3 + j + 2];
+    m.data[3] = data[i * 3 + j + 3];
+    m.data[4] = data[i * 3 + j + 4];
+    m.data[5] = data[i * 3 + j + 5];
+    m.data[6] = data[i * 3 + j + 6];
+    m.data[7] = data[i * 3 + j + 7];
+    m.data[8] = data[i * 3 + j + 8];
+    m.data[9] = data[i * 3 + j + 9];
+    m.data[10] = data[i * 3 + j + 10];
+    m.data[11] = data[i * 3 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 3 + j + 0] = m.data[0];
+    data[i * 3 + j + 1] = m.data[1];
+    data[i * 3 + j + 2] = m.data[2];
+    data[i * 3 + j + 3] = m.data[3];
+    data[i * 3 + j + 4] = m.data[4];
+    data[i * 3 + j + 5] = m.data[5];
+    data[i * 3 + j + 6] = m.data[6];
+    data[i * 3 + j + 7] = m.data[7];
+    data[i * 3 + j + 8] = m.data[8];
+    data[i * 3 + j + 9] = m.data[9];
+    data[i * 3 + j + 10] = m.data[10];
+    data[i * 3 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1)
+      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1));
+  }
+  
+  /// Forms a 4-by-3 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0)
+      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0));
+  }
+  
+  /// Concatenates this matrix with a a 4-by-1 matrix to form a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> hcat(Matrix<Element, 4, 1> const & rhs) const {
+    return Matrix<Element, 4, 4>::hcat(*this, rhs);
+  }
+    
+  /// Forms a 4-by-3 matrix by vertically concatenating a 1-by-3 matrix with a 3-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 3> const & upper, Matrix<Element, 3, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2)
+      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by vertically concatenating a 2-by-3 matrix with a 2-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 3> const & upper, Matrix<Element, 2, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by vertically concatenating a 3-by-3 matrix with a 1-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 3> const & upper, Matrix<Element, 1, 3> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2)
+      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2));
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 2> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+      , C.at(2, 0), D.at(2, 0), D.at(2, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Element                         B,
+    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+      , C.at(2, 0), C.at(2, 1), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 2> const & B,
+    Element                         C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1)
+      , A.at(2, 0), B.at(2, 0), B.at(2, 1)
+      , C, D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 4-by-3 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 1> const & B,
+    Matrix<Element, 1, 2> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0)
+      , A.at(2, 0), A.at(2, 1), B.at(2, 0)
+      , C.at(0, 0), C.at(0, 1), D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+
+    result.data[3] = data[3] + rhs.data[3];
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+    result.data[8] = data[8] + rhs.data[8];
+
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+
+    data[3] += rhs.data[3];
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+    data[8] += rhs.data[8];
+
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+
+    result.data[3] = data[3] - rhs.data[3];
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+    result.data[8] = data[8] - rhs.data[8];
+
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+
+    data[3] -= rhs.data[3];
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+    data[8] -= rhs.data[8];
+
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+
+    result.data[3] = data[3] * rhs.data[3];
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+    result.data[8] = data[8] * rhs.data[8];
+
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+
+    result.data[3] = data[3] * s;
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+    result.data[8] = data[8] * s;
+
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+
+    data[3] *= s;
+    data[4] *= s;
+    data[5] *= s;
+
+    data[6] *= s;
+    data[7] *= s;
+    data[8] *= s;
+
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+
+    result.data[3] = data[3] / rhs.data[3];
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+    result.data[8] = data[8] / rhs.data[8];
+
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+
+    result.data[3] = data[3] / s;
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+    result.data[8] = data[8] / s;
+
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+
+    data[3] /= s;
+    data[4] /= s;
+    data[5] /= s;
+
+    data[6] /= s;
+    data[7] /= s;
+    data[8] /= s;
+
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-3)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+
+    data[3] /= rhs.data[3];
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+    data[8] /= rhs.data[8];
+
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+    m.data[8] = -data[8];
+    m.data[9] = -data[9];
+    m.data[10] = -data[10];
+    m.data[11] = -data[11];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 3, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[3] * rhs.data[0];
+    accum.data[2] += data[6] * rhs.data[0];
+    accum.data[3] += data[9] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[4] * rhs.data[1];
+    accum.data[2] += data[7] * rhs.data[1];
+    accum.data[3] += data[10] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[5] * rhs.data[2];
+    accum.data[2] += data[8] * rhs.data[2];
+    accum.data[3] += data[11] * rhs.data[2];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 3, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 3, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[3] * rhs.data[0];
+    accum.data[3] += data[3] * rhs.data[1];
+    accum.data[4] += data[6] * rhs.data[0];
+    accum.data[5] += data[6] * rhs.data[1];
+    accum.data[6] += data[9] * rhs.data[0];
+    accum.data[7] += data[9] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[4] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[7] * rhs.data[2];
+    accum.data[5] += data[7] * rhs.data[3];
+    accum.data[6] += data[10] * rhs.data[2];
+    accum.data[7] += data[10] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[5] * rhs.data[4];
+    accum.data[3] += data[5] * rhs.data[5];
+    accum.data[4] += data[8] * rhs.data[4];
+    accum.data[5] += data[8] * rhs.data[5];
+    accum.data[6] += data[11] * rhs.data[4];
+    accum.data[7] += data[11] * rhs.data[5];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 3, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 3, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[3] * rhs.data[0];
+    accum.data[4] += data[3] * rhs.data[1];
+    accum.data[5] += data[3] * rhs.data[2];
+    accum.data[6] += data[6] * rhs.data[0];
+    accum.data[7] += data[6] * rhs.data[1];
+    accum.data[8] += data[6] * rhs.data[2];
+    accum.data[9] += data[9] * rhs.data[0];
+    accum.data[10] += data[9] * rhs.data[1];
+    accum.data[11] += data[9] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[4] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[7] * rhs.data[3];
+    accum.data[7] += data[7] * rhs.data[4];
+    accum.data[8] += data[7] * rhs.data[5];
+    accum.data[9] += data[10] * rhs.data[3];
+    accum.data[10] += data[10] * rhs.data[4];
+    accum.data[11] += data[10] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[5] * rhs.data[6];
+    accum.data[4] += data[5] * rhs.data[7];
+    accum.data[5] += data[5] * rhs.data[8];
+    accum.data[6] += data[8] * rhs.data[6];
+    accum.data[7] += data[8] * rhs.data[7];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[11] * rhs.data[6];
+    accum.data[10] += data[11] * rhs.data[7];
+    accum.data[11] += data[11] * rhs.data[8];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 3, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 3, 3> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Matrix product of size 4-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 3, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[3] * rhs.data[0];
+    accum.data[5] += data[3] * rhs.data[1];
+    accum.data[6] += data[3] * rhs.data[2];
+    accum.data[7] += data[3] * rhs.data[3];
+    accum.data[8] += data[6] * rhs.data[0];
+    accum.data[9] += data[6] * rhs.data[1];
+    accum.data[10] += data[6] * rhs.data[2];
+    accum.data[11] += data[6] * rhs.data[3];
+    accum.data[12] += data[9] * rhs.data[0];
+    accum.data[13] += data[9] * rhs.data[1];
+    accum.data[14] += data[9] * rhs.data[2];
+    accum.data[15] += data[9] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[4] * rhs.data[4];
+    accum.data[5] += data[4] * rhs.data[5];
+    accum.data[6] += data[4] * rhs.data[6];
+    accum.data[7] += data[4] * rhs.data[7];
+    accum.data[8] += data[7] * rhs.data[4];
+    accum.data[9] += data[7] * rhs.data[5];
+    accum.data[10] += data[7] * rhs.data[6];
+    accum.data[11] += data[7] * rhs.data[7];
+    accum.data[12] += data[10] * rhs.data[4];
+    accum.data[13] += data[10] * rhs.data[5];
+    accum.data[14] += data[10] * rhs.data[6];
+    accum.data[15] += data[10] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[5] * rhs.data[8];
+    accum.data[5] += data[5] * rhs.data[9];
+    accum.data[6] += data[5] * rhs.data[10];
+    accum.data[7] += data[5] * rhs.data[11];
+    accum.data[8] += data[8] * rhs.data[8];
+    accum.data[9] += data[8] * rhs.data[9];
+    accum.data[10] += data[8] * rhs.data[10];
+    accum.data[11] += data[8] * rhs.data[11];
+    accum.data[12] += data[11] * rhs.data[8];
+    accum.data[13] += data[11] * rhs.data[9];
+    accum.data[14] += data[11] * rhs.data[10];
+    accum.data[15] += data[11] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-3
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 3, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[4];
+    accum += data[8];
+
+    return accum;
+  }
+    
+};
+
+/// Template alias for 4-by-3 matrix
+template <typename Element>
+using Matrix4x3 = Matrix<Element, 4, 3>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x3<Element> make_Matrix4x3(
+    Element _0_0, Element _0_1, Element _0_2, 
+    Element _1_0, Element _1_1, Element _1_2, 
+    Element _2_0, Element _2_1, Element _2_2, 
+    Element _3_0, Element _3_1, Element _3_2
+) {
+  return Matrix4x3<Element>(
+  _0_0, _0_1, _0_2, 
+  _1_0, _1_1, _1_2, 
+  _2_0, _2_1, _2_2, 
+  _3_0, _3_1, _3_2 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// 4-by-4 matrix template class definition
+template <typename Element_>
+struct Matrix<Element_, 4, 4> {
+
+  //
+  // Type definitions
+  //
+
+  /// Element data type
+  using Element = Element_;
+
+  /// Number of rows in matrix
+  static int const kRows = 4;
+
+  /// Number of columns in matrix
+  static int const kColumns = 4;
+
+  /// Layout of matrix in underlying array
+  using Layout = layout::RowMajor;
+
+  /// Number of elements in matrix
+  static int const kCount = 16;
+
+  //
+  // Data members
+  //
+
+  /// Elements of the matrix in row-major layout
+  Array<Element, kCount> data;
+
+  //
+  // Methods
+  //
+
+  /// Constructs a zero matrix
+  CUTLASS_HOST_DEVICE
+  Matrix() {
+    data.clear();
+  }
+  
+  /// Copy constructor for a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Matrix(Matrix const &rhs) {
+    data = rhs.data;
+  }
+    
+  /// Constructs a 4-by-4 matrix from scalar elements
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
+    Element _3_0, Element _3_1, Element _3_2, Element _3_3
+  ) {
+
+    data[0] = _0_0;  data[1] = _0_1;  data[2] = _0_2;  data[3] = _0_3;
+    data[4] = _1_0;  data[5] = _1_1;  data[6] = _1_2;  data[7] = _1_3;
+    data[8] = _2_0;  data[9] = _2_1;  data[10] = _2_2;  data[11] = _2_3;
+    data[12] = _3_0;  data[13] = _3_1;  data[14] = _3_2;  data[15] = _3_3;
+  }
+    
+  /// Constructs a 4-by-4 matrix from row vectors
+  CUTLASS_HOST_DEVICE
+  Matrix(
+    Matrix<Element, 1, 4> const &row_0,
+    Matrix<Element, 1, 4> const &row_1,
+    Matrix<Element, 1, 4> const &row_2,
+    Matrix<Element, 1, 4> const &row_3
+  ) { 
+    data[0] = row_0.data[0];
+    data[1] = row_0.data[1];
+    data[2] = row_0.data[2];
+    data[3] = row_0.data[3];
+    data[4] = row_1.data[0];
+    data[5] = row_1.data[1];
+    data[6] = row_1.data[2];
+    data[7] = row_1.data[3];
+    data[8] = row_2.data[0];
+    data[9] = row_2.data[1];
+    data[10] = row_2.data[2];
+    data[11] = row_2.data[3];
+    data[12] = row_3.data[0];
+    data[13] = row_3.data[1];
+    data[14] = row_3.data[2];
+    data[15] = row_3.data[3];
+  }
+    
+  /// Static method to construct a 4-by-4 matrix from column vectors
+  CUTLASS_HOST_DEVICE
+  static Matrix from_columns(
+    Matrix<Element, 4, 1> const &column_0,
+    Matrix<Element, 4, 1> const &column_1,
+    Matrix<Element, 4, 1> const &column_2,
+    Matrix<Element, 4, 1> const &column_3
+  ) { 
+    Matrix result;
+    
+    result.data[0] = column_0.data[0];
+    result.data[1] = column_1.data[0];
+    result.data[2] = column_2.data[0];
+    result.data[3] = column_3.data[0];
+    result.data[4] = column_0.data[1];
+    result.data[5] = column_1.data[1];
+    result.data[6] = column_2.data[1];
+    result.data[7] = column_3.data[1];
+    result.data[8] = column_0.data[2];
+    result.data[9] = column_1.data[2];
+    result.data[10] = column_2.data[2];
+    result.data[11] = column_3.data[2];
+    result.data[12] = column_0.data[3];
+    result.data[13] = column_1.data[3];
+    result.data[14] = column_2.data[3];
+    result.data[15] = column_3.data[3];
+    return result;
+  }
+    
+  /// Constructs an identity matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix identity() {
+    Matrix m;
+    
+    m.data[0] = Element(1);
+    m.data[5] = Element(1);
+    m.data[10] = Element(1);
+    m.data[15] = Element(1);
+
+    return m;
+  }
+    
+  /// Constructs a matrix from a uniform element
+  CUTLASS_HOST_DEVICE
+  static Matrix uniform(Element s) {
+    Matrix m;
+    
+    m.data[0] = s;
+    m.data[1] = s;
+    m.data[2] = s;
+    m.data[3] = s;
+    m.data[4] = s;
+    m.data[5] = s;
+    m.data[6] = s;
+    m.data[7] = s;
+    m.data[8] = s;
+    m.data[9] = s;
+    m.data[10] = s;
+    m.data[11] = s;
+    m.data[12] = s;
+    m.data[13] = s;
+    m.data[14] = s;
+    m.data[15] = s;
+
+    return m;
+  }
+
+  /// Constructs a matrix from a uniform element 1
+  CUTLASS_HOST_DEVICE
+  static Matrix ones() {
+    return uniform(Element(1));
+  }
+
+  /// Constructs a matrix from a uniform element 0
+  CUTLASS_HOST_DEVICE
+  static Matrix zero() {
+    return Matrix();
+  }
+  
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 4, 1> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Constructs a matrix from elements along its diagonal
+  CUTLASS_HOST_DEVICE
+  static Matrix from_diagonal(Matrix<Element, 1, 4> const &diag) {
+    Matrix m;
+    
+    m.data[0] = diag.data[0];
+    m.data[5] = diag.data[1];
+    m.data[10] = diag.data[2];
+    m.data[15] = diag.data[3];
+
+    return m;
+  }
+
+  /// Gets an array of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> diagonal() const {
+    Matrix<Element, 4, 1> diag;
+    
+    diag.data[0] = data[0];
+    diag.data[1] = data[5];
+    diag.data[2] = data[10];
+    diag.data[3] = data[15];
+
+    return diag;
+  }
+    
+  /// Returns a transposed matrix
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> transpose() const {
+    Matrix<Element, 4, 4> mt;
+    
+    mt.data[0] = data[0];
+    mt.data[4] = data[1];
+    mt.data[8] = data[2];
+    mt.data[12] = data[3];
+    mt.data[1] = data[4];
+    mt.data[5] = data[5];
+    mt.data[9] = data[6];
+    mt.data[13] = data[7];
+    mt.data[2] = data[8];
+    mt.data[6] = data[9];
+    mt.data[10] = data[10];
+    mt.data[14] = data[11];
+    mt.data[3] = data[12];
+    mt.data[7] = data[13];
+    mt.data[11] = data[14];
+    mt.data[15] = data[15];
+
+    return mt;
+  }
+    
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(int i, int j) const {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(int i, int j) {
+    return data[i * 4 + j];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element at(Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & at(Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element &at(int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element at(int offset) const {
+    return data[offset];
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element operator[](Coord<2> const &coord) const {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by coordinate
+  CUTLASS_HOST_DEVICE
+  Element & operator[](Coord<2> const &coord) {
+    return at(coord[0], coord[1]);
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element & operator[](int offset) {
+    return data[offset];
+  }
+
+  /// Accesses an element by offset
+  CUTLASS_HOST_DEVICE
+  Element operator[](int offset) const {
+    return data[offset];
+  }
+  
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 2> slice_1x2(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x2(Matrix<Element, 1, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 3> slice_1x3(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x3(Matrix<Element, 1, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> slice_1x4(int i = 0, int j = 0) const {
+    Matrix<Element, 1, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_1x4(Matrix<Element, 1, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 1, 4> row(int i) const {
+    return slice_1x4(i, 0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_row(Matrix<Element, 1, 4> const &v, int i = 0) {
+    return set_slice_1x4(v, i, 0);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 1> slice_2x1(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x1(Matrix<Element, 2, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 2> slice_2x2(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x2(Matrix<Element, 2, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 3> slice_2x3(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x3(Matrix<Element, 2, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 2, 4> slice_2x4(int i = 0, int j = 0) const {
+    Matrix<Element, 2, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_2x4(Matrix<Element, 2, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 1> slice_3x1(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x1(Matrix<Element, 3, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 2> slice_3x2(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x2(Matrix<Element, 3, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 3> slice_3x3(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x3(Matrix<Element, 3, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 3, 4> slice_3x4(int i = 0, int j = 0) const {
+    Matrix<Element, 3, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_3x4(Matrix<Element, 3, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> slice_4x1(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 1> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 4];
+    m.data[2] = data[i * 4 + j + 8];
+    m.data[3] = data[i * 4 + j + 12];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x1(Matrix<Element, 4, 1> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 4] = m.data[1];
+    data[i * 4 + j + 8] = m.data[2];
+    data[i * 4 + j + 12] = m.data[3];
+
+    return *this;
+  }
+    
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> column(int j) const {
+    return slice_4x1(0, j);
+  }
+
+  CUTLASS_HOST_DEVICE
+  Matrix &set_column(Matrix<Element, 4, 1> const &v, int j =0) {
+    return set_slice_4x1(v, 0, j);
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> slice_4x2(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 2> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 4];
+    m.data[3] = data[i * 4 + j + 5];
+    m.data[4] = data[i * 4 + j + 8];
+    m.data[5] = data[i * 4 + j + 9];
+    m.data[6] = data[i * 4 + j + 12];
+    m.data[7] = data[i * 4 + j + 13];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x2(Matrix<Element, 4, 2> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 4] = m.data[2];
+    data[i * 4 + j + 5] = m.data[3];
+    data[i * 4 + j + 8] = m.data[4];
+    data[i * 4 + j + 9] = m.data[5];
+    data[i * 4 + j + 12] = m.data[6];
+    data[i * 4 + j + 13] = m.data[7];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> slice_4x3(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 3> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 4];
+    m.data[4] = data[i * 4 + j + 5];
+    m.data[5] = data[i * 4 + j + 6];
+    m.data[6] = data[i * 4 + j + 8];
+    m.data[7] = data[i * 4 + j + 9];
+    m.data[8] = data[i * 4 + j + 10];
+    m.data[9] = data[i * 4 + j + 12];
+    m.data[10] = data[i * 4 + j + 13];
+    m.data[11] = data[i * 4 + j + 14];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x3(Matrix<Element, 4, 3> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 4] = m.data[3];
+    data[i * 4 + j + 5] = m.data[4];
+    data[i * 4 + j + 6] = m.data[5];
+    data[i * 4 + j + 8] = m.data[6];
+    data[i * 4 + j + 9] = m.data[7];
+    data[i * 4 + j + 10] = m.data[8];
+    data[i * 4 + j + 12] = m.data[9];
+    data[i * 4 + j + 13] = m.data[10];
+    data[i * 4 + j + 14] = m.data[11];
+
+    return *this;
+  }
+    
+  /// Gets a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> slice_4x4(int i = 0, int j = 0) const {
+    Matrix<Element, 4, 4> m;
+    
+    m.data[0] = data[i * 4 + j + 0];
+    m.data[1] = data[i * 4 + j + 1];
+    m.data[2] = data[i * 4 + j + 2];
+    m.data[3] = data[i * 4 + j + 3];
+    m.data[4] = data[i * 4 + j + 4];
+    m.data[5] = data[i * 4 + j + 5];
+    m.data[6] = data[i * 4 + j + 6];
+    m.data[7] = data[i * 4 + j + 7];
+    m.data[8] = data[i * 4 + j + 8];
+    m.data[9] = data[i * 4 + j + 9];
+    m.data[10] = data[i * 4 + j + 10];
+    m.data[11] = data[i * 4 + j + 11];
+    m.data[12] = data[i * 4 + j + 12];
+    m.data[13] = data[i * 4 + j + 13];
+    m.data[14] = data[i * 4 + j + 14];
+    m.data[15] = data[i * 4 + j + 15];
+
+    return m;
+  }
+
+  /// Overwrites a submatrix with optional offset
+  CUTLASS_HOST_DEVICE
+  Matrix & set_slice_4x4(Matrix<Element, 4, 4> const &m, int i = 0, int j = 0) {
+    
+    data[i * 4 + j + 0] = m.data[0];
+    data[i * 4 + j + 1] = m.data[1];
+    data[i * 4 + j + 2] = m.data[2];
+    data[i * 4 + j + 3] = m.data[3];
+    data[i * 4 + j + 4] = m.data[4];
+    data[i * 4 + j + 5] = m.data[5];
+    data[i * 4 + j + 6] = m.data[6];
+    data[i * 4 + j + 7] = m.data[7];
+    data[i * 4 + j + 8] = m.data[8];
+    data[i * 4 + j + 9] = m.data[9];
+    data[i * 4 + j + 10] = m.data[10];
+    data[i * 4 + j + 11] = m.data[11];
+    data[i * 4 + j + 12] = m.data[12];
+    data[i * 4 + j + 13] = m.data[13];
+    data[i * 4 + j + 14] = m.data[14];
+    data[i * 4 + j + 15] = m.data[15];
+
+    return *this;
+  }
+    
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-1 matrix with a 4-by-3 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 1> const & lhs, Matrix<Element, 4, 3> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), rhs.at(0, 0), rhs.at(0, 1), rhs.at(0, 2)
+      , lhs.at(1, 0), rhs.at(1, 0), rhs.at(1, 1), rhs.at(1, 2)
+      , lhs.at(2, 0), rhs.at(2, 0), rhs.at(2, 1), rhs.at(2, 2)
+      , lhs.at(3, 0), rhs.at(3, 0), rhs.at(3, 1), rhs.at(3, 2));
+  }
+  
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-2 matrix with a 4-by-2 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 2> const & lhs, Matrix<Element, 4, 2> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), rhs.at(0, 0), rhs.at(0, 1)
+      , lhs.at(1, 0), lhs.at(1, 1), rhs.at(1, 0), rhs.at(1, 1)
+      , lhs.at(2, 0), lhs.at(2, 1), rhs.at(2, 0), rhs.at(2, 1)
+      , lhs.at(3, 0), lhs.at(3, 1), rhs.at(3, 0), rhs.at(3, 1));
+  }
+  
+  /// Forms a 4-by-4 matrix by horizontally concatenating a 4-by-3 matrix with a 4-by-1 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix hcat(Matrix<Element, 4, 3> const & lhs, Matrix<Element, 4, 1> const & rhs) {
+    return Matrix(
+      lhs.at(0, 0), lhs.at(0, 1), lhs.at(0, 2), rhs.at(0, 0)
+      , lhs.at(1, 0), lhs.at(1, 1), lhs.at(1, 2), rhs.at(1, 0)
+      , lhs.at(2, 0), lhs.at(2, 1), lhs.at(2, 2), rhs.at(2, 0)
+      , lhs.at(3, 0), lhs.at(3, 1), lhs.at(3, 2), rhs.at(3, 0));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 1-by-4 matrix with a 3-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 1, 4> const & upper, Matrix<Element, 3, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3)
+      , lower.at(2, 0), lower.at(2, 1), lower.at(2, 2), lower.at(2, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 2-by-4 matrix with a 2-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 2, 4> const & upper, Matrix<Element, 2, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3)
+      , lower.at(1, 0), lower.at(1, 1), lower.at(1, 2), lower.at(1, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by vertically concatenating a 3-by-4 matrix with a 1-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  static Matrix vcat(Matrix<Element, 3, 4> const & upper, Matrix<Element, 1, 4> const & lower) {
+    return Matrix(
+      upper.at(0, 0), upper.at(0, 1), upper.at(0, 2), upper.at(0, 3)
+      , upper.at(1, 0), upper.at(1, 1), upper.at(1, 2), upper.at(1, 3)
+      , upper.at(2, 0), upper.at(2, 1), upper.at(2, 2), upper.at(2, 3)
+      , lower.at(0, 0), lower.at(0, 1), lower.at(0, 2), lower.at(0, 3));
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Element                         A, Matrix<Element, 1, 3> const & B,
+    Matrix<Element, 3, 1> const & C, Matrix<Element, 3, 3> const & D) {
+    return Matrix(
+      A, B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+      , C.at(2, 0), D.at(2, 0), D.at(2, 1), D.at(2, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 2> const & A, Matrix<Element, 1, 2> const & B,
+    Matrix<Element, 3, 2> const & C, Matrix<Element, 3, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+      , C.at(2, 0), C.at(2, 1), D.at(2, 0), D.at(2, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 1, 3> const & A, Element                         B,
+    Matrix<Element, 3, 3> const & C, Matrix<Element, 3, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+      , C.at(2, 0), C.at(2, 1), C.at(2, 2), D.at(2, 0)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 1> const & A, Matrix<Element, 2, 3> const & B,
+    Matrix<Element, 2, 1> const & C, Matrix<Element, 2, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , C.at(0, 0), D.at(0, 0), D.at(0, 1), D.at(0, 2)
+      , C.at(1, 0), D.at(1, 0), D.at(1, 1), D.at(1, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 2> const & A, Matrix<Element, 2, 2> const & B,
+    Matrix<Element, 2, 2> const & C, Matrix<Element, 2, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+      , C.at(1, 0), C.at(1, 1), D.at(1, 0), D.at(1, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 2, 3> const & A, Matrix<Element, 2, 1> const & B,
+    Matrix<Element, 2, 3> const & C, Matrix<Element, 2, 1> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D.at(0, 0)
+      , C.at(1, 0), C.at(1, 1), C.at(1, 2), D.at(1, 0)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 1> const & A, Matrix<Element, 3, 3> const & B,
+    Element                         C, Matrix<Element, 1, 3> const & D) {
+    return Matrix(
+      A.at(0, 0), B.at(0, 0), B.at(0, 1), B.at(0, 2)
+      , A.at(1, 0), B.at(1, 0), B.at(1, 1), B.at(1, 2)
+      , A.at(2, 0), B.at(2, 0), B.at(2, 1), B.at(2, 2)
+      , C, D.at(0, 0), D.at(0, 1), D.at(0, 2)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 2> const & A, Matrix<Element, 3, 2> const & B,
+    Matrix<Element, 1, 2> const & C, Matrix<Element, 1, 2> const & D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), B.at(0, 0), B.at(0, 1)
+      , A.at(1, 0), A.at(1, 1), B.at(1, 0), B.at(1, 1)
+      , A.at(2, 0), A.at(2, 1), B.at(2, 0), B.at(2, 1)
+      , C.at(0, 0), C.at(0, 1), D.at(0, 0), D.at(0, 1)
+    );
+  }
+  
+  /// Forms a 4-by-4 matrix by concatenating four components
+  CUTLASS_HOST_DEVICE
+  static Matrix block(
+    Matrix<Element, 3, 3> const & A, Matrix<Element, 3, 1> const & B,
+    Matrix<Element, 1, 3> const & C, Element                         D) {
+    return Matrix(
+      A.at(0, 0), A.at(0, 1), A.at(0, 2), B.at(0, 0)
+      , A.at(1, 0), A.at(1, 1), A.at(1, 2), B.at(1, 0)
+      , A.at(2, 0), A.at(2, 1), A.at(2, 2), B.at(2, 0)
+      , C.at(0, 0), C.at(0, 1), C.at(0, 2), D
+    );
+  }
+  
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix add(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] + rhs.data[0];
+    result.data[1] = data[1] + rhs.data[1];
+    result.data[2] = data[2] + rhs.data[2];
+    result.data[3] = data[3] + rhs.data[3];
+
+    result.data[4] = data[4] + rhs.data[4];
+    result.data[5] = data[5] + rhs.data[5];
+    result.data[6] = data[6] + rhs.data[6];
+    result.data[7] = data[7] + rhs.data[7];
+
+    result.data[8] = data[8] + rhs.data[8];
+    result.data[9] = data[9] + rhs.data[9];
+    result.data[10] = data[10] + rhs.data[10];
+    result.data[11] = data[11] + rhs.data[11];
+
+    result.data[12] = data[12] + rhs.data[12];
+    result.data[13] = data[13] + rhs.data[13];
+    result.data[14] = data[14] + rhs.data[14];
+    result.data[15] = data[15] + rhs.data[15];
+
+    return result;
+  }
+      
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator +(Matrix const &rhs) const {
+    return add(rhs);
+  }
+
+  /// Elementwise add operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator +=(Matrix const &rhs) {
+    
+    data[0] += rhs.data[0];
+    data[1] += rhs.data[1];
+    data[2] += rhs.data[2];
+    data[3] += rhs.data[3];
+
+    data[4] += rhs.data[4];
+    data[5] += rhs.data[5];
+    data[6] += rhs.data[6];
+    data[7] += rhs.data[7];
+
+    data[8] += rhs.data[8];
+    data[9] += rhs.data[9];
+    data[10] += rhs.data[10];
+    data[11] += rhs.data[11];
+
+    data[12] += rhs.data[12];
+    data[13] += rhs.data[13];
+    data[14] += rhs.data[14];
+    data[15] += rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix subtract(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] - rhs.data[0];
+    result.data[1] = data[1] - rhs.data[1];
+    result.data[2] = data[2] - rhs.data[2];
+    result.data[3] = data[3] - rhs.data[3];
+
+    result.data[4] = data[4] - rhs.data[4];
+    result.data[5] = data[5] - rhs.data[5];
+    result.data[6] = data[6] - rhs.data[6];
+    result.data[7] = data[7] - rhs.data[7];
+
+    result.data[8] = data[8] - rhs.data[8];
+    result.data[9] = data[9] - rhs.data[9];
+    result.data[10] = data[10] - rhs.data[10];
+    result.data[11] = data[11] - rhs.data[11];
+
+    result.data[12] = data[12] - rhs.data[12];
+    result.data[13] = data[13] - rhs.data[13];
+    result.data[14] = data[14] - rhs.data[14];
+    result.data[15] = data[15] - rhs.data[15];
+
+    return result;
+  }
+      
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator -(Matrix const &rhs) const {
+    return subtract(rhs);
+  }
+
+  /// Elementwise subtract operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator -=(Matrix const &rhs) {
+    
+    data[0] -= rhs.data[0];
+    data[1] -= rhs.data[1];
+    data[2] -= rhs.data[2];
+    data[3] -= rhs.data[3];
+
+    data[4] -= rhs.data[4];
+    data[5] -= rhs.data[5];
+    data[6] -= rhs.data[6];
+    data[7] -= rhs.data[7];
+
+    data[8] -= rhs.data[8];
+    data[9] -= rhs.data[9];
+    data[10] -= rhs.data[10];
+    data[11] -= rhs.data[11];
+
+    data[12] -= rhs.data[12];
+    data[13] -= rhs.data[13];
+    data[14] -= rhs.data[14];
+    data[15] -= rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Elementwise multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * rhs.data[0];
+    result.data[1] = data[1] * rhs.data[1];
+    result.data[2] = data[2] * rhs.data[2];
+    result.data[3] = data[3] * rhs.data[3];
+
+    result.data[4] = data[4] * rhs.data[4];
+    result.data[5] = data[5] * rhs.data[5];
+    result.data[6] = data[6] * rhs.data[6];
+    result.data[7] = data[7] * rhs.data[7];
+
+    result.data[8] = data[8] * rhs.data[8];
+    result.data[9] = data[9] * rhs.data[9];
+    result.data[10] = data[10] * rhs.data[10];
+    result.data[11] = data[11] * rhs.data[11];
+
+    result.data[12] = data[12] * rhs.data[12];
+    result.data[13] = data[13] * rhs.data[13];
+    result.data[14] = data[14] * rhs.data[14];
+    result.data[15] = data[15] * rhs.data[15];
+
+    return result;
+  }
+      
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix multiply(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] * s;
+    result.data[1] = data[1] * s;
+    result.data[2] = data[2] * s;
+    result.data[3] = data[3] * s;
+
+    result.data[4] = data[4] * s;
+    result.data[5] = data[5] * s;
+    result.data[6] = data[6] * s;
+    result.data[7] = data[7] * s;
+
+    result.data[8] = data[8] * s;
+    result.data[9] = data[9] * s;
+    result.data[10] = data[10] * s;
+    result.data[11] = data[11] * s;
+
+    result.data[12] = data[12] * s;
+    result.data[13] = data[13] * s;
+    result.data[14] = data[14] * s;
+    result.data[15] = data[15] * s;
+
+    return result;
+  }
+
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator *(Element const &s) const {
+    return multiply(s);
+  }
+
+  /// Scalar multiply operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator *=(Element const &s) {
+    
+    data[0] *= s;
+    data[1] *= s;
+    data[2] *= s;
+    data[3] *= s;
+
+    data[4] *= s;
+    data[5] *= s;
+    data[6] *= s;
+    data[7] *= s;
+
+    data[8] *= s;
+    data[9] *= s;
+    data[10] *= s;
+    data[11] *= s;
+
+    data[12] *= s;
+    data[13] *= s;
+    data[14] *= s;
+    data[15] *= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Matrix const &rhs) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / rhs.data[0];
+    result.data[1] = data[1] / rhs.data[1];
+    result.data[2] = data[2] / rhs.data[2];
+    result.data[3] = data[3] / rhs.data[3];
+
+    result.data[4] = data[4] / rhs.data[4];
+    result.data[5] = data[5] / rhs.data[5];
+    result.data[6] = data[6] / rhs.data[6];
+    result.data[7] = data[7] / rhs.data[7];
+
+    result.data[8] = data[8] / rhs.data[8];
+    result.data[9] = data[9] / rhs.data[9];
+    result.data[10] = data[10] / rhs.data[10];
+    result.data[11] = data[11] / rhs.data[11];
+
+    result.data[12] = data[12] / rhs.data[12];
+    result.data[13] = data[13] / rhs.data[13];
+    result.data[14] = data[14] / rhs.data[14];
+    result.data[15] = data[15] / rhs.data[15];
+
+    return result;
+  }
+      
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix divide(Element const &s) const {
+
+    Matrix result;
+    
+    result.data[0] = data[0] / s;
+    result.data[1] = data[1] / s;
+    result.data[2] = data[2] / s;
+    result.data[3] = data[3] / s;
+
+    result.data[4] = data[4] / s;
+    result.data[5] = data[5] / s;
+    result.data[6] = data[6] / s;
+    result.data[7] = data[7] / s;
+
+    result.data[8] = data[8] / s;
+    result.data[9] = data[9] / s;
+    result.data[10] = data[10] / s;
+    result.data[11] = data[11] / s;
+
+    result.data[12] = data[12] / s;
+    result.data[13] = data[13] / s;
+    result.data[14] = data[14] / s;
+    result.data[15] = data[15] / s;
+
+    return result;
+  }
+
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Element const &s) const {
+    return divide(s);
+  }
+
+  /// Scalar divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Element const &s) {
+    
+    data[0] /= s;
+    data[1] /= s;
+    data[2] /= s;
+    data[3] /= s;
+
+    data[4] /= s;
+    data[5] /= s;
+    data[6] /= s;
+    data[7] /= s;
+
+    data[8] /= s;
+    data[9] /= s;
+    data[10] /= s;
+    data[11] /= s;
+
+    data[12] /= s;
+    data[13] /= s;
+    data[14] /= s;
+    data[15] /= s;
+
+    return *this;
+  }
+        
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix operator /(Matrix const &rhs) const {
+    return divide(rhs);
+  }
+
+  /// Elementwise divide operator (4-by-4)
+  CUTLASS_HOST_DEVICE
+  Matrix & operator /=(Matrix const &rhs) {
+    
+    data[0] /= rhs.data[0];
+    data[1] /= rhs.data[1];
+    data[2] /= rhs.data[2];
+    data[3] /= rhs.data[3];
+
+    data[4] /= rhs.data[4];
+    data[5] /= rhs.data[5];
+    data[6] /= rhs.data[6];
+    data[7] /= rhs.data[7];
+
+    data[8] /= rhs.data[8];
+    data[9] /= rhs.data[9];
+    data[10] /= rhs.data[10];
+    data[11] /= rhs.data[11];
+
+    data[12] /= rhs.data[12];
+    data[13] /= rhs.data[13];
+    data[14] /= rhs.data[14];
+    data[15] /= rhs.data[15];
+
+    return *this;
+  }
+        
+  /// Negates each element of the matrix
+  CUTLASS_HOST_DEVICE
+  Matrix operator-() const {
+    Matrix m;
+    
+    m.data[0] = -data[0];
+    m.data[1] = -data[1];
+    m.data[2] = -data[2];
+    m.data[3] = -data[3];
+    m.data[4] = -data[4];
+    m.data[5] = -data[5];
+    m.data[6] = -data[6];
+    m.data[7] = -data[7];
+    m.data[8] = -data[8];
+    m.data[9] = -data[9];
+    m.data[10] = -data[10];
+    m.data[11] = -data[11];
+    m.data[12] = -data[12];
+    m.data[13] = -data[13];
+    m.data[14] = -data[14];
+    m.data[15] = -data[15];
+
+    return m;
+  }
+  
+  /// Matrix product of size 4-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> product(
+    Matrix<Element, 4, 1> const &rhs,
+    Matrix<Element, 4, 1> accum = Matrix<Element, 4, 1>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[4] * rhs.data[0];
+    accum.data[2] += data[8] * rhs.data[0];
+    accum.data[3] += data[12] * rhs.data[0];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[1];
+    accum.data[1] += data[5] * rhs.data[1];
+    accum.data[2] += data[9] * rhs.data[1];
+    accum.data[3] += data[13] * rhs.data[1];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[2];
+    accum.data[1] += data[6] * rhs.data[2];
+    accum.data[2] += data[10] * rhs.data[2];
+    accum.data[3] += data[14] * rhs.data[2];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[3];
+    accum.data[1] += data[7] * rhs.data[3];
+    accum.data[2] += data[11] * rhs.data[3];
+    accum.data[3] += data[15] * rhs.data[3];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-1-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 1> operator*(Matrix<Element, 4, 1> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> product(
+    Matrix<Element, 4, 2> const &rhs,
+    Matrix<Element, 4, 2> accum = Matrix<Element, 4, 2>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[4] * rhs.data[0];
+    accum.data[3] += data[4] * rhs.data[1];
+    accum.data[4] += data[8] * rhs.data[0];
+    accum.data[5] += data[8] * rhs.data[1];
+    accum.data[6] += data[12] * rhs.data[0];
+    accum.data[7] += data[12] * rhs.data[1];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[2];
+    accum.data[1] += data[1] * rhs.data[3];
+    accum.data[2] += data[5] * rhs.data[2];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[9] * rhs.data[2];
+    accum.data[5] += data[9] * rhs.data[3];
+    accum.data[6] += data[13] * rhs.data[2];
+    accum.data[7] += data[13] * rhs.data[3];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[4];
+    accum.data[1] += data[2] * rhs.data[5];
+    accum.data[2] += data[6] * rhs.data[4];
+    accum.data[3] += data[6] * rhs.data[5];
+    accum.data[4] += data[10] * rhs.data[4];
+    accum.data[5] += data[10] * rhs.data[5];
+    accum.data[6] += data[14] * rhs.data[4];
+    accum.data[7] += data[14] * rhs.data[5];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[6];
+    accum.data[1] += data[3] * rhs.data[7];
+    accum.data[2] += data[7] * rhs.data[6];
+    accum.data[3] += data[7] * rhs.data[7];
+    accum.data[4] += data[11] * rhs.data[6];
+    accum.data[5] += data[11] * rhs.data[7];
+    accum.data[6] += data[15] * rhs.data[6];
+    accum.data[7] += data[15] * rhs.data[7];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-2-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 2> operator*(Matrix<Element, 4, 2> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> product(
+    Matrix<Element, 4, 3> const &rhs,
+    Matrix<Element, 4, 3> accum = Matrix<Element, 4, 3>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[4] * rhs.data[0];
+    accum.data[4] += data[4] * rhs.data[1];
+    accum.data[5] += data[4] * rhs.data[2];
+    accum.data[6] += data[8] * rhs.data[0];
+    accum.data[7] += data[8] * rhs.data[1];
+    accum.data[8] += data[8] * rhs.data[2];
+    accum.data[9] += data[12] * rhs.data[0];
+    accum.data[10] += data[12] * rhs.data[1];
+    accum.data[11] += data[12] * rhs.data[2];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[3];
+    accum.data[1] += data[1] * rhs.data[4];
+    accum.data[2] += data[1] * rhs.data[5];
+    accum.data[3] += data[5] * rhs.data[3];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[9] * rhs.data[3];
+    accum.data[7] += data[9] * rhs.data[4];
+    accum.data[8] += data[9] * rhs.data[5];
+    accum.data[9] += data[13] * rhs.data[3];
+    accum.data[10] += data[13] * rhs.data[4];
+    accum.data[11] += data[13] * rhs.data[5];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[6];
+    accum.data[1] += data[2] * rhs.data[7];
+    accum.data[2] += data[2] * rhs.data[8];
+    accum.data[3] += data[6] * rhs.data[6];
+    accum.data[4] += data[6] * rhs.data[7];
+    accum.data[5] += data[6] * rhs.data[8];
+    accum.data[6] += data[10] * rhs.data[6];
+    accum.data[7] += data[10] * rhs.data[7];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[14] * rhs.data[6];
+    accum.data[10] += data[14] * rhs.data[7];
+    accum.data[11] += data[14] * rhs.data[8];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[9];
+    accum.data[1] += data[3] * rhs.data[10];
+    accum.data[2] += data[3] * rhs.data[11];
+    accum.data[3] += data[7] * rhs.data[9];
+    accum.data[4] += data[7] * rhs.data[10];
+    accum.data[5] += data[7] * rhs.data[11];
+    accum.data[6] += data[11] * rhs.data[9];
+    accum.data[7] += data[11] * rhs.data[10];
+    accum.data[8] += data[11] * rhs.data[11];
+    accum.data[9] += data[15] * rhs.data[9];
+    accum.data[10] += data[15] * rhs.data[10];
+    accum.data[11] += data[15] * rhs.data[11];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-3-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 3> operator*(Matrix<Element, 4, 3> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> product(
+    Matrix<Element, 4, 4> const &rhs,
+    Matrix<Element, 4, 4> accum = Matrix<Element, 4, 4>()
+  ) const {
+    
+    // k=0
+    accum.data[0] += data[0] * rhs.data[0];
+    accum.data[1] += data[0] * rhs.data[1];
+    accum.data[2] += data[0] * rhs.data[2];
+    accum.data[3] += data[0] * rhs.data[3];
+    accum.data[4] += data[4] * rhs.data[0];
+    accum.data[5] += data[4] * rhs.data[1];
+    accum.data[6] += data[4] * rhs.data[2];
+    accum.data[7] += data[4] * rhs.data[3];
+    accum.data[8] += data[8] * rhs.data[0];
+    accum.data[9] += data[8] * rhs.data[1];
+    accum.data[10] += data[8] * rhs.data[2];
+    accum.data[11] += data[8] * rhs.data[3];
+    accum.data[12] += data[12] * rhs.data[0];
+    accum.data[13] += data[12] * rhs.data[1];
+    accum.data[14] += data[12] * rhs.data[2];
+    accum.data[15] += data[12] * rhs.data[3];
+
+    // k=1
+    accum.data[0] += data[1] * rhs.data[4];
+    accum.data[1] += data[1] * rhs.data[5];
+    accum.data[2] += data[1] * rhs.data[6];
+    accum.data[3] += data[1] * rhs.data[7];
+    accum.data[4] += data[5] * rhs.data[4];
+    accum.data[5] += data[5] * rhs.data[5];
+    accum.data[6] += data[5] * rhs.data[6];
+    accum.data[7] += data[5] * rhs.data[7];
+    accum.data[8] += data[9] * rhs.data[4];
+    accum.data[9] += data[9] * rhs.data[5];
+    accum.data[10] += data[9] * rhs.data[6];
+    accum.data[11] += data[9] * rhs.data[7];
+    accum.data[12] += data[13] * rhs.data[4];
+    accum.data[13] += data[13] * rhs.data[5];
+    accum.data[14] += data[13] * rhs.data[6];
+    accum.data[15] += data[13] * rhs.data[7];
+
+    // k=2
+    accum.data[0] += data[2] * rhs.data[8];
+    accum.data[1] += data[2] * rhs.data[9];
+    accum.data[2] += data[2] * rhs.data[10];
+    accum.data[3] += data[2] * rhs.data[11];
+    accum.data[4] += data[6] * rhs.data[8];
+    accum.data[5] += data[6] * rhs.data[9];
+    accum.data[6] += data[6] * rhs.data[10];
+    accum.data[7] += data[6] * rhs.data[11];
+    accum.data[8] += data[10] * rhs.data[8];
+    accum.data[9] += data[10] * rhs.data[9];
+    accum.data[10] += data[10] * rhs.data[10];
+    accum.data[11] += data[10] * rhs.data[11];
+    accum.data[12] += data[14] * rhs.data[8];
+    accum.data[13] += data[14] * rhs.data[9];
+    accum.data[14] += data[14] * rhs.data[10];
+    accum.data[15] += data[14] * rhs.data[11];
+
+    // k=3
+    accum.data[0] += data[3] * rhs.data[12];
+    accum.data[1] += data[3] * rhs.data[13];
+    accum.data[2] += data[3] * rhs.data[14];
+    accum.data[3] += data[3] * rhs.data[15];
+    accum.data[4] += data[7] * rhs.data[12];
+    accum.data[5] += data[7] * rhs.data[13];
+    accum.data[6] += data[7] * rhs.data[14];
+    accum.data[7] += data[7] * rhs.data[15];
+    accum.data[8] += data[11] * rhs.data[12];
+    accum.data[9] += data[11] * rhs.data[13];
+    accum.data[10] += data[11] * rhs.data[14];
+    accum.data[11] += data[11] * rhs.data[15];
+    accum.data[12] += data[15] * rhs.data[12];
+    accum.data[13] += data[15] * rhs.data[13];
+    accum.data[14] += data[15] * rhs.data[14];
+    accum.data[15] += data[15] * rhs.data[15];
+
+    return accum;
+  }
+
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix<Element, 4, 4> operator*(Matrix<Element, 4, 4> const &rhs) const {
+    return product(rhs);
+  }
+  
+  /// Matrix product of size 4-by-4-by-4
+  CUTLASS_HOST_DEVICE
+  Matrix & operator*=(Matrix<Element, 4, 4> const &rhs) {
+    *this = product(rhs);
+    return *this;
+  }
+    
+  /// Returns the sum of elements
+  CUTLASS_HOST_DEVICE
+  Element sum(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[1];
+    accum += data[2];
+    accum += data[3];
+    accum += data[4];
+    accum += data[5];
+    accum += data[6];
+    accum += data[7];
+    accum += data[8];
+    accum += data[9];
+    accum += data[10];
+    accum += data[11];
+    accum += data[12];
+    accum += data[13];
+    accum += data[14];
+    accum += data[15];
+
+    return accum;
+  }  
+
+  /// Returns the sum of squared elements
+  CUTLASS_HOST_DEVICE
+  Element norm(Element accum = Element()) const {
+    
+    accum += data[0] * data[0];
+    accum += data[1] * data[1];
+    accum += data[2] * data[2];
+    accum += data[3] * data[3];
+    accum += data[4] * data[4];
+    accum += data[5] * data[5];
+    accum += data[6] * data[6];
+    accum += data[7] * data[7];
+    accum += data[8] * data[8];
+    accum += data[9] * data[9];
+    accum += data[10] * data[10];
+    accum += data[11] * data[11];
+    accum += data[12] * data[12];
+    accum += data[13] * data[13];
+    accum += data[14] * data[14];
+    accum += data[15] * data[15];
+
+    return accum;
+  }
+
+  /// Returns square root of the norm
+  CUTLASS_HOST_DEVICE
+  Element magnitude() const {
+    return fast_sqrt(norm());
+  }
+
+  /// Returns the sum of diagonal elements
+  CUTLASS_HOST_DEVICE
+  Element trace(Element accum = Element()) const {
+    
+    accum += data[0];
+    accum += data[5];
+    accum += data[10];
+    accum += data[15];
+
+    return accum;
+  }
+    
+  /// Returns 4-by-4 rotation matrix around the X axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_X(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(1, 1) = c;
+    m.at(1, 2) = -s;
+    m.at(2, 1) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 4-by-4 rotation matrix around the Y axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Y(Element theta) {
+    Matrix m = identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(2, 0) = -s;
+    m.at(0, 2) = s;
+    m.at(2, 2) = c;
+
+    return m;
+  }
+
+  /// Returns 4-by-4 rotation matrix around the Z axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation_Z(Element theta) {
+    Matrix m = Matrix::identity();
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    m.at(0, 0) = c;
+    m.at(0, 1) = -s;
+    m.at(1, 0) = s;
+    m.at(1, 1) = c;
+
+    return m;
+  }
+
+  /// Returns a 4-by-4 rotation matrix around a unit-length axis
+  CUTLASS_HOST_DEVICE
+  static Matrix rotation(Element theta, Matrix<Element, 3, 1> const &u) {
+    Element x = u.data[0];
+    Element y = u.data[1];
+    Element z = u.data[2];
+
+    Element c = fast_cos(theta);
+    Element s = fast_sin(theta);
+
+    Element one_minus_cos = Element(1) - fast_cos(theta);
+
+    Matrix m;
+
+    m.set_slice_3x3({
+      c + x * x * one_minus_cos, x * y * one_minus_cos - z * s, x * z * one_minus_cos + y * s,
+      y * x * one_minus_cos * z * s, c + y * y * one_minus_cos, y * z * one_minus_cos - x * s,
+      z * x * one_minus_cos - y * s, z * y * one_minus_cos + x * s, c + z * z * one_minus_cos
+    });
+
+    return m;
+  }
+
+  /// Returns a 4-by-4 reflection about the plane specified by the 
+  /// unit-length normal vector n_unit
+  CUTLASS_HOST_DEVICE
+  static Matrix reflection(Matrix<Element, 3, 1> const &n_unit) {
+
+    Element a = n_unit.data[0];
+    Element b = n_unit.data[1];
+    Element c = n_unit.data[2];
+
+    Matrix m = Matrix::identity();
+
+    m.set_slice_3x3({
+      Element(1) - Element(2) * a * a, Element(-2) * a * b, Element(-2) * a * c,
+      Element(-2) * a * b, Element(1) - Element(2) * b * b, Element(-2) * b * c,
+      Element(-2) * a * c, Element(-2) * b * c, Element(1) - Element(2) * c * c
+    });
+
+    return m;
+  }
+
+  /// Returns a perspective projection matrix typical of OpenGL applications
+  CUTLASS_HOST_DEVICE
+  static Matrix perspective(Element near_plane, Element far_plane, Element fovH, Element fovV) {
+    Element aspect = fovH / fovV;
+    Element f = Element(cos(fovV)) / Element(fovH);
+    Element Q = near_plane - far_plane;
+
+    return Matrix(
+      f / aspect, 0,                0,                           0,
+      0,          f,                0,                           0,
+      0,          0, (near_plane + far_plane) / Q, Element(2) * far_plane * near_plane / Q,
+      0,          0,                -1,                          0
+    );
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Matrix translation(Matrix<Element, 3, 1> const &v) {
+    return Matrix(
+      1, 0, 0, v.data[0],
+      0, 1, 0, v.data[1],
+      0, 0, 1, v.data[2],
+      0, 0, 0, 1
+    );
+  }
+  
+  /// Computes the determinant of a 4-by-4 matrix
+  CUTLASS_HOST_DEVICE
+  Element determinant(Element accum = Element()) const {
+    
+    accum += at(0, 0) * Matrix<Element, 3, 3>({ at(1, 1), at(1, 2), at(1, 3), at(2, 1), at(2, 2), at(2, 3), at(3, 1), at(3, 2), at(3, 3) }).determinant();
+    accum -= at(0, 1) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 2), at(1, 3), at(2, 0), at(2, 2), at(2, 3), at(3, 0), at(3, 2), at(3, 3) }).determinant();
+    accum += at(0, 2) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 3), at(2, 0), at(2, 1), at(2, 3), at(3, 0), at(3, 1), at(3, 3) }).determinant();
+    accum -= at(0, 3) * Matrix<Element, 3, 3>({ at(1, 0), at(1, 1), at(1, 2), at(2, 0), at(2, 1), at(2, 2), at(3, 0), at(3, 1), at(3, 2) }).determinant();
+
+    return accum;
+  }
+  
+  /// Computes the inverse of a 4-by-4 matrix (ignores the optional argument)
+  CUTLASS_HOST_DEVICE
+  Matrix inverse(Element ignore = 1) const {
+    Matrix<Element, 2, 2> B = slice_2x2(0, 2);
+    Matrix<Element, 2, 2> A = slice_2x2(0, 0);
+    Matrix<Element, 2, 2> C = slice_2x2(2, 0);
+    Matrix<Element, 2, 2> D = slice_2x2(2, 2);
+
+    Matrix<Element, 2, 2> D_inv = D.inverse();
+
+    Matrix<Element, 2, 2> E = (A - B * D_inv * C).inverse();
+
+    return Matrix::block(
+      E,              -E * B * D_inv,
+      -D_inv * C * E, D_inv + D_inv * C * E * B * D_inv
+    );
+  }
+    
+};
+
+/// Template alias for 4-by-4 matrix
+template <typename Element>
+using Matrix4x4 = Matrix<Element, 4, 4>;
+
+
+/// Free function to infer element type from template arguments
+template <typename Element>
+CUTLASS_HOST_DEVICE Matrix4x4<Element> make_Matrix4x4(
+    Element _0_0, Element _0_1, Element _0_2, Element _0_3, 
+    Element _1_0, Element _1_1, Element _1_2, Element _1_3, 
+    Element _2_0, Element _2_1, Element _2_2, Element _2_3, 
+    Element _3_0, Element _3_1, Element _3_2, Element _3_3
+) {
+  return Matrix4x4<Element>(
+  _0_0, _0_1, _0_2, _0_3, 
+  _1_0, _1_1, _1_2, _1_3, 
+  _2_0, _2_1, _2_2, _2_3, 
+  _3_0, _3_1, _3_2, _3_3 
+  );
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Elementwise scalar multiplication
+template <typename Element, int Rows, int Columns>
+CUTLASS_HOST_DEVICE
+Matrix<Element, Rows, Columns> operator*(Element s, Matrix<Element, Rows, Columns> const &rhs) {
+  return rhs.multiply(s);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_coord.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_coord.h
new file mode 100644
index 0000000000000000000000000000000000000000..85d447b1398e844011a798e2d818543f2d51bba4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_coord.h
@@ -0,0 +1,164 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a canonical coordinate for rank=2 matrices offering named indices.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// MatrixCoord wraps Coord<2, int> to provide a helper for accessing named dimensions. Classes
+/// expecting a coordinate in the rank=2 index space of a matrix should use MatrixCoord.
+struct MatrixCoord : public Coord<2, int> {
+
+public:
+
+  /// Integer-valued index
+  using Index = int;
+
+  /// Base type is a Coord of rank=2
+  using Base = Coord<2, Index>;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+private:
+
+  /// Rows dimension
+  static int const kRow = 0;
+
+  /// Columns dimension
+  static int const kColumn = 1;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  MatrixCoord() { }
+
+  /// Constructs from Coord<2>
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(Coord<2, Index> const &coord): Base(coord) { }
+
+  /// Helper to construct from a row and column
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(Index row, Index column): Base(make_Coord(row, column)) { }
+
+  /// Helper to construct from a row and column, which are LongIndex based
+  CUTLASS_HOST_DEVICE
+  MatrixCoord(LongIndex row, LongIndex column): Base(make_Coord(Index(row), Index(column))) { }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & row() const { return this->at(kRow); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & row() { return this->at(kRow); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & column() const { return this->at(kColumn); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & column() { return this->at(kColumn); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator+(Base const& b) const {
+    return MatrixCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator-(Base const& b) const {
+    return MatrixCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator*(Base const& b) const {
+    return MatrixCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  MatrixCoord operator/(Base const& b) const {
+    return MatrixCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  MatrixCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_shape.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..20d668b248daac24cf152ba6ec72c5d47ad319e9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/matrix_shape.h
@@ -0,0 +1,65 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a Shape template for matrix tiles
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Describes the size of a matrix tile
+template <
+  int Row_,     ///< rows of a matrix
+  int Column_      ///< columns of a matrix
+>
+struct MatrixShape {
+  static int const kRow = Row_;           ///< rows of a matrix
+  static int const kColumn = Column_;           ///< columns of a matrix
+  static int const kCount = Row_ * Column_;  ///< total number of elements in a matrix
+
+  //
+  // Static member functions
+  //
+
+  CUTLASS_HOST_DEVICE
+  static Coord<2> toCoord() {
+    return make_Coord(kRow, kColumn);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_conversion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_conversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..7aad6c24193c19537340f50777ac62a645465902
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_conversion.h
@@ -0,0 +1,7123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Boost-like numeric conversion operator for CUTLASS numeric types
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cfenv>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/thread/unary_op.h"
+
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Floating-point rounding style similar to Standard Library's formats but supporting
+/// additional rounding options.
+enum class FloatRoundStyle {
+  round_indeterminate,          ///< rounding mode unknown
+  round_toward_zero,            ///< round toward zero
+  round_to_nearest,             ///< round to nearest even
+  round_to_nearest_satfinite,   ///< round to nearest even, capping value to min and max of destination type
+  round_toward_infinity,        ///< round toward infinity
+  round_toward_neg_infinity,    ///< round toward negative infinity
+  round_half_ulp_truncate,      ///< add 0.5ulp to integer representation then round toward zero
+  round_half_ulp_trunc_dntz     ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+struct NumericConverter {
+
+  using result_type = T;
+  using source_type = S;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<result_type>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => int32_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if __CUDA_ARCH__
+    return __float2int_rn(s);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    return static_cast<result_type>(std::nearbyint(s));
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int32_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if __CUDA_ARCH__
+    return __float2int_rz(s);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TOWARDZERO);
+    return (result_type)std::nearbyint(s);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => int8_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<int8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = int8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TOWARDZERO);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif 
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    int32_t intermediate;
+    asm volatile("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<uint8_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = uint8_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style =  FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if __CUDA_ARCH__
+    int32_t intermediate;
+    asm volatile("cvt.rzi.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s));
+    return static_cast<result_type>(intermediate);
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TOWARDZERO);
+    int32_t intermediate = (int32_t)std::nearbyint(s);
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<uint8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<uint8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for cutlass::half_t => int8_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct NumericConverter<int8_t, cutlass::half_t, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = int8_t;
+  using source_type = cutlass::half_t;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    #if defined(__CUDA_ARCH__)
+    union { int8_t int8[2]; int16_t int16; };
+    union { cutlass::half_t fp16; int16_t int16_in; };
+    fp16 = s;
+    asm volatile ("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
+    return int8[0];
+    #elif !defined(__CUDACC_RTC__)
+    std::fesetround(FE_TONEAREST);
+    int32_t intermediate = (int32_t)std::nearbyint(static_cast<float>(s));
+    // Low-end saturation
+    intermediate = std::max(intermediate, (int32_t)std::numeric_limits<int8_t>::lowest());
+    // High-end saturation
+    intermediate = std::min(intermediate, (int32_t)std::numeric_limits<int8_t>::max());
+    return static_cast<result_type>(intermediate);
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float => integer_subbyte
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ true>, float, Round> {
+private:
+  static constexpr bool result_is_signed = true;
+
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
+  using source_type = float;
+  static constexpr FloatRoundStyle round_style = Round;
+
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = int;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
+  }
+
+  CUTLASS_HOST_DEVICE result_type
+  operator()(source_type const& s) const {
+    return convert(s);
+  }
+};
+
+template<int Bits, FloatRoundStyle Round>
+struct NumericConverter<integer_subbyte<Bits, /* Signed = */ false>, float, Round> {
+private:
+  static constexpr bool result_is_signed = false;
+
+public:
+  using result_type = integer_subbyte<Bits, result_is_signed>;
+  using source_type = float;
+  static constexpr FloatRoundStyle round_style = Round;
+
+  CUTLASS_HOST_DEVICE static result_type
+  convert(source_type const& src) {
+    using middle_type = unsigned;
+    static_assert(8 * sizeof(middle_type) > Bits, "This conversion "
+      "requires that integer_subbyte have fewer representation bits "
+      "than the number of bits in unsigned int.");
+
+    auto middle = NumericConverter<middle_type, source_type, Round>::convert(src);
+    return NumericConverter<result_type, middle_type, Round>::convert(middle);
+  }
+
+  CUTLASS_HOST_DEVICE result_type  
+  operator()(source_type const& s) const {
+    return convert(s);
+  }
+};
+  
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::half_t
+template <typename T, FloatRoundStyle Round>
+struct NumericConverter<T, T, Round> {
+
+  using result_type = T;
+  using source_type = T;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return s;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::half_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::half_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::half_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::half_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result = static_cast<float>(s);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Specialization for round-to-nearest
+template <>
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = cutlass::half_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result = static_cast<cutlass::half_t>(s);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Specialization for round-toward-zero
+template <>
+struct NumericConverter<cutlass::half_t, float, FloatRoundStyle::round_toward_zero> {
+
+  using result_type = cutlass::half_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  /// Round toward zero
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & flt) {
+
+  #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return cutlass::half_t(__float2half_rz(flt));
+  #else
+    // software implementation rounds toward nearest even
+    unsigned const& s = reinterpret_cast<unsigned const &>(flt);
+    uint16_t sign = uint16_t((s >> 16) & 0x8000);
+    int32_t exp = int32_t((s >> 23) & 0xff) - 127;
+    int mantissa = s & 0x7fffff;
+    uint16_t u = 0;
+
+    if ((s & 0x7fffffff) == 0) {
+      // sign-preserving zero
+      return cutlass::half_t::bitcast(sign);
+    }
+
+    if (exp > 15) {
+      if (exp == 128 && mantissa) {
+        // not a number
+        u = 0x7fff;
+      } else {
+        // overflow to infinity
+        u = sign | 0x7c00;
+      }
+      return cutlass::half_t::bitcast(u);
+    }
+
+    if (exp >= -14) {
+      // normal fp32 to normal fp16
+      u = uint16_t((uint32_t(exp + 15) & 0x1f) << 10);
+      u = uint16_t(u | (mantissa >> 13));
+    } else {
+      // normal single-precision to subnormal cutlass::half_t-precision representation
+      int rshift = (-14 - exp);
+      if (rshift < 32) {
+        mantissa |= (1 << 23);
+        mantissa = (mantissa >> rshift);
+        u = (uint16_t(mantissa >> 13) & 0x3ff);
+      } else {
+        mantissa = 0;
+        u = 0;
+      }
+    }
+
+    u |= sign;
+
+    return cutlass::half_t::bitcast(u);
+
+  #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::bfloat16_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::bfloat16_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::bfloat16_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::bfloat16_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return static_cast<cutlass::bfloat16_t>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x32 += 0x8000;
+    }
+    #endif
+
+    uint16_t x16 = uint16_t((x32 >> 16) & 0xffff);
+    return cutlass::bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::bfloat16_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::bfloat16_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    uint32_t x32 = reinterpret_cast<uint32_t const &>(s);
+    uint16_t x16 = uint16_t(x32 >> 16);
+
+    return cutlass::bfloat16_t::bitcast(x16);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for float <=> cutlass::tfloat32_t
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for float <= cutlass::tfloat32_t
+template <FloatRoundStyle Round>
+struct NumericConverter<float, cutlass::tfloat32_t, Round> {
+
+  using result_type = float;
+  using source_type = cutlass::tfloat32_t;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    return static_cast<float>(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_to_nearest> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned storage = reinterpret_cast<unsigned const &>(s);
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+    asm volatile("cvt.rn.tf32.f32 %0, %1;" : "=r"(storage) : "r"(storage));
+#else
+    if ((storage & 0x7f800000) != 0x7f800000) {
+
+      bool mantissa_bit = ((storage & (1 << 13)) != 0);
+      bool round_bit = ((storage & (1 << 12)) != 0);
+      bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0);
+
+      if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) {
+        storage += uint32_t(1 << 13);
+      }
+
+      // Note, the following is intentionally commented out. TF32
+      // does not define the low order bits, so they may be left in
+      // an undefined state.
+      //
+      // By not truncating these bit explicitly, we avoid an extra logical
+      // operation.
+      //
+      // TF32 may be implicitly converted to float by performing this
+      // operation as needed.
+      //
+      // storage = (storage & ~0x1fff);
+    }
+    else if (storage & ~0xff800000) {
+      storage = 0x7fffffff;
+    }
+#endif
+
+    return cutlass::tfloat32_t::bitcast(storage);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_truncate> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    return cutlass::tfloat32_t::round_half_ulp_truncate(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero.
+/// It avoids predicated code, though it requires a temporary register.
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_half_ulp_trunc_dntz> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    unsigned y = reinterpret_cast<unsigned const &>(s);
+    y = y & 0xff800000;
+    float d = reinterpret_cast<float const &>(y);
+    float z = d / float(1 << 11) + s;
+
+    return reinterpret_cast<result_type const &>(z);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <>
+struct NumericConverter<cutlass::tfloat32_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_type = cutlass::tfloat32_t;
+  using source_type = float;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+    uint32_t x = reinterpret_cast<uint32_t const &>(s);
+    return cutlass::tfloat32_t::bitcast(x & 0xffffe000);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion operator for float to cutlass::tfloat32_t big and small values
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero,
+  FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate
+>
+struct NumericConverterFastF32 {
+
+  // result_type holds big cutlass::tfloat32_t at idx(0) and small cutlass::tfloat32_t at idx(1)
+  using result_type = Array<cutlass::tfloat32_t, 2>;
+
+  // source data type
+  using source_type = float;
+
+  // rounding styles for big and small part
+  static FloatRoundStyle const kRoundBig = RoundBig;
+  static FloatRoundStyle const kRoundSmall = RoundSmall;
+
+  CUTLASS_HOST_DEVICE
+    static result_type convert(source_type const & source) {
+
+    result_type result;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundBig> convert_big_;
+    NumericConverter<cutlass::tfloat32_t, float, kRoundSmall> convert_small_;
+
+    // convert and fill cutlass::tfloat32_t big at idx 0
+    result[0] = convert_big_(source);
+
+    // convert and fill cutlass::tfloat32_t small at idx 1
+    result[1] = convert_small_(source - static_cast<float>(result[0]));
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+    result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion and Clamp operator for Integers
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S
+>
+struct NumericConverterClamp {
+
+  using result_type = T;
+  using source_type = S;
+
+  CUTLASS_HOST_DEVICE
+    static result_type convert(source_type const & s) {
+    NumericConverter<result_type, source_type> convert_op;
+    result_type const kClamp_max = cutlass::platform::numeric_limits<result_type>::max();
+    result_type const kClamp_min = cutlass::platform::numeric_limits<result_type>::lowest();
+    if (s < (source_type)kClamp_min)
+      return kClamp_min;
+    if (s > (source_type)kClamp_max)
+      return kClamp_max;
+    return convert_op(s);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// This converter is needed to enable cutlass::half_t output types when using int32_t accumulators.
+// Since floating-point types do not require a clamp, this converter simply casts from
+// the source type to cutlass::half_t.
+template <
+  typename S
+>
+struct NumericConverterClamp<cutlass::half_t, S> {
+
+  using result_type = cutlass::half_t;
+  using source_type = S;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const &source) {
+    return static_cast<cutlass::half_t>(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conversion operator for Array
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Conversion operator for Array
+template <
+  typename T,
+  typename S,
+  int N,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
+>
+struct NumericArrayConverter {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result;
+    NumericConverter<T, S, Round> convert_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+        result[i] = convert_(s[i]);
+      } else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round,
+  typename Transform
+>
+struct NumericArrayConverter<T, T, N, Round, Transform> {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<T, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const &source) {
+    if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+      return source;
+    } else {
+      result_type result;
+      for (int i = 0; i < N; ++i) {
+        result[i] = conj(static_cast<typename source_type::Element>(source[i]));
+      }
+      return result;
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half, 2> <= Array<float, 2>, round to nearest
+template <>
+struct NumericArrayConverter<cutlass::half_t, float, 2, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = Array<cutlass::half_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+      Array<cutlass::half_t, 2> result;
+      reinterpret_cast<__half2 &>(result) = __float22half2_rn(reinterpret_cast<float2 const &>(source));
+      return result;
+    #else
+      NumericConverter<cutlass::half_t, float, round_style> convert_;
+      // NOTE: cutlass::Array<half, N> is NOT an aggregate type and
+      //  below `{}` does NOT conduct zero initialization. Below `{}` will 
+      //  conduct default initialization (calling default ctr). We use this syntax
+      //  to resolve compiler warning on uninitialized member variable.
+      Array<cutlass::half_t, 2> result{};
+      result[0] = convert_(source[0]);
+      result[1] = convert_(source[1]);
+      return result;
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 2> <= Array<cutlass::half_t, 2>, round to nearest
+template <FloatRoundStyle Round>
+struct NumericArrayConverter<float, cutlass::half_t, 2, Round> {
+
+  using result_type = Array<float, 2>;
+  using source_type = Array<cutlass::half_t, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+      float2 result2 = __half22float2(reinterpret_cast<__half2 const &>(source));
+      return {
+        float{result2.x},
+        float{result2.y}
+      };
+    #else
+      NumericConverter<float, cutlass::half_t, round_style> convert_;
+      return {
+        convert_(source[0]),
+        convert_(source[1])
+      };
+    #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, float, N, Round> {
+
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<cutlass::half_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::half_t, float, Round> convert_element_;
+
+    result_type result;
+
+    Array<cutlass::half_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::half_t, 2> *>(&result);
+    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<half> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::half_t, N, Round> {
+
+  using result_type = Array<float, N>;
+  using source_type = Array<cutlass::half_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<float, cutlass::half_t, 2, Round> convert_vector_;
+    NumericConverter<float, cutlass::half_t, Round> convert_element_;
+
+    result_type result;
+
+    Array<float, 2> *result_ptr = reinterpret_cast<Array<float, 2> *>(&result);
+    Array<cutlass::half_t, 2> const *source_ptr = reinterpret_cast<Array<cutlass::half_t, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 2> <= Array<float, 2>, round to nearest
+template <>
+struct NumericArrayConverter<cutlass::bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest> {
+
+  using result_type = Array<cutlass::bfloat16_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned d;
+
+    asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) );
+
+    return reinterpret_cast<result_type const &>(d);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 2> <= Array<float, 2>, round to nearest with min/max saturation
+template <>
+struct NumericArrayConverter<cutlass::bfloat16_t, float, 2, FloatRoundStyle::round_to_nearest_satfinite> {
+
+  using result_type = Array<cutlass::bfloat16_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest_satfinite;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned d;
+
+    asm("cvt.rn.satfinite.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) );
+
+    return reinterpret_cast<result_type const &>(d);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Partial specialization for Array<cutlass::bfloat16_t> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, float, N, Round> {
+
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> convert_vector_;
+    NumericConverter<cutlass::bfloat16_t, float, Round> convert_element_;
+
+    result_type result;
+
+    Array<cutlass::bfloat16_t, 2> *result_ptr = reinterpret_cast<Array<cutlass::bfloat16_t, 2> *>(&result);
+    Array<float, 2> const *source_ptr = reinterpret_cast<Array<float, 2> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    if (N % 2) {
+      result[N - 1] = convert_element_(source[N - 1]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Conditional guards to enable partial specialization for packed integers
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \
+    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
+     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Partial specialization for Array<int8_t, 1> <= Array<int, 1>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 1, Round> {
+
+  using result_type = Array<int8_t, 1>;
+  using source_type = Array<int, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<int8_t, int, Round> convert_element_;
+
+    result_type result;
+
+    result[0] = convert_element_(source[0]);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t, 2> <= Array<int, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 2, Round> {
+
+  using result_type = Array<int8_t, 2>;
+  using source_type = Array<int, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    uint32_t tmp;
+
+    asm volatile(
+      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, 0;\n"
+      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
+
+    uint16_t out = (tmp & 0xffff);
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t, 4> <= Array<int, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, 4, Round> {
+
+  using result_type = Array<int8_t, 4>;
+  using source_type = Array<int, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+      "{ .reg .u32 r4;"
+      "cvt.pack.sat.s8.s32.b32   r4, %4, %3, 0;"
+      "cvt.pack.sat.s8.s32.b32   %0, %2, %1, r4;"
+      "}"
+      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int8_t, int, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 4> *result_ptr = reinterpret_cast<Array<int8_t, 4> *>(&result);
+    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 1> <= Array<int, 1>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 1, Round> {
+
+  using result_type = Array<uint8_t, 1>;
+  using source_type = Array<int, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<uint8_t, int, Round> convert_element_;
+
+    result_type result;
+
+    result[0] = convert_element_(source[0]);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 2> <= Array<int, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 2, Round> {
+
+  using result_type = Array<uint8_t, 2>;
+  using source_type = Array<int, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    uint32_t tmp;
+
+    asm volatile(
+      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, 0;\n"
+      : "=r"(tmp) : "r"(source[0]), "r"(source[1]));
+
+    uint16_t out = (tmp & 0xffff);
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint8_t, 4> <= Array<int, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, 4, Round> {
+
+  using result_type = Array<uint8_t, 4>;
+  using source_type = Array<int, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+      "{ .reg .u32 r4;"
+      "cvt.pack.sat.u8.s32.b32   r4, %4, %3, 0;"
+      "cvt.pack.sat.u8.s32.b32   %0, %2, %1, r4;"
+      "}"
+      : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, int, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<uint8_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<uint8_t, int, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<uint8_t, 4> *result_ptr = reinterpret_cast<Array<uint8_t, 4> *>(&result);
+    Array<int, 4> const *source_ptr = reinterpret_cast<Array<int, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, float, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, float, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<half, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<half, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<half, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    result_type out;
+    uint32_t& reg = reinterpret_cast<uint32_t&>(out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(reg): "h"(src_packed));
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<half, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::half_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   %0, %1;\n" \
+        "}" \
+        : "=h"(out) : "r"(reinterpret_cast<uint32_t const&>(source)));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<bfloat16_t, N> <=> Array<float_e4m3_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e4m3_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<bfloat16_t, 2> <= Array<float_e5m2_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::float_e5m2_t, 2, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t res_half;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \
+        "}\n" : "=r"(res_half): "h"(src_packed));
+    float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half));
+    NumericArrayConverter<cutlass::bfloat16_t, float, 2, Round> converter;
+    return converter(reinterpret_cast<Array<float, 2> const&>(res_float));
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 2> <= Array<bfloat16_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::bfloat16_t, 2, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2, Round> converter;
+    Array<float, 2> res_float = converter(source);
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(res_float[0]), "f"(res_float[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+namespace detail {
+
+/// Special converters that can be used with 4 8-bit elements packed in a register.
+/// Common use is for fast FP8 converters.
+template <
+  typename T,
+  typename S,
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+  typename Transform = cutlass::transform::thread::UnaryTransform::Identity
+>
+struct NumericArrayConverterPacked4Element {
+  using result_type = Array<T, 4>;
+  using source_type = Array<S, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value ||
+                platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Conjugate>::value,
+                  "Unary Operator not supported.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & s) {
+
+    result_type result;
+    NumericConverter<T, S, Round> convert_;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      if (platform::is_same<Transform, cutlass::transform::thread::UnaryTransform::Identity>::value) {
+        result[i] = convert_(s[i]);
+      }
+      else { // conjugate
+        result[i] = conj(convert_(s[i]));
+      }
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e4m3_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, float, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<float, 4> <= Array<float_ue4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, float_ue4m3_t, Round> {
+  using result_element = float;
+  using source_element = float_ue4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue4m3_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_ue4m3_t, float, Round> {
+  using result_element = float_ue4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_ue8m0_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 4> <= Array<float_ue8m0_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, float_ue8m0_t, Round> {
+  using result_element = float;
+  using source_element = float_ue8m0_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  using BfloatArr = Array<cutlass::bfloat16_t, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.bf16x2.ue8m0x2 %0, lo;\n" \
+        "cvt.rn.bf16x2.ue8m0x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2> bf2fp32_converter;
+    auto res0 = bf2fp32_converter(reinterpret_cast<Array<cutlass::bfloat16_t, 2> &>(out_fp16[0]));
+    auto res1 = bf2fp32_converter(reinterpret_cast<Array<cutlass::bfloat16_t, 2> &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0[0];
+    out[1] = res0[1];
+    out[2] = res1[0];
+    out[3] = res1[1];
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<float_ue8m0_t, 4> <= Array<float, 4>
+template <>
+struct NumericArrayConverterPacked4Element<float_ue8m0_t, float, FloatRoundStyle::round_toward_infinity> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_infinity;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint32_t out;
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rp.satfinite.ue8m0x2.f32   lo, %2, %1;\n" \
+        "cvt.rp.satfinite.ue8m0x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, FloatRoundStyle::round_toward_infinity> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue8m0_t, 4> <= Array<float, 4>
+template <>
+struct NumericArrayConverterPacked4Element<float_ue8m0_t, float, FloatRoundStyle::round_toward_zero> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint32_t out;
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rz.satfinite.ue8m0x2.f32   lo, %2, %1;\n" \
+        "cvt.rz.satfinite.ue8m0x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, FloatRoundStyle::round_toward_zero> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_ue8m0_t, float, Round> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_infinity;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    //default maps to RP mode.
+    return NumericArrayConverterPacked4Element<float_ue8m0_t, float, FloatRoundStyle::round_toward_infinity>{}(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_e2m3_unpack8bits_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float_e2m3_unpack8bits_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::detail::float_e2m3_unpack8bits_t, float, Round> {
+  using result_element = cutlass::detail::float_e2m3_unpack8bits_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e2m3x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e2m3x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, 4> <= Array<float_e2m3_unpack8bits_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::detail::float_e2m3_unpack8bits_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::detail::float_e2m3_unpack8bits_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e2m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e2m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, 4> <=> Array<float_e3m2_unpack8bits_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float_e3m2_unpack8bits_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::detail::float_e3m2_unpack8bits_t, float, Round> {
+  using result_element = cutlass::detail::float_e3m2_unpack8bits_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e3m2x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e3m2x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<float, 4> <= Array<float_e3m2_unpack8bits_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::detail::float_e3m2_unpack8bits_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::detail::float_e3m2_unpack8bits_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e3m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e3m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float, cutlass::float_e5m2_t, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, float, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   lo, %2, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f32   hi, %4, %3;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e4m3x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::half_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::half_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::half_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out[2];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo, hi;\n" \
+        "mov.b32 {lo, hi}, %2;\n" \
+        "cvt.rn.f16x2.e5m2x2 %0, lo;\n" \
+        "cvt.rn.f16x2.e5m2x2 %1, hi;\n" \
+        "}\n" : "=r"(out[0]), "=r"(out[1]) : "r"(src_packed));
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::half_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::half_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::half_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out;
+    uint32_t const* src_packed = reinterpret_cast<uint32_t const*>(&source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b16 lo;\n" \
+        ".reg .b16 hi;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   lo, %1;\n" \
+        "cvt.rn.satfinite.e5m2x2.f16x2   hi, %2;\n" \
+        "mov.b32 %0, {lo, hi};\n" \
+        "}" \
+        : "=r"(out) : "r"(src_packed[0]), "r"(src_packed[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e4m3_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<cutlass::bfloat16_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<cutlass::bfloat16_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<cutlass::bfloat16_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::bfloat16_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert f8 to float
+    NumericArrayConverterPacked4Element<float, source_element, Round> src2float;
+    Array<float, 4> tmp_floats = src2float(source);
+
+    // Convert float to bf16
+    result_type out;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp_floats);
+    Array<result_element, 2>* packed_out = reinterpret_cast<Array<result_element, 2>*>(&out);
+    NumericArrayConverter<result_element, float, 2, Round> float2result;
+    packed_out[0] = float2result(packed_tmp[0]);
+    packed_out[1] = float2result(packed_tmp[1]);
+
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<cutlass::bfloat16_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::bfloat16_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::bfloat16_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    // Convert bf16 to float
+    Array<float, 4> tmp;
+    Array<float, 2>* packed_tmp = reinterpret_cast<Array<float, 2>*>(&tmp);
+    Array<source_element, 2> const* packed_source = reinterpret_cast<Array<source_element, 2> const*>(&source);
+    NumericArrayConverter<float, source_element, 2, Round> src2float;
+    packed_tmp[0] = src2float(packed_source[0]);
+    packed_tmp[1] = src2float(packed_source[1]);
+
+    // Convert float to f8
+    NumericArrayConverterPacked4Element<result_element, float, Round> float2result;
+    return float2result(tmp);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float_e4m3_t, 4> <=> Array<float_e5m2_t, 4>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float_e4m3_t, 4> <= Array<float_e5m2_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e4m3_t, cutlass::float_e5m2_t, Round> {
+  using result_element = cutlass::float_e4m3_t;
+  using source_element = cutlass::float_e5m2_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e5m2_t, 4> <= Array<float_e4m3_t, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverterPacked4Element<float_e5m2_t, cutlass::float_e4m3_t, Round> {
+  using result_element = cutlass::float_e5m2_t;
+  using source_element = cutlass::float_e4m3_t;
+
+  using result_type = Array<result_element, 4>;
+  using source_type = Array<source_element, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for:
+//       Array<T, N> <=> Array<float_e4m3_t, N>
+//       Array<T, N> <=> Array<float_e5m2_t, N>
+// using packed converter under the hood
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename T,
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct PackedNumericArrayConverter {
+  using result_element = T;
+  using source_element = S;
+
+  using result_type = Array<result_element, N>;
+  using source_type = Array<source_element, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using packed_result_type = Array<result_element, 4>;
+  using packed_source_type = Array<source_element, 4>;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+    result_type result;
+    packed_result_type* packed_result = reinterpret_cast<packed_result_type*>(&result);
+    const packed_source_type* packed_source = reinterpret_cast<const packed_source_type*>(&source);
+
+    detail::NumericArrayConverterPacked4Element<result_element, source_element, Round> packed_converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      packed_result[i] = packed_converter(packed_source[i]);
+    }
+
+    // Handle leftovers
+    NumericConverter<result_element, source_element, Round> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N % 4; ++i) {
+      int idx = ((N / 4) * 4) + i;
+      result[idx] = converter(source[idx]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const{
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<T, N> <= Array<float_e4m3_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<T, N> <= Array<float_e5m2_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e5m2_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e4m3_t, N> <= Array<float_e4m3_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> :
+  public PackedNumericArrayConverter<float_e4m3_t, cutlass::float_e4m3_t, N, Round> {};
+
+/// Partial specialization for Array<float_e5m2_t, N> <= Array<float_e5m2_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> :
+  public PackedNumericArrayConverter<float_e5m2_t, cutlass::float_e5m2_t, N, Round> {};
+
+
+/// Partial specialization for Array<float, 2> <= Array<float_ue8m0_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, float_ue8m0_t, 2, Round> {
+  using result_element = float;
+  using source_element = float_ue8m0_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.bf16x2.ue8m0x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    NumericArrayConverter<float, cutlass::bfloat16_t, 2> bf2fp32_converter;
+    auto res0 = bf2fp32_converter(reinterpret_cast<Array<cutlass::bfloat16_t, 2> &>(out_fp16));
+
+    result_type out;
+    out[0] = res0[0];
+    out[1] = res0[1];
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue8m0_t, 2> <= Array<float, 2>
+template <>
+struct NumericArrayConverter<float_ue8m0_t, float, 2, FloatRoundStyle::round_toward_infinity> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_infinity;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint16_t out;
+    asm volatile( \
+        "{\n" \
+        "cvt.rp.satfinite.ue8m0x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, FloatRoundStyle::round_toward_infinity> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue8m0_t, 2> <= Array<float, 2>
+template <>
+struct NumericArrayConverter<float_ue8m0_t, float, 2, FloatRoundStyle::round_toward_zero> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_UE8M0_CVT_ENABLED)
+    uint16_t out;
+    asm volatile( \
+        "{\n" \
+        "cvt.rz.satfinite.ue8m0x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, FloatRoundStyle::round_toward_zero> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_ue8m0_t, float, 2, Round> {
+  using result_element = float_ue8m0_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    return NumericArrayConverter<float_ue8m0_t, float, 2, FloatRoundStyle::round_toward_infinity>{}(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<T, N> <= Array<float_ue8m0_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, float_ue8m0_t, N, Round> :
+  public PackedNumericArrayConverter<T, float_ue8m0_t, N, Round> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, 2> <=> Array<float_ue4m3_t, 2>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<float, 2> <= Array<float_ue4m3_t, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, float_ue4m3_t, 2, Round> {
+  using result_element = float;
+  using source_element = float_ue4m3_t;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint32_t out_fp16;
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \
+        "}\n" : "=r"(out_fp16): "h"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue4m3_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_ue4m3_t, float, 2, Round> {
+  using result_element = float_ue4m3_t;
+  using source_element = float;
+
+  using result_type = Array<result_element, 2>;
+  using source_type = Array<source_element, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+    uint16_t out;
+
+    asm volatile( \
+        "{\n" \
+        "cvt.rn.satfinite.e4m3x2.f32   %0, %2, %1;\n" \
+        "}" \
+        : "=h"(out) : "f"(source[0]), "f"(source[1]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_ue8m0_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_ue8m0_t, S, N, Round> :
+  public PackedNumericArrayConverter<float_ue8m0_t, S, N, Round> {};
+/// Partial specialization for Array<T, N> <= Array<float_ue4m3_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::float_ue4m3_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::float_ue4m3_t, N, Round> {};
+
+// Partial specialization for Array<float_ue4m3_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::float_ue4m3_t, S, N, Round> :
+  public PackedNumericArrayConverter<cutlass::float_ue4m3_t, S, N, Round> {};
+
+
+/// Partial specialization for Array<T, N> <= Array<float_e2m3_unpack8bits_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::detail::float_e2m3_unpack8bits_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::detail::float_e2m3_unpack8bits_t, N, Round> {};
+
+
+/// Partial specialization for Array<float_e2m3_unpack8bits_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::detail::float_e2m3_unpack8bits_t, S, N, Round> :
+  public PackedNumericArrayConverter<cutlass::detail::float_e2m3_unpack8bits_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e2m3_unpack8bits_t, N> <= Array<float_e2m3_unpack8bits_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::detail::float_e2m3_unpack8bits_t, cutlass::detail::float_e2m3_unpack8bits_t, N, Round> :
+  public PackedNumericArrayConverter<cutlass::detail::float_e2m3_unpack8bits_t, cutlass::detail::float_e2m3_unpack8bits_t, N, Round> {};
+
+/// Partial specialization for Array<T, N> <= Array<float_e3m2_unpack8bits_t, N>
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<T, cutlass::detail::float_e3m2_unpack8bits_t, N, Round> :
+  public PackedNumericArrayConverter<T, cutlass::detail::float_e3m2_unpack8bits_t, N, Round> {};
+
+/// Partial specialization for Array<float_e3m2_unpack8bits_t, N> <= Array<S, N>
+template <
+  typename S,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::detail::float_e3m2_unpack8bits_t, S, N, Round> :
+  public PackedNumericArrayConverter<cutlass::detail::float_e3m2_unpack8bits_t, S, N, Round> {};
+
+/// Partial specialization for Array<float_e3m2_unpack8bits_t, N> <= Array<float_e3m2_unpack8bits_t, N>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<cutlass::detail::float_e3m2_unpack8bits_t, cutlass::detail::float_e3m2_unpack8bits_t, N, Round> :
+  public PackedNumericArrayConverter<cutlass::detail::float_e3m2_unpack8bits_t, cutlass::detail::float_e3m2_unpack8bits_t, N, Round> {};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations for Array<float, N> <=> Array<float_e2m1_t, N>
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<float, 8> <= Array<float_e2m1_t, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e2m1_t, 8, Round> {
+  using result_element = float;
+  using source_element = cutlass::float_e2m1_t;
+
+  using result_type = Array<result_element, 8>;
+  using source_type = Array<source_element, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t out_fp16[4];
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+
+    asm volatile( \
+        "{\n" \
+        ".reg .b8 byte0, byte1, byte2, byte3;\n" \
+        "mov.b32 {byte0, byte1, byte2, byte3}, %4;\n" \
+        "cvt.rn.f16x2.e2m1x2 %0, byte0;\n" \
+        "cvt.rn.f16x2.e2m1x2 %1, byte1;\n" \
+        "cvt.rn.f16x2.e2m1x2 %2, byte2;\n" \
+        "cvt.rn.f16x2.e2m1x2 %3, byte3;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) , "=r"(out_fp16[2]), "=r"(out_fp16[3]): "r"(src_packed));
+
+    float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0]));
+    float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1]));
+    float2 res2 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[2]));
+    float2 res3 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[3]));
+
+    result_type out;
+    out[0] = res0.x;
+    out[1] = res0.y;
+    out[2] = res1.x;
+    out[3] = res1.y;
+    out[4] = res2.x;
+    out[5] = res2.y;
+    out[6] = res3.x;
+    out[7] = res3.y;
+    return out;
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float> <= Array<float_e2m1_t>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float, cutlass::float_e2m1_t, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<float, N>;
+  using source_type = Array<float_e2m1_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<float, cutlass::float_e2m1_t, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<float, 8> *result_ptr = reinterpret_cast<Array<float, 8> *>(&result);
+    Array<float_e2m1_t, 8> const *source_ptr = reinterpret_cast<Array<float_e2m1_t, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+/// Partial specialization for Array<float_e2m1_t, 2> <= Array<float, 2>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e2m1_t, float, 2, Round> {
+  using result_element = float_e2m1_t;
+  using source_element = float;
+
+  using result_type = Array<float_e2m1_t, 2>;
+  using source_type = Array<float, 2>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint32_t tmp;
+    asm volatile( \
+      "{\n" \
+      ".reg .b8 byte0;\n" \
+      ".reg .b8 byte1;\n" \
+      ".reg .b8 byte2;\n" \
+      ".reg .b8 byte3;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n" \
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" \
+      "}" \
+      : "=r"(tmp) : "f"(source[0]), "f"(source[1]));
+    
+    uint8_t out = (tmp & 0xff);
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 2; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e2m1_t, 8> <= Array<float, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e2m1_t, float, 8, Round> {
+  using result_element = cutlass::float_e2m1_t;
+  using source_element = float;
+
+  using result_type = Array<float_e2m1_t, 8>;
+  using source_type = Array<float, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    unsigned out;
+    asm volatile( \
+      "{\n" \
+      ".reg .b8 byte0;\n" \
+      ".reg .b8 byte1;\n" \
+      ".reg .b8 byte2;\n" \
+      ".reg .b8 byte3;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n" \
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n" \
+      "}" \
+      : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]),
+                    "f"(source[4]), "f"(source[5]), "f"(source[6]), "f"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 8; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e2m1_t, 4> <= Array<float, 4>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e2m1_t, float, 4, Round> {
+  using result_element = float_e2m1_t;
+  using source_element = float;
+
+  using result_type = Array<float_e2m1_t, 4>;
+  using source_type = Array<float, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    uint16_t out;
+    asm volatile( \
+      "{\n" \
+      ".reg .b8 byte0;\n" \
+      ".reg .b8 byte1;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n" \
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n" \
+      "mov.b16 %0, {byte0, byte1};\n" \
+      "}" \
+      : "=h"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3]));
+
+    return reinterpret_cast<result_type const &>(out);
+  #else
+    result_type result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float_e2m1_t> <= Array<float>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<float_e2m1_t, float, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<float_e2m1_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<float_e2m1_t, float, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<float_e2m1_t, 8> *result_ptr = reinterpret_cast<Array<float_e2m1_t, 8> *>(&result);
+    Array<float, 8> const *source_ptr = reinterpret_cast<Array<float, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Array<int8_t> <= Array<float>
+/// Conversion is performed with saturation regardless of setting of
+/// the `Round` template parameter.
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, 1, Round> {
+
+  using result_type = Array<int8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<int8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, 1, Round> {
+
+  using result_type = Array<uint8_t, 1>;
+  using source_type = Array<float, 1>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericConverter<uint8_t, float, Round> destination_converter;
+    result_type result;
+    result[0] = destination_converter(source[0]);
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+// To convert a FP32 to Int that has less than 32 bits, we need to convert it to int32 first.
+template <
+  typename T,
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayFP32ToIntConverter {
+
+  using result_type = Array<T, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  static_assert(cutlass::platform::numeric_limits<T>::is_integer, "the dest type has to be int.");
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    // Convert float to int
+    Array<int32_t, N> temporary;
+
+    NumericArrayConverter<int32_t, float, N, Round> compute_converter;
+    temporary = compute_converter(source);
+
+    // Convert to int to int8_t
+    NumericArrayConverter<T, int32_t, N, Round> destination_converter;
+    return destination_converter(temporary);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, float, N, Round> {
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint8_t, float, N, Round> {
+
+  using result_type = Array<uint8_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint8_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, float, N, Round> {
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<int4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, float, N, Round> {
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<float, N>;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+    NumericArrayFP32ToIntConverter<uint4b_t, N, Round> converter;
+    return converter(source);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) && \
+    ((__CUDACC_VER_MAJOR__ > 10) ||                     \
+     ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+
+/// Partial specialization for Array<int4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, 8, Round> {
+
+  using result_type = Array<int4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.s4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.s4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.s4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<int4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<int4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int4b_t, 8> *result_ptr = reinterpret_cast<Array<int4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<uint4b_t, 8> <= Array<int, 8>
+template <
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, 8, Round> {
+
+  using result_type = Array<uint4b_t, 8>;
+  using source_type = Array<int, 8>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    unsigned out;
+
+    asm volatile(
+        "{ .reg .u32 r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %8, %7, 0;"
+        "cvt.pack.sat.u4.s32.b32   r4, %6, %5, r4;"
+        "cvt.pack.sat.u4.s32.b32   r4, %4, %3, r4;"
+        "cvt.pack.sat.u4.s32.b32   %0, %2, %1, r4;"
+        "}"
+        : "=r"(out)
+        : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3]),
+          "r"(source[4]), "r"(source[5]), "r"(source[6]), "r"(source[7]));
+
+    return reinterpret_cast<result_type const &>(out);
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int4b_t> <= Array<int>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<uint4b_t, int, N, Round> {
+  static_assert(!(N % 8), "N must be multiple of 8.");
+
+  using result_type = Array<uint4b_t, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+
+    NumericArrayConverter<uint4b_t, int, 8, Round> convert_vector_;
+
+    result_type result;
+
+    Array<uint4b_t, 8> *result_ptr = reinterpret_cast<Array<uint4b_t, 8> *>(&result);
+    Array<int, 8> const *source_ptr = reinterpret_cast<Array<int, 8> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 8; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif  // Conditional guards to enable partial specialization for packed integers
+
+namespace detail {
+
+  /*
+      A helper class that can vectorize a numeric converter with implementation for several vector widths.
+
+      The vector widths must be giving in decreasing order or width, and must be a power of 2.
+
+      The vector converters must produce identical results to the scalar converters for consistency.
+    */
+  class VectorizedConverter {
+  private:
+    // Base case to handle remainder elements as scalars.
+    template <int Offset, size_t ParentWidth, typename ArrayConverter>
+    CUTLASS_DEVICE
+    static void convert_helper(
+      typename ArrayConverter::result_type& result,
+      typename ArrayConverter::source_type const& source) {
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+      // If no more converters, handle the remaining elements as scalars.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int remainder = total_elements - Offset;
+      static_assert(remainder == (total_elements % ParentWidth), "Unexpected remainder.");
+
+      typename ArrayConverter::ScalarConverter scalar_converter;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = Offset; i < ArrayConverter::result_type::kElements; ++i) {
+        result[i] = scalar_converter(ElementSrc(source[i]));
+      }
+    }
+
+    template <int Offset, size_t ParentWidth, typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert_helper(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      static_assert(sizeof...(OtherVectorArrays) % 2 == 0, "Vector converters must come in {dst, src} pairs");
+      static_assert(ResultVectorArray::kElements == SourceVectorArray::kElements, "Vector converters must have the same vector width");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::result_type::Element, typename ResultVectorArray::Element>::value,
+        "ResultVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(cutlass::platform::is_same<typename ArrayConverter::source_type::Element, typename SourceVectorArray::Element>::value,
+        "SourceVectorArray must have the same type ArrayConverter::result_type");
+      static_assert(Offset >= 0 && Offset <= ArrayConverter::result_type::kElements, "Offset must be between 0 and N");
+
+      static_assert(ParentWidth == 0 || ParentWidth > ResultVectorArray::kElements, "Vector arrays must be given in decreasing order of width");
+
+      constexpr int vector_width = ResultVectorArray::kElements;
+      static_assert(ispow2(vector_width), "Vector width must be a power of 2");
+
+      using ElementRes = typename ArrayConverter::result_type::Element;
+      using ElementSrc = typename ArrayConverter::source_type::Element;
+
+      constexpr int vector_bits_res = vector_width * cutlass::sizeof_bits<ElementRes>::value;
+      constexpr int vector_bits_src = vector_width * cutlass::sizeof_bits<ElementSrc>::value;
+
+      static_assert(vector_bits_res % 8 == 0, "Result vector type must be byte addressed.");
+      static_assert(vector_bits_src % 8 == 0, "Source vector type must be byte addressed.");
+
+      constexpr int vector_offset = Offset / vector_width;
+      ResultVectorArray* packed_result_vec = reinterpret_cast<ResultVectorArray*>(&result) + vector_offset;
+      SourceVectorArray const* packed_source_vec = reinterpret_cast<SourceVectorArray const*>(&source) + vector_offset;
+
+      // Convert the remaining elements as vectors.
+      constexpr int total_elements = ArrayConverter::result_type::kElements;
+      constexpr int groups_of_vec = (total_elements - Offset) / vector_width;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < groups_of_vec; ++i) {
+        packed_result_vec[i] = ArrayConverter::template packed_convert<ResultVectorArray, SourceVectorArray>(packed_source_vec[i]);
+      }
+
+      constexpr int new_offset = Offset + vector_width * groups_of_vec;
+      // Recurse to handle other vector converters, or the scalar base case.
+      convert_helper<new_offset, ResultVectorArray::kElements, ArrayConverter, OtherVectorArrays...>(result, source);
+    }
+
+  public:
+    /*
+        A method to convert vectors of elements using the packed_convert method of the converter.
+
+        Converters using this class must implement packed convert and support 1 or more vector conversions.
+      */
+    template <typename ArrayConverter, typename ResultVectorArray, typename SourceVectorArray, typename... OtherVectorArrays>
+    CUTLASS_DEVICE
+    static void convert(typename ArrayConverter::result_type& result, typename ArrayConverter::source_type const& source) {
+      convert_helper<0, 0, ArrayConverter, ResultVectorArray, SourceVectorArray, OtherVectorArrays...>(result, source);
+    }
+  };
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<half_t, N> <= Array<float_e2m1_t, N>
+template <
+  FloatRoundStyle Round,
+  int N
+>
+struct NumericArrayConverter<cutlass::half_t, cutlass::float_e2m1_t, N, Round> {
+  using result_element = cutlass::half_t;
+  using source_element = cutlass::float_e2m1_t;
+  using result_type = Array<result_element, N>;
+  using source_type = Array<source_element, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::float_e2m1_t, 8>;
+  using source_type_packed_4 = Array<cutlass::float_e2m1_t, 4>;
+  using source_type_packed_2 = Array<cutlass::float_e2m1_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::float_e2m1_t, Round>;
+
+  #if defined(CUDA_PTX_FP8_CVT_ENABLED)
+  CUTLASS_DEVICE
+  static result_type_packed_8 ptx_convert(source_type_packed_8 const &source) {
+    result_type_packed_8 out;
+    uint32_t* out_fp16 = reinterpret_cast<uint32_t*>(&out);
+    uint32_t const& src_packed = reinterpret_cast<uint32_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b8 byte0, byte1, byte2, byte3;\n" \
+        "mov.b32 {byte0, byte1, byte2, byte3}, %4;\n" \
+        "cvt.rn.f16x2.e2m1x2 %0, byte0;\n" \
+        "cvt.rn.f16x2.e2m1x2 %1, byte1;\n" \
+        "cvt.rn.f16x2.e2m1x2 %2, byte2;\n" \
+        "cvt.rn.f16x2.e2m1x2 %3, byte3;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) , "=r"(out_fp16[2]), "=r"(out_fp16[3]): "r"(src_packed));
+    return out;
+  }
+
+  CUTLASS_DEVICE
+  static result_type_packed_4 ptx_convert(source_type_packed_4 const &source) {
+    result_type_packed_4 out;
+    uint32_t* out_fp16 = reinterpret_cast<uint32_t*>(&out);
+    uint16_t const& src_packed = reinterpret_cast<uint16_t const&>(source);
+    asm volatile( \
+        "{\n" \
+        ".reg .b8 byte0, byte1;\n" \
+        "mov.b16 {byte0, byte1}, %2;\n" \
+        "cvt.rn.f16x2.e2m1x2 %0, byte0;\n" \
+        "cvt.rn.f16x2.e2m1x2 %1, byte1;\n" \
+        "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "h"(src_packed));
+    return out;
+  }
+
+  CUTLASS_DEVICE
+  static result_type_packed_2 ptx_convert(source_type_packed_2 const &source) {
+    result_type_packed_2 out;
+    uint32_t* out_fp16 = reinterpret_cast<uint32_t*>(&out);
+    uint16_t const& src_packed = static_cast<uint16_t const&>(reinterpret_cast<uint8_t const&>(source));
+    asm volatile( \
+        "{\n" \
+        ".reg .b8 byte0, byte1;\n" \
+        "mov.b16 {byte0, byte1}, %1;\n" \
+        "cvt.rn.f16x2.e2m1x2 %0, byte0;\n" \
+        "}\n" : "=r"(out_fp16[0]) : "h"(src_packed));
+    return out;
+  }
+  #endif
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+  #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED)
+    return ptx_convert(source);
+  #else
+    PackedResultType result;
+    NumericConverter<result_element, source_element, Round> converter;
+
+    const int k_packed = PackedResultType::kElements;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < k_packed; ++i) {
+      result[i] = converter(source[i]);
+    }
+
+    return result;
+  #endif
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::float_e4m3_t, 16>;
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP8s in reg. We need 1 reg for every 4 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 2;
+
+    src_reg         &= 0x333333333333; // s14s12s10s8s6s4s2s0
+    src_reg_shifted &= 0x333333333333; // s15s13s11s9s7s5s3s1
+
+    // [0, 1, -2, -1] encoded as FP8
+    static constexpr uint32_t E4M3_LUT = 0xB8C03800;
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ii += 2, src_reg >>= 16, src_reg_shifted >>= 16) {
+      // This uses a look up table to convert packed int2s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 f8_6420, f8_7531;\n"
+          "  prmt.b32 f8_6420, %4, 0, %2;\n"
+          "  prmt.b32 f8_7531, %4, 0, %3;\n"
+          "  prmt.b32 %0, f8_6420, f8_7531, 0x5140;\n" // 3210
+          "  prmt.b32 %1, f8_6420, f8_7531, 0x7362;\n" // 7654
+          "}\n"
+          : "=r"(r[ii]), "=r"(r[ii+1])
+          : "r"(src_reg), "r"(src_reg_shifted), "n"(E4M3_LUT));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::float_e4m3_t, 16>;
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP8s in reg. We need 1 reg for every 4 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 2;
+
+    src_reg         &= 0x333333333333; // u14u12u10u8u6u4u2u0
+    src_reg_shifted &= 0x333333333333; // u15u13u11u9u7u5u3u1
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t E4M3_LUT = 0x44403800;
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ii += 2, src_reg >>= 16, src_reg_shifted >>= 16) {
+      // This uses a look up table to convert packed uint2s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 f8_6420, f8_7531;\n"
+          "  prmt.b32 f8_6420, %4, 0, %2;\n"
+          "  prmt.b32 f8_7531, %4, 0, %3;\n"
+          "  prmt.b32 %0, f8_6420, f8_7531, 0x5140;\n" // 3210
+          "  prmt.b32 %1, f8_6420, f8_7531, 0x7362;\n" // 7654
+          "}\n"
+          : "=r"(r[ii]), "=r"(r[ii+1])
+          : "r"(src_reg), "r"(src_reg_shifted), "n"(E4M3_LUT));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e5m2_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e5m2_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::float_e5m2_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::float_e5m2_t, 16>;
+  using result_type_packed_8 = Array<cutlass::float_e5m2_t, 8>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e5m2_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP8s in reg. We need 1 reg for every 4 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 2;
+
+    src_reg         &= 0x333333333333; // s14s12s10s8s6s4s2s0
+    src_reg_shifted &= 0x333333333333; // s15s13s11s9s7s5s3s1
+
+    // [0, 1, -2, -1] encoded as FP8
+    static constexpr uint32_t E4M3_LUT = 0xBCC03C00;
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ii += 2, src_reg >>= 16, src_reg_shifted >>= 16) {
+      // This uses a look up table to convert packed int2s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 f8_6420, f8_7531;\n"
+          "  prmt.b32 f8_6420, %4, 0, %2;\n"
+          "  prmt.b32 f8_7531, %4, 0, %3;\n"
+          "  prmt.b32 %0, f8_6420, f8_7531, 0x5140;\n" // 3210
+          "  prmt.b32 %1, f8_6420, f8_7531, 0x7362;\n" // 7654
+          "}\n"
+          : "=r"(r[ii]), "=r"(r[ii+1])
+          : "r"(src_reg), "r"(src_reg_shifted), "n"(E4M3_LUT));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e5m2_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e5m2_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::float_e5m2_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::float_e5m2_t, 16>;
+  using result_type_packed_8 = Array<cutlass::float_e5m2_t, 8>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e5m2_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP8s in reg. We need 1 reg for every 4 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 2;
+
+    src_reg         &= 0x333333333333; // u14u12u10u8u6u4u2u0
+    src_reg_shifted &= 0x333333333333; // u15u13u11u9u7u5u3u1
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t E4M3_LUT = 0x42403C00;
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ii += 2, src_reg >>= 16, src_reg_shifted >>= 16) {
+      // This uses a look up table to convert packed uint2s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 f8_6420, f8_7531;\n"
+          "  prmt.b32 f8_6420, %4, 0, %2;\n"
+          "  prmt.b32 f8_7531, %4, 0, %3;\n"
+          "  prmt.b32 %0, f8_6420, f8_7531, 0x5140;\n" // 3210
+          "  prmt.b32 %1, f8_6420, f8_7531, 0x7362;\n" // 7654
+          "}\n"
+          : "=r"(r[ii]), "=r"(r[ii+1])
+          : "r"(src_reg), "r"(src_reg_shifted), "n"(E4M3_LUT));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<int4b_t>
+template <
+  int N,
+  FloatRoundStyle Round
+>
+struct NumericArrayConverter<int8_t, int4b_t, N, Round> {
+
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<int4b_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_HOST_DEVICE
+  static result_type convert(source_type const & source) {
+   
+    #if defined(__CUDA_ARCH__)
+
+    if constexpr ( N == 8 ) {
+      
+      unsigned const& storage = reinterpret_cast<unsigned const &>(source);
+      unsigned out[2];
+
+      asm volatile(
+          "{\n"
+          "  .reg .u32 tmp0, tmp1, tmp2;\n"
+          "  shl.b32 tmp0, %2, 4;\n"                // tmp0 = x1x2x3x4x5x6x7__
+          "  and.b32 tmp0, tmp0, 0xf0f0f0f0;\n"     // tmp0 = x1__x3__x5__x7__
+          "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s1s3s5s7
+          "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s1__s3__s5__s7__
+          "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x1__x3__x5__x7
+          "  or.b32 tmp2, tmp0, tmp1;\n"            // tmp2 = y1y3y5y7
+          "  and.b32 tmp0, %2, 0xf0f0f0f0;\n"       // tmp0 = x0__x2__x4__x6__
+          "  prmt.b32 tmp1, tmp0, tmp0, 0xba98;\n"  // tmp1 = s0s2s4s6
+          "  and.b32 tmp1, tmp1, 0xf0f0f0f0;\n"     // tmp1 = s0__s2__s4__s6__
+          "  shr.u32 tmp0, tmp0, 4;\n"              // tmp0 = __x0__x2__x4__x6
+          "  or.b32 tmp0, tmp0, tmp1;\n"            // tmp0 = y0y2y4y6
+          "  prmt.b32 %0, tmp2, tmp0, 0x5140;\n"    // %0 = y0y1y2y3
+          "  prmt.b32 %1, tmp2, tmp0, 0x7362;\n"    // %1 = y4y5y6y7
+          "}\n"
+          : "=r"(out[0]), "=r"(out[1])
+          : "r"(storage));
+
+      return reinterpret_cast<result_type const &>(out);
+      
+    } else {
+      
+      NumericArrayConverter<int8_t, int4b_t, 8, Round> convert_vector_;
+      
+      result_type result;
+      
+      Array<int8_t, 8> *result_ptr = reinterpret_cast<Array<int8_t, 8> *>(&result);
+      Array<int4b_t, 8> const *source_ptr = reinterpret_cast<Array<int4b_t, 8> const *>(&source);
+      
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < N / 8; ++i) {
+        result_ptr[i] = convert_vector_(source_ptr[i]);
+      }
+      
+      return result;
+    }
+    
+    #else
+    
+    result_type result;
+    NumericConverter<int8_t, int4b_t, Round> convert_;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = convert_(source[i]);
+    }
+    
+    return result;
+    
+    #endif // __CUDA_ARCH__
+  }
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using result_type_packed_4 = Array<cutlass::float_e4m3_t, 4>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses a lookup table to converts i4 -> e4m3.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
+
+    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
+    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the signed or unsigned candidates
+    uint32_t sign = (reg & 0x88888888) >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = (reg & 0x77777777);
+
+    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
+    const uint32_t final_prmt_base = 0x32103210;
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG1 = 0x44403800;
+    // [4, 5, 6, 7] encoded as FP8
+    static constexpr uint32_t POS_E4M3s_REG2 = 0x4E4C4A48;
+    // [-8, -7, -6, -5] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG1 = 0xCACCCED0;
+    // [-4, -3, -2, -1] encoded as FP8
+    static constexpr uint32_t NEG_E4M3s_REG2 = 0xB8C0C4C8;
+
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
+      uint32_t final_prmt_idx = final_prmt_base | sign;
+
+      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos_f8s, neg_f8s;\n"
+          "  prmt.b32 pos_f8s, %1, %2, %5;\n"
+          "  prmt.b32 neg_f8s, %3, %4, %5;\n"
+          "  prmt.b32 %0, pos_f8s, neg_f8s, %6;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "n"(POS_E4M3s_REG1), "n"(POS_E4M3s_REG2), "n"(NEG_E4M3s_REG1), "n"(NEG_E4M3s_REG2),
+            "r"(lut_idx), "r"(final_prmt_idx));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e5m2_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e5m2_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::float_e5m2_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::float_e5m2_t, 8>;
+  using result_type_packed_4 = Array<cutlass::float_e5m2_t, 4>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e5m2_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses a lookup table to converts i4 -> e5m2.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
+
+    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
+    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the signed or unsigned candidates
+    uint32_t sign = (reg & 0x88888888) >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = (reg & 0x77777777);
+
+    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
+    const uint32_t final_prmt_base = 0x32103210;
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t POS_E5M2s_REG1 = 0x42403C00;
+    // [4, 5, 6, 7] encoded as FP8
+    static constexpr uint32_t POS_E5M2s_REG2 = 0x47464544;
+    // [-8, -7, -6, -5] encoded as FP8
+    static constexpr uint32_t NEG_E5M2s_REG1 = 0xC5C6C7C8;
+    // [-4, -3, -2, -1] encoded as FP8
+    static constexpr uint32_t NEG_E5M2s_REG2 = 0xBCC0C2C4;
+
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
+      uint32_t final_prmt_idx = final_prmt_base | sign;
+
+      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 pos_f8s, neg_f8s;\n"
+          "  prmt.b32 pos_f8s, %1, %2, %5;\n"
+          "  prmt.b32 neg_f8s, %3, %4, %5;\n"
+          "  prmt.b32 %0, pos_f8s, neg_f8s, %6;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "n"(POS_E5M2s_REG1), "n"(POS_E5M2s_REG2), "n"(NEG_E5M2s_REG1), "n"(NEG_E5M2s_REG2),
+            "r"(lut_idx), "r"(final_prmt_idx));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::float_e4m3_t, N> <= Array<cutlass::uint4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::float_e4m3_t, cutlass::uint4b_t, N, Round> {
+  using result_type = Array<cutlass::float_e4m3_t, N>;
+  using source_type = Array<cutlass::uint4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::float_e4m3_t, 8>;
+  using result_type_packed_4 = Array<cutlass::float_e4m3_t, 4>;
+  using source_type_packed_8 = Array<cutlass::uint4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint4b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::float_e4m3_t, cutlass::uint4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses a lookup table to converts u4 -> e4m3.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4 or 8 to use private convert dispatch.");
+
+    // Hold FP8 outputs in reg. We need 1 reg for every 4 outputs.
+    cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 4, sizeof(PackedResultType)> r;
+
+    // View the input as reg
+    uint32_t reg = to_reg(source);
+
+    // Determines if to get from the [0-7] or [8-15] candidates
+    uint32_t sign = (reg & 0x88888888) >> 1;
+
+    // Ignore sign bit when indexing into LUT
+    uint32_t lut_idx = (reg & 0x77777777);
+
+    // Signed is OR'd with 0x32103210 to find the correct value in the LUT
+    const uint32_t final_prmt_base = 0x32103210;
+
+    // [0, 1, 2, 3] encoded as FP8
+    static constexpr uint32_t E4M3s_REG1 = 0x44403800;
+    // [4, 5, 6, 7] encoded as FP8
+    static constexpr uint32_t E4M3s_REG2 = 0x4E4C4A48;
+    // [8, 9, 10, 11] encoded as FP8
+    static constexpr uint32_t E4M3s_REG3 = 0x53525150;
+    // [12, 13, 14, 15] encoded as FP8
+    static constexpr uint32_t E4M3s_REG4 = 0x57565554;
+
+
+    const int iters = PackedSrcType::kElements / 4;
+    #pragma unroll
+    for (int ii = 0; ii < iters; ++ii, lut_idx >>=16, sign >>=16) {
+      uint32_t final_prmt_idx = final_prmt_base | sign;
+
+      // This uses a look up table to convert packed int4s to packed fp8s, using the int4 value
+      // as the index to prmt.
+      // It first select both the positive and negative candidates, then uses the sign bit to
+      // select the correct candidate.
+      asm volatile(
+          "{\n"
+          "  .reg .b32 f8s_1, f8s_2;\n"
+          "  prmt.b32 f8s_1, %1, %2, %5;\n"
+          "  prmt.b32 f8s_2, %3, %4, %5;\n"
+          "  prmt.b32 %0, f8s_1, f8s_2, %6;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "n"(E4M3s_REG1), "n"(E4M3s_REG2), "n"(E4M3s_REG3), "n"(E4M3s_REG4),
+            "r"(lut_idx), "r"(final_prmt_idx));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, cutlass::int4b_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<float, 8>;
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <int offset, int elements_to_convert, typename PackedResultType>
+  CUTLASS_DEVICE
+  static void packed_convert_vec(PackedResultType& result, uint32_t src_reg) {
+    static_assert(offset == 0 || offset == 4, "Invalid offset");
+    // Selects one of the bottom int4s and constructs:
+    // 8388608 + (x + 8)
+    // 8388608 + 16 * (x + 8)
+    // 8388608 + 256 * (x + 8)
+    // 8388608 + 4096 * (x + 8)
+    uint32_t const and_masks[4] = {0x0000000F, 0x000000F0, 0x00000F00, 0x0000F000};
+    uint32_t const xor_masks[4] = {0x4B000008, 0x4B000080, 0x4B000800, 0x4B008000};
+
+    float const scales[4] = {1.f, 1.f / 16.f, 1.f / 256.f, 1.f / 4096.f};
+    float const offsets[4] = {-8388616.f, -524296.f, -32776.f, -2056.f};
+
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&result);
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < elements_to_convert; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %1, %2, %3, %4;\n"
+          "}\n"
+          : "=r"(result_as_int[offset + ii])
+          : "r"(src_reg), "r"(and_masks[ii]), "r"(xor_masks[ii]), "n"(immLut));
+
+      result[offset + ii] = __fmaf_rn(result[offset + ii], scales[ii], offsets[ii]);
+    }
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 1, 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    PackedResultType r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    constexpr int total_elements = PackedResultType::kElements == 8 ? 4 : PackedResultType::kElements;
+    packed_convert_vec<0, total_elements>(r, src_reg);
+
+
+    if (PackedResultType::kElements == 8) {
+      uint32_t src_reg_shifted = src_reg >> 16;
+      packed_convert_vec<4, 4>(r, src_reg_shifted);
+    }
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, int8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_2 const& source) {
+    return static_cast<int32_t>(reinterpret_cast<const int16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static int32_t to_int32(source_type_packed_4 const& source) {
+    return reinterpret_cast<const int32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ <= 800
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    static constexpr int fp32_base = 0x4B400000;
+    uint32_t const prmt_indices[4] = {0x8880, 0x9991, 0xAAA2, 0xBBB3};
+
+    int* result_as_int = reinterpret_cast<int*>(&r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(result_as_int[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii)
+    {
+      result_as_int[ii] += fp32_base;
+      r[ii] -= reinterpret_cast<const float&>(fp32_base);
+    }
+  #else
+    int32_t x = to_int32(source);
+    int32_t t[4];
+    constexpr int32_t mask[4] = {0x00000001, 0x00000100, 0x00010000, 0x01000000};
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      t[ii] = __dp4a(x, mask[ii], 0);
+      r[ii] = static_cast<float>(t[ii]);
+    }
+  #endif
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<float, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<float, uint8_t, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<float, 4>;
+  using result_type_packed_2 = Array<float, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<float, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    PackedResultType r;
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of u8x4 source and stores
+    // the result in r (without introducing extra cvt.u32.u8 instruction)
+    uint32_t const prmt_indices[4] = {0x7650, 0x7651, 0x7652, 0x7653};
+    uint32_t* result_as_int = reinterpret_cast<uint32_t*>(&r);
+    for (int ii = 0; ii < PackedResultType::kElements; ++ii) {
+      result_as_int[ii] = __byte_perm(src_reg, 0x4B000000, prmt_indices[ii]);
+      // Subtract the magic number 0x4B000000 from tmp in floating-point arithmetic to obtain final result
+      r[ii] -= 8388608.f;
+    }
+
+    return r;
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::half_t, 16>;
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // f1f0   = {0x00, i3i2i1i0,     0x00, i3i2i1i0}
+    // f3f2   = {0x00, i5i4i3i2,     0x00, i5i4i3i2}
+    // f5f4   = {0x00, i7i6i5i4,     0x00, i7i6i5i4}
+    // f7f6   = {0x00, i9i8i7i6,     0x00, i9i8i7i6}
+    // f9f8   = {0x00, i11i10i9i8,   0x00, i11i10i9i8}
+    // f11f10 = {0x00, i13i12i11i10, 0x00, i13i12i11i10}
+    // f13f12 = {0x00, i15i14i13i12, 0x00, i15i14i13i12}
+    // f15f14 = {0x00, 0000i15i14,   0x00, 0000i15i14}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> FP16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted), "n"(0), "r"(prmt_indices[ii / 2]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x + 2, 1024 + 4 * (x + 2)
+    // We use lop3 so that we can use 1 instruction for AND and XOR.
+    // static constexpr uint32_t xor_mask[2] = { 0x64086402, 0x64806420};
+    // static constexpr uint32_t and_mask[2] = { 0x000C0003, 0x00C00030};
+    static constexpr uint32_t xor_mask = 0x64086402;
+    static constexpr uint32_t and_mask = 0x000C0003;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask[i / 2]) ^ xor_mask[i / 2]
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // {-258, -1026}
+    static constexpr uint32_t hfma_bias_rep = 0xDC08E402;
+    // {1/4, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x34003C00;
+
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(fp16x2_val,
+                           reinterpret_cast<const half2&>(hfma_scale_rep),
+                           reinterpret_cast<const half2&>(hfma_bias_rep));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::half_t, 16>;
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // f1f0   = {0x00, u3u2u1u0,     0x00, u3u2u1u0}
+    // f3f2   = {0x00, u5u4u3u2,     0x00, u5u4u3u2}
+    // f5f4   = {0x00, u7u6u5u4,     0x00, u7u6u5u4}
+    // f7f6   = {0x00, u9u8u7u6,     0x00, u9u8u7u6}
+    // f9f8   = {0x00, u11u10u9u8,   0x00, u11u10u9u8}
+    // f11f10 = {0x00, u13u12u11u10, 0x00, u13u12u11u10}
+    // f13f12 = {0x00, u15u14u13u12, 0x00, u15u14u13u12}
+    // f15f14 = {0x00, 0000u15u14,   0x00, 0000u15u14}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> FP16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted), "n"(0), "r"(prmt_indices[ii / 2]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x, 1024 + 4 * x
+    // We use lop3 so that we can use 1 instruction for AND and OR.
+    static constexpr uint32_t xor_mask = 0x64006400;
+    static constexpr uint32_t and_mask = 0x000C0003;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask[i / 2]) ^ xor_mask[i / 2]
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // {-256, -1024}
+    static constexpr uint32_t hfma_bias_rep = 0xDC00E400;
+    // {1/4, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x34003C00;
+
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(fp16x2_val,
+                           reinterpret_cast<const half2&>(hfma_scale_rep),
+                           reinterpret_cast<const half2&>(hfma_bias_rep));
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00, i4_01, 0x00, i4_01}
+    // fp16s_23 = {0x00, i4_23, 0x00, i4_23}
+    // fp16s_45 = {0x00, i4_45, 0x00, i4_45}
+    // fp16s_67 = {0x00, i4_67, 0x00, i4_67}
+    // We use inline asm instead of __byte_perm intrinsic since we don't want the documented (& 0x7) on the index. NVCC
+    // might be able to optimize it out since the index is a constexpr, but we choose to be safe about it here.
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for I4 ->F16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    1024 + x + 8 OR 1024 + 16 * (x + 8), then using hfma to subtract 1032 from that
+    // 2) Adds 8 to the int4 value that we will process in the FP16 (for uint4, we can simply avoid this step)
+    // The AND does the following:
+    // 1) Clear the set bits for the int4 we will ignore.
+    // We use lop3 so that we can use 1 instruction for AND and XOR.
+    static constexpr uint32_t xor_mask = 0x64806408;
+    static constexpr uint32_t and_mask = 0xFFF0FF0F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 hfmas that do the following:
+    // For the high FP16:
+    //  Divide by 16 {packed as a operand} to get:
+    //    64 + (x + 8)
+    //    x + 72
+    //  Subtract 72 {packed as c operand} to get x
+    // For the low FP16:
+    //    1024 + (x + 8)
+    //    x + 1032
+    // So, we subtract 1032 {packed as c operand} to get x
+
+    // {-72, -1032}
+    static constexpr uint32_t hfma_bias_rep = 0xD480E408;
+    // {1 / 16, 1}
+    static constexpr uint32_t hfma_scale_rep = 0x2C003C00;
+
+    const half2& hfma_bias = reinterpret_cast<const half2&>(hfma_bias_rep);
+    const half2& hfma_scale = reinterpret_cast<const half2&>(hfma_scale_rep);
+    // Scale and subtract the FP16s to get the original int4 number as FP16.
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hfma2(hfma_scale, fp16x2_val, hfma_bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<cutlass::uint4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, cutlass::uint4b_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<cutlass::uint4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::half_t, 8>;
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_8 = Array<cutlass::uint4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::uint4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, cutlass::uint4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00, u4_01, 0x00, u4_01}
+    // fp16s_23 = {0x00, u4_23, 0x00, u4_23}
+    // fp16s_45 = {0x00, u4_45, 0x00, u4_45}
+    // fp16s_67 = {0x00, u4_67, 0x00, u4_67}
+    uint32_t prmt_indices[4] = {0x4040, 0x4141, 0x4242, 0x4343};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for u4 -> f16 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "n"(0), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    // 1024 + x, then using hsub2 to subtract 1024 from that
+    static constexpr uint32_t or_mask  = 0x64006400;
+    static constexpr uint32_t and_mask = 0x00F0000F;
+    static constexpr uint32_t immLut   = (0xf0 & 0xcc) | 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) | or_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(or_mask), "n"(immLut));
+
+      // We will issue 2 hfmas that do the following:
+      // For the high FP16:
+      //  Divide by 16 {packed as a operand} to get:
+      //    64 + x
+      //  Subtract 64 {packed as c operand} to get x
+      // For the low FP16:
+      // we subtract 1024 {packed as c operand} to get x
+
+      static constexpr uint32_t hfma_bias  = 0xD400E400; // {-64, -1024}
+      static constexpr uint32_t hfma_scale = 0x2C003C00; // {1 / 16, 1}
+      
+      {
+        __half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+        fp16x2_val = __hfma2(fp16x2_val, reinterpret_cast<const __half2&>(hfma_scale), reinterpret_cast<const __half2&>(hfma_bias));
+      }
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    #if 0 // Scalar conversion (Please keep this code for reference for vectorized version below)
+    auto result = reinterpret_cast<PackedResultType&>(r);
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < PackedResultType::kElements; ++i) {
+      int16_t tmp = source[i] + 26112 /* 0x6600 */;
+      result[i] = reinterpret_cast<cutlass::half_t const &>(tmp) - 1536.0_hf;
+    }
+    #endif
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x9180, 0xB3A2};
+
+    // Pack s8x2 (s8[1], s8[0]) -> s16x2 (sext.s8[1], sext.s8[0])
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt)
+    // The inline ptx below uses `msb=0` and `msb=1` from the above link to sign-extend the sign bit in 0, 1, 2, 3 bytes of s8x4
+    // into result_ptr[0] and result_ptr[1]'s 08-15 and 24-31 bits, respectively.
+    // Note that `__byte_perm(source_ptr[0], source_ptr[0], 0x9180);` won't achieve the same result and doesn't sign-extend the sign bit.
+    // Thus, we use inline ptx `prmt.b32` instruction for the desired sign extend from s8x2 to s16x2.
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%1,%2;\n" : "=r"(r[ii]) : "r"(src_reg), "r"(prmt_indices[ii]));
+    }
+
+    // In the absence of add.s16x2 instruction, use bit-wise operation to execute signed addition with magic numbers to achieve
+    // the same result as add.s16x2 instruction.
+    // (See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#logic-and-shift-instructions-lop3)
+    // For a logical operation F(a, b, c) the value of kImmLut can be computed by applying the same operation to
+    // three predefined constant values as follows:
+    //                                        ta = 0xF0;
+    //                                        tb = 0xCC;
+    //                                        tc = 0xAA;
+    //                                   kImmLut = F(ta, tb, tc);
+    // If we want F = ((a & b) ^ c) then set kImmLut = (0xF0 & 0xCC) ^ 0xAA
+    static constexpr uint32_t kImmLut = (0xF0 & 0xCC) ^ 0xAA;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      // The bit-wise operation executed below is `r[ii] = (r[ii] & 0x03FF03FF) ^ 0x66006600;`
+      asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" :
+                                "=r"(r[ii]) : "r"(r[ii]), "n"(0x03FF03FF), "n"(0x66006600), "n"(kImmLut));
+    }
+
+    static constexpr uint32_t bias_rep = 0x66006600;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::half_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::half_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::half_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::half_t, 4>;
+  using result_type_packed_2 = Array<cutlass::half_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::half_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t const prmt_indices[2] = {0x5150, 0x5352};
+    static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile("prmt.b32 %0,%1,%2,%3;\n" : "=r"(r[ii]) : "r"(src_reg), "n"(start_byte_for_fp16), "r"(prmt_indices[ii]));
+    }
+
+    static constexpr uint32_t bias_rep = 0x64006400;
+    const half2& bias = reinterpret_cast<const half2&>(bias_rep);
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      half2& fp16x2_val = reinterpret_cast<__half2&>(r[ii]);
+      fp16x2_val = __hsub2(fp16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int2b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::int2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::bfloat16_t, 16>;
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using source_type_packed_16 = Array<cutlass::int2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::int2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted_two = src_reg >> 2;
+    uint32_t src_reg_shifted_four = src_reg >> 4;
+    uint32_t src_reg_shifted_six = src_reg >> 6;
+
+    // Modified prmt indices for signed 2-bit values 
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+
+    static_assert(RegArray::kElements <= 8, "Too many inputs for I2 -> BF16 vector converter");
+
+    // First pass: extract and sign extend the 2-bit values
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted_two), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted_four), "r"(src_reg_shifted_six), "r"(prmt_indices[ii / 2]));
+    }
+
+    // For signed 2-bit integers:
+    // 00 ->  0     (0)
+    // 01 ->  1     (1)
+    // 10 -> -2     (2 with sign extension)
+    // 11 -> -1     (3 with sign extension)
+    //static constexpr uint32_t sign_mask = 0x00020002;  // Mask to check sign bit
+    static constexpr uint32_t and_mask = 0x00030003;   // Mask for 2 bits
+
+    // Modified for signed range (-2 to 1)
+    // We'll construct numbers in the form 128 + (x + 2) and then subtract 130
+    // to get back to our original range
+    static constexpr uint32_t xor_mask = 0x43024302;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // Bias represents 130 in bfloat16 format
+    // Subtracting 130 brings us back to our signed range (-2 to 1)
+    static constexpr uint32_t bias_rep = 0x43024302;  // {130, 130} in bfloat16
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::uint2b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::uint2b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::uint2b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_16 = Array<cutlass::bfloat16_t, 16>;
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using source_type_packed_16 = Array<cutlass::uint2b_t, 16>;
+  using source_type_packed_8 = Array<cutlass::uint2b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint2b_t, 4>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::uint2b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_16 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_16>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_16>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 4, 8 or 16 to use private convert dispatch.");
+
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted_two = src_reg >> 2;
+    uint32_t src_reg_shifted_four = src_reg >> 4;
+    uint32_t src_reg_shifted_six = src_reg >> 6;
+
+    // Modified prmt indices for signed 2-bit values 
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+
+    static_assert(RegArray::kElements <= 8, "Too many inputs for U2 -> BF16 vector converter");
+
+    // First pass: extract and sign extend the 2-bit values
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ii += 2) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted_two), "r"(prmt_indices[ii / 2]));
+
+      asm volatile(
+           "{ prmt.b32 %0, %1, %2, %3; }\n"
+           : "=r"(r[ii + 1])
+           : "r"(src_reg_shifted_four), "r"(src_reg_shifted_six), "r"(prmt_indices[ii / 2]));
+    }
+
+    static constexpr uint32_t and_mask = 0x00030003;   // Mask for 2 bits
+    static constexpr uint32_t xor_mask = 0x43004300;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    static constexpr uint32_t bias_rep = xor_mask;  // {128, 128} in bfloat16
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_16, source_type_packed_16,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::int4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::int4b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::int4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_8 = Array<cutlass::int4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::int4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::int4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::int4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    uint32_t const prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ prmt.b32 %0, %1, %2, %3; }\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+    }
+
+    // The below XOR does the following:
+    // 1) Sets the exponent bits of the FP16 to the correct value for the FP16 magic_num. We will be constructing
+    //    128 + (x + 8) and subtracting 136 to get x
+    static constexpr uint32_t xor_mask = 0x43084308;
+    static constexpr uint32_t and_mask = 0x000F000F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{ lop3.b32 %0, %0, %1, %2, %3; }\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 bfmas that do the following:
+    // high BF16:
+    // hi_bf16 - 136, lo_bf16 - 136
+
+    // This is the BF16 {136, 136} represented as an integer.
+    static constexpr uint32_t bias_rep = 0x43084308;
+    const __nv_bfloat162& bias = reinterpret_cast<const __nv_bfloat162&>(bias_rep);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, bias);
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<cutlass::uint4b_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, cutlass::uint4b_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<cutlass::uint4b_t, N>;
+
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_8 = Array<cutlass::bfloat16_t, 8>;
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_8 = Array<cutlass::uint4b_t, 8>;
+  using source_type_packed_4 = Array<cutlass::uint4b_t, 4>;
+  using source_type_packed_2 = Array<cutlass::uint4b_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, cutlass::uint4b_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint8_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_8 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  // The core converter uses bit tricks to construct a known FP16 number, then does a
+  // subtraction in FP16 for the final result.
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_8>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_8>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2, 4 or 8 to use private convert dispatch.");
+
+    // Hold output FP16s in reg. We need 1 reg for every 2 elements
+    using RegArray = cutlass::AlignedArray<uint32_t, PackedResultType::kElements / 2, sizeof(PackedResultType)>;
+    RegArray r;
+
+    // View the input as reg
+    uint32_t src_reg = to_reg(source);
+    uint32_t src_reg_shifted = src_reg >> 4;
+
+    // Below constructs the following temporary:
+    // fp16s_01 = {0x00,  u4_21, 0x00, u4_10}
+    // fp16s_23 = {0x00,  u4_43, 0x00, u4_32}
+    // fp16s_45 = {0x00,  u4_65, 0x00, u4_54}
+    // fp16s_67 = {0x000, u4_7,  0x00, u4_76}
+    static constexpr uint32_t prmt_indices[4] = {0xF4F0, 0xF5F1, 0xF6F2, 0xF7F3};
+    static_assert(RegArray::kElements <= 4, "Too many inputs for BF16 -> I4 vector converter");
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  prmt.b32 %0, %1, %2, %3;\n"
+          "}\n"
+          : "=r"(r[ii])
+          : "r"(src_reg), "r"(src_reg_shifted), "r"(prmt_indices[ii]));
+    }
+
+    static constexpr uint32_t xor_mask = 0x43004300;
+    static constexpr uint32_t and_mask = 0x000F000F;
+    static constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
+
+    // For each operand, computes:
+    // r[i] = (r[i] & and_mask) ^ xor_mask
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      asm volatile(
+          "{\n"
+          "  lop3.b32 %0, %0, %1, %2, %3;\n"
+          "}\n"
+          : "+r"(r[ii])
+          : "n"(and_mask), "n"(xor_mask), "n"(immLut));
+    }
+
+    // We will issue 2 bfmas that do the following:
+    // high BF16:
+    // hi_bf16 - 128, lo_bf16 - 128
+
+    // This is the BF16 {128, 128} represented as an integer.
+    static constexpr uint32_t bias = xor_mask;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int ii = 0; ii < RegArray::kElements; ++ii) {
+      __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]);
+      bf16x2_val = __hsub2(bf16x2_val, reinterpret_cast<const __nv_bfloat162&>(bias));
+    }
+
+    return reinterpret_cast<PackedResultType&>(r);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_8, source_type_packed_8,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<int8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, int8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<int8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<int8_t, 4>;
+  using source_type_packed_2 = Array<int8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, int8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, int8_t, PackedResultType::kElements, Round> convert_int8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_int8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16;
+    return convert_f32_to_bf16(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+/// Partial specialization for Array<cutlass::bfloat16_t, N> <= Array<uint8_t, N>
+template <FloatRoundStyle Round, int N>
+struct NumericArrayConverter<cutlass::bfloat16_t, uint8_t, N, Round> {
+  using result_type = Array<cutlass::bfloat16_t, N>;
+  using source_type = Array<uint8_t, N>;
+  static FloatRoundStyle const round_style = Round;
+
+private:
+  using result_type_packed_4 = Array<cutlass::bfloat16_t, 4>;
+  using result_type_packed_2 = Array<cutlass::bfloat16_t, 2>;
+  using source_type_packed_4 = Array<uint8_t, 4>;
+  using source_type_packed_2 = Array<uint8_t, 2>;
+
+  using ScalarConverter = NumericConverter<cutlass::bfloat16_t, uint8_t, Round>;
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_2 const& source) {
+    return static_cast<uint32_t>(
+      reinterpret_cast<const uint16_t&>(source));
+  }
+
+  CUTLASS_DEVICE
+  static uint32_t to_reg(source_type_packed_4 const& source) {
+    return reinterpret_cast<const uint32_t&>(source);
+  }
+
+  template <typename PackedResultType, typename PackedSrcType>
+  CUTLASS_DEVICE
+  static PackedResultType packed_convert(PackedSrcType const &source) {
+
+    static_assert((platform::is_same<PackedSrcType, source_type_packed_2>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_2>::value) ||
+                  (platform::is_same<PackedSrcType, source_type_packed_4>::value &&
+                   platform::is_same<PackedResultType, result_type_packed_4>::value),
+                  "Invalid PackedSrcType/PackedResultType must be 2 or 4 to use private convert dispatch.");
+
+    NumericArrayConverter<float, uint8_t, PackedResultType::kElements, Round> convert_uint8_to_f32;
+    Array<float, PackedResultType::kElements> tmp = convert_uint8_to_f32(source);
+    NumericArrayConverter<cutlass::bfloat16_t, float, PackedResultType::kElements, Round> convert_f32_to_bf16_;
+    return convert_f32_to_bf16_(tmp);
+  }
+
+  friend class detail::VectorizedConverter;
+
+public:
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+    using ConverterType = NumericArrayConverter<typename result_type::Element, typename source_type::Element, N, Round>;
+    detail::VectorizedConverter::convert<ConverterType,
+                                         result_type_packed_4, source_type_packed_4,
+                                         result_type_packed_2, source_type_packed_2>(result, source);
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const {
+    return convert(s);
+  }
+};
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// FastNumericArrayConverter only works when the source is within center range.
+/// Conversion operator for Array.  See the comments before
+/// FastLinearCombinationClamp.
+template <typename T, typename S, int N,
+          FloatRoundStyle Round = FloatRoundStyle::round_to_nearest,
+          typename Enable = void>
+struct FastNumericArrayConverter {
+  using result_type = Array<T, N>;
+  using source_type = Array<S, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &s) {
+    NumericArrayConverter<T, S, N, Round> convert_;
+
+    return convert_(s);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<float> <= Array<int>
+template <int N, FloatRoundStyle Round>
+struct FastNumericArrayConverter<float, int, N, Round> {
+  using result_type = Array<float, N>;
+  using source_type = Array<int, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    result_type result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int tmp = source[i] + 1262485504 /*0x4B400000*/;
+      result[i] = reinterpret_cast<float const &>(tmp) - 12582912.0f;
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<int8_t, 4> <= Array<float, 4>
+template <FloatRoundStyle Round>
+struct FastNumericArrayConverter<int8_t, float, 4, Round> {
+  using result_type = Array<int8_t, 4>;
+  using source_type = Array<float, 4>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    Array<int32_t, 4> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < 4; ++i) {
+      float tmp = source[i] + 12582912.0f;
+      result[i] = reinterpret_cast<int32_t const &>(tmp);
+    }
+
+    result[0] = __byte_perm(result[0], result[1], 0x40);
+    result[2] = __byte_perm(result[2], result[3], 0x40);
+    result[0] = __byte_perm(result[0], result[2], 0x5410);
+
+    return reinterpret_cast<result_type const &>(result[0]);
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/// Partial specialization for Array<int8_t> <= Array<float>
+template <int N, FloatRoundStyle Round>
+struct FastNumericArrayConverter<int8_t, float, N, Round> {
+  static_assert(!(N % 4), "N must be multiple of 4.");
+
+  using result_type = Array<int8_t, N>;
+  using source_type = Array<float, N>;
+  static FloatRoundStyle const round_style = Round;
+
+  CUTLASS_DEVICE
+  static result_type convert(source_type const &source) {
+    FastNumericArrayConverter<int8_t, float, 4, Round> convert_vector_;
+
+    result_type result;
+
+    Array<int8_t, 4> *result_ptr =
+        reinterpret_cast<Array<int8_t, 4> *>(&result);
+    Array<float, 4> const *source_ptr =
+        reinterpret_cast<Array<float, 4> const *>(&source);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 4; ++i) {
+      result_ptr[i] = convert_vector_(source_ptr[i]);
+    }
+
+    return result;
+  }
+
+  CUTLASS_DEVICE
+  result_type operator()(source_type const &s) const { return convert(s); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines preferred rounding mode for a pair of types
+template <typename T, typename S>
+struct PreferredRoundingMode {
+  static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest;
+};
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 900
+/// Defines preferred rounding mode for a pair of types
+template <>
+struct PreferredRoundingMode<cutlass::tfloat32_t, float> {
+  static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate;
+};
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Packs predicates into an array.
+template <int N>
+struct PackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must pack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  result_type operator()(bool const predicates[]) {
+
+    result_type packed;
+    packed.clear();
+
+    int const kWordSize = 8;
+    uint8_t *bytes = reinterpret_cast<uint8_t *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      uint8_t mask = static_cast<uint8_t>((predicates[i] ? 1u : 0u) << bit_idx);
+      bytes[word_idx] = (bytes[word_idx] | mask);
+    }
+    return packed;
+  }
+};
+
+/// Packs predicates into an array
+template <int N>
+struct UnpackPredicates {
+  using result_type = Array<uint1b_t, N>;
+
+  static_assert(!(N % 4), "Must unpack predicates in a count that is a multiple of 4");
+
+  CUTLASS_HOST_DEVICE
+  void operator()(bool predicates[], result_type const &packed) {
+
+    int const kWordSize = 8;
+    uint8_t const *bytes = reinterpret_cast<uint8_t const *>(packed.data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      int word_idx = (i / kWordSize);
+      int bit_idx = (i % kWordSize);
+
+      predicates[i] = bool((bytes[word_idx] >> bit_idx) & 0x1);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_size.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d8f2ada075c5bfc54ee3667b2153116647da7bf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_size.h
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the size of an element in bits
+template <typename T>
+struct sizeof_bits {
+  static constexpr int value = int(sizeof(T) * 8);
+};
+
+template <typename T>
+struct sizeof_bits<T const> : sizeof_bits<T> {};
+
+template <typename T>
+struct sizeof_bits<T volatile> : sizeof_bits<T> {};
+
+template <typename T>
+struct sizeof_bits<T const volatile> : sizeof_bits<T> {};
+
+template <>
+struct sizeof_bits<void> {
+  static constexpr int value = 0;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the number of bytes required to hold a specified number of bits
+template <class R = int, class T>
+CUTLASS_HOST_DEVICE
+constexpr
+R
+bits_to_bytes(T bits) {
+  return (R(bits) + R(7)) / R(8);
+}
+
+/// Returns the number of bits required to hold a specified number of bytes
+template <class R = int, class T>
+CUTLASS_HOST_DEVICE
+constexpr
+R
+bytes_to_bits(T bytes) {
+  return R(bytes) * R(8);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class T>
+struct is_subbyte {
+  static constexpr bool value = sizeof_bits<T>::value < 8;
+};
+
+template <class T>
+struct is_subbyte<T const> : is_subbyte<T> {};
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_types.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d814ed29150b2a13131a1f4a7d3cc13174336c9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/numeric_types.h
@@ -0,0 +1,114 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+    \file
+    \brief Top-level include for all CUTLASS numeric types.
+*/
+#pragma once
+
+#include "cute/util/type_traits.hpp"
+
+#include "cutlass/numeric_size.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/half.h"
+#include "cutlass/bfloat16.h"
+#include "cutlass/tfloat32.h"
+#include "cutlass/float8.h"
+#include "cutlass/uint128.h"
+#include "cutlass/uint256.h"
+#include "cutlass/exmy_base.h"
+#include "cutlass/float_subbyte.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <size_t... Seq>
+struct index_sequence;
+
+template <size_t N, size_t... Next>
+struct index_sequence_helper : index_sequence_helper<N - 1, N - 1, Next...> {};
+
+template <size_t... Next>
+struct index_sequence_helper<0, 0, Next...> {
+  using type = index_sequence<0, Next...>;
+};
+
+template <size_t N>
+using make_index_sequence = typename index_sequence_helper<N>::type;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Default case - no negative zero
+template <typename T>
+struct has_negative_zero : CUTE_STL_NAMESPACE::false_type{};
+
+// Float types that support negative zero
+template <> struct has_negative_zero<mx_float4_t<float_e2m1_t>> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<mx_float6_t<float_e2m3_t>> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<mx_float8_t<float_e4m3_t>> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<mx_float8_t<float_e5m2_t>> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<float_e2m1_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<float_e2m3_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<float_e4m3_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<float_e5m2_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<half_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<bfloat16_t> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<float> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<double> : CUTE_STL_NAMESPACE::true_type{};
+template <> struct has_negative_zero<tfloat32_t> : CUTE_STL_NAMESPACE::true_type{};
+
+// Helper variable template 
+template <typename T>
+inline constexpr bool has_negative_zero_v = has_negative_zero<T>::value;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Get the register type used in kernel
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<typename T>
+struct get_unpacked_element_type {
+  using type = T;
+};
+
+} // namespace detail
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/pipeline.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/pipeline.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e9cf66a794fef4631b715e8b6009c99425b3330f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/pipeline.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "cutlass/pipeline/sm100_pipeline.hpp" 
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm100_pipeline.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm100_pipeline.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4014bd006f6e08feff24a82eb8f12ac11462c9ad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm100_pipeline.hpp
@@ -0,0 +1,1328 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+//
+
+//
+
+#include "cute/numeric/integral_constant.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/pipeline/sm90_pipeline.hpp"
+#include "sm90_pipeline.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+using namespace cute;
+
+enum class McastDirection {
+  kRow,
+  kCol,
+  kRowCol
+};
+namespace detail {
+
+template<McastDirection McastDir, class ClusterShape, class AtomThrShape_MNK>
+CUTLASS_DEVICE
+uint16_t calculate_multicast_mask(ClusterShape cluster_shape, AtomThrShape_MNK atom_thr_shape, dim3 block_id_in_cluster) {
+  auto is_participant = [&](auto x, auto y) {
+    if constexpr (McastDir == McastDirection::kRowCol) {
+      return (x/size<0>(atom_thr_shape) == block_id_in_cluster.x/size<0>(atom_thr_shape) || // is same MMA cluster col
+              y/size<1>(atom_thr_shape) == block_id_in_cluster.y/size<1>(atom_thr_shape));  // is same MMA cluster row
+    }
+    else if constexpr (McastDir == McastDirection::kRow) {
+      return (x/size<0>(atom_thr_shape) == block_id_in_cluster.x/size<0>(atom_thr_shape));  // is same MMA cluster row
+    }
+    else { // (McastDir == McastDirection::kCol)
+      return (y/size<1>(atom_thr_shape) == block_id_in_cluster.y/size<1>(atom_thr_shape));  // is same MMA cluster col
+    }
+  };
+  
+  uint16_t block_id_mask = 0;
+  auto cluster_layout = make_layout(cluster_shape);
+  // When MMA_2x1SM instructions are used, the definition of "same row" changes.
+  // With MMA_2x1SM, we need to send the notification for MMA completion to all
+  // 2x1 threadblocks of the cluster. Below is a 4x4 example where R are the threadblocks
+  // that receives the release for A/B buffers that threadblock (0,0) uses.
+  // Row&Col   Row     Col
+  // RRRR      RRRR    Cxxx
+  // RRRR      RRRR    Cxxx
+  // Rxxx      xxxx    Cxxx
+  // Rxxx      xxxx    Cxxx
+  CUTLASS_PRAGMA_UNROLL
+  for (int x = 0; x<size<0>(cluster_shape); x++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int y = 0; y<size<1>(cluster_shape); y++) {
+      if (is_participant(x,y)) {
+        block_id_mask |= (1 << cluster_layout(x,y, Int<0>{}));
+      }
+    }
+  }
+  return block_id_mask;
+}
+
+template<class ClusterShape, class AtomThrShape_MNK>
+CUTLASS_DEVICE
+uint16_t calculate_umma_peer_mask(ClusterShape cluster_shape, AtomThrShape_MNK atom_thr_shape, dim3 block_id_in_cluster) {
+  uint16_t tmem_sync_mask = 0;
+  auto cluster_layout =  make_layout(cluster_shape);
+  int block_id_in_cluster_x = (block_id_in_cluster.x / size<0>(AtomThrShape_MNK{})) * size<0>(AtomThrShape_MNK{}) ;
+  int block_id_in_cluster_y = (block_id_in_cluster.y / size<1>(AtomThrShape_MNK{})) * size<1>(AtomThrShape_MNK{}) ;
+  CUTLASS_PRAGMA_UNROLL
+  for (int x = 0; x < size<0>(AtomThrShape_MNK{}); x++) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int y = 0; y < size<1>(AtomThrShape_MNK{}); y++) {
+      tmem_sync_mask |= (1 << cluster_layout(block_id_in_cluster_x + x, block_id_in_cluster_y + y, Int<0>{}));
+    }
+  }
+
+  return tmem_sync_mask;
+}
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA (producer) Async Pipeline class for Blackwell UMMA
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Stages_, class AtomThrShape_MNK_ = Shape<_1,_1,_1>>
+class PipelineUmmaAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineAsync<Stages>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  // Helper function to initialize barriers
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params) {
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      CUTLASS_ASSERT(params.producer_arv_count > 0 && "Producer arrival count must be non-zero");
+      CUTLASS_ASSERT(params.consumer_arv_count > 0 && "Consumer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, params.producer_arv_count, params.consumer_arv_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster()) {
+    // Calculate producer mask
+    if (params_.role == ThreadCategory::Producer) {
+      // The leader threadblock executing the MMA_2x1SM instruction will signal its peer
+      // threadblock when it is done with MMA operations. tmem_sync_mask encodes the
+      // position of peer SMs in the cluster
+      tmem_sync_mask_ = detail::calculate_umma_peer_mask(cluster_shape, AtomThrShape_MNK{}, block_id_in_cluster);
+    }
+  }
+
+  // Constructor by default initializes barriers and calculates masks. 
+  // These operations can be explicity deferred by specifying InitBarriers and InitMasks. 
+  // If deferred, user code needs to guarantee init_masks and/or init_barriers is/are called. 
+  template<class ClusterShape, class InitBarriers = cute::true_type, class InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineUmmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, InitBarriers{})
+      , params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.producer_acquire(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.consumer_wait(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if constexpr (is_2sm_mma) {
+      consumer_release_2x1SM(state.index());
+    } else {
+      impl_.consumer_release(state);
+    }
+  }
+
+private:
+  Impl impl_;
+  Params params_;
+  FullBarrier* full_barrier_ptr_ = nullptr;
+  EmptyBarrier* empty_barrier_ptr_ = nullptr;
+  uint16_t tmem_sync_mask_ = 0;
+  static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1;
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    uint64_t* smem_ptr = reinterpret_cast<uint64_t*>(&full_barrier_ptr_[stage]);
+    if constexpr (is_2sm_mma) {
+      cutlass::arch::umma_arrive_multicast_2x1SM(smem_ptr, tmem_sync_mask_);
+    }
+    else {
+      cutlass::arch::umma_arrive(smem_ptr);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release_2x1SM(uint32_t stage) {
+    detail::pipeline_check_is_consumer(params_.role);
+    uint64_t* smem_ptr = reinterpret_cast<uint64_t*>(&empty_barrier_ptr_[stage]);
+    cutlass::arch::umma_arrive_2x1SM_sm0(smem_ptr);
+    static_assert(is_2sm_mma, "ERROR : AtomThrShape_MNK does not correspond to a 2SM MMMA");
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA (producer) Transform (consumer) Async Pipeline
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  int Stages_,
+  class AtomThrShape_MNK_ = Shape<_1,_1,_1>
+>
+class PipelineTmaTransformAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineTmaAsync<Stages>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  // Constructor
+  template <class ClusterShape, class InitBarriers = cute::true_type, class InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaTransformAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+  template<class ClusterShape, class InitBarriers = cute::true_type, class InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaTransformAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, McastDirection mcast_direction, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , empty_barrier_ptr_(&storage.empty_barrier_[0])
+      , full_barrier_ptr_(&storage.full_barrier_[0]) {
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape, mcast_direction);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape, mcast_direction);
+    }
+  }
+
+  // Helper function to initialize barriers
+  template <class ClusterShape>
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      auto atom_thr_shape = AtomThrShape_MNK{};
+      static constexpr bool IsDynamicCluster = not cute::is_static_v<ClusterShape>;
+      static_assert(IsDynamicCluster or ((cute::size<0>(cluster_shape) % cute::size<0>(atom_thr_shape) == 0) &&
+                    (cute::size<1>(cluster_shape) % cute::size<1>(atom_thr_shape) == 0)));
+      uint32_t const num_consumer_per_cluster = cute::ceil_div(params.num_consumers, static_cast<uint32_t>(NumThreadsPerWarpGroup));
+      uint32_t const multicast_consumer_arrival_count = ((cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) +
+                                     (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1) * num_consumer_per_cluster;
+      CUTLASS_ASSERT(multicast_consumer_arrival_count > 0 && "Multicast consumer arrival count must be non-zero");
+      CUTLASS_ASSERT(producer_arv_cnt > 0 && "Producer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template <class ClusterShape>
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape, McastDirection mcast_direction) {
+    auto atom_thr_shape = AtomThrShape_MNK{};
+
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      uint32_t const num_consumer_per_cluster = params.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t const multicast_consumer_arrival_count = (mcast_direction == McastDirection::kRow) ?
+        (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) * num_consumer_per_cluster : // Mcast with row ctas
+        (cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) * num_consumer_per_cluster;  // Mcast with col ctas
+
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster(), McastDirection mcast_dir = McastDirection::kRowCol) {
+    // Calculate consumer mask
+    if (params_.role == ThreadCategory::Consumer) {
+      // Logic to optimally schedule Empty Arrives
+      // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads)
+      int warp_idx = canonical_warp_idx_sync();
+      int thread_idx = threadIdx.x;
+      auto cluster_size = cute::size(cluster_shape);
+
+      // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15)
+      if (params_.num_consumers % NumThreadsPerWarpGroup == 0) {
+        auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warpgroup(thread_idx % NumThreadsPerWarpGroup, warp_idx);
+        is_signaling_thread_ = is_signaling_thread;
+        dst_blockid_ = dst_blockid;
+      }
+      else if (params_.num_consumers == 32) {
+        auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warp(thread_idx % 32);
+        is_signaling_thread_ = is_signaling_thread;
+        dst_blockid_ = dst_blockid;
+      }
+      else {
+        is_signaling_thread_ = 0;
+        #ifndef NDEBUG
+          asm volatile ("brkpt;\n" ::);
+        #endif
+      }
+
+      // STEP 2: Find if this dst block-id needs an arrival for this problem
+      is_signaling_thread_ &= dst_blockid_ < cluster_size;
+      if(mcast_dir == McastDirection::kRowCol){
+        is_signaling_thread_ &= is_same_row_or_col(dst_blockid_, block_id_in_cluster, cluster_shape);
+      }
+      if(mcast_dir == McastDirection::kRow){
+        is_signaling_thread_ &= is_same_row(dst_blockid_, block_id_in_cluster, cluster_shape);
+      }
+    }
+  }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  bool is_same_row(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
+    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) 
+              // If we are in the same cluster column and using 2CTA MMA, only odd or only even CTAs sync with each other
+                 && ((dst_block_id % cute::size<0>(cluster_shape)) % cute::size<0>(AtomThrShape_MNK{}) ==
+                      block_id.x % cute::size<0>(AtomThrShape_MNK{}))
+            );
+  }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
+    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) ||
+            (
+              ((dst_block_id / cute::size<0>(cluster_shape)) == block_id.y)
+              // If we are in the same cluster column and using 2CTA MMA, only odd or only even CTAs sync with each other
+                 && ((dst_block_id % cute::size<0>(cluster_shape)) % cute::size<0>(AtomThrShape_MNK{}) ==
+                      block_id.x % cute::size<0>(AtomThrShape_MNK{}))
+            ));
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.producer_acquire(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, uint32_t bytes) {
+    impl_.producer_commit(state, bytes);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state);
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_test_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state) {
+    impl_.consumer_wait(state);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token) {
+    impl_.consumer_wait(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[state.index()].arrive(dst_blockid_, is_signaling_thread_ & (!skip));
+  }
+
+private:
+  Impl impl_;
+  uint32_t dst_blockid_ = 0;
+  uint32_t is_signaling_thread_ = 0;
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA (consumer) Async Pipeline classes for Blackwell UMMA
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Producer-consumer pipeline implementation
+// for UMMA producer. In this case, UMMA barrier arrives are used
+// by producer_commit. Use case, accumulator generation as
+// the result of MMA instructions.
+template <
+  int Stages_,
+  class ClusterShape = Shape<int,int,_1>,
+  class AtomThrShape_MNK_ = Shape<_1,_1,_1>
+>
+class PipelineTmaUmmaAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineTmaAsync<Stages>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  using McastDirection = McastDirection;
+
+  // Helper function to initialize barriers
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      auto atom_thr_shape = AtomThrShape_MNK{};
+      uint32_t const multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) +
+                                     (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1;
+      CUTLASS_ASSERT(multicast_consumer_arrival_count > 0 && "Multicast consumer arrival count must be non-zero");
+      CUTLASS_ASSERT(producer_arv_cnt > 0 && "Producer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape, McastDirection mcast_direction) {
+    auto atom_thr_shape = AtomThrShape_MNK{};
+
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      uint32_t const multicast_consumer_arrival_count = (mcast_direction == McastDirection::kRow) ?
+        cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape) : // Mcast with row ctas
+        cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape);  // Mcast with col ctas
+
+      CUTLASS_ASSERT(multicast_consumer_arrival_count > 0 && "Multicast consumer arrival count must be non-zero");
+      CUTLASS_ASSERT(producer_arv_cnt > 0 && "Producer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster()) {
+    // Calculate consumer mask
+    if (params_.role == ThreadCategory::Consumer) {
+      auto cluster_layout = make_layout(cluster_shape);
+      block_id_mask_ = detail::calculate_multicast_mask<McastDirection::kRowCol>(cluster_shape, AtomThrShape_MNK{}, block_id_in_cluster);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, McastDirection mcast_direction) {
+    // Calculate consumer mask
+    dim3 block_id_in_cluster = cute::block_id_in_cluster();
+    auto cluster_layout = make_layout(cluster_shape);
+    if (mcast_direction == McastDirection::kRow) {
+      block_id_mask_ = detail::calculate_multicast_mask<McastDirection::kRow>(cluster_shape, AtomThrShape_MNK{}, block_id_in_cluster);
+    }
+    else {
+      block_id_mask_ = detail::calculate_multicast_mask<McastDirection::kCol>(cluster_shape, AtomThrShape_MNK{}, block_id_in_cluster);
+    }
+  }
+
+  // Constructor by default initializes barriers and calculates masks. 
+  // These operations can be explicity deferred by specifying InitBarriers and InitMasks. 
+  // If deferred, user code needs to guarantee init_masks and/or init_barriers is/are called. 
+  template<typename InitBarriers = cute::true_type, typename InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaUmmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , empty_barrier_ptr_(&storage.empty_barrier_[0])
+      , full_barrier_ptr_(&storage.full_barrier_[0]) {
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+  template<typename InitBarriers = cute::true_type, typename InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaUmmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, McastDirection mcast_direction, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , empty_barrier_ptr_(&storage.empty_barrier_[0])
+      , full_barrier_ptr_(&storage.full_barrier_[0]) {
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape, mcast_direction);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape, mcast_direction);
+    }
+  }
+
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.producer_acquire(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_expect_transaction(PipelineState state, uint32_t transaction_bytes) {
+    impl_.producer_expect_transaction(state, transaction_bytes);
+  }
+
+  // NOP for TMA based mainloop
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, uint32_t bytes) {
+    impl_.producer_commit(state, bytes);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state);
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.consumer_wait(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index(), false);
+  }
+
+private:
+  Impl impl_;
+  Params params_;
+  EmptyBarrier *empty_barrier_ptr_;
+  FullBarrier *full_barrier_ptr_;
+  uint16_t block_id_mask_ = 0;
+  static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1;
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip) {
+    detail::pipeline_check_is_consumer(params_.role);
+    uint64_t* smem_ptr = reinterpret_cast<uint64_t*>(&empty_barrier_ptr_[stage]);
+    if constexpr (is_2sm_mma) { // Mma cluster shape is 2x1
+      if (!skip) {
+        cutlass::arch::umma_arrive_multicast_2x1SM(smem_ptr, block_id_mask_);
+      }
+    }
+    else {
+      if (!skip) {
+        if constexpr (cute::is_static_v<ClusterShape> and size(ClusterShape{}) == 1) {
+          cutlass::arch::umma_arrive(smem_ptr);
+        }
+        else {
+          cutlass::arch::umma_arrive_multicast(smem_ptr, block_id_mask_);
+        }
+      }
+    }
+  }
+};
+
+// Producer-consumer pipeline implementation
+// for UMMA consumer. In this case, UMMA barrier arrives are
+// used by consumer_release.
+template <int Stages_, class AtomThrShape_MNK_ = Shape<_1,_1,_1>>
+class PipelineUmmaConsumerAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineAsync<Stages>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster()) {
+    // Calculate consumer mask
+    if (params_.role == ThreadCategory::Consumer) {
+      // The leader threadblock executing the MMA_2x1SM instruction will signal its peer
+      // threadblock when it is done with MMA operations. tmem_sync_mask encodes the
+      // position of peer SMs in the cluster
+      tmem_sync_mask_ = detail::calculate_umma_peer_mask(cluster_shape, AtomThrShape_MNK{}, block_id_in_cluster);
+    }
+  }
+
+  // Constructor by default initializes barriers and calculates masks. 
+  // These operations can be explicity deferred by specifying InitBarriers and InitMasks. 
+  // If deferred, user code needs to guarantee init_masks and/or init_barriers is/are called. 
+  template<class ClusterShape, class InitBarriers = cute::true_type, class InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineUmmaConsumerAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, InitBarriers{})
+      , params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    impl_.producer_acquire(state, barrier_token);
+  }
+
+  template<class UserDefinedArriveOp>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
+    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state));
+    producer_commit(state);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    if constexpr (is_2sm_mma) {
+      producer_commit_2x1SM(state.index());
+    } else {
+      impl_.producer_commit(state);
+    }
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      impl_.consumer_wait(state);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  Impl impl_;
+  Params params_;
+  FullBarrier* full_barrier_ptr_ = nullptr;
+  EmptyBarrier* empty_barrier_ptr_ = nullptr;
+  uint16_t tmem_sync_mask_ = 0;
+  static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1;
+
+  CUTLASS_DEVICE
+  void producer_commit_2x1SM(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    uint64_t* smem_ptr = reinterpret_cast<uint64_t*>(&full_barrier_ptr_[stage]);
+    cutlass::arch::umma_arrive_2x1SM_sm0(smem_ptr);
+    static_assert(is_2sm_mma, "ERROR : AtomThrShape_MNK does not correspond to a 2SM MMMA");
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    uint64_t* smem_ptr = reinterpret_cast<uint64_t*>(&empty_barrier_ptr_[stage]);
+    if constexpr (is_2sm_mma) {
+      cutlass::arch::umma_arrive_multicast_2x1SM(smem_ptr, tmem_sync_mask_);
+    }
+    else {
+      cutlass::arch::umma_arrive(smem_ptr);
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// CLC Async Pipeline class for Blackwell UMMA
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+
+template<int Stages_>
+using PipelineCLCFetchAsyncPipelineState = cutlass::PipelineState<Stages_>;
+
+template<int Stages_>
+struct PipelineCLCFetchAsyncSharedStorage {
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+
+  FullBarrier full_barrier_[static_cast<size_t>(Stages_)];
+  EmptyBarrier empty_barrier_[static_cast<size_t>(Stages_)];
+};
+
+} // namespace PipelineDetail
+
+template <int Stages_, class ClusterShape = Shape<int,int,_1>>
+class PipelineCLCFetchAsync {
+
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = PipelineDetail::PipelineCLCFetchAsyncPipelineState<Stages>;
+  using SharedStorage = PipelineDetail::PipelineCLCFetchAsyncSharedStorage<Stages>;
+  using FullBarrier = typename SharedStorage::FullBarrier;
+  using EmptyBarrier = typename SharedStorage::EmptyBarrier;
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    uint32_t transaction_bytes = 0;
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t is_leader = 0;
+    uint32_t num_consumers = 0;
+    uint32_t producer_blockid = 0;
+    uint32_t producer_arv_count = 0;
+    uint32_t consumer_arv_count = 0;
+    int initializing_warp = 0;
+  };
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineCLCFetchAsync(SharedStorage& storage, Params const& params) :
+  params_(params),
+  full_barrier_ptr_(&storage.full_barrier_[0]),
+  empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      CUTLASS_ASSERT(params.producer_arv_count > 0 && "Producer arrival count must be non-zero");
+      CUTLASS_ASSERT(params.consumer_arv_count > 0 && "Consumer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(full_barrier_ptr_), decltype(empty_barrier_ptr_), Stages>(
+          full_barrier_ptr_, empty_barrier_ptr_, params_.producer_arv_count, params_.consumer_arv_count);
+    }
+    cutlass::arch::fence_barrier_init();
+
+    cluster_size_ = []() { auto cs = cute::cluster_shape(); return cs.x * cs.y; }();
+  }
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineCLCFetchAsync(SharedStorage& storage, Params const& params, ClusterShape cluster_shape)
+  : params_(params)
+  , full_barrier_ptr_(&storage.full_barrier_[0])
+  , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+    int warp_idx = canonical_warp_idx_sync();
+    if (warp_idx == params.initializing_warp) {
+      // Barrier FULL and EMPTY init
+      CUTLASS_ASSERT(params.producer_arv_count > 0 && "Producer arrival count must be non-zero");
+      CUTLASS_ASSERT(params.consumer_arv_count > 0 && "Consumer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(full_barrier_ptr_), decltype(empty_barrier_ptr_), Stages>(
+          full_barrier_ptr_, empty_barrier_ptr_, params_.producer_arv_count, params_.consumer_arv_count);
+    }
+    cutlass::arch::fence_barrier_init();
+
+    cluster_size_ = cute::size<0>(cluster_shape)
+                  * cute::size<1>(cluster_shape)
+                  * cute::size<2>(cluster_shape);
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  // Manual completion of transaction count
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.phase());
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // Does NOT reset transaction bytes.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    detail::pipeline_check_is_producer(params_.role);
+    for (int count = 0; count < Stages; ++count) {
+      bool done = empty_barrier_ptr_[state.index()].test_wait(state.phase());
+      if (!done) {
+        empty_barrier_ptr_[state.index()].wait(state.phase());
+      }
+      ++state;
+    }
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  // Consumer signalling Producer of completion
+  // Notifies the producer block in the Cluster
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+  CUTLASS_HOST_DEVICE
+  uint32_t producer_get_barrier(PipelineState state) {
+    return cute::cast_smem_ptr_to_uint(reinterpret_cast<void*>(&full_barrier_ptr_[state.index()]));
+  }
+
+private:
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+  int lane_idx_ = canonical_lane_idx();
+  int cluster_size_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_stat = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_stat)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    // 1. Wait for empty barrier to be ready
+    // 2. Set the transaction bytes set to occur on the Full barrier for all blocks
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+
+    full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes, lane_idx_, uint32_t(lane_idx_ < cluster_size_));
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage, uint32_t phase) {
+    int cluster_size_ = []() { auto cs = cute::cluster_shape(); return cs.x * cs.y; }();
+    full_barrier_ptr_[stage].complete_transaction(lane_idx_, params_.transaction_bytes,  uint32_t(lane_idx_ < cluster_size_));
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_stat = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_stat)};
+  }
+
+  // Wait for producer to commit transactions
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(params_.producer_blockid);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Empty Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+class PipelineEmpty {
+public:
+  static constexpr uint32_t Stages = 0;
+  using PipelineState = cutlass::PipelineState<0>;
+  struct Params {};
+  struct SharedStorage {};
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineEmpty(SharedStorage& storage, Params const& params) {}
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineEmpty(SharedStorage&& storage, Params const& params) {}
+
+  // Constructor with throwaway ClusterShape
+  template <class ClusterShape = Shape<int,int,_1>>
+  CUTLASS_DEVICE
+  PipelineEmpty(SharedStorage&& storage, Params const& params, ClusterShape) {}
+
+ CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA (producer - consumer) Async Pipeline classes for Blackwell Sparse UMMA
+// This is designed for the pattern that kernel has two different staged tensors. (AB and metadata)
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Producer-consumer pipeline implementation
+// for UMMA producer. In this case, UMMA barrier arrives are used
+// by producer_commit. Use case, accumulator generation as
+// the result of MMA instructions.
+template <
+  int Stages_,
+  class ClusterShape = Shape<int,int,_1>,
+  class AtomThrShape_MNK_ = Shape<_1,_1,_1>
+>
+class PipelineTmaSparseUmmaAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using AtomThrShape_MNK = AtomThrShape_MNK_;
+private:
+  using Impl = PipelineTmaUmmaAsync<Stages, ClusterShape, AtomThrShape_MNK>;
+public:
+  using FullBarrier  = typename Impl::FullBarrier;
+  using EmptyBarrier = typename Impl::EmptyBarrier;
+  using ProducerBarrierType = typename Impl::ProducerBarrierType;
+  using ConsumerBarrierType = typename Impl::ConsumerBarrierType;
+  using PipelineState = typename Impl::PipelineState;
+  using SharedStorage = typename Impl::SharedStorage;
+  using ThreadCategory = typename Impl::ThreadCategory;
+  using Params = typename Impl::Params;
+
+  struct ParamsMetadata {
+    uint32_t transaction_bytes = 0;
+    uint32_t metadata_transaction_bytes = 0;
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    Impl::init_barriers(storage, params, cluster_shape);
+  }
+
+  CUTLASS_DEVICE
+  void init_masks(ClusterShape cluster_shape, dim3 block_id_in_cluster = cute::block_id_in_cluster()) {
+    impl_.init_masks(cluster_shape, block_id_in_cluster);
+  }
+
+  // Constructor by default initializes barriers and calculates masks. 
+  // These operations can be deferred by specifying InitBarriers and InitMasks. 
+  // If deferred, user code needs to guarantee init_masks and/or init_barriers is/are called. 
+  template<typename InitBarriers = cute::true_type, typename InitMasks = cute::true_type>
+  CUTLASS_DEVICE
+  PipelineTmaSparseUmmaAsync(SharedStorage& storage, Params params, ParamsMetadata params_metadata, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : impl_(storage, params, cluster_shape, cute::false_type{}, cute::false_type{})
+      , params_(params)
+      , params_metadata_(params_metadata)
+      , empty_barrier_ptr_(&storage.empty_barrier_[0])
+      , full_barrier_ptr_(&storage.full_barrier_[0]) {
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      init_masks(cluster_shape);
+    }
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.producer_try_acquire(state, skip_wait);
+  }
+
+  // Customized for metadata load
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, bool load_e, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), load_e, barrier_token);
+  }
+
+  // Customized for metadata load
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state, true, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    return impl_.producer_tail(state);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return impl_.producer_get_barrier(state);
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return impl_.consumer_try_wait(state, skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    return impl_.consumer_wait(state, barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    return impl_.consumer_release(state);
+  }
+
+private:
+  Impl impl_;
+  Params params_;
+  ParamsMetadata params_metadata_;
+  EmptyBarrier *empty_barrier_ptr_{nullptr};
+  FullBarrier *full_barrier_ptr_{nullptr};
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, bool load_e, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+    uint32_t bytes_now = load_e ? params_metadata_.transaction_bytes + params_metadata_.metadata_transaction_bytes : params_metadata_.transaction_bytes;
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(bytes_now);
+    }
+  }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aae17d98aafc045be0bfda867cad95717b19e74d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -0,0 +1,1388 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/layout_composed.hpp"  // cute::composition
+#include "cute/swizzle.hpp"             // cute::Swizzle
+#include "cute/swizzle_layout.hpp"      // cute::composition
+#include "cute/util/type_traits.hpp"
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/container/array.hpp"
+#include "cute/numeric/integral_constant.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/detail/dependent_false.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using namespace cute;
+
+namespace detail {
+
+// Helper function for DEBUG checks
+template<class ThreadCategory>
+CUTLASS_DEVICE
+bool pipeline_is_producer(ThreadCategory role) {
+  return (role == ThreadCategory::Producer || role == ThreadCategory::ProducerConsumer);
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+void pipeline_check_is_producer(ThreadCategory role) {
+  #ifndef NDEBUG
+  if (!pipeline_is_producer(role)) {
+    asm volatile ("brkpt;\n" ::);
+  }
+  #endif
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+bool pipeline_is_consumer(ThreadCategory role) {
+  return (role == ThreadCategory::Consumer || role == ThreadCategory::ProducerConsumer);
+}
+
+template<class ThreadCategory>
+CUTLASS_DEVICE
+void pipeline_check_is_consumer(ThreadCategory role) {
+  #ifndef NDEBUG
+  if (!pipeline_is_consumer(role)) {
+    asm volatile ("brkpt;\n" ::);
+  }
+  #endif
+}
+
+CUTLASS_DEVICE
+cute::tuple<bool, uint32_t> spread_arrivals_to_warp(int thread_idx_in_warp) {
+  constexpr uint32_t MaxClusterSize = 16;
+  bool is_signaling_thread = (thread_idx_in_warp % (32 / MaxClusterSize)) == 0;
+  auto layout = Layout<Shape<_4,_4>,Stride<_4, _1>>{};
+  uint32_t thread_row = thread_idx_in_warp / 8;
+  uint32_t thread_col = (thread_idx_in_warp % 8) / 2;
+  uint32_t dst_blockid = layout(thread_row, thread_col);
+  return cute::make_tuple(is_signaling_thread, dst_blockid);
+}
+
+CUTLASS_DEVICE
+cute::tuple<bool, uint32_t> spread_arrivals_to_warpgroup(int thread_idx_in_warpgroup, int warp_idx) {
+  constexpr uint32_t MaxClusterSize = 16;
+  bool is_signaling_thread = (thread_idx_in_warpgroup % (NumThreadsPerWarpGroup / MaxClusterSize)) == 0;
+  auto layout = cute::composition(Swizzle<2,0,-2>{},
+                                  Layout<Shape<_4,_4>,Stride<_4,_1>>{});
+  uint32_t thread_row = warp_idx % 4;
+  uint32_t thread_col = (thread_idx_in_warpgroup / 8) % 4;
+  uint32_t dst_blockid = layout(thread_row, thread_col);
+  return cute::make_tuple(is_signaling_thread, dst_blockid);
+}
+} // namespace detail
+
+enum class BarrierStatus : uint32_t {
+  WaitAgain = 0u,
+  WaitDone  = 1u,
+};
+
+class ArrivalToken {
+public:
+  CUTLASS_HOST_DEVICE
+  ArrivalToken(BarrierStatus barrier_status) : barrier_status_(barrier_status) {}
+
+  CUTLASS_HOST_DEVICE
+  ArrivalToken() = delete;
+
+  CUTLASS_HOST_DEVICE
+  BarrierStatus get() const {
+    return barrier_status_;
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool operator==(ArrivalToken const& other) const {
+    return barrier_status_ == other.get();
+  }
+
+private:
+  BarrierStatus barrier_status_;
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() == right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator==(const BarrierStatus& left, const ArrivalToken& right) {
+    return left == right.get();
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const ArrivalToken& left, const BarrierStatus& right) {
+    return left.get() != right;
+  }
+
+  CUTLASS_HOST_DEVICE
+  friend bool operator!=(const BarrierStatus& left, const ArrivalToken& right) {
+    return left != right.get();
+  }
+};
+
+class ProducerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+class ConsumerToken : public ArrivalToken {
+  using ArrivalToken::ArrivalToken;
+};
+
+// Circular Buffer Index + Associated Phase
+// Assumes only one operation possible - i.e., ++
+template<uint32_t Stages_>
+struct PipelineState {
+
+  static constexpr uint32_t Stages = Stages_;
+
+  int index_ = 0;
+  uint32_t phase_ = 0;
+  uint32_t count_ = 0;
+
+  CUTLASS_DEVICE
+  PipelineState(): index_{}, phase_{}, count_{} {}
+
+  CUTLASS_DEVICE
+  PipelineState(int index, uint32_t phase, uint32_t count)
+    : index_(index)
+    , phase_(phase)
+    , count_(count) {}
+
+  CUTLASS_DEVICE
+  int index() const {
+    return index_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t phase() const {
+    return phase_;
+  }
+
+  CUTLASS_DEVICE
+  uint32_t count() const {
+    return count_;
+  }
+
+  CUTLASS_DEVICE
+  void operator++() {
+    if constexpr (Stages > 0) {
+      ++index_;
+      ++count_;
+      if (index_ == Stages) {
+        index_ = 0;
+        phase_ ^= 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator+=(uint32_t num_iterations) {
+    return advance(num_iterations);
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& operator=(PipelineState const& other) {
+    index_ = other.index();
+    phase_ = other.phase();
+    count_ = other.count();
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  PipelineState& advance(uint32_t num_iterations) {
+    if constexpr (Stages > 0) {
+      // Number of iterations cross over the stage boundary => flipped phase
+      if ((num_iterations < Stages) && (index_ + num_iterations) >= Stages ) {
+        phase_ ^= 1;
+      }
+      // How many times number of iterations cross over the stage boundary and
+      // end up on a odd number => flipped phase
+      if ((num_iterations >= Stages) && (((index_ + num_iterations) / Stages) % 2) == 1) {
+        phase_ ^= 1;
+      }
+      index_ = (index_ + num_iterations) % Stages;
+      count_ += num_iterations;
+    }
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  static PipelineState make_pipeline_state(PipelineState start_state, uint32_t num_iterations) {
+    return start_state.advance(num_iterations);
+  }
+};
+
+template<class Pipeline>
+CUTLASS_DEVICE
+PipelineState<Pipeline::Stages> make_producer_start_state() {
+  // Producer starts with an opposite phase as the buffers are initially empty
+  constexpr int InitialProducerStage = 0;
+  constexpr uint32_t InitialProducerPhase = 1;
+  constexpr uint32_t InitialProducerCount = 0;
+  return {InitialProducerStage, InitialProducerPhase, InitialProducerCount};
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA load (producer) Async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Assumptions : Constructor is visible Cluster-wide (as it needs a Cluster-Sync)
+// We have exactly one thread elected in the Producer as the "leader"
+// Currently, it is optional to elect a leader for the Consumers
+template <int Stages_>
+class PipelineTmaAsync {
+public:
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    uint32_t transaction_bytes = 0;
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t is_leader = 0;
+    uint32_t num_consumers = 0; // Number of consumer threads
+    uint32_t num_producers = 1; // Number of producer threads
+    int initializing_warp = 0; 
+  };
+
+  template <class ClusterShape>
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params, ClusterShape cluster_shape) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    is_initializing_warp = (warp_idx == params.initializing_warp); 
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      uint32_t const producer_arv_cnt = params.num_producers;
+      uint32_t const num_consumer_warpgroups_per_cluster = cute::ceil_div(params.num_consumers, static_cast<uint32_t>(NumThreadsPerWarpGroup));
+      uint32_t multicast_consumer_arrival_count = params.num_consumers; // If cluster_size is 1
+      if (cute::size(cluster_shape) > 1) {
+        multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) + cute::size<1>(cluster_shape) - 1) *
+              num_consumer_warpgroups_per_cluster;
+      }
+      CUTLASS_ASSERT(multicast_consumer_arrival_count > 0 && "Multicast consumer arrival count must be non-zero");
+      CUTLASS_ASSERT(producer_arv_cnt > 0 && "Producer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template<class ClusterShape, class InitBarriers, class InitMasks>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {}, InitMasks = {})
+      : params_(params)
+      , full_barrier_ptr_(&storage.full_barrier_[0])
+      , empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int thread_idx = threadIdx.x;
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    static_assert(cute::is_same_v<InitMasks, cute::true_type> || cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_, cluster_shape);
+    }
+
+    if constexpr (cute::is_same_v<InitMasks, cute::true_type>) {
+      // Logic to optimally schedule Empty Arrives
+      // Goal : To divide SYNCS Empty Arrival duty equally amongst the Warp-Group (128 threads)
+      dim3 block_id = cute::block_id_in_cluster();
+      auto cluster_size = cute::size(cluster_shape);
+
+      if (cluster_size == 1) {
+        is_signaling_thread_ = true;
+        dst_blockid_ = 0;
+      }
+      else {
+        // STEP 1 : Use Cute Layout function to generate an optimal dst block-id (0-15)
+        if (params_.num_consumers % NumThreadsPerWarpGroup == 0) {
+          auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warpgroup(thread_idx % NumThreadsPerWarpGroup, warp_idx);
+          is_signaling_thread_ = is_signaling_thread;
+          dst_blockid_ = dst_blockid;
+        }
+        else if (params_.num_consumers == 32) {
+          auto [is_signaling_thread, dst_blockid] = detail::spread_arrivals_to_warp(thread_idx % 32);
+          is_signaling_thread_ = is_signaling_thread;
+          dst_blockid_ = dst_blockid;
+        }
+        else {
+          is_signaling_thread_ = 0;
+          #ifndef NDEBUG
+            asm volatile ("brkpt;\n" ::);
+          #endif
+        }
+
+        // STEP 2: Find if this dst block-id needs an arrival for this problem
+        is_signaling_thread_ &= dst_blockid_ < cluster_size;
+        is_signaling_thread_ &= is_same_row_or_col(dst_blockid_, block_id, cluster_shape);
+      }
+    }
+  }
+
+  // Constructor
+  template<class ClusterShape>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape)
+      : PipelineTmaAsync(storage, params, cluster_shape, cute::true_type{}, cute::true_type{}) { }
+  
+  template<class ClusterShape, class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineTmaAsync(SharedStorage& storage, Params params, ClusterShape cluster_shape, InitBarriers = {})
+      : PipelineTmaAsync(storage, params, cluster_shape, InitBarriers{}, cute::true_type{}) { }
+
+  template <class ClusterShape>
+  CUTLASS_DEVICE
+  bool is_same_row_or_col(int dst_block_id, dim3 block_id, ClusterShape cluster_shape) {
+    return (((dst_block_id % cute::size<0>(cluster_shape)) == block_id.x) ||
+            (
+              ((dst_block_id / cute::size<0>(cluster_shape)) == block_id.y)
+            ));
+  }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state) {
+    producer_acquire(state.index(), state.phase());
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, uint32_t bytes) {
+    producer_commit(state.index(), bytes);
+  }
+
+  template<class UserDefinedArriveOp>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
+    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));;
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    detail::pipeline_check_is_producer(params_.role);
+    for (int count = 0; count < Stages; ++count) {
+      empty_barrier_ptr_[state.index()].wait(state.phase());
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  CUTLASS_DEVICE
+  void producer_expect_transaction(PipelineState state, uint32_t transaction_bytes) {
+    producer_expect_transaction(state.index(), transaction_bytes);
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state) {
+    consumer_wait(state.index(), state.phase());
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  uint32_t dst_blockid_ = 0;
+  uint32_t is_signaling_thread_ = 0;
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase) {
+    empty_barrier_ptr_[stage].wait(phase);
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+
+    // Most likely you have elected more than one leader
+    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token != BarrierStatus::WaitDone) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].arrive_and_expect_tx(params_.transaction_bytes);
+    }
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Consumer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+
+    // Most likely you have elected more than one leader
+    if (params_.is_leader && (threadIdx.x % 32 != 0)) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  void producer_expect_transaction(uint32_t stage, uint32_t transaction_bytes) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (params_.is_leader) {
+      full_barrier_ptr_[stage].expect_transaction(transaction_bytes);
+    }
+  }
+
+  // NOP for TMA based mainloop
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage, uint32_t bytes) {
+    // Below code is used only for unit-testing (in the absence of TMA commit)
+    #if CUTLASS_UNIT_TEST_PIPELINE
+      if (params_.is_leader) {
+        // STEP 1 : Commit to self
+        full_barrier_ptr_[stage].complete_transaction(bytes);
+
+        // STEP 2 : Commit to other blocks in our cluster
+        auto cluster_shape = cute::cluster_shape();
+        Layout block_layout_in_cluster = make_layout(cluster_shape);
+        dim3 local_block_id = cute::block_id_in_cluster();
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < size<1>(block_layout_in_cluster); ++n) {
+          uint32_t dst_block_id = block_layout_in_cluster(local_block_id.x,n,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, n!=local_block_id.y);
+        }
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int m = 0; m < size<0>(block_layout_in_cluster); ++m) {
+          uint32_t dst_block_id = block_layout_in_cluster(m,local_block_id.y,Int<0>{});
+          full_barrier_ptr_[stage].complete_transaction(dst_block_id, bytes, m!=local_block_id.x);
+        }
+      }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    detail::pipeline_check_is_consumer(params_.role);
+    full_barrier_ptr_[stage].wait(phase);
+  }
+
+  // Wait for producer to commit transactions (done by TMA)
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(dst_blockid_, is_signaling_thread_ & (!skip));
+    #ifndef NDEBUG
+    if (params_.role == ThreadCategory::Producer || params_.role == ThreadCategory::NonParticipant) {
+      asm volatile ("brkpt;\n" ::);
+    }
+    #endif
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// TMA store pipeline class
+// producer-only class, no async barriers between threads because consumer is TMA unit
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  int Stages_,
+  // The number of committed TMA store batches that can be in flight upon return of producer acquire
+  int UnacquiredStages_ = Stages_-1
+>
+class PipelineTmaStore {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  static_assert(Stages_ > 0);
+  static_assert(UnacquiredStages_ >= 0);
+  static constexpr uint32_t UnacquiredStages = static_cast<uint32_t>(UnacquiredStages_);
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  CUTLASS_DEVICE
+  PipelineTmaStore(Params params = {}) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Wait for the least recently committed batch of TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state) {
+    producer_acquire(state.index(), state.count());
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+  void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Wait for the least recently committed batch of TMA stores to complete
+  // or until at most UnacquiredStages TMA store batches are in-flight (if specified)
+  CUTLASS_DEVICE
+  void producer_acquire([[maybe_unused]] uint32_t stage, uint32_t count) {
+    if (params_.always_wait || count > UnacquiredStages) {
+      tma_store_wait<UnacquiredStages>();
+    }
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+  void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+template <>
+class PipelineTmaStore< /* Stages_ = */ 0, /* UnacquiredStages = Stages_ - 1 = */ -1 > {
+public:
+  static constexpr uint32_t Stages = 0;
+  static constexpr uint32_t UnacquiredStages = 0;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct Params {
+    bool always_wait = false;
+  };
+
+  PipelineTmaStore() = default;
+  CUTLASS_DEVICE
+    PipelineTmaStore(Params params) : params_(params) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+
+  template<class ThisTemplateParameterExistsOnlyForDependentFalse = int>
+  CUTLASS_DEVICE
+    void producer_acquire(PipelineState /* state */,
+      ThisTemplateParameterExistsOnlyForDependentFalse* /* unused */ = nullptr) {
+    static_assert(cutlass::detail::dependent_false<ThisTemplateParameterExistsOnlyForDependentFalse>,
+      "It is never valid to call PipelineTmaStore<0>::producer_acquire");
+  }
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit(PipelineState state) {
+    producer_commit(state.index(), state.count());
+  }
+
+  // Wait for all TMA stores to complete
+  CUTLASS_DEVICE
+    void producer_tail([[maybe_unused]] PipelineState state) {
+    tma_store_wait<0>();
+  }
+
+private:
+  Params params_;
+
+  // Commit the most recently issued batch of TMA stores
+  CUTLASS_DEVICE
+    void producer_commit([[maybe_unused]] uint32_t stage, [[maybe_unused]] uint32_t count) {
+    tma_store_arrive();
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class using producer transaction barriers
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <int Stages_>
+class PipelineTransactionAsync {
+public:
+  using FullBarrier = cutlass::arch::ClusterTransactionBarrier;
+  using EmptyBarrier = cutlass::arch::ClusterBarrier;
+  using ProducerBarrierType = FullBarrier::ValueType;
+  using ConsumerBarrierType = EmptyBarrier::ValueType;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = cutlass::PipelineState<Stages>;
+
+  struct SharedStorage {
+    cute::array<FullBarrier, Stages> full_barrier_;
+    cute::array<EmptyBarrier, Stages> empty_barrier_;
+  };
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t transaction_bytes = 0;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+    int initializing_warp = 0; 
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params const& params) {
+    FullBarrier *full_barrier_ptr = storage.full_barrier_.data();
+    EmptyBarrier *empty_barrier_ptr = storage.empty_barrier_.data();
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    is_initializing_warp = (warp_idx == params.initializing_warp); 
+
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      CUTLASS_ASSERT(params.producer_arv_count > 0 && "Producer arrival count must be non-zero");
+      CUTLASS_ASSERT(params.consumer_arv_count > 0 && "Consumer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(full_barrier_ptr), decltype(empty_barrier_ptr), Stages>(
+          full_barrier_ptr, empty_barrier_ptr, params.producer_arv_count, params.consumer_arv_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  // Constructor
+  template<class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineTransactionAsync(SharedStorage& storage, Params const& params, InitBarriers = cute::true_type{})
+    : params_(params)
+    , full_barrier_ptr_(storage.full_barrier_.data())
+    , empty_barrier_ptr_(storage.empty_barrier_.data()) {
+
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params);
+    }
+
+  }
+
+  // Constructor
+  CUTLASS_DEVICE
+  PipelineTransactionAsync(SharedStorage& storage, Params const& params) :
+    PipelineTransactionAsync(storage, params, cute::true_type{}) { }
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(PipelineState state) {
+    producer_expect_transaction(state.index());
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+private:
+  FullBarrier *full_barrier_ptr_ = nullptr;
+  EmptyBarrier *empty_barrier_ptr_ = nullptr;
+  Params params_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  // Perform an expect-tx operation on the stage's full barrier. Must be called by 1 thread
+  CUTLASS_DEVICE
+  void producer_expect_transaction(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].expect_transaction(params_.transaction_bytes);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid, (not skip));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Simple producer-consumer async Pipeline class
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+  template<int Stages>
+  using PipelineAsyncPipelineState = cutlass::PipelineState<Stages>;
+
+  template<int Stages>
+  struct PipelineAsyncSharedStorage {
+    using FullBarrier = cutlass::arch::ClusterBarrier;
+    using EmptyBarrier = cutlass::arch::ClusterBarrier;
+
+    FullBarrier full_barrier_[Stages];
+    EmptyBarrier empty_barrier_[Stages];
+  };
+};
+
+template <int Stages_>
+class PipelineAsync {
+public:
+  static constexpr uint32_t Stages = Stages_;
+  using SharedStorage = PipelineDetail::PipelineAsyncSharedStorage<Stages>;
+  using FullBarrier = typename SharedStorage::FullBarrier;
+  using EmptyBarrier = typename SharedStorage::EmptyBarrier;
+  using ProducerBarrierType = typename FullBarrier::ValueType;
+  using ConsumerBarrierType = typename EmptyBarrier::ValueType;
+  using PipelineState = PipelineDetail::PipelineAsyncPipelineState<Stages>;
+
+  enum class ThreadCategory {
+    NonParticipant,
+    Producer,
+    Consumer,
+    ProducerConsumer
+  };
+
+  struct Params {
+    ThreadCategory role = ThreadCategory::NonParticipant;
+    uint32_t producer_arv_count = 1;
+    uint32_t consumer_arv_count = 1;
+    uint32_t dst_blockid = cute::block_rank_in_cluster();
+    int initializing_warp = 0; 
+  };
+
+  static
+  CUTLASS_DEVICE
+  void
+  init_barriers(SharedStorage& storage, Params params) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    is_initializing_warp = (warp_idx == params.initializing_warp); 
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      CUTLASS_ASSERT(params.producer_arv_count > 0 && "Producer arrival count must be non-zero");
+      CUTLASS_ASSERT(params.consumer_arv_count > 0 && "Consumer arrival count must be non-zero");
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<decltype(storage.full_barrier_), decltype(storage.empty_barrier_), Stages>(
+          storage.full_barrier_, storage.empty_barrier_, params.producer_arv_count, params.consumer_arv_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template<class InitBarriers>
+  CUTLASS_DEVICE
+  PipelineAsync(
+    SharedStorage& storage,
+    Params const& params,
+    InitBarriers = {}) :
+      params_(params),
+      full_barrier_ptr_(&storage.full_barrier_[0]),
+      empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+
+    static_assert(cute::is_same_v<InitBarriers, cute::true_type> || cute::is_same_v<InitBarriers, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params_);
+    }
+  }
+
+  CUTLASS_DEVICE
+  PipelineAsync(
+    SharedStorage& storage,
+    Params const& params) :
+      PipelineAsync(storage, params, cute::true_type{}) { }
+
+  // Default assumption when only storage is passed is :
+  // => single producer, single consumer & they are in the same block (within the Cluster)
+  CUTLASS_DEVICE
+  PipelineAsync(SharedStorage& storage)
+    : PipelineAsync(storage, {}, cute::true_type{}) {}
+
+  ////////////////////
+  // Producer APIs
+  ////////////////////
+  // Four member functions are always used in pairs:
+  //
+  // * producer_try_acquire and producer_acquire, and
+  // * consumer_try_wait and consumer_wait.
+  //
+  // The two functions with "try" in their names are called "try" functions,
+  // and the other two are conceptually "finalize" functions.
+  // The "try" function in each pair starts the process of waiting on the barrier to flip.
+  // It opportunistically waits for an implementation-dependent timeout.
+  // Whether or not the barrier has flipped yet, the try function will return a token.
+  // If the token indicates that the barrier has not flipped,
+  // then the token must be passed into the corresponding "finalize" function.
+  // The finalize function will then block until the barrier has flipped.
+  // If the token indicates that the barrier _has_ flipped,
+  // then it is still correct to pass it into the finalize function.
+  // The finalize function will return immediately in that case.
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(PipelineState state, uint32_t skip_wait = false) {
+    return producer_try_acquire(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(PipelineState state, ProducerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    producer_acquire(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state) {
+    producer_commit(state.index());
+  }
+
+  template<class UserDefinedArriveOp>
+  CUTLASS_DEVICE
+  void producer_commit(PipelineState state, UserDefinedArriveOp&& user_defined_arrive_op) {
+    cute::forward<UserDefinedArriveOp>(user_defined_arrive_op)(producer_get_barrier(state.index()));
+    producer_commit(state);
+  }
+
+  // Prevents early exit of producer blocks in Cluster.
+  // This should be called once before kernel exits.
+  CUTLASS_DEVICE
+  void producer_tail(PipelineState state) {
+    for (int count = 0; count < Stages; ++count) {
+      producer_acquire(state);
+      ++state;
+    }
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(PipelineState state) {
+    return producer_get_barrier(state.index());
+  }
+
+  ////////////////////
+  // Consumer APIs
+  ////////////////////
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_try_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(PipelineState state, uint32_t skip_wait = false) {
+    return consumer_test_wait(state.index(), state.phase(), skip_wait);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(PipelineState state, ConsumerToken barrier_token = {BarrierStatus::WaitAgain}) {
+    consumer_wait(state.index(), state.phase(), barrier_token);
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+  CUTLASS_DEVICE
+  ProducerBarrierType* producer_get_barrier(uint32_t stage) {
+    return reinterpret_cast<ProducerBarrierType*>(&full_barrier_ptr_[stage]);
+  }
+
+private:
+  Params params_;
+  FullBarrier *full_barrier_ptr_;
+  EmptyBarrier *empty_barrier_ptr_;
+
+  CUTLASS_DEVICE
+  ProducerToken producer_try_acquire(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = empty_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void producer_acquire(uint32_t stage, uint32_t phase, ProducerToken barrier_token) {
+    detail::pipeline_check_is_producer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      empty_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void producer_commit(uint32_t stage) {
+    detail::pipeline_check_is_producer(params_.role);
+    full_barrier_ptr_[stage].arrive();
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_try_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].try_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  ConsumerToken consumer_test_wait(uint32_t stage, uint32_t phase, uint32_t skip_wait) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (skip_wait) {
+      return {BarrierStatus::WaitDone};
+    }
+    bool barrier_status = full_barrier_ptr_[stage].test_wait(phase);
+    return {static_cast<BarrierStatus>(barrier_status)};
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase) {
+    detail::pipeline_check_is_consumer(params_.role);
+    bool done = full_barrier_ptr_[stage].test_wait(phase);
+    if (!done) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
+    detail::pipeline_check_is_consumer(params_.role);
+    if (barrier_token == BarrierStatus::WaitAgain) {
+      full_barrier_ptr_[stage].wait(phase);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage) {
+    detail::pipeline_check_is_consumer(params_.role);
+    empty_barrier_ptr_[stage].arrive(params_.dst_blockid);
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Barrier to ensure an Ordered Sequence between
+// SequenceLength number of groups (each with group_size participants) executing SequenceDepth Stages
+// i.e., for all i < j - only after id "i" arrives at a particular stage "m"
+// will the wait() for id "j" succeed for the same stage
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace PipelineDetail {
+
+template<int SequenceDepth, int SequenceLength>
+struct OrderedSequenceBarrierSharedStorage {
+  using Barrier = cutlass::arch::ClusterBarrier;
+  Barrier barrier_[SequenceDepth][SequenceLength];
+};
+
+} // namespace PipelineDetail
+
+template<int SequenceDepth_, int SequenceLength_>
+class OrderedSequenceBarrier {
+public:
+  static constexpr int SequenceDepth = SequenceDepth_;
+  static constexpr int SequenceLength = SequenceLength_;
+  using SharedStorage =
+    PipelineDetail::OrderedSequenceBarrierSharedStorage<SequenceDepth, SequenceLength>;
+  using Barrier = typename SharedStorage::Barrier;
+
+  struct Params {
+    uint32_t group_id;
+    uint32_t group_size;
+    int initializing_warp = 0; 
+  };
+
+private:
+  // In future this Params object can be replaced easily with a CG object
+  Params params_;
+  Barrier *barrier_ptr_;
+  PipelineState<SequenceDepth> stage_;
+
+  static constexpr int Depth = SequenceDepth;
+  static constexpr int Length = SequenceLength;
+
+public:
+  OrderedSequenceBarrier() = delete;
+  OrderedSequenceBarrier(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier(OrderedSequenceBarrier&&) = delete;
+  OrderedSequenceBarrier& operator=(const OrderedSequenceBarrier&) = delete;
+  OrderedSequenceBarrier& operator=(OrderedSequenceBarrier&&) = delete;
+  ~OrderedSequenceBarrier() = default;
+
+  CUTLASS_DEVICE
+  OrderedSequenceBarrier(SharedStorage& storage, Params const& params) :
+      params_(params),
+      barrier_ptr_(&storage.barrier_[0][0]),
+      // Group 0 - starts with an opposite phase
+      stage_({0, params.group_id == 0, 0}) {
+
+#if (__CUDA_ARCH__ >= 1000)
+    int warp_idx = canonical_warp_idx_sync();
+
+    // Barrier FULL, EMPTY init
+    if (warp_idx == params.initializing_warp) {
+      int arv_cnt = params.group_size;
+      CUTLASS_ASSERT(arv_cnt > 0 && "Arrive count must be non-zero");
+      constexpr int Stages = Depth * Length;
+      cutlass::arch::detail::initialize_barrier_array_aligned<decltype(barrier_ptr_), Stages>(
+          barrier_ptr_, arv_cnt);
+    }
+#else
+
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+    CUTLASS_ASSERT(params.group_size > 0 && "Group size must be non-zero");
+
+    // Barrier FULL, EMPTY init
+    // Init is done only by the one elected thread of the block
+    if (warp_idx == 0 && lane_predicate) {
+      for (int d = 0; d < Depth; ++d) {
+        for (int l = 0; l < Length; ++l) {
+          barrier_ptr_[d * Length + l].init(params.group_size);
+        }
+      }
+    }
+#endif 
+    cutlass::arch::fence_barrier_init();
+  }
+
+  // Wait on a stage to be unlocked
+  CUTLASS_DEVICE
+  void wait() {
+    get_barrier_for_current_stage(params_.group_id).wait(stage_.phase());
+  }
+
+  // Signal completion of Stage and move to the next stage
+  // (group_id) signals to (group_id+1)
+  CUTLASS_DEVICE
+  void arrive() {
+    int signalling_id = (params_.group_id + 1) % Length;
+    get_barrier_for_current_stage(signalling_id).arrive();
+    ++stage_;
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    ++stage_;
+  }
+
+private:
+
+  CUTLASS_DEVICE
+  Barrier& get_barrier_for_current_stage(int group_id) {
+    return barrier_ptr_[stage_.index() * Length + group_id];
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Synchronization call. Blocks until barriers are initialized in shared memory.
+CUTLASS_DEVICE
+void
+pipeline_init_wait(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_wait();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+// Used to guarantee that the Pipeline init is visible
+// to all producers and consumer threadblocks in the cluster
+CUTLASS_DEVICE
+void
+pipeline_init_arrive_relaxed(int cluster_size) {
+  if (cluster_size > 1) {
+    cute::cluster_arrive_relaxed();
+  }
+  else {
+    __syncthreads();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // end namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pitch_linear_coord.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pitch_linear_coord.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b782ecef78928ade707daac617b8707bf720eb6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/pitch_linear_coord.h
@@ -0,0 +1,181 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines layout functions used by TensorRef and derived classes for pitch-linear memory.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Template defining a shape used by pitch-linear operators
+template <
+  int Contiguous,
+  int Strided
+>
+struct PitchLinearShape {
+  static int const kContiguous = Contiguous;
+  static int const kStrided = Strided;
+  static int const kCount = Contiguous * Strided;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Coordinate in pitch-linear space
+struct PitchLinearCoord : public Coord<2, int> {
+public:
+
+  /// Integer-valued index
+  using Index = int;
+
+  /// Base type is a Coord of rank=2
+  using Base = Coord<2, Index>;
+
+  /// Long integer type
+  using LongIndex = typename Base::LongIndex;
+
+private:
+
+  /// Rows dimension
+  static int const kContiguous = 0;
+
+  /// Columns dimension
+  static int const kStrided = 1;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord() { }
+
+  /// Constructs from Coord<2>
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(Coord<2, Index> const &coord): Base(coord) { }
+
+  /// Helper to construct from a row and column
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(Index contiguous_, Index strided_): Base(make_Coord(contiguous_, strided_)) { }
+
+  /// Helper to construct from a row and column based on LongIndex
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord(LongIndex contiguous_, LongIndex strided_)
+    : Base(make_Coord(Index(contiguous_), Index(strided_))) { }
+
+  /// Returns the contiguous dimension
+  CUTLASS_HOST_DEVICE
+  Index const & contiguous() const { return this->at(kContiguous); }
+
+  /// Returns the contiguous dimension
+  CUTLASS_HOST_DEVICE
+  Index & contiguous() { return this->at(kContiguous); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & strided() const { return this->at(kStrided); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & strided() { return this->at(kStrided); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator+(Base const& b) const {
+    return PitchLinearCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator-(Base const& b) const {
+    return PitchLinearCoord(Base::operator-(b));
+  }
+
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator-() const {
+    return PitchLinearCoord(-at(0), -at(1));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator*(Base const& b) const {
+    return PitchLinearCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord operator/(Base const& b) const {
+    return PitchLinearCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  PitchLinearCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/platform/platform.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/platform/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..86ba43a4cc06d84d911d8b135babbad0338894ef
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/platform/platform.h
@@ -0,0 +1,953 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ features that may be otherwise unimplemented for CUDA device functions.
+ *
+ * This file has three components:
+ *
+ *   (1) Macros:
+ *       - Empty macro defines for C++ keywords not supported by the current
+ *         version of C++. These simply allow compilation to proceed (but do
+ *         not provide the added semantics).
+ *           - \p noexcept
+ *           - \p constexpr
+ *           - \p nullptr
+ *           - \p static_assert
+ *
+ *       - Macro functions that we need in constant expressions because the
+ *         C++ equivalents require constexpr compiler support.  These are
+ *         prefixed with \p __NV_STD_*
+ *           - \p __NV_STD_MAX
+ *           - \p __NV_STD_MIN
+ *
+ *   (2) Re-implementations of STL functions and types:
+ *       - C++ features that need the \p __device__ annotation.  These are
+ *         placed into the \p platform namespace.
+ *           - \p abs
+ *           - \p plus
+ *           - \p less
+ *           - \p greater
+ *           - \p min
+ *           - \p max
+ *           - \p methods on std::pair (==, !=, <, <=, >, >=, and make_pair())
+ *
+ *   (3) Stop-gap implementations of unsupported STL functions and types:
+ *       - STL functions and types defined by C++ 11/14/17/etc. that are not
+ *         provided by the current version of C++. These are placed into the
+ *         \p platform namespace
+ *           - \p integral_constant
+ *           - \p nullptr_t
+ *           - \p true_type
+ *           - \p false_type
+ *           - \p bool_constant
+ *           - \p enable_if
+ *           - \p conditional
+ *           - \p is_same
+ *           - \p is_base_of
+ *           - \p remove_const
+ *           - \p remove_volatile
+ *           - \p remove_cv
+ *           - \p is_volatile
+ *           - \p is_pointer
+ *           - \p is_void
+ *           - \p is_integral
+ *           - \p is_floating_point
+ *           - \p is_arithmetic
+ *           - \p is_fundamental
+ *           - \p is_trivially_copyable
+ *           - \p alignment_of
+ *           - \p aligned_storage
+ *
+ * The idea is that, as we drop support for older compilers, we can simply #define
+ * the \p __NV_STD_XYZ macros and \p platform namespace to alias their C++
+ * counterparts (or trivially find-and-replace their occurrences in code text).
+ */
+
+//-----------------------------------------------------------------------------
+// Dependencies
+//-----------------------------------------------------------------------------
+#include <cutlass/cutlass.h>
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(type_traits)
+#include CUDA_STD_HEADER(utility)
+#include CUDA_STD_HEADER(cstddef)
+#include CUDA_STD_HEADER(cstdint)
+#include CUDA_STD_HEADER(limits)
+#else
+#include <type_traits>
+#include <utility>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#endif
+
+#if !defined(__CUDACC_RTC__)
+//-----------------------------------------------------------------------------
+// Include STL files that platform provides functionality for
+//-----------------------------------------------------------------------------
+
+#include <algorithm>   // Minimum/maximum operations
+#include <cstddef>     // nullptr_t
+#include <functional>  // Arithmetic operations
+#include <utility>     // For methods on std::pair
+#include <limits>      // float_round_style, float_denorm_style
+#if (!defined(_MSC_VER) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MS_VER >= 1500))
+#include <type_traits>  // For integral constants, conditional metaprogramming, and type traits
+#endif
+
+#include <vector_types.h>
+
+#endif
+
+//-----------------------------------------------------------------------------
+// OS
+//-----------------------------------------------------------------------------
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
+#define CUTLASS_OS_WINDOWS
+#endif
+
+#if defined(__clang__) && defined(__CUDA__)
+#define CUTLASS_CLANG_CUDA 1
+#endif
+
+/******************************************************************************
+ * Macros
+ ******************************************************************************/
+/// std
+#if !defined(CUTLASS_STL_NAMESPACE)
+#if defined(__CUDACC_RTC__)
+#define CUTLASS_STL_NAMESPACE cuda::std
+#else
+#define CUTLASS_STL_NAMESPACE std
+#endif
+#endif
+
+/// builtin_unreachable
+#if !defined(CUTLASS_GCC_UNREACHABLE)
+#  if defined(__GNUC__)
+#    define CUTLASS_GCC_UNREACHABLE __builtin_unreachable()
+#  else
+#    define CUTLASS_GCC_UNREACHABLE
+#  endif
+#endif
+
+//-----------------------------------------------------------------------------
+// Keywords
+//-----------------------------------------------------------------------------
+
+/// noexcept, constexpr
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1900))
+#ifndef noexcept
+#define noexcept
+#endif
+#ifndef constexpr
+#define constexpr
+#endif
+#endif
+
+/// nullptr
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1310))
+#ifndef nullptr
+#define nullptr 0
+#endif
+#endif
+
+/// static_assert
+#if (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#ifndef static_assert
+#define __platform_cat_(a, b) a##b
+#define __platform_cat(a, b) __platform_cat_(a, b)
+#define static_assert(__e, __m) typedef int __platform_cat(AsSeRt, __LINE__)[(__e) ? 1 : -1]
+#endif
+#endif
+
+//-----------------------------------------------------------------------------
+// Functions
+//-----------------------------------------------------------------------------
+
+/// Select maximum(a, b)
+#ifndef __NV_STD_MAX
+#define __NV_STD_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+/// Select minimum(a, b)
+#ifndef __NV_STD_MIN
+#define __NV_STD_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+/******************************************************************************
+ * Re-implementations
+ ******************************************************************************/
+namespace cutlass {
+namespace platform {
+
+//-----------------------------------------------------------------------------
+// Abs operations <algorithm>
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__)
+/// std::abs
+CUTLASS_HOST_DEVICE constexpr int abs(int a) {
+    return (a < 0) ? -a : a;
+}
+CUTLASS_HOST_DEVICE constexpr long long abs(long long a) {
+    return (a < 0) ? -a : a;
+}
+#else
+using std::abs;
+#endif
+
+//-----------------------------------------------------------------------------
+// Minimum/maximum operations <algorithm>
+//-----------------------------------------------------------------------------
+
+/// std::min
+template <typename T>
+CUTLASS_HOST_DEVICE constexpr const T& min(const T& a, const T& b) {
+  return (b < a) ? b : a;
+}
+
+/// std::max
+template <typename T>
+CUTLASS_HOST_DEVICE constexpr const T& max(const T& a, const T& b) {
+  return (a < b) ? b : a;
+}
+
+#if !defined(__CUDACC_RTC__)
+//-----------------------------------------------------------------------------
+// Methods on std::pair
+//-----------------------------------------------------------------------------
+
+using std::pair;
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator==(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first == rhs.first) && (lhs.second == rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator!=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first != rhs.first) && (lhs.second != rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator<(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (lhs.first < rhs.first) ? true : (rhs.first < lhs.first) ? false
+                                                                  : (lhs.second < rhs.second);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator<=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return !(rhs < lhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator>(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return (rhs < lhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE constexpr bool operator>=(const pair<T1, T2>& lhs, const pair<T1, T2>& rhs) {
+  return !(lhs < rhs);
+}
+
+template <class T1, class T2>
+CUTLASS_HOST_DEVICE std::pair<T1, T2> make_pair(T1 t, T2 u) {
+  std::pair<T1, T2> retval;
+  retval.first = t;
+  retval.second = u;
+  return retval;
+}
+#endif
+
+}  // namespace platform
+
+/******************************************************************************
+ * Implementations of C++ 11/14/17/... STL features
+ ******************************************************************************/
+
+namespace platform {
+
+//-----------------------------------------------------------------------------
+// Integral constant helper types <type_traits>
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+#else
+
+using std::pair;
+
+#endif
+
+using CUTLASS_STL_NAMESPACE::integral_constant;
+using CUTLASS_STL_NAMESPACE::bool_constant;
+using CUTLASS_STL_NAMESPACE::true_type;
+using CUTLASS_STL_NAMESPACE::false_type;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1700))
+
+/// std::nullptr_t
+struct nullptr_t {};
+
+#else
+
+using std::nullptr_t;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Conditional metaprogramming <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::conditional;
+using CUTLASS_STL_NAMESPACE::conditional_t;
+using CUTLASS_STL_NAMESPACE::enable_if;
+using CUTLASS_STL_NAMESPACE::enable_if_t;
+using CUTLASS_STL_NAMESPACE::void_t;
+
+//-----------------------------------------------------------------------------
+// Const/volatility specifiers <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::remove_const;
+using CUTLASS_STL_NAMESPACE::remove_const_t;
+using CUTLASS_STL_NAMESPACE::remove_cv;
+using CUTLASS_STL_NAMESPACE::remove_cv_t;
+using CUTLASS_STL_NAMESPACE::remove_reference;
+using CUTLASS_STL_NAMESPACE::remove_reference_t;
+using CUTLASS_STL_NAMESPACE::remove_volatile;
+using CUTLASS_STL_NAMESPACE::remove_volatile_t;
+
+// remove_cvref and remove_cvref_t are C++20 features,
+// but CUTLASS finds them useful enough to back-port.
+#if defined(__cpp_lib_remove_cvref)
+
+using CUTLASS_STL_NAMESPACE::remove_cvref;
+using CUTLASS_STL_NAMESPACE::remove_cvref_t;
+
+#else
+
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
+};
+
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Type relationships <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::is_same;
+using CUTLASS_STL_NAMESPACE::is_same_v;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// Helper for std::is_base_of
+template <typename BaseT, typename DerivedT>
+struct is_base_of_helper {
+  typedef char (&yes)[1];
+  typedef char (&no)[2];
+
+  template <typename B, typename D>
+  struct dummy {
+    CUTLASS_HOST_DEVICE operator B*() const;
+    CUTLASS_HOST_DEVICE operator D*();
+  };
+
+  template <typename T>
+  CUTLASS_HOST_DEVICE static yes check(DerivedT*, T);
+
+  CUTLASS_HOST_DEVICE static no check(BaseT*, int);
+
+  static const bool value = sizeof(check(dummy<BaseT, DerivedT>(), int())) == sizeof(yes);
+};
+
+/// std::is_base_of
+template <typename BaseT, typename DerivedT>
+struct is_base_of
+    : integral_constant<bool,
+                        (is_base_of_helper<typename remove_cv<BaseT>::type,
+                                           typename remove_cv<DerivedT>::type>::value) ||
+                            (is_same<typename remove_cv<BaseT>::type,
+                                     typename remove_cv<DerivedT>::type>::value)> {};
+
+#else
+
+using std::is_base_of;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// Type properties <type_traits>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::is_arithmetic;
+using CUTLASS_STL_NAMESPACE::is_arithmetic_v;
+using CUTLASS_STL_NAMESPACE::is_void;
+using CUTLASS_STL_NAMESPACE::is_void_v;
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// std::is_volatile
+template <typename T>
+struct is_volatile : false_type {};
+template <typename T>
+struct is_volatile<volatile T> : true_type {};
+
+/// Helper for std::is_pointer (false specialization)
+template <typename T>
+struct is_pointer_helper : false_type {};
+
+/// Helper for std::is_pointer (true specialization)
+template <typename T>
+struct is_pointer_helper<T*> : true_type {};
+
+/// std::is_pointer
+template <typename T>
+struct is_pointer : is_pointer_helper<typename remove_cv<T>::type> {};
+
+/// std::is_integral
+template <typename T>
+struct is_integral : false_type {};
+template <>
+struct is_integral<char> : true_type {};
+template <>
+struct is_integral<signed char> : true_type {};
+template <>
+struct is_integral<unsigned char> : true_type {};
+template <>
+struct is_integral<short> : true_type {};
+template <>
+struct is_integral<unsigned short> : true_type {};
+template <>
+struct is_integral<int> : true_type {};
+template <>
+struct is_integral<unsigned int> : true_type {};
+template <>
+struct is_integral<long> : true_type {};
+template <>
+struct is_integral<unsigned long> : true_type {};
+template <>
+struct is_integral<long long> : true_type {};
+template <>
+struct is_integral<unsigned long long> : true_type {};
+template <typename T>
+struct is_integral<volatile T> : is_integral<T> {};
+template <typename T>
+struct is_integral<const T> : is_integral<T> {};
+template <typename T>
+struct is_integral<const volatile T> : is_integral<T> {};
+
+/// std::is_floating_point
+template <typename T>
+struct is_floating_point
+    : integral_constant<bool,
+                        (is_same<float, typename remove_cv<T>::type>::value ||
+                         is_same<double, typename remove_cv<T>::type>::value)> {};
+
+/// std::is_fundamental
+template <typename T>
+struct is_fundamental
+    : integral_constant<bool,
+                        (is_arithmetic<T>::value || is_void<T>::value ||
+                         is_same<nullptr_t, typename remove_cv<T>::type>::value)> {};
+
+#else
+
+using std::is_volatile;
+using std::is_pointer;
+using std::is_integral;
+using std::is_floating_point;
+using std::is_fundamental;
+
+#endif
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800)) || \
+    (defined(__GNUG__) && (__GNUC__ < 5))
+
+/**
+     * std::is_trivially_copyable
+     *
+     * This implementation only evaluates true if T is fundamental or pointer
+     *
+     * Without help from partial template specializations provided by the user for
+     * a specific class or struct, this trait will never report that the specified
+     * class or struct  is trivially-copyable ; this is always safe,
+     * if possibly sub-optimal.
+     */
+template <typename T>
+struct is_trivially_copyable
+    : integral_constant<bool, (is_fundamental<T>::value || is_pointer<T>::value)> {};
+
+#else
+
+using std::is_trivially_copyable;
+
+#endif
+
+#if (CUTLASS_CXX17_OR_LATER)
+
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_integral_v;
+/// std::is_unsigned_v
+using CUTLASS_STL_NAMESPACE::is_unsigned_v;
+
+#endif
+
+//-----------------------------------------------------------------------------
+// <utility>
+//-----------------------------------------------------------------------------
+
+using CUTLASS_STL_NAMESPACE::declval;
+
+//-----------------------------------------------------------------------------
+// bit_cast <bit>
+//-----------------------------------------------------------------------------
+
+template< class To, class From >
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& from ) noexcept;
+
+template <class To, class From>
+constexpr To CUTLASS_HOST_DEVICE bit_cast(const From& src) noexcept
+{
+  static_assert(sizeof(To) == sizeof(From), "sizes must match");
+  return reinterpret_cast<To const &>(src);
+}
+
+//-----------------------------------------------------------------------------
+// Convertable
+//-----------------------------------------------------------------------------
+using CUTLASS_STL_NAMESPACE::is_convertible;
+using CUTLASS_STL_NAMESPACE::is_convertible_v;
+
+//-----------------------------------------------------------------------------
+// Alignment and layout utilities
+//-----------------------------------------------------------------------------
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1500))
+
+/// std::alignment_of
+template <typename value_t>
+struct alignment_of {
+  struct pad {
+    value_t val;
+    char byte;
+  };
+
+  enum { value = sizeof(pad) - sizeof(value_t) };
+};
+
+#else
+
+template <typename value_t>
+struct alignment_of : std::alignment_of<value_t> {};
+
+#endif
+
+/* 16B specializations where 32-bit Win32 host compiler disagrees with device compiler */
+template <>
+struct alignment_of<int4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<uint4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<float4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<longlong2> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulonglong2> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<double2> {
+  enum { value = 16 };
+};
+
+#if !defined(CUDA_VECTOR_TYPE_ALIGNMENT_16_32_ENABLED)
+#define CUDA_VECTOR_TYPE_ALIGNMENT_16_32_ENABLED (__CUDACC_VER_MAJOR__ >= 13)
+#endif
+
+#if (CUDA_VECTOR_TYPE_ALIGNMENT_16_32_ENABLED)
+template <>
+struct alignment_of<long4_16a> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulong4_16a> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<longlong4_16a> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulonglong4_16a> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<double4_16a> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<long4_32a> {
+  enum { value = 32 };
+};
+template <>
+struct alignment_of<ulong4_32a> {
+  enum { value = 32 };
+};
+template <>
+struct alignment_of<longlong4_32a> {
+  enum { value = 32 };
+};
+template <>
+struct alignment_of<ulonglong4_32a> {
+  enum { value = 32 };
+};
+template <>
+struct alignment_of<double4_32a> {
+  enum { value = 32 };
+};
+#else
+template <>
+struct alignment_of<long4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<longlong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<ulonglong4> {
+  enum { value = 16 };
+};
+template <>
+struct alignment_of<double4> {
+  enum { value = 16 };
+};
+
+#endif
+
+// Specializations for volatile/const qualified types
+template <typename value_t>
+struct alignment_of<volatile value_t> : alignment_of<value_t> {};
+template <typename value_t>
+struct alignment_of<const value_t> : alignment_of<value_t> {};
+template <typename value_t>
+struct alignment_of<const volatile value_t> : alignment_of<value_t> {};
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) && (__cplusplus < 201103L)) || (defined(_MSC_VER) && (_MSC_VER < 1800))
+
+template <size_t Align>
+struct aligned_chunk;
+template <>
+struct __align__(1) aligned_chunk<1> {
+  uint8_t buff;
+};
+template <>
+struct __align__(2) aligned_chunk<2> {
+  uint16_t buff;
+};
+template <>
+struct __align__(4) aligned_chunk<4> {
+  uint32_t buff;
+};
+template <>
+struct __align__(8) aligned_chunk<8> {
+  uint32_t buff[2];
+};
+template <>
+struct __align__(16) aligned_chunk<16> {
+  uint32_t buff[4];
+};
+template <>
+struct __align__(32) aligned_chunk<32> {
+  uint32_t buff[8];
+};
+template <>
+struct __align__(64) aligned_chunk<64> {
+  uint32_t buff[16];
+};
+template <>
+struct __align__(128) aligned_chunk<128> {
+  uint32_t buff[32];
+};
+template <>
+struct __align__(256) aligned_chunk<256> {
+  uint32_t buff[64];
+};
+template <>
+struct __align__(512) aligned_chunk<512> {
+  uint32_t buff[128];
+};
+template <>
+struct __align__(1024) aligned_chunk<1024> {
+  uint32_t buff[256];
+};
+template <>
+struct __align__(2048) aligned_chunk<2048> {
+  uint32_t buff[512];
+};
+template <>
+struct __align__(4096) aligned_chunk<4096> {
+  uint32_t buff[1024];
+};
+
+/// std::aligned_storage
+template <size_t Len, size_t Align>
+struct aligned_storage {
+  typedef aligned_chunk<Align> type[Len / sizeof(aligned_chunk<Align>)];
+};
+
+#else
+
+using std::aligned_storage;
+
+#endif
+
+#if !defined(__CUDACC_RTC__)
+/// Default deleter
+template <typename T>
+struct default_delete {
+  void operator()(T* ptr) const { delete ptr; }
+};
+
+/// Partial specialization for deleting array types
+template <typename T>
+struct default_delete<T[]> {
+  void operator()(T* ptr) const { delete[] ptr; }
+};
+
+/// std::unique_ptr
+template <class T, class Deleter = default_delete<T> >
+class unique_ptr {
+ public:
+  typedef T* pointer;
+  typedef T element_type;
+  typedef Deleter deleter_type;
+
+ private:
+  /// Pointer to memory
+  pointer _ptr;
+
+  /// Deleter
+  deleter_type _deleter;
+
+ public:
+  unique_ptr() : _ptr(nullptr) {}
+  unique_ptr(pointer p) : _ptr(p) {}
+
+  ~unique_ptr() {
+    if (_ptr) {
+      _deleter(_ptr);
+    }
+  }
+  /// Returns a pointer to the managed object or nullptr if no object is owned.
+  pointer get() const noexcept { return _ptr; }
+
+  /// Releases ownership of the managed object, if any
+  pointer release() noexcept {
+    pointer p(_ptr);
+    _ptr = nullptr;
+    return p;
+  }
+
+  /// Replaces the managed object, deleting the old object.
+  void reset(pointer p = pointer()) noexcept {
+    pointer old_ptr = _ptr;
+    _ptr = p;
+    if (old_ptr != nullptr) {
+      get_deleter()(old_ptr);
+    }
+  }
+
+  /// Swaps the managed objects with *this and another unique_ptr
+  void swap(unique_ptr& other) noexcept { std::swap(_ptr, other._ptr); }
+
+  /// Returns the deleter object
+  Deleter& get_deleter() noexcept { return _deleter; }
+
+  /// Returns the deleter object
+  Deleter const& get_deleter() const noexcept { return _deleter; }
+
+  /// Checks whether an object is owned
+  operator bool() const noexcept { return _ptr != nullptr; }
+
+  /// Dereferences the unique_ptr
+  T& operator*() const { return *_ptr; }
+
+  /// Returns a pointer to the managed object
+  pointer operator->() const noexcept { return _ptr; }
+
+  /// Array access to managed object
+  T& operator[](size_t i) const { return _ptr[i]; }
+};
+
+/// Specializes the swap algorithm
+template <typename T, typename Deleter>
+void swap(unique_ptr<T, Deleter>& lhs, unique_ptr<T, Deleter>& rhs) noexcept {
+  lhs.swap(rhs);
+}
+#endif
+
+/// std::numeric_limits
+template <class T>
+struct numeric_limits;
+
+template <>
+struct numeric_limits<int32_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int32_t lowest() noexcept { return -2147483647 - 1;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int32_t max() noexcept { return 2147483647;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<int16_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int16_t lowest() noexcept { return -32768;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int16_t max() noexcept { return 32767;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<int8_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr int8_t lowest() noexcept { return -128;}
+  CUTLASS_HOST_DEVICE
+  static constexpr int8_t max() noexcept { return 127;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+
+template <>
+struct numeric_limits<uint32_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint32_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint32_t max() noexcept { return 4294967295U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<uint16_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint16_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint16_t max() noexcept { return 65535U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<uint8_t> {
+  CUTLASS_HOST_DEVICE
+  static constexpr uint8_t lowest() noexcept { return 0;}
+  CUTLASS_HOST_DEVICE
+  static constexpr uint8_t max() noexcept { return 255U;}
+  static constexpr bool is_integer = true;
+  static constexpr bool has_infinity = false;
+};
+
+template <>
+struct numeric_limits<float> {
+  CUTLASS_HOST_DEVICE
+  static constexpr float infinity() noexcept { return bit_cast<float, int32_t>(0x7f800000);}
+  CUTLASS_HOST_DEVICE
+  static constexpr float max() noexcept { return bit_cast<float, int32_t>(0x7f7fffff);}
+  static constexpr bool is_integer = false;
+  static constexpr bool has_infinity = true;
+};
+
+/// Returns a value that curries the `std::maximum()` function into the identity
+/// function. No value will compare < than this value.
+template <typename T>
+constexpr T identity_for_maximum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return -numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::lowest();
+  }
+}
+
+/// Returns a value that curries the `std::minimum()` function into the identity
+/// function. No value will compare > than this value.
+template <typename T>
+constexpr T identity_for_minimum() {
+  if constexpr (numeric_limits<T>::has_infinity) {
+    return numeric_limits<T>::infinity();
+  } else {
+    return numeric_limits<T>::max();
+  }
+}
+
+/// std::float_round_style
+using CUTLASS_STL_NAMESPACE::float_round_style;
+using CUTLASS_STL_NAMESPACE::round_indeterminate;
+using CUTLASS_STL_NAMESPACE::round_toward_zero;
+using CUTLASS_STL_NAMESPACE::round_to_nearest;
+using CUTLASS_STL_NAMESPACE::round_toward_infinity;
+using CUTLASS_STL_NAMESPACE::round_toward_neg_infinity;
+
+/// std::float_denorm_style
+using CUTLASS_STL_NAMESPACE::float_denorm_style;
+using CUTLASS_STL_NAMESPACE::denorm_indeterminate;
+using CUTLASS_STL_NAMESPACE::denorm_absent;
+using CUTLASS_STL_NAMESPACE::denorm_present;
+
+}  // namespace platform
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/predicate_vector.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/predicate_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3867c570340fd41480c7806456d269eed0b1189
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/predicate_vector.h
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines container classes and iterators for managing a statically sized vector
+      of boolean predicates.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#endif
+
+#include CUDA_STD_HEADER(cassert)
+
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_vector_concept Predicate Vector Concept
+@{
+
+Implementations of \ref predicate_vector_concept contain an ordered set of boolean predicates which
+may be used as conditionals in other device-side operations. Both random access and iterators
+offering sequential access are provided.
+
+@par Predicate Vector
+   A \ref predicate_vector_concept satisfies the following expressions
+  - <b>at(int idx)</b> - returns the value of the indexed predicate
+  - <b>set(int idx, bool value)</b> - sets the value of the indexed predicate
+  - <b>begin()</b> - returns a \ref predicate_iterator_concept pointing to the first predicate
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_iterator_concept Predicate Iterator Concept
+@{
+
+Implementations of \ref predicate_iterator_concept enables accessing and traversing elements of a
+bit vector.
+
+@par Const Predicate Iterator
+  A const \ref predicate_iterator_concept satisfies the following expressions
+ - <b>++it</b> increments the iterator to the next predicate
+ - <b>*it</b> returns the value of the currently pointed-to predicate
+
+@par Mutable Predicate Iterator
+ A \ref predicate_iterator_concept that is non-const <b>also</b> satisfies the following expressions
+ - <b>it.set(bool value)</b> sets the value of the currently pointed-to predicate
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*!@defgroup predicate_tile_adapter Predicate Tile Adapter Concept
+@{
+
+Implementations of \ref predicate_tile_adapter provide a mapping between a the elements of a \ref
+tile_traits_concept and a \ref predicate_vector_concept.
+
+@par Predicate Tile Adapter
+  A \ref predicate_tile_adapter satisfies the following expressions
+ - <b>at(int d, int h, int w, int c)</b> - returns the value of a predicate corresponding to the
+   access (d, h, w, c) within the tile.
+
+@}
+*/
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Statically sized array of bits implementing @concept{predicate_vector_concept}.
+template <
+    /// Number of predicates contained in predicate vector
+    int kPredicates_,
+    /// Number of predicates contained in each byte of internal storage
+    int kPredicatesPerByte_ = 4,
+    /// Location of first predicate within byte of internal storage
+    int kPredicateStart_ = 0>
+struct PredicateVector {
+  /// Number of bits stored by the PredicateVector
+  static constexpr int kPredicates = kPredicates_;
+
+  /// Number of bits stored within each byte of the predicate bit vector
+  static constexpr int kPredicatesPerByte = kPredicatesPerByte_;
+
+  /// First bit within each byte containing predicates
+  static constexpr int kPredicateStart = kPredicateStart_;
+
+  // Make sure no one tries to put more than 8 bits in a byte :)
+  static_assert(kPredicatesPerByte <= 8, "kPredicatesPerByte must fit within an actual byte");
+  // Make sure the "offsetted" bits fit in one byte.
+  static_assert(kPredicateStart + kPredicatesPerByte <= 8,
+                "The offsetted predicates must fit within an actual byte.");
+
+  /// Storage type of individual elements
+  typedef uint32_t Storage;
+
+  /// Number of bytes needed
+  static constexpr int kBytes = (kPredicates + kPredicatesPerByte - 1) / kPredicatesPerByte;
+
+  /// Number of storage elements needed
+  static constexpr int kWordCount = (kBytes + int(sizeof(Storage)) - 1) / int(sizeof(Storage));
+
+  /// The byte mask corresponding to predicates
+  static constexpr Storage kByteMask = (((1 << kPredicatesPerByte) - 1) << kPredicateStart);
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Words of bit vector
+  Storage storageData[kWordCount];
+
+  //
+  // Methods
+  //
+
+  /// Computes the word and bit corresponding to a logical predicate index
+  CUTLASS_HOST_DEVICE void computeStorageOffset(int &word, int &bit, int idx) const {
+    CUTLASS_ASSERT(idx < kPredicates);
+
+    int byte = (idx / kPredicatesPerByte);
+    int bit_offset = (idx % kPredicatesPerByte);
+
+    word = byte / sizeof(Storage);
+    int byte_offset = (byte % sizeof(Storage));
+
+    bit = byte_offset * 8 + bit_offset + kPredicateStart;
+  }
+
+  /// Returns word mask.
+  CUTLASS_HOST_DEVICE static constexpr bool computeWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (size_t byte = 0; byte < sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
+  /// Returns mask of last word.
+  CUTLASS_HOST_DEVICE static constexpr bool computeLastWordMask() {
+    Storage mask(0);
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < kBytes % sizeof(Storage); ++byte) {
+      mask |= (kByteMask << (byte * 8));
+    }
+    return mask;
+  }
+
+  /// Accesses a given word with optional assertions
+  CUTLASS_HOST_DEVICE Storage &storage(int word) {
+    CUTLASS_ASSERT(word < kWordCount);
+    return storageData[word];
+  }
+
+  /// Accesses a given word with optional assertions
+  CUTLASS_HOST_DEVICE Storage const &storage(int word) const {
+    CUTLASS_ASSERT(word < kWordCount);
+    return storageData[word];
+  }
+
+ public:
+  //
+  // Iterator
+  //
+
+  /**
+  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
+  * read and write access to predicates.
+  * @concept{predicate_iterator_concept}
+  */
+  class Iterator {
+    /// Reference to PredicateVector instance
+    PredicateVector &vec_;
+
+    /// Index into PredicateVector
+    int bit_;
+
+   public:
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    Iterator(Iterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    Iterator(PredicateVector &vec, int _start = 0) : vec_(vec), bit_(_start) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    Iterator &operator++() {
+      ++bit_;
+      return *this;
+    }
+
+    /// Increment
+    CUTLASS_HOST_DEVICE
+    Iterator &operator+=(int offset) {
+      bit_ += offset;
+      return *this;
+    }
+
+    /// Pre-decrement
+    CUTLASS_HOST_DEVICE
+    Iterator &operator--() {
+      --bit_;
+      return *this;
+    }
+
+    /// Decrement
+    CUTLASS_HOST_DEVICE
+    Iterator &operator-=(int offset) {
+      bit_ -= offset;
+      return *this;
+    }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    Iterator operator++(int) {
+      Iterator ret(*this);
+      ret.bit_++;
+      return ret;
+    }
+
+    /// Post-decrement
+    CUTLASS_HOST_DEVICE
+    Iterator operator--(int) {
+      Iterator ret(*this);
+      ret.bit_--;
+      return ret;
+    }
+
+    /// Iterator advances by some amount
+    CUTLASS_HOST_DEVICE
+    Iterator operator+(int offset) {
+      Iterator ret(*this);
+      ret.bit_ += offset;
+      return ret;
+    }
+
+    /// Iterator recedes by some amount
+    CUTLASS_HOST_DEVICE
+    Iterator operator-(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ -= offset;
+      return ret;
+    }
+
+    /// Returns true if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator==(Iterator const &it) const { return bit_ == it.bit_; }
+
+    /// Returns false if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator!=(Iterator const &it) const { return bit_ != it.bit_; }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool get() { return vec_.at(bit_); }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool at() const { return vec_.at(bit_); }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return at(); }
+
+    /// Sets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    void set(bool value = true) { vec_.set(bit_, value); }
+  };
+
+  /**
+  * @brief An iterator implementing \ref predicate_iterator_concept enabling sequential
+  * read and write access to predicates.
+  * @concept{predicate_iterator_concept}
+  */
+  class ConstIterator {
+    /// Reference to PredicateVector instance
+    PredicateVector const &vec_;
+
+    /// Index into PredicateVector
+    int bit_;
+
+   public:
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    ConstIterator(ConstIterator const &it) : vec_(it.vec_), bit_(it.bit_) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    ConstIterator(PredicateVector const &vec, int _start = 0) : vec_(vec), bit_(_start) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator++() {
+      ++bit_;
+      return *this;
+    }
+
+    /// Increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator+=(int offset) {
+      bit_ += offset;
+      return *this;
+    }
+
+    /// Pre-decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator--() {
+      --bit_;
+      return *this;
+    }
+
+    /// Decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator &operator-=(int offset) {
+      bit_ -= offset;
+      return *this;
+    }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator++(int) {
+      ConstIterator ret(*this);
+      ret.bit_++;
+      return ret;
+    }
+
+    /// Post-decrement
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator--(int) {
+      ConstIterator ret(*this);
+      ret.bit_--;
+      return ret;
+    }
+
+    /// Iterator advances by some amount
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator+(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ += offset;
+      return ret;
+    }
+
+    /// Iterator recedes by some amount
+    CUTLASS_HOST_DEVICE
+    ConstIterator operator-(int offset) {
+      ConstIterator ret(*this);
+      ret.bit_ -= offset;
+      return ret;
+    }
+
+    /// Returns true if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator==(ConstIterator const &it) const { return bit_ == it.bit_; }
+
+    /// Returns false if iterators point to the same bit
+    CUTLASS_HOST_DEVICE
+    bool operator!=(ConstIterator const &it) const { return bit_ != it.bit_; }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool get() { return vec_.at(bit_); }
+
+    /// Gets the bit at the pointed to location
+    CUTLASS_HOST_DEVICE
+    bool at() const { return vec_.at(bit_); }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return at(); }
+  };
+
+  /// Iterator that always returns true
+  struct TrivialIterator {
+    /// Constructor
+    CUTLASS_HOST_DEVICE
+    TrivialIterator() {}
+
+    /// Copy constructor
+    CUTLASS_HOST_DEVICE
+    TrivialIterator(Iterator const &it) {}
+
+    /// Constructs an iterator from a PredicateVector
+    CUTLASS_HOST_DEVICE
+    TrivialIterator(PredicateVector const &_vec) {}
+
+    /// Pre-increment
+    CUTLASS_HOST_DEVICE
+    TrivialIterator &operator++() { return *this; }
+
+    /// Post-increment
+    CUTLASS_HOST_DEVICE
+    TrivialIterator operator++(int) { return *this; }
+
+    /// Dereferences iterator
+    CUTLASS_HOST_DEVICE
+    bool operator*() const { return true; }
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Initialize the predicate vector
+  CUTLASS_HOST_DEVICE PredicateVector(bool value = true) { fill(value); }
+
+  /// Fills all predicates with a given value
+  CUTLASS_HOST_DEVICE void fill(bool value = true) {
+    Storage item = (value ? ~Storage(0) : Storage(0));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = item;
+    }
+  }
+
+  /// Clears all predicates
+  CUTLASS_HOST_DEVICE void clear() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = 0;
+    }
+  }
+
+  /// Sets all predicates to true
+  CUTLASS_HOST_DEVICE void enable() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = ~Storage(0);
+    }
+  }
+
+  /// Accesses a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE bool operator[](int idx) const { return at(idx); }
+
+  /// Accesses a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE bool at(int idx) const {
+    int bit, word;
+    computeStorageOffset(word, bit, idx);
+
+    return ((storage(word) >> bit) & 1);
+  }
+
+  /// Set a bit within the predicate vector.
+  CUTLASS_HOST_DEVICE void set(int idx, bool value = true) {
+    int bit, word;
+    computeStorageOffset(word, bit, idx);
+
+    Storage disable_mask = (~(Storage(1) << bit));
+    Storage enable_mask = (Storage(value) << bit);
+
+    storage(word) = ((storage(word) & disable_mask) | enable_mask);
+  }
+
+  /// Computes the intersection of two identical predicate vectors.
+  CUTLASS_HOST_DEVICE PredicateVector &operator&=(PredicateVector const &predicates) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = (storage(i) & predicates.storage(i));
+    }
+    return *this;
+  }
+
+  /// Computes the union of two identical predicate vectors.
+  CUTLASS_HOST_DEVICE PredicateVector &operator|=(PredicateVector const &predicates) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kWordCount; ++i) {
+      storage(i) = (storage(i) | predicates.storage(i));
+    }
+    return *this;
+  }
+
+  /// Returns true if entire predicate array is zero.
+  CUTLASS_HOST_DEVICE bool is_zero() const {
+   constexpr Storage mask = computeWordMask();
+    Storage result = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int word = 0; word < kWordCount - 1; ++word) {
+      result |= (storage(word) & mask);
+    }
+    constexpr Storage last_word_mask = computeLastWordMask();
+    result |= (storage(kWordCount - 1) & last_word_mask);
+    
+    return result == 0;
+  }
+
+  /// Returns an iterator to the start of the bit vector
+  CUTLASS_DEVICE
+  Iterator begin() { return Iterator(*this); }
+
+  /// Returns an iterator
+  CUTLASS_DEVICE
+  Iterator end() { return Iterator(*this, kPredicates); }
+
+  /// Returns a ConstIterator
+  CUTLASS_DEVICE
+  ConstIterator const_begin() const { return ConstIterator(*this); }
+
+  /// Returns a ConstIterator
+  CUTLASS_DEVICE
+  ConstIterator const_end() const { return ConstIterator(*this, kPredicates); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/quaternion.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/quaternion.h
new file mode 100644
index 0000000000000000000000000000000000000000..48ca3628777d5eeca1582ef2703ee01923903f26
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/quaternion.h
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a densely packed quaternion object intended for storing data in registers and
+    executing quaternion operations within a CUDA or host thread.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/array.h"
+#include "cutlass/real.h"
+#include "cutlass/coord.h"
+#include "cutlass/matrix.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/layout/vector.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Quaternion: xi + yj + zk + w
+template <
+  typename Element_ = float      ///< element type
+>
+class Quaternion : public Array<Element_, 4> {
+public:
+
+  /// Logical rank of tensor index space
+  static int const kRank = 1;
+
+  /// Number of elements
+  static int const kExtent = 4;
+
+  /// Base class is a four-element array
+  using Base = Array<Element_, kExtent>;
+
+  /// Element type
+  using Element = typename Base::Element;
+
+  /// Reference type to an element
+  using Reference = typename Base::reference;
+
+  /// Index type
+  using Index = int;
+
+  /// Quaternion storage - imaginary part
+  static int const kX = 0;
+
+  /// Quaternion storage - imaginary part
+  static int const kY = 1;
+
+  /// Quaternion storage - imaginary part
+  static int const kZ = 2;
+
+  /// Quaternion storage - real part
+  static int const kW = 3;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a quaternion q = 0
+  CUTLASS_HOST_DEVICE
+  Quaternion() {
+    Base::at(kX) = Element();
+    Base::at(kY) = Element();
+    Base::at(kZ) = Element();
+    Base::at(kW) = Element();
+  }
+
+  /// Constructs a quaternion q = w + 0*i + 0*j + 0*k
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Element w_
+  ) {
+    Base::at(kX) = Element();
+    Base::at(kY) = Element();
+    Base::at(kZ) = Element();
+    Base::at(kW) = w_;
+  }
+
+  /// Constructs a quaternion q = w + x*i + y*j + z*k
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Element x_,
+    Element y_,
+    Element z_,
+    Element w_
+  ) {
+    Base::at(kX) = x_;
+    Base::at(kY) = y_;
+    Base::at(kZ) = z_;
+    Base::at(kW) = w_;
+  }
+
+  /// Constructs a quaternion from a vector representing the imaginary part and a real number
+  CUTLASS_HOST_DEVICE
+  Quaternion(
+    Matrix3x1<Element> const &imag_,
+    Element w_ = Element()
+  ) {
+    Base::at(kX) = imag_[0];
+    Base::at(kY) = imag_[1];
+    Base::at(kZ) = imag_[2];
+    Base::at(kW) = w_;
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(Index idx) const {
+    return Base::at(idx);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(Index idx) {
+    return Base::at(idx);
+  }
+
+  /// Accesses the x element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element x() const {
+    return Base::at(kX);
+  }
+
+  /// Accesses the x element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference x() {
+    return Base::at(kX);
+  }
+
+  /// Accesses the y element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element y() const {
+    return Base::at(kY);
+  }
+
+  /// Accesses the y element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference y() {
+    return Base::at(kY);
+  }
+
+  /// Accesses the z element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element z() const {
+    return Base::at(kZ);
+  }
+
+  /// Accesses the z element of the imaginary part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference z() {
+    return Base::at(kZ);
+  }
+
+  /// Accesses the real part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Element w() const {
+    return Base::at(kW);
+  }
+
+  /// Accesses the real part of the quaternion
+  CUTLASS_HOST_DEVICE
+  Reference w() {
+    return Base::at(kW);
+  }
+
+  /// Returns the pure imaginary part of the quaternion as a 3-vector
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> pure() const {
+    return Matrix3x1<Element>(x(), y(), z());
+  }
+
+  /// Returns a quaternion representation of a spatial rotation given a unit-length axis and
+  /// a rotation in radians.
+  CUTLASS_HOST_DEVICE
+  static Quaternion<Element> rotation(
+    Matrix3x1<Element> const &axis_unit,    ///< axis of rotation (assumed to be unit length)
+    Element theta) {                        ///< angular rotation in radians
+
+    Element s = fast_sin(theta / Element(2));
+
+    return Quaternion(
+      s * axis_unit[0],
+      s * axis_unit[1],
+      s * axis_unit[2],
+      fast_cos(theta / Element(2))
+    );
+  }
+  
+  /// Returns a quaternion representation of a spatial rotation represented as a
+  /// unit-length rotation axis (r_x, r_y, r_z) and an angular rotation in radians
+  CUTLASS_HOST_DEVICE
+  static Quaternion<Element> rotation(
+    Element r_x,
+    Element r_y,
+    Element r_z,
+    Element theta) {                      ///< angular rotation in radians
+
+    return rotation({r_x, r_y, r_z}, theta);
+  }
+
+  /// Geometric rotation of a 3-element vector
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> rotate(Matrix3x1<Element> const &rhs) const {
+    return (*this * Quaternion<Element>(rhs, 0) * reciprocal(*this)).pure();
+  }
+
+  /// Inverse rotation operation
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> rotate_inv(Matrix3x1<Element> const &rhs) const {
+    return (reciprocal(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
+  }
+
+  /// Rotates a 3-vector assuming this is a unit quaternion (a spinor)
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> spinor(Matrix3x1<Element> const &rhs) const {
+    return (*this * Quaternion<Element>(rhs, 0) * conj(*this)).pure();
+  }
+
+  /// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor)
+  CUTLASS_HOST_DEVICE
+  Matrix3x1<Element> spinor_inv(Matrix3x1<Element> const &rhs) const {
+    return (conj(*this) * Quaternion<Element>(rhs, 0) * *this).pure();
+  }
+
+  /// In-place addition
+  template <typename Element>
+  CUTLASS_HOST_DEVICE 
+  Quaternion<Element> &operator+=(Quaternion<Element> const &rhs) {
+    *this = (*this + rhs);
+    return *this;
+  }
+
+  /// In-place subtraction
+  template <typename Element>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator-=(Quaternion<Element> const &rhs) {
+    *this = (*this - rhs);
+    return *this;
+  }
+
+  /// In-place multiplication
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator*=(Quaternion<Element> const &rhs) {
+    *this = (*this * rhs);
+    return *this;
+  }
+
+  /// Scalar multiplication
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator*=(Element s) {
+    *this = (*this * s);
+    return *this;
+  }
+
+  /// In-place Division
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator/=(Quaternion<Element> const &rhs) {
+    *this = (*this / rhs);
+    return *this;
+  }
+
+  /// In-place Division
+  template <typename T>
+  CUTLASS_HOST_DEVICE
+  Quaternion<Element> &operator/=(Element s) {
+    *this = (*this / s);
+    return *this;
+  }
+
+  /// Computes a 3x3 rotation matrix (row-major representation)
+  CUTLASS_HOST_DEVICE
+  Matrix3x3<Element> as_rotation_matrix_3x3() const {
+    Matrix3x3<Element> m(
+      w() * w() + x() * x() - y() * y() - z() * z(),
+      2 * x() * y() - 2 * w() * z(),
+      2 * x() * z() + 2 * w() * y(),
+
+      2 * x() * y() + 2 * w() * z(),
+      w() * w() - x() * x() + y() * y() - z() * z(),
+      2 * y() * z() - 2 * w() * x(),
+
+      2 * x() * z() - 2 * w() * y(),
+      2 * y() * z() + 2 * w() * x(),
+      w() * w() - x() * x() - y() * y() + z() * z()
+    );
+    return m;
+  }
+
+  /// Computes a 4x4 rotation matrix (row-major representation)
+  CUTLASS_HOST_DEVICE
+  Matrix4x4<Element> as_rotation_matrix_4x4() const {
+    Matrix4x4<Element> m = Matrix4x4<Element>::identity();
+    m.set_slice_3x3(as_rotation_matrix_3x3());
+    return m;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a quaternion that is non-zero only in its real element.
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(
+  Element w) {                                ///< real part
+
+  return Quaternion<Element>(w);
+}
+
+/// Constructs a quaternion from a vector and real
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(
+  Matrix3x1<Element> const &imag,             ///< imaginary party as a vector
+  Element w) {                                ///< real part
+
+  return Quaternion<Element>(imag, w);
+}
+
+/// Constructs a quaternion from a unit-length rotation axis and a rotation 
+/// angle in radians
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_QuaternionRotation(
+  Matrix3x1<Element> const &axis_unit,        ///< rotation axis (unit-length)
+  Element w) {                                ///< rotation angle in radians
+
+  return Quaternion<Element>::rotation(axis_unit, w);
+}
+
+/// Constructs a quaternion q = xi + yj + zk + w
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> make_Quaternion(Element x, Element y, Element z, Element w) {
+  return Quaternion<Element>(x, y, z, w);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Returns the real part of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+Element const &real(Quaternion<Element> const &q) {
+  return q.w();
+}
+
+/// Returns the real part of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element &real(Quaternion<Element> &q) {
+  return q.w();
+}
+
+/// Returns the magnitude of the quaternion number
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element abs(Quaternion<Element> const &q) {
+  return fast_sqrt(norm(q));
+}
+
+/// Quaternion conjugate
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> conj(Quaternion<Element> const &q) {
+  return make_Quaternion(
+    -q.x(),
+    -q.y(),
+    -q.z(),
+    q.w()
+  );
+}
+
+/// Computes the squared magnitude of the quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element norm(Quaternion<Element> const &q) {
+  return q.x() * q.x() + q.y() * q.y() + q.z() * q.z() + q.w() * q.w();
+}
+
+/// Quaternion reciprocal
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> reciprocal(Quaternion<Element> const &q) {
+  
+  Element nsq = norm(q);
+  
+  return make_Quaternion(
+    -q.x() / nsq,
+    -q.y() / nsq,
+    -q.z() / nsq,
+    q.w() / nsq
+  );
+}
+
+/// Returns a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> unit(Quaternion<Element> const &q) {
+  
+  Element rcp_mag = Element(1) / abs(q);
+  
+  return make_Quaternion(
+    q.x() * rcp_mag,
+    q.y() * rcp_mag,
+    q.z() * rcp_mag,
+    q.w() * rcp_mag
+  );
+}
+
+/// Quaternion exponential
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> exp(Quaternion<Element> const &q) {
+  
+  Element exp_ = fast_exp(q.w());
+  Element imag_norm = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
+  Element sin_norm = fast_sin(imag_norm);
+
+  return make_Quaternion(
+    exp_ * q.x() * sin_norm / imag_norm,
+    exp_ * q.y() * sin_norm / imag_norm,
+    exp_ * q.z() * sin_norm / imag_norm,
+    exp_ * fast_cos(imag_norm)
+  );
+}
+
+/// Quaternion natural logarithm
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> log(Quaternion<Element> const &q) {
+  
+  Element v = fast_sqrt(q.x() * q.x() + q.y() * q.y() + q.z() * q.z());
+  Element s = fast_acos(q.w() / abs(q)) / v;
+  
+  return make_Quaternion(
+    q.x() * s,
+    q.y() * s,
+    q.z() * s,
+    fast_log(q.w())
+  );
+}
+
+/// Gets the rotation angle from a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Element get_rotation_angle(Quaternion<Element> const &q_unit) {
+  return fast_acos(q_unit.w()) * Element(2);
+}
+
+/// Gets the rotation axis from a unit-length quaternion
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> get_rotation_axis(Quaternion<Element> const &q_unit) {
+  return q_unit.pure().unit();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Equality operator
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+bool operator==(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return lhs.x() == rhs.x() &&
+    lhs.y() == rhs.y() &&
+    lhs.z() == rhs.z() &&
+    lhs.w() == rhs.w();
+}
+
+/// Inequality operator
+template <typename Element>
+CUTLASS_HOST_DEVICE 
+bool operator!=(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return !(lhs == rhs);
+}
+
+/// Quaternion scalar multiplication
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Quaternion<Element> q, Element s) {
+  return make_Quaternion(
+    q.x() * s,
+    q.y() * s,
+    q.z() * s,
+    q.w() * s
+  );
+}
+
+/// Quaternion scalar multiplication
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Element s, Quaternion<Element> const &q) {
+  return make_Quaternion(
+    s * q.x(),
+    s * q.y(),
+    s * q.z(),
+    s * q.w()
+  );
+}
+
+/// Quaternion scalar division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Quaternion<Element> const &q, Element s) {
+  return make_Quaternion(
+    q.x() / s,
+    q.y() / s,
+    q.z() / s,
+    q.w() / s
+  );
+}
+
+/// Quaternion unary negation
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator-(Quaternion<Element> const &q) {
+  return make_Quaternion(
+    -q.x(),
+    -q.y(),
+    -q.z(),
+    -q.w()
+  );
+}
+
+/// Quaternion addition
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator+(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.x() + rhs.x(), 
+    lhs.y() + rhs.y(), 
+    lhs.z() + rhs.z(), 
+    lhs.w() + rhs.w()
+  );
+}
+
+/// Quaternion subtraction
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator-(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.x() - rhs.x(), 
+    lhs.y() - rhs.y(), 
+    lhs.z() - rhs.z(), 
+    lhs.w() - rhs.w()
+  );
+}
+
+/// Quaternion product
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator*(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return make_Quaternion(
+    lhs.w() * rhs.x() + rhs.w() * lhs.x() + lhs.y() * rhs.z() - lhs.z() * rhs.y(),
+    lhs.w() * rhs.y() + rhs.w() * lhs.y() + lhs.z() * rhs.x() - lhs.x() * rhs.z(),
+    lhs.w() * rhs.z() + rhs.w() * lhs.z() + lhs.x() * rhs.y() - lhs.y() * rhs.x(),
+    lhs.w() * rhs.w() - lhs.x() * rhs.x() - lhs.y() * rhs.y() - lhs.z() * rhs.z()
+  );
+}
+
+/// Quaternion division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return lhs * reciprocal(rhs);
+}
+
+/// Quaternion scalar division
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Quaternion<Element> operator/(Element s, Quaternion<Element> const &q) {
+  return s * reciprocal(q);
+}
+
+/// Comparison 
+template <typename Element>
+CUTLASS_HOST_DEVICE
+bool operator<(Quaternion<Element> const &lhs, Quaternion<Element> const &rhs) {
+  return true; 
+}
+
+/// Rotates a 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
+/// a reciprocal.
+template <typename Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> spinor_rotation(
+  Quaternion<Element> const &spinor,        /// unit-length quaternion
+  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
+
+  return (spinor * Quaternion<Element>(rhs, 0) * conj(spinor)).pure();
+}
+
+/// Inverse rotation of 3-vector assuming this is a unit quaternion (a spinor). This avoids computing
+/// a reciprocal.
+template <typename  Element>
+CUTLASS_HOST_DEVICE
+Matrix3x1<Element> spinor_rotation_inv(
+  Quaternion<Element> const &spinor,        /// unit-length quaternion
+  Matrix3x1<Element> const &rhs) {          /// arbitrary 3-vector
+
+  return (conj(spinor) * Quaternion<Element>(rhs, 0) * spinor).pure();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Quaternion-valued type.
+template <typename T>
+struct RealType< Quaternion<T> > {
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = Quaternion<T>::kExtent;
+
+CUTLASS_HOST_DEVICE
+  static Quaternion<T> from_real(double x) {
+    return Quaternion<T>(static_cast<T>(x));
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Factories
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<half_t> from_real<cutlass::Quaternion<half_t> >(double r) {
+  return cutlass::Quaternion<half_t>(half_t(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<float> from_real<cutlass::Quaternion<float> >(double r) {
+  return cutlass::Quaternion<float>(float(r));
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+cutlass::Quaternion<double> from_real<cutlass::Quaternion<double> >(double r) {
+  return cutlass::Quaternion<double>(r);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// functional.h numeric specializations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct multiplies<Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(Quaternion<T> lhs, Quaternion<T> const &rhs) const {
+    lhs = lhs * rhs;
+    return lhs;
+  }
+};
+
+/// Squares with optional conversion
+template <typename T, typename Output>
+struct magnitude_squared<Quaternion<T>, Output> {
+  CUTLASS_HOST_DEVICE
+  Output operator()(Quaternion<T> lhs) const {
+    multiplies<Output> mul_op;
+
+    Output y_w = Output(lhs.w());
+    Output y_x = Output(lhs.x());
+    Output y_y = Output(lhs.y());
+    Output y_z = Output(lhs.z());
+
+    return mul_op(y_w, y_w) + mul_op(y_x, y_x) + mul_op(y_y, y_y) + \
+           mul_op(y_z, y_z);
+  }
+};
+
+template <typename T>
+struct multiply_add<Quaternion<T>, Quaternion<T>, Quaternion<T>> {
+  CUTLASS_HOST_DEVICE
+  Quaternion<T> operator()(
+    Quaternion<T> const &a,
+    Quaternion<T> const &b,
+    Quaternion<T> const &c) const {
+
+    T x = c.x();
+    T y = c.y();
+    T z = c.z();
+    T w = c.w();
+
+    x += a.w() * b.x();
+    x += b.w() * a.x();
+    x += a.y() * b.z();
+    x += -a.z() * b.y(),
+
+    y += a.w() * b.y();
+    y += b.w() * a.y();
+    y += a.z() * b.x();
+    y += -a.x() * b.z();
+
+    z += a.w() * b.z();
+    z += b.w() * a.z();
+    z += a.x() * b.y();
+    z += -a.y() * b.x();
+
+    w += a.w() * b.w();
+    w += -a.x() * b.x();
+    w += -a.y() * b.y();
+    w += -a.z() * b.z();
+
+    return cutlass::make_Quaternion(x, y, z, w);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/real.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/real.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfca386610d5b6412b98d942c45ca28c2129ec1f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/real.h
@@ -0,0 +1,63 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/**
+  \file
+  \brief This class provides helpers to support real<> and complex<> types in generic code.
+*/
+
+#pragma once
+
+#include <cutlass/detail/helper_macros.hpp> // CUTLASS_DEVICE
+
+namespace cutlass {
+
+/// Used to determine the real-valued underlying type of a numeric type T.
+template <typename T>
+struct RealType {
+  using Type = T;
+
+  /// Number of elements
+  static int const kExtent = 1;
+
+CUTLASS_HOST_DEVICE
+  static T from_real(double x) {
+    return static_cast<T>(x);
+  }
+};
+
+template <typename T>
+CUTLASS_HOST_DEVICE
+static T from_real(double r) {
+  return T(r);
+}
+
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/reduce_split_k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/reduce_split_k.h
new file mode 100644
index 0000000000000000000000000000000000000000..92b57aae26e22cc7a5859568882a9661f022c5a7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/reduce_split_k.h
@@ -0,0 +1,232 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/device_kernel.h"
+#include "cutlass/reduction/kernel/reduce_split_k.h"
+#include "cutlass/cuda_host_adapter.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ReductionKernel_
+>
+class ReduceSplitK {
+public:
+  using ReductionKernel = ReductionKernel_;
+
+  using Shape = typename ReductionKernel::Shape;
+  using ReductionOp = typename ReductionKernel::ReductionOp;
+  using OutputOp = typename ReductionKernel::OutputOp;
+
+  using ElementWorkspace = typename ReductionKernel::ElementWorkspace;
+  using ElementAccumulator = typename ReductionKernel::ElementAccumulator;
+  using ElementOutput = typename ReductionKernel::ElementOutput;
+
+  using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef;
+  using OutputTensorRef = typename ReductionKernel::OutputTensorRef;
+
+  using StrideIndex = typename ReductionKernel::StrideIndex;
+
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    MatrixCoord problem_size{0,0};
+    int partitions{1};
+    size_t partition_stride{0};
+    WorkspaceTensorRef workspace{};
+    OutputTensorRef destination{};
+    OutputTensorRef source{};
+    typename OutputOp::Params output{};
+    typename ReductionOp::Params reduction{};
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    Arguments() = default;
+   
+    CUTLASS_HOST_DEVICE 
+    Arguments(
+      MatrixCoord const & problem_size
+    ):
+      problem_size(problem_size) { }
+
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(partition_stride_),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_)
+    {
+
+    }
+
+  };
+
+private:
+  /// Kernel parameters object
+  typename ReductionKernel::Params params_;
+
+public:
+  /// Constructs Reduction SplitK
+  ReduceSplitK() { }
+
+  /// Determines whether the ReduceSplitK can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    // needs no additional workspace
+    return 0;
+  }
+
+  /// Initializes Reduction state from arguments.
+  Status initialize(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr) {
+    
+    // initialize the params structure from the arguments
+    params_ = typename ReductionKernel::Params(
+      args.problem_size,
+      args.partitions,
+      args.partition_stride,
+      args.workspace,
+      args.destination,
+      args.source,
+      args.output,
+      args.reduction
+    );
+
+    return Status::kSuccess;
+
+   }
+
+  /// Initializes Reduction kernel state from arguments.
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    // update the params structure from the arguments
+    params_.workspace.reset(args.workspace.non_const_ref().data());
+    params_.destination.reset(args.destination.non_const_ref().data());
+    params_.source.reset(args.source.non_const_ref().data());
+    params_.output = args.output;
+    params_.reduction = args.reduction;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+
+    //
+    // Launch reduction kernel
+    //
+    dim3 block = ReductionKernel::block_shape();
+    dim3 grid = ReductionKernel::grid_shape(params_.problem_size);
+
+    if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params_};
+          cuda_adapter->launch(
+              grid, dim3(1,1,1), block, 0, stream, kernel_params, kernel_index);
+        }
+    }
+    else {
+      cutlass::arch::synclog_setup();
+      Kernel<ReductionKernel><<< grid, block, 0, stream >>>(params_);
+    }
+
+    cudaError_t result = cudaGetLastError();
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    return run(stream, cuda_adapter, kernel_index);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args, 
+    void *workspace = nullptr, 
+    cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, int32_t kernel_index = 0) {
+    
+    Status status = initialize(args, workspace, stream);
+    
+    if (status == Status::kSuccess) {
+      status = run(stream,cuda_adapter, kernel_index);
+    }
+
+    return status;
+  }
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..26a0249e9c259dbf2930832d2819188ec74bda60
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce.h
@@ -0,0 +1,264 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/device/tensor_reduce_affine_strided.h"
+#include "cutlass/reduction/device/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on specific CUTLASS layouts over exactly one index
+template <
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename Layout_,
+  typename ReductionOp_,
+  int VectorLength_  = 1,
+  typename ElementCompute_ = ElementOutput_
+>
+struct TensorReduction {
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using Layout = Layout_;
+  using ReductionOp = ReductionOp_;
+  static int const kVectorLength = VectorLength_;
+  using ElementCompute = ElementCompute_;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Reduction operator
+  using ReductionDeviceStridedOperator = TensorReductionAffineStrided<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  using ReductionDeviceContiguousOperator = TensorReductionAffineContiguous<
+    4, 3, ElementOutput, ElementSource, ReductionOp, kVectorLength, ElementCompute
+  >;
+
+  //
+  // Data members
+  //
+
+  ReductionDeviceStridedOperator reduction_strided;
+  ReductionDeviceContiguousOperator reduction_contiguous;
+  int reduction_index;
+
+  //
+  // Methods
+  //
+
+  ///
+  TensorReduction(
+    TensorCoord extent, 
+    int reduction_index_
+  ): 
+    reduction_index(reduction_index_) {
+
+    Coord<4> extent_affine;
+
+    switch (reduction_index) {
+    case 0:
+      extent_affine[0] = extent[1];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[0];
+      extent_affine[3] = extent[3];
+      break;
+    case 1:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[2];
+      extent_affine[2] = extent[1];
+      extent_affine[3] = extent[3];
+      break;
+    case 2:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    case 3:
+      extent_affine[0] = extent[0];
+      extent_affine[1] = extent[1];
+      extent_affine[2] = extent[2];
+      extent_affine[3] = extent[3];
+      break;
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      reduction_contiguous = ReductionDeviceContiguousOperator(extent_affine);  
+    }
+    else {
+      reduction_strided = ReductionDeviceStridedOperator(extent_affine);  
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.good();
+    }
+    return reduction_strided.good();
+  }
+
+  /// Size of one workspace
+  int64_t workspace_stride() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_stride();
+    }
+    else {
+      return reduction_strided.workspace_stride();
+    }
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+    if (reduction_index == 3) {
+      return reduction_contiguous.workspace_size();
+    }
+    else {
+      return reduction_strided.workspace_size();
+    }
+  }
+
+  /// Helper to use overloaded function call operator
+  Status reduce(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    int64_t src_stride[3];
+    int64_t dst_stride[3];
+
+    switch (reduction_index) {
+    case 0:
+      src_stride[0] = src_ref.stride()[1];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[2];
+      dst_stride[0] = dst_ref.stride()[1];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 1:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[0];
+      src_stride[2] = src_ref.stride()[1];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[0];
+      break;
+    case 2:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      break;
+    case 3:
+      src_stride[0] = src_ref.stride()[2];
+      src_stride[1] = src_ref.stride()[1];
+      src_stride[2] = src_ref.stride()[0];
+
+      dst_stride[0] = dst_ref.stride()[2];
+      dst_stride[1] = dst_ref.stride()[1];
+      dst_stride[2] = dst_ref.stride()[0];
+
+    default: break;
+    }
+
+    if (reduction_index == 3) {
+      return reduction_contiguous(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+    else {
+      return reduction_strided(
+        dst_ref.data(),
+        dst_stride, 
+        src_ref.data(), 
+        src_stride, 
+        device_workspace_ptr, 
+        reduction_identity,
+        reduction_op, 
+        stream);
+    }
+  }
+
+  Status operator()(
+    TensorRef<ElementOutput, Layout> dst_ref,
+    TensorRef<ElementSource, Layout> src_ref,
+    void *device_workspace_ptr = nullptr,
+    ElementCompute reduction_identity = ElementCompute(),
+    ReductionOp reduction_op = ReductionOp(),
+    cudaStream_t stream = nullptr) {
+
+    return reduce(
+      dst_ref, 
+      src_ref, 
+      device_workspace_ptr, 
+      reduction_identity,
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
new file mode 100644
index 0000000000000000000000000000000000000000..c00c368165902bdda08f6316a07be19668dc0fb9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@@ -0,0 +1,374 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (e.g. ND => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguous {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineContiguous():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineContiguous(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p];
+    }
+
+    int cta_count_x = 1;
+    int cta_count_y = 1;
+    int cta_count_z = 1;
+
+    int cta_threads_x = kThreads;
+    int cta_threads_y = 1;
+    int cta_threads_z = 1;
+
+    // Determine CTA shape
+    int64_t inner_vector_count = inner_count / kVectorLength;
+
+    // Priority 1. Assign threadblocks to outer indices if possible
+    if (outer_count > target_threadblock_count) {
+      cta_count_x = 1;
+      cta_count_y = target_threadblock_count;
+      cta_count_z = 1;
+    }
+    else {
+
+      cta_count_y = int(outer_count);
+      int remaining_ctas = target_threadblock_count / cta_count_y;
+
+      // Priority 2. Assign inner dimensions to one CTA
+      if (inner_vector_count > cta_threads_x) {
+        int64_t cta_z_bound = inner_vector_count / cta_threads_x;
+        if (cta_z_bound > remaining_ctas) {
+          cta_count_z = remaining_ctas;
+        }
+        else {
+          cta_count_z = int(cta_z_bound);
+        }
+      }
+      else {
+        cta_threads_x = reshape_pow2(int(inner_vector_count), cta_threads_x);
+        cta_count_z = 1;
+      }
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    threadblock_shape = dim3(cta_threads_x, cta_threads_y, cta_threads_z);
+
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    if (workspace_count) {
+
+      int final_threads = kThreads;
+      int final_ctas = 1;
+
+      if (outer_count > kThreads) {
+        final_ctas = int(outer_count + kThreads - 1) / kThreads;
+      }
+      else {
+        final_threads = int(outer_count);
+      }
+
+      grid_final = dim3(final_ctas, 1, 1);
+      threadblock_final = dim3(final_threads, 1, 1); 
+    }
+    else {
+      grid_final = dim3(0, 0, 0);
+      threadblock_final = dim3(0, 0, 0);
+    }
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size (in bytes) of <outer_count> workspace elements which are densely packed together
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    return outer_count * sizeof_bits<ElementCompute>::value / 8;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineContiguous<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineContiguousFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+    }
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduction identity element
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(dst_ptr, dst_stride, src_ptr, src_stride, device_workspace_ptr, reduction_identity, reduction_op, stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
new file mode 100644
index 0000000000000000000000000000000000000000..c85d6dcbf13ba17a82b252124313c58f901e55f5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@@ -0,0 +1,362 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/kernel/tensor_reduce_affine_strided.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor reduction operator on layouts which are affine
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput_,
+  typename ElementSource_,
+  typename ReductionOp_,
+  int VectorLength  = 1,
+  typename ElementCompute_ = ElementOutput_,
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStrided {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ReductionOp = ReductionOp_;
+  using ElementCompute = ElementCompute_;
+
+  //
+  // Data members
+  //
+
+  /// Internal status field
+  Status status;
+
+  /// Extent of tensor in source layout
+  Coord<kRank> extent;
+
+  /// Number of points in the outer index space
+  int64_t outer_count;
+
+  /// Number of elements in the inner index space
+  int64_t inner_count;
+
+  /// Number of workspaces needed
+  int workspace_count;
+
+  /// CUDA Grid shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 grid_shape;
+
+  /// CUDA Threadblock shape (.x => contiguous, .y => outer, .z => inner)
+  dim3 threadblock_shape;
+
+  /// CUDA grid shape for the final reduction step if needed
+  dim3 grid_final;
+
+  /// CUDA threadblock shape for the final reduction step if needed
+  dim3 threadblock_final;
+
+private:
+  //
+  // Methods
+  //
+
+  /// Helper to reshape 'count' such that it is less than 2 x 'ext'
+  static int reshape_pow2(int ext, int count) {
+    if (ext > count) {
+      return 1;
+    }
+    int x = 1;
+    for (; count >= ext * 2; ) {
+      count >>= 1;
+      x <<= 1;
+    }
+    return x;
+  }
+
+public:
+
+  /// Default ctor
+  TensorReductionAffineStrided():
+    status(Status::kErrorInvalidProblem),
+    extent(),
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0),
+    grid_shape(0, 0, 0),
+    threadblock_shape(0, 0, 0) { }
+
+  /// Constructor
+  TensorReductionAffineStrided(
+    Coord<kRank> extent_,
+    int target_threadblock_count = 128
+  ):
+    status(Status::kSuccess),
+    extent(extent_), 
+    outer_count(0),
+    inner_count(0),
+    workspace_count(0) {
+
+    //
+    // Plan the parallel mapping strategy.
+    //
+
+    outer_count = 1;
+    inner_count = 1;
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= extent[p];
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= extent[kReducedRank + p - 1];
+    }
+
+    // Compute plan for the reduction
+    int extent_c = extent[kRank - 1];
+    int vectors_c = (extent_c -1 + kVectorLength) / kVectorLength;
+
+    // Determine CTA shape
+    int cta_width = kThreads * kVectorLength;
+    int cta_ways = reshape_pow2(extent_c, cta_width);
+    int cta_threads_x = kThreads / cta_ways;
+
+    threadblock_shape = dim3(cta_threads_x, 1, std::min(cta_ways, 64));
+
+    // This leads to an error.
+    if (threadblock_shape.z > 1) {
+      if (threadblock_shape.y != 1) {
+        status = Status::kErrorInternal;
+        return;
+      }
+    }
+    
+    // Determine grid shape
+    int cta_count_x = (vectors_c + cta_threads_x - 1) / cta_threads_x;
+    int cta_count_y = std::max(1, target_threadblock_count / cta_count_x);
+
+    // Limit the number of CTAs assigned to outer dimension
+    if (int64_t(cta_count_y * threadblock_shape.y) > outer_count) {
+      cta_count_y = int(outer_count + threadblock_shape.y - 1) / threadblock_shape.y;
+    }
+
+    // Limit the number of CTAs assigned to inner dimension
+    int cta_count_z = std::max(1, target_threadblock_count / cta_count_y);
+    if (int64_t(cta_count_z * threadblock_shape.z) > inner_count) {
+      cta_count_z = int(inner_count + threadblock_shape.z - 1) / threadblock_shape.z;
+    }
+
+    grid_shape = dim3(cta_count_x, cta_count_y, cta_count_z);
+    workspace_count = (cta_count_z > 1 ? cta_count_z : 0);
+
+    // Determine shape of final reduction kernel if needed
+    grid_final = dim3(cta_count_x, int(outer_count));
+    threadblock_final = dim3(cta_threads_x, 1, 1);
+  }
+
+  /// Simple check to verify the object is initialized correctly
+  bool good() const {
+    return status == Status::kSuccess;
+  }
+
+  /// Size of one CTA's workspace
+  int64_t workspace_stride() const {
+    
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    int vector_size_bytes = kVectorLength * sizeof_bits<ElementCompute>::value / 8;
+
+    return extent[kRank - 1] * vector_size_bytes;
+  }
+
+  /// Returns the size (in bytes) of a temporary workspace needed for reduction across CTAs
+  int64_t workspace_size() const {
+
+    // Error condition
+    if (!good()) {
+      return 0;
+    }
+
+    // No reduction across CTAs
+    if (grid_shape.z == 1) {
+      return 0;
+    }
+
+    return workspace_stride() * outer_count * grid_shape.z;
+  }
+
+  /// Performs a reduction
+  Status reduce(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,             ///< Device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    // Initial status check
+    if (!good()) {
+      return status;
+    }
+
+    // Guard against null workspace
+    if (workspace_count > 1 && device_workspace_ptr == nullptr) {
+      return Status::kErrorWorkspaceNull;
+    }
+
+    // Define reduction kernel
+    using ReductionKernel = kernel::TensorReductionAffineStrided<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using FinalReductionKernel = kernel::TensorReductionAffineStridedFinal<
+      kRank,
+      kReducedRank,
+      ElementOutput, 
+      ElementSource, 
+      ReductionOp, 
+      kVectorLength,
+      ElementCompute,
+      kThreads>;
+
+    using Params = typename ReductionKernel::Params;
+
+    // Construct the parameters
+    Params params(
+      extent, 
+      dst_ptr,
+      dst_stride, 
+      src_ptr,
+      src_stride,
+      static_cast<ElementCompute *>(device_workspace_ptr),
+      workspace_stride(),
+      workspace_count,
+      reduction_op,
+      reduction_identity);
+
+    // Shared memory size
+    int shared_mem_bytes = sizeof(typename ReductionKernel::SharedStorage);
+
+    // Launch the kernel
+    cutlass::arch::synclog_setup();
+    Kernel<ReductionKernel><<< grid_shape, threadblock_shape, shared_mem_bytes, stream >>>(params);
+
+    // Check error condition
+    if (cudaPeekAtLastError() == cudaSuccess) {
+      status = Status::kSuccess;
+    }
+    else {
+      status = Status::kErrorInternal;
+    }
+
+    // Final reduction kernel
+    if (workspace_count) {
+
+      Kernel<FinalReductionKernel><<< grid_final, threadblock_final, 0, stream >>>(params);
+
+      // Check error condition
+      if (cudaPeekAtLastError() == cudaSuccess) {
+        status = Status::kSuccess;
+      }
+      else {
+        status = Status::kErrorInternal;
+      }
+    }
+
+    return status;
+  }
+
+  /// Helper to use overloaded function call operator
+  Status operator()(
+    ElementOutput *dst_ptr,                       ///< Pointer to destination tensor
+    int64_t dst_stride[],                         ///< Stride vector (of length kReducedRank - 1)
+    ElementSource const *src_ptr,                 ///< Pointer to source tensor
+    int64_t src_stride[],                         ///< Stride vector (of length kRank - 1)
+    void *device_workspace_ptr = nullptr,         ///< Pointer to device workspace
+    ElementCompute reduction_identity = ElementCompute(), ///< Reduciton identity
+    ReductionOp reduction_op = ReductionOp(),     ///< Reduction operator
+    cudaStream_t stream = nullptr) {              ///< CUDA Stream into which all kernels are launched
+
+    return reduce(
+      dst_ptr, 
+      dst_stride, 
+      src_ptr, 
+      src_stride, 
+      device_workspace_ptr, 
+      reduction_identity, 
+      reduction_op, 
+      stream);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d39dc751c4bdef328398c5a94e5462136728f6a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_softmax_final.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a final reduction for softmax
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm75.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+template <
+  typename ElementNorm_,
+  typename ElementSum_,
+  typename ElementSoftmaxCompute_,
+  typename ThreadblockShape_,
+  bool GroupedProblem = false
+>
+class ApplySoftmaxFinalReduction {
+public:
+
+  using ElementNorm = ElementNorm_;
+  using ElementSum = ElementSum_;
+  using ElementSoftmaxCompute = ElementSoftmaxCompute_;
+  using ThreadblockShape = ThreadblockShape_;
+  static const bool isGroupedProblem = GroupedProblem;
+
+  //
+  // Arguments
+  //
+
+  struct Arguments {
+
+    cutlass::gemm::GemmCoord*  problem_sizes{nullptr};
+    cutlass::gemm::GemmCoord   problem_size{};
+    ElementNorm*               block_Norm{nullptr};
+    ElementSum*                block_Sum{nullptr};
+    int64_t*                   offset_Norm_Device{nullptr};
+    int64_t*                   offset_Sum_Device{nullptr};
+    int64_t                    batch_stride_Max{0};
+    int64_t                    batch_stride_Sum{0};
+
+    //
+    // Methods
+    //
+    Arguments() { }
+
+    // Non-grouped constructor without batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr),
+      batch_stride_Max(0),
+      batch_stride_Sum(0)
+    {
+
+    }
+
+    // Non-grouped constructor with batching
+    Arguments(
+      cutlass::gemm::GemmCoord  problem_size,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t                   batch_stride_Max,
+      int64_t                   batch_stride_Sum
+    ):
+      problem_size(problem_size),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      batch_stride_Max(batch_stride_Max),
+      batch_stride_Sum(batch_stride_Sum),
+      problem_sizes(nullptr),
+      offset_Norm_Device(nullptr),
+      offset_Sum_Device(nullptr)
+    {
+
+    }
+
+
+    // Grouped constructor
+    Arguments(
+      cutlass::gemm::GemmCoord  *problem_sizes,
+      ElementNorm*              block_Norm,
+      ElementSum*               block_Sum,
+      int64_t*                  offset_Norm_Device,
+      int64_t*                  offset_Sum_Device
+    ):
+      problem_sizes(problem_sizes),
+      problem_size(cutlass::gemm::GemmCoord(0, 0, 0)),
+      block_Norm(block_Norm),
+      block_Sum(block_Sum),
+      offset_Norm_Device(offset_Norm_Device),
+      offset_Sum_Device(offset_Sum_Device)
+    {
+
+    }
+  };
+
+  struct SharedStorage {
+
+
+  };
+
+  //
+  // Params struct
+  //
+
+  struct Params {
+    Arguments args;
+
+    //
+    // Methods
+    //
+    Params() { }
+
+    Params(Arguments const &args_): args(args_) { }
+  };
+
+private:
+
+public:
+
+  CUTLASS_DEVICE
+  ApplySoftmaxFinalReduction() { }
+
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    apply(params, shared_storage);
+  }
+
+private:
+
+  /// Full reduction
+  CUTLASS_DEVICE
+  void apply(Params const &params, SharedStorage &shared_storage) {
+
+    int tid = threadIdx.x;
+    int bid = blockIdx.x;
+    int bdim = blockDim.x;
+    
+    int block_batch = blockIdx.z;
+
+    // defining three vars for a general reduction module
+    cutlass::gemm::GemmCoord problem_size = isGroupedProblem ? params.args.problem_sizes[bid] : params.args.problem_size;
+    int m_dim_in_loop = isGroupedProblem ? problem_size.m() : tid + bdim;
+    int access_offset = isGroupedProblem ? 0 : bid * bdim;
+
+    if (!isGroupedProblem && access_offset + tid >= problem_size.m()) return;
+
+    ElementNorm *curr_ptr_Max = isGroupedProblem ? \
+              params.args.block_Norm + params.args.offset_Norm_Device[bid] : \
+              params.args.block_Norm + block_batch * params.args.batch_stride_Max;
+    ElementSum *curr_ptr_Sum = isGroupedProblem ? \
+              params.args.block_Sum + params.args.offset_Sum_Device[bid] : \
+              params.args.block_Sum + block_batch * params.args.batch_stride_Sum;
+
+    int threadblock_num = (problem_size.n() + ThreadblockShape::kN - 1) / ThreadblockShape::kN;
+
+    using ConvertSumOutput = cutlass::NumericConverter<ElementSum, ElementSoftmaxCompute>;
+    using ConvertNormOutput = cutlass::NumericConverter<ElementNorm, ElementSoftmaxCompute>;
+
+    using ConvertSum = cutlass::NumericConverter<ElementSoftmaxCompute, ElementSum>;
+    using ConvertNorm = cutlass::NumericConverter<ElementSoftmaxCompute, ElementNorm>;
+
+    ConvertSum   convert_sum;
+    ConvertNorm  convert_norm;
+
+    ConvertSumOutput   convert_sum_output;
+    ConvertNormOutput  convert_norm_output;
+
+    uint32_t float_max_bits = 0xff7fffff;
+    float min_float = reinterpret_cast<float const &>(float_max_bits);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int idx_m = tid; idx_m < m_dim_in_loop; idx_m += bdim) {
+      ElementNorm *access_n = curr_ptr_Max + idx_m + access_offset;
+      ElementSum *access_s = curr_ptr_Sum + idx_m + access_offset;
+      ElementNorm *access_n_bak = access_n;
+      ElementSum *access_s_bak = access_s;
+      ElementSoftmaxCompute max_val = ElementSoftmaxCompute(min_float);
+      ElementSoftmaxCompute sum_val = ElementSoftmaxCompute(0);
+      ElementNorm fetch_n;
+      ElementSum fetch_s;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        max_val = cutlass::fast_max(max_val, convert_norm(fetch_n));
+        access_n += problem_size.m();
+      }
+
+      access_n = access_n_bak;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int idx_n = 0; idx_n < threadblock_num; idx_n++) {
+        cutlass::arch::global_load<ElementNorm, sizeof(ElementNorm)>(fetch_n, access_n, true);
+        cutlass::arch::global_load<ElementSum, sizeof(ElementSum)>(fetch_s, access_s, true);
+        sum_val += convert_sum(fetch_s) * cutlass::fast_exp(convert_norm(fetch_n) - max_val);
+        access_n += problem_size.m();
+        access_s += problem_size.m();
+      }
+
+      ElementSoftmaxCompute inv_sum = cutlass::constants::one<ElementSoftmaxCompute>() / sum_val;
+
+      access_n = access_n_bak;
+      access_s = access_s_bak;
+
+      access_n[0] = convert_norm_output(max_val);
+      access_s[0] = convert_sum_output(inv_sum);
+    }
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6d26666957a58321c579b191ec06c84503e8ca2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -0,0 +1,248 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/layout/matrix.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape_,              ///< shape of CTA        (concept: MatrixShape)
+  typename OutputOp_ ,          ///< output operator     (concept: epilogue::thread operator)
+  typename ReductionOp_,        ///< reduction operator  (concept: ReductionOperator)
+  int PartitionsPerStage = 4    ///< number of partitions to issue 
+>
+class ReduceSplitK {
+public:
+
+  using Shape = Shape_;
+  using ReductionOp = ReductionOp_;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = OutputOp::kCount;
+  static int const kPartitionsPerStage = PartitionsPerStage;
+
+  using ElementWorkspace = typename ReductionOp::Element;
+  using ElementAccumulator = typename ReductionOp::ElementAccumulator;
+  using ElementOutput = typename OutputOp::ElementOutput;
+
+  using WorkspaceTensorRef = TensorRef<ElementWorkspace, layout::RowMajor>;
+  using OutputTensorRef = TensorRef<ElementOutput, layout::RowMajor>;
+  using StrideIndex = typename WorkspaceTensorRef::Layout::Stride::Index;
+
+  using FragmentWorkspace = AlignedArray<ElementWorkspace, kElementsPerAccess>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentOutput = AlignedArray<ElementOutput, kElementsPerAccess>;
+
+  //
+  // Types
+  //
+
+  /// Params structure
+  struct Params {
+
+    MatrixCoord problem_size;
+    int partitions;
+    size_t partition_stride;
+    WorkspaceTensorRef workspace;
+    OutputTensorRef destination;
+    OutputTensorRef source;
+    typename OutputOp::Params output;
+    typename ReductionOp::Params reduction;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      MatrixCoord problem_size_,
+      int partitions_,
+      size_t partition_stride_,
+      WorkspaceTensorRef workspace_,
+      OutputTensorRef destination_,
+      OutputTensorRef source_,
+      typename OutputOp::Params output_ = typename OutputOp::Params(),
+      typename ReductionOp::Params reduction_ = typename ReductionOp::Params()
+    ):
+      problem_size(problem_size_),
+      partitions(partitions_),
+      partition_stride(sizeof(FragmentWorkspace) * partition_stride_ / kElementsPerAccess),
+      workspace(workspace_),
+      destination(destination_),
+      source(source_),
+      output(output_),
+      reduction(reduction_) {
+
+    }
+  };
+
+  struct SharedStorage { };
+
+
+public:
+
+  /// Computes the grid size given a chosen threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 grid_shape(
+    cutlass::MatrixCoord problem_size) {
+
+    return dim3(
+      (problem_size.row() + Shape::kRow - 1) / Shape::kRow,
+      (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn);
+  }
+
+  /// Determines the threadblock shape
+  CUTLASS_HOST_DEVICE
+  static dim3 block_shape() {
+    return dim3(Shape::kColumn / kElementsPerAccess, Shape::kRow);
+  }
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &storage) {
+
+    // Determine CTA position
+    MatrixCoord thread_offset(
+      MatrixCoord::Index(int(blockIdx.x) * Shape::kRow + threadIdx.y),
+      MatrixCoord::Index(int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess)
+    );
+
+    // One guard conditional
+    if (!(thread_offset.row() < params.problem_size.row() && 
+          thread_offset.column() < params.problem_size.column())) {
+
+      return;
+    }
+
+
+    ReductionOp reduction_op(params.reduction);
+
+    FragmentAccumulator accumulator;
+
+    accumulator.clear();  
+    
+    //
+    // Load the first slice
+    //
+
+    char const *workspace_ptr = 
+      reinterpret_cast<char const *>(
+        params.workspace.data() + params.workspace.offset(thread_offset));
+
+    FragmentWorkspace workspace_frag[kPartitionsPerStage];
+    
+    //
+    // Construct the output operator
+    //
+    
+    OutputOp output_op(params.output);
+
+    //
+    // Load and accumulate with a simple batched loading sequence.
+    //
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int k = 0; k < params.partitions; k += kPartitionsPerStage) {
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          workspace_frag[i] = *reinterpret_cast<FragmentWorkspace const *>(workspace_ptr);
+          workspace_ptr += params.partition_stride;
+        }
+      }   
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kPartitionsPerStage; ++i) {
+        if (k + i < params.partitions) {
+          accumulator = reduction_op(accumulator, workspace_frag[i]);
+        }
+      }
+    }
+
+    //
+    // Conditionally load the source
+    //
+
+    FragmentOutput source_frag;
+
+    source_frag.clear();
+
+    FragmentOutput const *source_ptr = reinterpret_cast<FragmentOutput const *>(
+      params.source.data() + params.source.offset(thread_offset));
+
+    if (output_op.is_source_needed()) {
+      reinterpret_cast<FragmentOutput &>(source_frag) = *source_ptr;
+    }
+    
+    //
+    // Compute the output
+    //
+
+    typename OutputOp::FragmentOutput output_frag = output_op(accumulator, source_frag);
+
+    //
+    // Store
+    //
+
+    FragmentOutput *dest_ptr = reinterpret_cast<FragmentOutput *>(
+      params.destination.data() + params.destination.offset(thread_offset));
+
+    *dest_ptr = reinterpret_cast<FragmentOutput const &>(output_frag);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
new file mode 100644
index 0000000000000000000000000000000000000000..914bbddda9227d1f1772d8e8171b06280b7a5f61
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (i.e. number of outer ranks)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineContiguousParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank];             /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element used by reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineContiguousParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineContiguousParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_ = ElementCompute() ///< Identity element used by reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kRank - 1 - p]);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *INCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguous {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into a coordinate of rank <kInnerRank>
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kRank - kInnerRank]);
+
+    // Compute an offset using the souce stride
+    src_offset = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank - 1; ++i) {
+      src_offset += coord[i] * params.src_stride[kReducedRank + i];
+    }
+    src_offset += coord[kInnerRank - 1] * sizeof_bits<ElementSource>::value / 8;
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices yielding a single element
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr,
+    int coord_c) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    //
+    // Early exit or initialize to identity element
+    //
+    if (!params.inner_count) {
+      return params.reduction_identity;
+    }
+
+    ComputeFragment accumulator;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(accumulator.size()); ++i) {
+      accumulator[i] = params.reduction_identity;
+    }
+    
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = (threadIdx.x + blockDim.x * threadIdx.z + blockDim.x * blockIdx.z * blockDim.z) * kVectorLength;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += (blockDim.z * gridDim.z * blockDim.x) * kVectorLength;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            cvt);
+        }
+      }
+    };
+
+    //
+    // Reduction of vectors to scalar
+    //
+
+    ElementCompute reduced_accumulator = accumulator[0];
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kVectorLength; ++i) {
+      reduced_accumulator = reduction_op(reduced_accumulator, accumulator[i]);
+    }
+
+    //
+    // Reduction within CTA across threadIdx.xz => threadIdx{.x = 0, .z = 0}
+    //
+    // This re-arranges data so threadIdx.y is effectively a row index and threadIdx.xz is a column
+    //
+
+    int thread_count = blockDim.x * blockDim.z;
+    int thread_j = threadIdx.x + blockDim.x * threadIdx.z;
+    int thread_i = threadIdx.y;
+
+    ElementCompute *frag_ptr = reinterpret_cast<ElementCompute *>(threadblock_workspace) + thread_i * thread_count;
+
+    frag_ptr[thread_j] = reduced_accumulator;
+
+    //
+    // Reduce
+    //
+    CUTLASS_PRAGMA_NO_UNROLL
+    while (thread_count > 1) {
+      thread_count /= 2;
+
+      __syncthreads();
+
+      if (thread_j < thread_count) {
+        ElementCompute other = frag_ptr[thread_j + thread_count];
+
+        reduced_accumulator = reduction_op(reduced_accumulator, other);
+
+        frag_ptr[thread_j] = reduced_accumulator;
+      }
+
+      __syncthreads();
+    }
+
+
+    return reduced_accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace);
+    }
+
+    uint64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+
+          // Convert to output type and store
+          NumericConverter<ElementOutput, ElementCompute> convert_output;
+          ElementOutput cvt = convert_output(result);
+
+          *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = cvt;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with workspace
+      while (idx_linear < params.outer_count) {
+
+        ElementCompute result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset,
+          coord_c);
+
+        int64_t byte_offset = 
+          blockIdx.z * params.workspace_stride + idx_linear * sizeof_bits<ElementCompute>::value / 8;
+
+        // Store the result for final reduction
+        if (threadIdx.z == 0 && threadIdx.x == 0) {
+          *reinterpret_cast<ElementCompute *>(dst_byte_ptr + byte_offset) = result;
+        }
+
+        __syncthreads();
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+      } // while
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineContiguousFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineContiguousParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate of rank <kReducedRank>
+    coord = CoordinateDecomposition<kReducedRank>(linear_idx, params.divmod);
+
+    // Compute offsets using destination and source strides
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ElementCompute reduce_indices_(
+    Params const &params,
+    ElementCompute const *device_workspace) {
+
+    ReductionOp reduction_op(params.reduction_op);
+    char const *src_byte_ptr = reinterpret_cast<char const *>(device_workspace);
+
+    // Accumulated output
+    ElementCompute accumulator = params.reduction_identity;
+
+    for (int iter = 0; iter < params.workspace_count; ++iter) {
+      ElementCompute workspace_item = *reinterpret_cast<ElementCompute const *>(src_byte_ptr);
+      
+      accumulator = reduction_op(accumulator, workspace_item);
+
+      src_byte_ptr += params.workspace_stride;
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    uint64_t idx_linear = blockIdx.x * blockDim.x + threadIdx.x;
+
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination);
+
+    // Use modulo division to compute location
+    Coord<kReducedRank> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      ElementCompute result = reduce_indices_(params, params.device_workspace + idx_linear);
+
+      // Convert to output type and store
+      NumericConverter<ElementOutput, ElementCompute> convert_output;
+
+      *reinterpret_cast<ElementOutput *>(dst_byte_ptr + dst_byte_offset) = convert_output(result);
+
+      // Update indices and pointers
+      idx_linear += gridDim.x * blockDim.x;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
new file mode 100644
index 0000000000000000000000000000000000000000..0538184f3886b53207cc28a46a9fb8b04d3e8c5e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@@ -0,0 +1,641 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over one or more ranks of an affine tensor
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace kernel {
+
+/// Parameters structure
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+struct TensorReductionAffineStridedParams {
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+
+  Coord<kRank> extent;                          /// Extent of source tensor
+  FastDivmodU64 divmod[kRank - 1];              /// FastDivmod by each strided rank
+  int64_t dst_stride[kReducedRank - 1];         /// stride (units of bytes) - I, J
+  int64_t src_stride[kRank - 1];                /// stride (units of bytes) - I, J, K
+  int64_t workspace_stride;                     /// stride (units of bytes) between workspace
+  int64_t workspace_outer_stride;               /// stride (units of bytes) between 'rows' of the workspace
+  int workspace_count;                          /// number of workspaces
+  
+  uint64_t inner_count;                          /// Number of elements in reduced index space
+  uint64_t outer_count;                          /// Number of elements in outer index space
+
+  ElementOutput * destination;                  /// Pointer to output tensor of rank kReducedRank
+  ElementSource const * source;                 /// Pointer to source pointer of rank kRank
+  ReductionOp reduction_op;                     /// Reduction operator
+  ElementCompute reduction_identity;            /// Identity element for reduction operator
+  ElementCompute *device_workspace;             /// Pointer to device workspace for inter-CTA reductions
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  TensorReductionAffineStridedParams() {
+
+  }
+
+  /// Ctor
+  TensorReductionAffineStridedParams(
+    Coord<kRank> extent_,                       ///< Extent of source tensor
+    ElementOutput * dst_ptr_,                   ///< Output tensor data
+    int64_t dst_stride_[],                      ///< Stride (units of elements)
+    ElementSource const * src_ptr_,             ///< Source tensor data
+    int64_t src_stride_[],                      ///< Stride (units of elements)
+    ElementCompute *device_workspace_,          ///< Pointer to device workspace for inter-CTA reductions
+    int64_t workspace_stride_,                  ///< Stride between workspaces
+    int workspace_count_,                       ///< Number of workspaces
+    ReductionOp reduction_op_,                  ///< Reduction operator
+    ElementCompute reduction_identity_  = ElementCompute() ///< Identity element for reduction operator
+  ):
+    extent(extent_),
+    inner_count(1),
+    outer_count(1),
+    destination(dst_ptr_),
+    source(src_ptr_),
+    device_workspace(device_workspace_),
+    workspace_outer_stride(0),
+    workspace_stride(workspace_stride_),
+    workspace_count(workspace_count_),
+    reduction_op(reduction_op_),
+    reduction_identity(reduction_identity_) {
+
+    // Initialize divisors for fast div-mod
+    for (int p = 1; p < kRank; ++p) {
+      divmod[p - 1] = FastDivmodU64(uint64_t(extent[p]));
+    }
+
+    int input_size_bits = sizeof_bits<ElementSource>::value;
+    int output_size_bits = sizeof_bits<ElementOutput>::value;
+
+    workspace_outer_stride = workspace_stride * workspace_count;
+
+    // Compute strides in units of bytes
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      dst_stride[p] = dst_stride_[p] * output_size_bits / 8;
+    }  
+
+    for (int p = 0; p < kRank - 1; ++p) {
+      src_stride[p] = src_stride_[p] * input_size_bits / 8;
+    }
+
+    // Compute number of elements in strided ranks
+    for (int p = 0; p < kReducedRank - 1; ++p) {
+      outer_count *= uint64_t(extent[p]);
+    }
+
+    for (int p = 0; p < kInnerRank; ++p) {
+      inner_count *= uint64_t(extent[kReducedRank + p - 1]);
+    }
+  }
+};
+
+/// Kernel to reduce a tensor with affine layout over a set of ranks *EXCLUDING* the contiguous
+/// rank. This leads to favorable vectorized memory accesses over the contiguous rank.
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStrided {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory allocation used for reduction within the CTA
+  struct SharedStorage {
+    Array<ElementCompute, kThreads * kVectorLength> workspace;
+  };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_inner_coord_and_offset_(
+    Params const &params, 
+    Coord<kInnerRank> & coord, 
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose into coordinate
+    coord = CoordinateDecomposition<kInnerRank>(linear_idx, &params.divmod[kReducedRank - 1]);
+
+    // Compute linear offset
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kInnerRank; ++i) {
+      src_offset += params.src_stride[kReducedRank + i - 1] * coord[i];
+    }
+  }
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    int64_t &src_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear coordinate
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute offset into tensors
+    dst_offset = 0;
+    src_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+      src_offset += params.src_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    ElementCompute *threadblock_workspace,
+    char const *src_byte_ptr) {
+
+    NumericArrayConverter<ElementCompute, ElementSource, VectorLength> convert_source;
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    if (!params.inner_count) {
+      return identity_frag;
+    }
+    
+    ComputeFragment accumulator = identity_frag;
+
+    // Compute the coordinate of the first access    
+    int64_t src_byte_offset = 0;
+    Coord<kInnerRank> coord; 
+
+    uint64_t linear_idx = threadIdx.z + blockIdx.z * blockDim.z;
+    compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+
+    // Load the first vector
+    SourceFragment source_fragment[kBatchSize];
+    
+    bool not_done = true;
+
+    // Iterate over vectors in a linearized reduction index space
+    while (not_done) {
+
+      bool guards[kBatchSize];
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+
+        if (linear_idx < params.inner_count) {
+          source_fragment[b] = *reinterpret_cast<SourceFragment const *>(src_byte_ptr + src_byte_offset);
+          guards[b] = true;
+        }
+        else {
+          guards[b] = false;
+          not_done = false;
+        }
+
+        linear_idx += blockDim.z * gridDim.z;
+        compute_inner_coord_and_offset_(params, coord, src_byte_offset, linear_idx);
+      }
+
+      // Perform a batch of reduction operations
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (guards[b]) {
+
+          auto cvt = convert_source(source_fragment[b]);
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op,
+             accumulator, 
+             cvt);
+        }
+      }
+    };
+
+    // Optional reduction within a CTA
+    if (blockDim.z > 1) {
+
+      // Linearized thread ID
+      int thread_idx = threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+
+      // all threads store to workspace
+      ComputeFragment *frag_ptr = reinterpret_cast<ComputeFragment *>(threadblock_workspace);
+
+      frag_ptr[thread_idx] = accumulator;
+
+      __syncthreads();
+
+      if (threadIdx.z == 0) {
+        // Load all additional block indices
+        for (int z = 1; z < blockDim.z; ++z) {
+          ComputeFragment frag = frag_ptr[thread_idx + z * blockDim.x * blockDim.y];
+
+          accumulator = cutlass::reduction::thread::detail::ApplyArrayOperator(
+            reduction_op, 
+            accumulator, 
+            frag);
+        } 
+      }
+
+      __syncthreads();
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char const * src_byte_ptr = reinterpret_cast<char const *>(params.source + coord_c);
+    char * dst_byte_ptr = nullptr;
+
+    // If performing a reduction across CTAs, redirect output to device workspace
+    if (gridDim.z == 1) {
+      dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+    }
+    else {
+      dst_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    }
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+    int64_t src_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      src_byte_offset, 
+      idx_linear);
+
+    if (gridDim.z == 1) {
+
+      /// Complete the reduction with no workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          // Convert to output type and store
+          NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+          auto cvt = convert_output(result);
+
+          *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+            reinterpret_cast<OutputFragment const &>(cvt);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+
+      } // while 
+    }
+    else {
+
+      /// Complete the reduction with a device workspace
+      while (idx_linear < params.outer_count) {
+
+        ComputeFragment result;
+
+        result = reduce_indices_(
+          params, 
+          shared_storage.workspace.data(),
+          src_byte_ptr + src_byte_offset);
+
+        // Store the result after possible final reduction within the CTA
+        if (threadIdx.z == 0) {
+
+          int64_t byte_offset = 
+            blockIdx.z * params.workspace_stride + idx_linear * params.workspace_outer_stride;
+
+          // No conversion - store in compute type
+          *reinterpret_cast<ComputeFragment *>(dst_byte_ptr + byte_offset) = 
+            reinterpret_cast<ComputeFragment const &>(result);
+        }
+
+        // Update indices and pointers
+        idx_linear += gridDim.y * blockDim.y;
+
+        compute_outer_coord_and_offset_(
+          params, 
+          outer_coord, 
+          dst_byte_offset, 
+          src_byte_offset, 
+          idx_linear);
+        
+      } // while (outer index)
+    } // if ()
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Kernel to perform final reduction
+template <
+  int Rank,                                   ///< Rank of source tensor (e.g. NDHWC => 5)
+  int ReducedRank,                            ///< Rank of reduced tensor (includes contiguous, e.g. NC => 2)
+  typename ElementOutput,                     ///< Data type of output tensor
+  typename ElementSource,                     ///< Data type of source tensor
+  typename ReductionOp,                       ///< Reduction operator
+  int VectorLength  = 1,                      ///< Vector length for memory
+  typename ElementCompute = ElementOutput,    ///< Internal compute type - input type of reduction operation
+  int Threads = 256,                          ///< Number of participating threads
+  int BatchSize = 4                           ///< Number of elements to load per batch
+>
+class TensorReductionAffineStridedFinal {
+public:
+
+  static int const kRank = Rank;
+  static int const kReducedRank = ReducedRank;
+  static int const kVectorLength = VectorLength;
+  static int const kInnerRank = kRank - kReducedRank;
+  static int const kThreads = Threads;
+  static int const kBatchSize = BatchSize;
+  using ComputeFragment = Array<ElementCompute, VectorLength>;
+  using SourceFragment = AlignedArray<ElementSource, VectorLength>;
+  using OutputFragment = AlignedArray<ElementOutput, VectorLength>;
+
+  /// Shared memory
+  struct SharedStorage { };
+
+  /// Parameters structure
+  using Params = TensorReductionAffineStridedParams<
+    Rank,
+    ReducedRank,
+    ElementOutput,
+    ElementSource,
+    ReductionOp,
+    VectorLength,
+    ElementCompute,
+    Threads,
+    BatchSize
+  >;
+
+private:
+
+  /// Computes the coordinate and offset of a given linear index
+  CUTLASS_DEVICE
+  void compute_outer_coord_and_offset_(
+    Params const &params, 
+    Coord<kReducedRank - 1> & coord, 
+    int64_t &dst_offset,
+    uint64_t linear_idx) const {
+
+    // Decompose linear index
+    coord = CoordinateDecomposition<kReducedRank - 1>(linear_idx, params.divmod);
+
+    // Compute tensor offset
+    dst_offset = 0;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kReducedRank - 1; ++i) {
+      dst_offset += params.dst_stride[i] * coord[i];
+    }
+  }
+
+  /// Reduces over the reduction indices
+  CUTLASS_DEVICE
+  ComputeFragment reduce_indices_(
+    Params const &params,
+    char *src_byte_ptr) {
+
+    ReductionOp reduction_op(params.reduction_op);
+
+    // Accumulated output
+    ComputeFragment identity_frag;
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < int(identity_frag.size()); ++i) {
+      identity_frag[i] = params.reduction_identity;
+    }
+
+    ComputeFragment accumulator = identity_frag;
+    ComputeFragment workspace_fragments[kBatchSize];
+
+    // Partially unrolled loop
+    for (int idx = 0; idx < params.workspace_count; idx += kBatchSize) {
+
+      // Issue a batch of loads
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        if (idx + b < params.workspace_count) {
+          workspace_fragments[b] = 
+            *reinterpret_cast<ComputeFragment *>(src_byte_ptr);  
+        }
+        else {
+          workspace_fragments[b] = identity_frag;
+        }
+        src_byte_ptr += + params.workspace_stride;
+      }
+
+      // Perform a reduction
+      CUTLASS_PRAGMA_UNROLL
+      for (int b = 0; b < kBatchSize; ++b) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kVectorLength; ++i) {
+          accumulator[i] = reduction_op(accumulator[i], workspace_fragments[b][i]);
+        }
+      }
+    }
+
+    return accumulator;
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Perform a reduction
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    int coord_c = (blockIdx.x * blockDim.x + threadIdx.x) * kVectorLength;
+
+    char * src_byte_ptr = reinterpret_cast<char *>(params.device_workspace + coord_c);
+    char * dst_byte_ptr = reinterpret_cast<char *>(params.destination + coord_c);
+
+    // If the C index is out of bounds, exit
+    if (coord_c >= params.extent[kRank - 1]) {
+      return;
+    }
+
+    int64_t idx_linear = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Use modulo division to compute location
+    Coord<kReducedRank - 1> outer_coord;
+    int64_t dst_byte_offset;
+
+    compute_outer_coord_and_offset_(
+      params, 
+      outer_coord, 
+      dst_byte_offset, 
+      idx_linear);
+
+    /// Complete the reduction
+    while (idx_linear < params.outer_count) {
+
+      int64_t src_byte_offset = idx_linear * params.workspace_outer_stride;
+
+      ComputeFragment result = reduce_indices_(
+        params, 
+        src_byte_ptr + src_byte_offset);
+
+      // Convert to output type and store
+      NumericArrayConverter<ElementOutput, ElementCompute, VectorLength> convert_output;
+      auto cvt = convert_output(result);
+
+      *reinterpret_cast<OutputFragment *>(dst_byte_ptr + dst_byte_offset) = 
+        reinterpret_cast<OutputFragment const &>(cvt);
+
+      // Update indices and pointers
+      idx_linear += gridDim.y * blockDim.y;
+
+      compute_outer_coord_and_offset_(
+        params, 
+        outer_coord, 
+        dst_byte_offset, 
+        idx_linear);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduce.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc354df56a0fd83f0315370138fca729a2236d79
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduce.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines basic thread level reduction with specializations for Array<T, N>.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+namespace reduction {
+namespace thread {
+
+/// Structure to compute the thread level reduction
+template <typename Op, typename T>
+struct Reduce;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial Specialization of Reduce for "plus" (a functional operator)
+template <typename T>
+struct Reduce< plus<T>, T > {
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs, T const &rhs) const {
+    plus<T> _op;
+    return _op(lhs, rhs);
+  } 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization of Reduce for Array<T, N>
+template <typename T, int N>
+struct Reduce < plus<T>, Array<T, N>> {
+  
+  CUTLASS_HOST_DEVICE
+  Array<T, 1> operator()(Array<T, N> const &in) const {
+
+    Array<T, 1> result;
+    Reduce< plus<T>, T > scalar_reduce;
+    result.clear();
+
+    CUTLASS_PRAGMA_UNROLL
+    for (auto i = 0; i < N; ++i) {
+      result[0] = scalar_reduce(result[0], in[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specializations of Reduce for Array<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, Array<half_t, N> > {
+  
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(Array<half_t, N> const &input) {
+
+    Array<half_t, 1> result;
+
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+
+      result[0] = input.front();
+
+    } else {
+    
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        
+        __half result_d;
+        Array<half_t, 1> const *in_ptr_half = reinterpret_cast<Array<half_t, 1> const *>(&input);
+        Array<half_t, 2> const *in_ptr_half2 = reinterpret_cast<Array<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+
+        }
+        
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+    
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+
+          __half last_element;
+          Array<half_t, 1> tmp_last;
+          Array<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+
+          result_d = __hadd(result_d, last_element);
+
+        } 
+
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+
+      #else
+        
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+
+          result[0] = scalar_reduce(result[0], input[i]);
+
+        }
+
+      #endif
+    }
+
+    return result;
+      
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specializations of Reduce for AlignedArray<half_t, N>
+template <int N>
+struct Reduce < plus<half_t>, AlignedArray<half_t, N> > {
+  
+  CUTLASS_HOST_DEVICE
+  Array<half_t, 1> operator()(AlignedArray<half_t, N> const &input) {
+
+    Array<half_t, 1> result;
+
+    // If there is only 1 element - there is nothing to reduce
+    if( N ==1 ){
+
+      result[0] = input.front();
+
+    } else {
+    
+      #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+        
+        __half result_d;
+        AlignedArray<half_t, 1> const *in_ptr_half = reinterpret_cast<AlignedArray<half_t, 1> const *>(&input);
+        AlignedArray<half_t, 2> const *in_ptr_half2 = reinterpret_cast<AlignedArray<half_t, 2> const *>(&input);
+        __half2 const *x_in_half2 = reinterpret_cast<__half2 const *>(in_ptr_half2);
+
+        // Set initial result = first half2, in case N==2
+        __half2 tmp_result = x_in_half2[0];
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < N/2; ++i) {
+
+          tmp_result = __hadd2(x_in_half2[i], tmp_result);
+
+        }
+        
+        result_d = __hadd(__low2half(tmp_result), __high2half(tmp_result));
+    
+        // One final step is needed for odd "N" (to add the (N-1)th element)
+        if( N%2 ){
+
+          __half last_element;
+          AlignedArray<half_t, 1> tmp_last;
+          AlignedArray<half_t, 1> *tmp_last_ptr = &tmp_last;
+          tmp_last_ptr[0] = in_ptr_half[N-1];
+          last_element = reinterpret_cast<__half  const &>(tmp_last);
+
+          result_d = __hadd(result_d, last_element);
+
+        } 
+
+        Array<half_t, 1> *result_ptr = &result;
+        *result_ptr = reinterpret_cast<Array<half_t, 1> &>(result_d);
+
+      #else
+        
+        Reduce< plus<half_t>, half_t > scalar_reduce;
+        result.clear();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (auto i = 0; i < N; ++i) {
+
+          result[0] = scalar_reduce(result[0], input[i]);
+
+        }
+
+      #endif
+    }
+
+    return result;
+      
+  }
+};
+}
+}
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduction_operators.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduction_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..3792d332de65f19a1d30ba311d34073201176a3b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/thread/reduction_operators.h
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Kernel performing a reduction over densely packed tensors in global memory
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace reduction {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mixed-precision reduction
+template <
+  typename ElementAccumulator_,
+  typename Element_,
+  int Count = 1
+>
+struct ReduceAdd {
+
+  //
+  // Type definitions
+  //
+
+  using ElementAccumulator = ElementAccumulator_;
+  using Element = Element_;
+  static int const kCount = Count;
+
+  using FragmentAccumulator = cutlass::Array<ElementAccumulator, kCount>;
+  using FragmentElement = cutlass::Array<Element, kCount>;
+
+  struct Params { };
+
+  //
+  // Data members
+  //
+
+  /// Parameters object
+  Params params;
+
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ReduceAdd(Params params_ = Params()): params(params_) { }
+
+  /// Operator
+  CUTLASS_HOST_DEVICE
+  FragmentAccumulator operator()(
+    FragmentAccumulator accumulator, 
+    FragmentElement element) const {
+
+    plus<FragmentAccumulator> op;
+
+    NumericArrayConverter<
+      ElementAccumulator, 
+      Element, 
+      kCount, 
+      PreferredRoundingMode<ElementAccumulator, Element>::kRound> converter;
+
+    return op(accumulator, converter(element));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Special handling for binary operators
+template <typename ReductionOp, typename Element, int N>
+struct VectorizeArrayOperation {
+
+  using ValueType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  ValueType operator()(
+    ReductionOp const &reduction_op, 
+    ValueType const &lhs, 
+    ValueType const &rhs) const {
+
+    ValueType result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = reduction_op(lhs[i], rhs[i]);
+    }
+
+    return result;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ReductionOp, typename Element, int N>
+struct ReduceArrayOperation {
+
+  using ArrayType = Array<Element, N>;
+
+  CUTLASS_HOST_DEVICE
+  Element operator()(
+    ReductionOp const &reduction_op, 
+    ArrayType const &array) const {
+
+    Element item = reduction_op(array[0], array[1]);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 2; i < N; ++i) {
+      item = reduction_op(item, array[i]);
+    }
+
+    return item;
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_and<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = false;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || !bits);
+    }
+
+    return uint1b_t{!item};
+  }
+};
+
+template <int N>
+struct ReduceArrayOperation<logical_or<uint1b_t>, uint1b_t, N> {
+
+  using ArrayType = Array<uint1b_t, N>;
+
+  CUTLASS_HOST_DEVICE
+  uint1b_t operator()(
+    logical_and<uint1b_t> const &reduction_op, 
+    ArrayType const &array) const {
+
+    uint8_t const *ptr = reinterpret_cast<uint8_t const *>(&array);
+    bool item = true;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int byte = 0; byte < (N + 7) / 8; ++byte) {
+      uint8_t bits = ptr[byte];
+      item = (item || bits);
+    }
+
+    return uint1b_t{item};
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper function to infer template argument types
+template <typename ReductionOp, typename Element, int N>
+CUTLASS_HOST_DEVICE
+Array<Element, N> ApplyArrayOperator(
+  ReductionOp const &reduction_op,
+  Array<Element, N> const &lhs, 
+  Array<Element, N> const &rhs) {
+
+  VectorizeArrayOperation<ReductionOp, Element, N> vectorize_op;
+
+  return vectorize_op(reduction_op, lhs, rhs);
+}
+
+/// Helper to reduce an array
+template <typename ReductionOp, typename Element, int N>
+Element ReduceArray(ReductionOp const &reduction_op, Array<Element, N> const &array) {
+  ReduceArrayOperation<ReductionOp, Element, N> reduce_array_op;
+
+  return reduce_array_op(reduction_op, array);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace reduction
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/threadblock_swizzle.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/threadblock_swizzle.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbabaed2736cac7043671f10e9813a9a48b1916c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/reduction/threadblock_swizzle.h
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+**************************************************************************************************/
+/*! \file
+\brief Defies functors for mapping blockIdx to partitions of the batched reduction computation.
+*/
+#pragma once
+#include "cutlass/coord.h"
+
+namespace cutlass {
+namespace reduction {
+struct DefaultBlockSwizzle {
+  /// Ctor
+  CUTLASS_HOST_DEVICE DefaultBlockSwizzle() {}
+
+  /// Swizzle the block index.
+  CUTLASS_DEVICE dim3 swizzle() { return blockIdx; }
+
+  /// 
+  CUTLASS_HOST_DEVICE dim3 get_grid_layout(Coord<3> const &problem_size,
+                                           Coord<3> const &OutputTile) {
+    assert(OutputTile[0] == 1 && OutputTile[1] == 1);
+    assert((problem_size[0] * problem_size[1] * problem_size[2]) % OutputTile[2] == 0);
+    dim3 grid;
+    grid.x = problem_size[0] * problem_size[1] * problem_size[2]
+      / OutputTile[2] ;
+    return grid;
+  }
+
+  ///
+  CUTLASS_DEVICE Coord<3> get_threadblock_offset(Coord<3> const &SubTile) {
+    assert(SubTile[0] == 1 && SubTile[1] == 1);
+    dim3 block = swizzle();
+    Coord<3> threadblock_offset =
+      make_Coord(0, 0, block.x * SubTile[2]);
+    return threadblock_offset;
+  }
+};
+} // namespace reduction
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/relatively_equal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/relatively_equal.h
new file mode 100644
index 0000000000000000000000000000000000000000..68bdb26e38b1a54843eb4883833ad6b8708f0aff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/relatively_equal.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Performs comparison between two elements with support for floating-point comparisons.
+*/
+
+#pragma once
+
+#include "numeric_types.h"
+#include "complex.h"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename U = T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(T a, T b, U epsilon, U nonzero_floor);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+// This floating-point comparison function implements the method described in
+//
+// https://floating-point-gui.de/errors/comparison/
+//
+template <typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal_float(T a, T b, T epsilon, T nonzero_floor) {
+  
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  T zero = T(0);
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || (abs_A + abs_B) < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+  
+  return diff < epsilon * (abs_A + abs_B);
+}
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bool>(bool a, bool b, bool, bool) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint1b_t>(uint1b_t a, uint1b_t b, uint1b_t, uint1b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int2b_t>(int2b_t a, int2b_t b, int2b_t, int2b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint2b_t>(uint2b_t a, uint2b_t b, uint2b_t, uint2b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int4b_t>(int4b_t a, int4b_t b, int4b_t, int4b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint4b_t>(uint4b_t a, uint4b_t b, uint4b_t, uint4b_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int8_t>(int8_t a, int8_t b, int8_t, int8_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint8_t>(uint8_t a, uint8_t b, uint8_t, uint8_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int16_t>(int16_t a, int16_t b, int16_t, int16_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint16_t>(uint16_t a, uint16_t b, uint16_t, uint16_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int32_t>(int32_t a, int32_t b, int32_t, int32_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint32_t>(uint32_t a, uint32_t b, uint32_t, uint32_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<int64_t>(int64_t a, int64_t b, int64_t, int64_t) {
+  return (a == b);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<uint64_t>(uint64_t a, uint64_t b, uint64_t, uint64_t) {
+  return (a == b);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e4m3_t>(float_e4m3_t a, float_e4m3_t b, float_e4m3_t epsilon, float_e4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e5m2_t>(float_e5m2_t a, float_e5m2_t b, float_e5m2_t epsilon, float_e5m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<half_t>(half_t a, half_t b, half_t epsilon, half_t nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<bfloat16_t>(
+  bfloat16_t a, 
+  bfloat16_t b, 
+  bfloat16_t epsilon, 
+  bfloat16_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<tfloat32_t>(
+  tfloat32_t a, 
+  tfloat32_t b, 
+  tfloat32_t epsilon, 
+  tfloat32_t nonzero_floor) {
+  
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float>(float a, float b, float epsilon, float nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<double>(double a, double b, double epsilon, double nonzero_floor) {
+  return detail::relatively_equal_float(a, b, epsilon, nonzero_floor);
+}
+
+template<typename T>
+CUTLASS_HOST_DEVICE
+bool relatively_equal(complex<T> a, complex<T> b, T epsilon, T nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  T diff = abs(a - b);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || diff < nonzero_floor) {
+    return diff < epsilon * nonzero_floor;
+  }
+
+  return diff < epsilon * (abs_A + abs_B);
+}
+
+template <typename T>
+CUTLASS_HOST_DEVICE 
+bool relatively_equal(complex<T> a,  complex<T> b, complex<T> epsilon, complex<T> nonzero_floor) {
+#if defined(__CUDACC_RTC__)
+  using cuda::std::abs;
+#else
+  using std::abs;
+#endif
+
+  T abs_A = abs(a);
+  T abs_B = abs(b);
+  complex<T> diff = a - b;
+  T abs_diff = abs(diff);
+  complex<T> zero = complex<T>{T{}, T{}};
+
+  if (a == b) {
+    return true;
+  }
+  else if (a == zero || b == zero || abs_diff < abs(nonzero_floor)) {
+    return abs_diff < abs(epsilon * nonzero_floor);
+  }
+
+  return abs_diff < abs(epsilon) * (abs_A + abs_B);
+}
+
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e2m3_t>(float_e2m3_t a, float_e2m3_t b, float_e2m3_t epsilon, float_e2m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e3m2_t>(float_e3m2_t a, float_e3m2_t b, float_e3m2_t epsilon, float_e3m2_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_e2m1_t>(float_e2m1_t a, float_e2m1_t b, float_e2m1_t epsilon, float_e2m1_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_ue8m0_t>(float_ue8m0_t a, float_ue8m0_t b, float_ue8m0_t epsilon, float_ue8m0_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+template <>
+CUTLASS_HOST_DEVICE
+bool relatively_equal<float_ue4m3_t>(float_ue4m3_t a, float_ue4m3_t b, float_ue4m3_t epsilon, float_ue4m3_t nonzero_floor) {
+  return detail::relatively_equal_float<float>(a, b, epsilon, nonzero_floor);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/semaphore.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/semaphore.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a0a1a4572775bbdbdba63a160952e35fef2c20
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/semaphore.h
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implementation of a CTA-wide semaphore for inter-CTA synchronization.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// CTA-wide semaphore for inter-CTA synchronization.
+class Semaphore { 
+public:
+
+  int *lock;
+  bool wait_thread;
+  int state;
+
+public:
+
+  /// Implements a semaphore to wait for a flag to reach a given value
+  CUTLASS_HOST_DEVICE
+  Semaphore(int *lock_, int thread_id): 
+    lock(lock_), 
+    wait_thread(thread_id < 0 || thread_id == 0),
+    state(-1) {
+
+  }
+
+  /// Permit fetching the synchronization mechanism early
+  CUTLASS_DEVICE
+  void fetch() {
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #else
+      asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock));  
+      #endif
+    }
+  }
+
+  /// Gets the internal state
+  CUTLASS_DEVICE
+  int get_state() const {
+    return state;
+  }
+
+  /// Waits until the semaphore is equal to the given value
+  CUTLASS_DEVICE
+  void wait(int status = 0) {
+    while( __syncthreads_and(state != status) ) {
+      fetch();
+    }
+
+    __syncthreads();
+  }
+
+  /// Updates the lock with the given result
+  CUTLASS_DEVICE
+  void release(int status = 0) {
+    __syncthreads();
+
+    if (wait_thread) {
+      #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #else
+      asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+      #endif
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/subbyte_reference.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/subbyte_reference.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e98cdc3886b06626ea7d003122d62078f7767b9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/subbyte_reference.h
@@ -0,0 +1,1388 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Provides a mechanism for packing and unpacking elements smaller than one byte
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/fast_math.h"
+
+namespace cutlass {
+
+namespace detail {
+// This is an implementation detail of cutlass::SubbyteReference and.
+// cutlass::HostTensor.  For a given logical element type Element,
+// and its corresponding storage (physical) element type StorageUnit,
+// it computes quantities that help with managing allocations.
+//
+// CUTLASS uses a hidden "ContainerUnitType" or StorageUnit type to support
+// packed arrays of subbyte types such as int4.  Element is the "logical" type
+// for computations, while CUTLASS uses StorageUnit as the element type
+// of a packed array of Element.  If Element is not a subbyte type,
+// then the corresponding StorageUnit type is just Element itself.
+//
+// The ContainerType is always calculated as an array StorageUnit type (the StorageUnit
+// is always a byte for subbyte types),
+// and its number of bits is the lcm of the subbyte type's number of bits and 8.
+// Below are some examples for different subbyte types.
+//
+// * Subbyte Type=int2, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+// * Subbyte Type=int4, ContainerType=StorageUnit[1] (StorageUnit=uint8_t)
+template<class Element, class StorageUnit>
+struct StorageContainerCalculator {
+  // kContainerTypeNumBits: The number of bits needed for ContainerType
+  static constexpr int kContainerTypeNumBits   = (sizeof_bits<Element>::value < 8) ? cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value) : sizeof_bits<Element>::value;
+  static_assert(kContainerTypeNumBits % sizeof_bits<Element>::value == 0, "The bits of ContainerType should be divisible by the element's number of bits");
+  // kContainerTypeNumLogicalElements: The number of logical Element instance(s) that can be stored per ContainerType instance
+  static constexpr int kContainerTypeNumLogicalElements = kContainerTypeNumBits / sizeof_bits<Element>::value;
+  /// 3. kContainerTypeNumBytes: The number of bytes per ContainerType instance
+  static constexpr int kContainerTypeNumBytes = kContainerTypeNumBits / 8;
+  /// 4. kContainerTypeNumBytes: The number of base StorageUnit in the ContainerType
+  static constexpr int kContainerTypeNumStorageUnit = kContainerTypeNumBits / sizeof_bits<StorageUnit>::value;
+
+  static_assert(kContainerTypeNumBits != 0, "kContainerTypeNumBits can not be zero");
+  static_assert(kContainerTypeNumLogicalElements != 0, "kContainerTypeNumLogicalElements can not be zero");
+  static_assert(kContainerTypeNumBytes != 0, "kContainerTypeNumBytes can not be zero");
+};
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This class provides a mechanism for packing and unpacking elements smaller than one byte. It
+/// assumes these sub-byte elements are packed in a traditional C++ numeric type.
+///
+/// The intended application is to provide a mechanism to indirectly reference elements in
+/// memory or Array<> objects whose addresses cannot otherwise be taken since they are smaller
+/// than one byte.
+/// 
+/// Supports basic pointer arithmetic:
+///
+/// Example:
+///
+///   int4b_t *ptr = ...;
+///
+///   SubbyteReference<int4b_t> ref = ptr;
+///   ref += 15;
+///
+///   int4b_t x = ref;      // load an int4b_t
+///   ref = x + 2_s4;      // perform arithmetic on int4b_t and then store
+///
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ = uint8_t,    /// Underlying storage type. Must be able to hold an integer 
+                                  ///   number of objects of type Element.
+  class = void
+>
+class ConstSubbyteReference {
+public:
+
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage const *;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask 
+  Storage const kMask = 
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+
+private:
+
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    Storage item = Storage((*ptr_ >> (offset_ * sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_ =             /// Underlying storage type. Must be able to hold an integer
+                                  ///   number of objects of type Element.
+
+#if defined(__CUDA_ARCH__)        /// Default size depends on width of atomicCas() overloads.
+  #if (__CUDA_ARCH__ >= 700)      ///
+  uint16_t
+  #else
+  uint32_t
+  #endif
+#else
+  uint8_t
+#endif
+  ,
+  class = void
+>
+class SubbyteReference {
+public:
+
+  using Element = Element_;
+  using Storage = Storage_;
+  using StoragePointer = Storage *;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<Storage>::value,
+    "Size of Element must not be greater than Storage.");
+
+  static_assert(!(sizeof_bits<Storage>::value % sizeof_bits<Element>::value),
+    "Storage must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<Storage>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask 
+  Storage const kMask = 
+    ((sizeof_bits<Element>::value < sizeof_bits<Storage>::value) ? 
+      (Storage(1) << sizeof_bits<Element>::value) - Storage(1) :
+      ~Storage(0));
+
+private:
+
+  /// Pointer to array containing element
+  StoragePointer ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StoragePointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StoragePointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    uint8_t const* byte_ptr = reinterpret_cast<uint8_t const*>(ptr_);
+    // Convert offset in elements to offset in bytes
+    constexpr int elements_per_byte = cutlass::sizeof_bits<uint8_t>::value / cutlass::sizeof_bits<Element>::value;
+    byte_ptr += offset_ / elements_per_byte;
+    // Offset of element within a byte
+    int byte_offset = offset_ % elements_per_byte;
+    uint8_t item = uint8_t((*byte_ptr >> (byte_offset * cutlass::sizeof_bits<Element>::value)) & kMask);
+    return reinterpret_cast<Element const &>(item);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+
+    Storage item        = (reinterpret_cast<Storage const &>(x) & kMask);
+    Storage kUpdateMask = Storage(~(kMask << (offset_ * cutlass::sizeof_bits<Element>::value)));
+    Storage new_bits    = Storage(item << (offset_ * cutlass::sizeof_bits<Element>::value));
+
+#if defined(__CUDA_ARCH__)
+
+    //
+    // Homebrew read-modify-write
+    //
+    Storage original;
+    Storage updated;
+
+    do {
+
+      original = (*ptr_);
+
+      updated  = Storage((original & kUpdateMask) | new_bits);
+
+      original = atomicCAS(ptr_, original, updated);
+
+    } while (updated != original);
+
+#else
+
+    Storage original = (*ptr_);
+    Storage updated  = Storage((original & kUpdateMask) | new_bits);
+    *ptr_ = updated;
+
+#endif
+
+    return *this;
+  }
+
+  ////
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, Storage> const &x) {
+    return set(x.get());
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying basic storage type.
+>
+class SubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  /// Note: It's possible that StorageUnit is not divisible by Element.
+  /// For example, an Element instance might be stored across 2 StorageUnit instances.
+  /// Thus, CUTLASS needs a storage vector to hold an integer number of Element instances.
+
+  using StorageUnit = Storage_;
+private:
+  using StorageContainerCalculator = cutlass::detail::StorageContainerCalculator<Element, StorageUnit>;
+public:
+  static int const kBitsStoredVec = StorageContainerCalculator::kContainerTypeNumBits; 
+  static int const kNumStorageUnitPerStoredVec = StorageContainerCalculator::kContainerTypeNumStorageUnit;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
+
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  SubbyteReference(
+    Element *ptr = nullptr
+  ): SubbyteReference(ptr, 0) { }
+
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets StorageVec pointer
+  CUTLASS_HOST_DEVICE
+  Element * operator&() const {
+    return reinterpret_cast<Element *>(ptr_);
+  }
+
+  /// Gets element offset within StorageVec vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference & set(Element const &x) {
+
+    uint64_t item = static_cast<uint64_t>((reinterpret_cast<uint8_t const &>(x) & kMask)) << start_bit_idx_;
+    
+    StorageUnit low_new_bits  = StorageUnit(item & ~StorageUnit(0));
+    StorageUnit high_new_bits = StorageUnit(item >> sizeof_bits<StorageUnit>::value);
+
+    StorageUnit const kLowUpdateMask  = StorageUnit((~full_element_mask_) & (~StorageUnit(0)));
+    StorageUnit const kHighUpdateMask = StorageUnit(((~full_element_mask_) >> sizeof_bits<StorageUnit>::value) & (~StorageUnit(0)));
+
+#if defined(__CUDA_ARCH__)
+    //
+    // Homebrew read-modify-write
+    //
+    if(high_storage_unit_idx_ != low_storage_unit_idx_){
+      /// Only need update 2 storage unit at once.
+      /// consider misaligned address issue, we need to do atomicCAS twice 
+      StorageUnit original_low_bits, original_high_bits, update_low_bits, update_high_bits;
+      do {
+        original_low_bits  = ((*ptr_)[low_storage_unit_idx_]);
+        update_low_bits  = (original_low_bits & kLowUpdateMask) | low_new_bits;
+        original_low_bits = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original_low_bits, update_low_bits);
+      } while (update_low_bits != original_low_bits);
+      do {
+        original_high_bits = ((*ptr_)[high_storage_unit_idx_]);
+        update_high_bits  = (original_high_bits & kHighUpdateMask) | high_new_bits;
+        original_high_bits = atomicCAS(&((*ptr_)[high_storage_unit_idx_]), original_high_bits, update_high_bits);
+      } while (update_high_bits != original_high_bits);
+    }
+    else {
+      /// Only need update 1 storage unit.
+      StorageUnit original, updated;
+      do {
+        original = ((*ptr_)[low_storage_unit_idx_]);
+
+        updated = (original & kLowUpdateMask) | low_new_bits;
+
+        original = atomicCAS(&((*ptr_)[low_storage_unit_idx_]), original, updated);
+
+      } while (updated != original);
+    }
+#else
+
+
+    StorageUnit update_low_bits  = ((*ptr_)[low_storage_unit_idx_] & kLowUpdateMask) | low_new_bits;
+    StorageUnit update_high_bits = ((*ptr_)[high_storage_unit_idx_] & kHighUpdateMask) | high_new_bits;
+
+    (*ptr_)[low_storage_unit_idx_] = update_low_bits;
+
+    if(low_storage_unit_idx_ != high_storage_unit_idx_)
+      (*ptr_)[high_storage_unit_idx_] = update_high_bits;
+#endif
+
+    return *this;
+  }
+
+  ////
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(Element const & x) {
+    return set(x);
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(SubbyteReference const & x) {
+    return set(x.get());
+  }
+
+  /// Stores an element to memory
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator=(
+      ConstSubbyteReference<Element, StorageVec> const &x) {
+    return set(x.get());
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator+(long long offset) const {
+    
+    SubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-(int offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  SubbyteReference operator-=(long long offset) const {
+
+    SubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(SubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+template<typename T> using _war = T;
+template <
+  typename Element_,              /// CUTLASS numeric element type.
+  typename Storage_               /// Underlying storage type. Must be able to hold an integer 
+>
+class ConstSubbyteReference<Element_, Storage_, 
+    typename platform::enable_if<sizeof_bits<Storage_>::value % sizeof_bits<Element_>::value != 0>::type> {
+public:
+
+  using Element = Element_;
+  ///! Note: Storage unit could not be divisibale by Element,   
+  ///   Type element may be stored across 2 storage units, so need a storage vector to hold integer
+  ///   number of objects of type Element.
+  using StorageUnit = Storage_;
+  static int const kBitsStoredVec = cutlass::lcm_cxx11(sizeof_bits<Element>::value, sizeof_bits<StorageUnit>::value); 
+  static int const kNumStorageUnitPerStoredVec = kBitsStoredVec / sizeof_bits<StorageUnit>::value;
+
+  using StorageVec = StorageUnit[kNumStorageUnitPerStoredVec];
+  using StorageVecPointer = StorageVec const *;
+  
+  using CudaAtomicType = typename platform::conditional<
+      sizeof_bits<StorageUnit>::value == 16,
+      uint32_t,
+      uint64_t
+    >::type;
+
+  static_assert(sizeof_bits<Element>::value <= sizeof_bits<StorageVec>::value,
+    "Size of Element must not be greater than StorageVec.");
+
+  static_assert(!(sizeof_bits<StorageVec>::value % sizeof_bits<Element>::value),
+    "StorageVec must be divisible by Element");
+
+private:
+
+  ///! Number of elements per storage vector
+  int const kElementsPerVector = sizeof_bits<StorageVec>::value / sizeof_bits<Element>::value;
+
+  ///! Bit mask for storage unit.
+  StorageUnit const kMask = (StorageUnit(1) << sizeof_bits<Element>::value) - StorageUnit(1);
+
+  /// Pointer to array containing element
+  _war<StorageVecPointer> ptr_;
+
+  /// Offset (in units of elements) from pointer.
+  ///
+  /// Invariant: must always be in range [0, kElementsPerVector)
+  int offset_;
+
+  /// Element may be stored across 2 storage unit.
+  ///   Low storage unit index in StorageVec
+  ///   High storage unit index in StorageVec
+  int low_storage_unit_idx_;
+  int high_storage_unit_idx_;
+
+  /// Full Mask to extract the entire element
+  uint64_t full_element_mask_;
+
+  /// Mask to extract the Element from Low storage unit and High storage unit.
+  StorageUnit low_storage_mask_;
+  StorageUnit high_storage_mask_;
+
+  /// Start bit index inside the storage unit.
+  int start_bit_idx_;
+
+private:
+
+  CUTLASS_HOST_DEVICE
+  void update_element_status() {
+    int num_bits = offset_ * sizeof_bits<Element>::value;
+
+    start_bit_idx_ = num_bits % sizeof_bits<StorageUnit>::value;
+    
+    low_storage_unit_idx_ = num_bits / sizeof_bits<StorageUnit>::value;
+    high_storage_unit_idx_ = sizeof_bits<StorageUnit>::value - (start_bit_idx_) < sizeof_bits<Element>::value 
+                              ? low_storage_unit_idx_ + 1 : low_storage_unit_idx_;
+    
+    full_element_mask_ = uint64_t(kMask) << start_bit_idx_;
+    low_storage_mask_ = StorageUnit(full_element_mask_ & ~StorageUnit(0));
+    high_storage_mask_ = StorageUnit((full_element_mask_ >> sizeof_bits<StorageUnit>::value) & ~StorageUnit(0));
+  }
+
+public:
+
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(): ptr_(nullptr), offset_(0) { }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element const *ptr,           /// pointer to memory
+    int64_t offset          /// logical offset in units of Element
+  ): 
+    ptr_(reinterpret_cast<StorageVecPointer>(ptr)),
+    offset_(0) {
+
+    int64_t offset_in_vectors = offset / kElementsPerVector;
+    int64_t offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = int(offset_in_elements);
+
+    update_element_status();
+  }
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference(
+    Element *ptr = nullptr
+  ): ConstSubbyteReference(ptr, 0) { }
+
+  /// Gets storage pointer
+  CUTLASS_HOST_DEVICE
+  StorageVecPointer storage_pointer() const {
+    return ptr_;
+  }
+
+  /// Gets element offset within storage vector
+  CUTLASS_HOST_DEVICE
+  int element_offset() const {
+    return offset_;
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  Element get() const {
+    StorageUnit low_bits = (*ptr_)[low_storage_unit_idx_] & low_storage_mask_;
+    StorageUnit high_bits = low_storage_unit_idx_ != high_storage_unit_idx_ ? (*ptr_)[high_storage_unit_idx_] & high_storage_mask_ : 0;
+
+    uint64_t full_item = ((uint64_t)high_bits << sizeof_bits<StorageUnit>::value) | low_bits;
+    uint8_t result = uint8_t(full_item >> start_bit_idx_);
+
+    return reinterpret_cast<Element const &>(result);
+  }
+
+  /// Unpacks an element from memory
+  CUTLASS_HOST_DEVICE
+  operator Element() const {
+    return get();
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(int offset) {
+
+    offset += offset_;
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator+=(long long offset) {
+
+    offset += offset_;
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ += offset_in_vectors;
+    offset_ = offset_in_elements;
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(int offset) {
+    
+    int offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = offset % kElementsPerVector;
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Adds an offset in units of elements to the reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference &operator-=(long long offset) {
+    
+    long long offset_in_vectors = offset / kElementsPerVector;
+    int offset_in_elements = int(offset % kElementsPerVector);
+
+    ptr_ -= offset_in_vectors;
+    offset_ -= offset_in_elements;
+
+    if (offset_ < 0) {
+      offset_ += kElementsPerVector;
+      --ptr_;
+    }
+
+    update_element_status();
+
+    return *this;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator+(long long offset) const {
+    
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref += offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-(int offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Returns a reference to an element with a given offset from the current reference
+  CUTLASS_HOST_DEVICE
+  ConstSubbyteReference operator-=(long long offset) const {
+
+    ConstSubbyteReference ref(ptr_, offset_);
+    ref -= offset;
+
+    return ref;
+  }
+
+  /// Computes the difference in elements between references
+  CUTLASS_HOST_DEVICE
+  ptrdiff_t operator-(ConstSubbyteReference ref) const {
+    return (ptr_ - ref.ptr_) * kElementsPerVector + (offset_ - ref.offset_);
+  }
+
+  /// Explicit cast to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(get());
+  }
+
+  /// Explicit cast to signed 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator int64_t() const {
+    return int64_t(get());
+  }
+
+  /// Explicit cast to unsigned 64-bit integer
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const {
+    return uint64_t(get());
+  }
+
+  /// Explicit cast to float
+  CUTLASS_HOST_DEVICE
+  explicit operator float() const {
+    return float(get());
+  }
+
+  /// Explicit cast to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(get());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, bool subbyte = (sizeof_bits<Element>::value < 8)>
+struct ReferenceFactory;
+
+template <typename Element>
+struct ReferenceFactory<Element, false> {
+
+  ///! Number of elements per storage vector
+  static int const kElementsPerVector = 1;
+
+  CUTLASS_HOST_DEVICE
+  static Element &get(Element *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const &get(Element const *ptr, int64_t offset) {
+    return ptr[offset];
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset) {
+    return ptr + offset;
+  }
+};
+
+template <typename Element>
+struct ReferenceFactory<Element, true> {
+
+  //
+  // Static methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  static SubbyteReference<Element> get(Element *ptr, int64_t offset) {
+    return SubbyteReference<Element>(ptr, offset);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static ConstSubbyteReference<Element> get(Element const *ptr,
+                                             int64_t offset) {
+    return ConstSubbyteReference<Element>(ptr, offset);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element *add_pointer_offset(Element *ptr, int64_t offset_in_elements) {
+    return &SubbyteReference<Element>(ptr, offset_in_elements);
+  }
+
+  /// Helper to add an offset in number of elements, assuming this offset is divisible
+  /// by the vector size.
+  CUTLASS_HOST_DEVICE
+  static Element const *add_pointer_offset(Element const *ptr, int64_t offset_in_elements) {
+    return &ConstSubbyteReference<Element>(ptr, offset_in_elements);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_coord.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_coord.h
new file mode 100644
index 0000000000000000000000000000000000000000..a124d395cf2222331e0ceb160271b1621688fd6f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_coord.h
@@ -0,0 +1,326 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a canonical coordinate for rank=4 tensors offering named indices.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a canonical 4D coordinate used by tensor operations.
+struct Tensor4DCoord : public Coord<4> {
+
+  /// Base class
+  using Base = Coord<4>;
+
+  /// Index type
+  using Index = typename Base::Index;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+  /// Batch dimension
+  static int const kN = 0;
+
+  /// Height dimension
+  static int const kH = 1;
+
+  /// Width dimension
+  static int const kW = 2;
+
+  /// Channels dimension
+  static int const kC = 3;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord() { }
+
+  /// Constructs from Coord<4>
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Coord<4> const &coord): Base(coord) { }
+
+  /// Helper to construct from N, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(Index n, Index h, Index w, Index c): Base(make_Coord(n, h, w, c)) { }
+
+  /// Helper to construct from N, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord(LongIndex n, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(h), Index(w), Index(c))) { }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator+(Base const& b) const {
+    return Tensor4DCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator-(Base const& b) const {
+    return Tensor4DCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator*(Base const& b) const {
+    return Tensor4DCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord operator/(Base const& b) const {
+    return Tensor4DCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor4DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a canonical 5D coordinate used by tensor operations.
+struct Tensor5DCoord : public Coord<5> {
+
+  /// Base class
+  using Base = Coord<5>;
+
+  /// Index type
+  using Index = typename Base::Index;
+
+  /// LongIndex type
+  using LongIndex = typename Base::LongIndex;
+
+  /// Batch dimension
+  static int const kN = 0;
+
+  /// Depth dimension
+  static int const kD = 1;
+
+  /// Height dimension
+  static int const kH = 2;
+
+  /// Width dimension
+  static int const kW = 3;
+
+  /// Channels dimension
+  static int const kC = 4;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord() { }
+
+  /// Constructs from Coord<5>
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Coord<5> const &coord): Base(coord) { }
+
+  /// Helper to construct from N, D, H, W, and C.
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(Index n, Index d, Index h, Index w, Index c): Base(make_Coord(n, d, h, w, c)) { }
+
+  /// Helper to construct from N, D, H, W, and C, which are LongIndex type
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord(LongIndex n, LongIndex d, LongIndex h, LongIndex w, LongIndex c)
+    : Base(make_Coord(Index(n), Index(d), Index(h), Index(w), Index(c))) { }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & n() const { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & n() { return this->at(kN); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & d() const { return this->at(kD); }
+
+  /// Returns the batch of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & d() { return this->at(kD); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & h() const { return this->at(kH); }
+
+  /// Returns the row of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & h() { return this->at(kH); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & w() const { return this->at(kW); }
+
+  /// Returns the column of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & w() { return this->at(kW); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index const & c() const { return this->at(kC); }
+
+  /// Returns the channel of the coordinate
+  CUTLASS_HOST_DEVICE
+  Index & c() { return this->at(kC); }
+
+  //
+  // Coord operators
+  //
+
+  /// Element-wise addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator+(Base const& b) const {
+    return Tensor5DCoord(Base::operator+(b));
+  }
+
+  /// Element-wise subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator-(Base const& b) const {
+    return Tensor5DCoord(Base::operator-(b));
+  }
+
+  /// Element-wise multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator*(Base const& b) const {
+    return Tensor5DCoord(Base::operator*(b));
+  }
+
+  /// Element-wise division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord operator/(Base const& b) const {
+    return Tensor5DCoord(Base::operator/(b));
+  }
+
+  /// In-place addition
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator+=(Base const& b) {
+    Base::operator+=(b);
+    return *this;
+  }
+
+  /// In-place subtraction
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator-=(Base const& b) {
+    Base::operator-=(b);
+    return *this;
+  }
+
+  /// In-place multiplication
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator*=(Base const& b) {
+    Base::operator*=(b);
+    return *this;
+  }
+
+  /// In-place division
+  CUTLASS_HOST_DEVICE
+  Tensor5DCoord& operator/=(Base const& b) {
+    Base::operator/=(b);
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc467499996a00645b0a936efe741ece2092fb90
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref.h
@@ -0,0 +1,419 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+
+
+#include "cutlass/cutlass.h"
+#include "cutlass/coord.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/subbyte_reference.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Default layout function from coordinates in a tensor's index space into the n-D array held
+/// in memory.
+///
+/// All layout functions must define at least the members shown in IdentityTensorLayout<>.
+template <int Rank>
+class IdentityTensorLayout {
+public:
+  /// Logical rank of tensor
+  static int const kRank = Rank;
+
+  /// Rank of stride vector
+  static int const kStrideRank = Rank;
+
+  /// Index type used for coordinates
+  using Index = int32_t;
+
+  /// Long index type used for offsets
+  using LongIndex = int64_t;
+
+  /// Logical coordinate
+  using TensorCoord = Coord<kRank, Index>;
+
+  /// Stride vector
+  using Stride = Coord<kStrideRank, Index>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride data member
+  Stride stride_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  IdentityTensorLayout(Stride const &stride = Stride()): stride_(stride) { }
+
+  /// Returns the offset of a coordinate in linear memory
+  CUTLASS_HOST_DEVICE
+  LongIndex operator()(Coord<Rank> const &coord) const {
+    return coord.dot(stride_);
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return stride_;
+  }
+
+  /// Returns the stride of the layout
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return stride_;
+  }
+
+  /// Compute the number of contiguous elements needed to store a tensor with the given size
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity(TensorCoord const &size) const {
+    int idx = stride_.max_dim_index();
+    return stride_[idx] * size[idx];
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+
+  Examples:
+
+  (These examples use helpers for matrix layouts defined in cutlass/layout/matrix.h)
+
+  1. Column-major matrix may be represented as a rank=2 tensor:
+
+    TensorRef<float, layout::ColumnMajor> A(ptr_A, ldm);
+
+  2. Row-major matrix may be represented as a rank=2 tensor:
+
+    TensorRef<float, layout::RowMajor> B(ptr_A, ldm);
+
+  3. An interleaved matrix may be represented as a rank=2 tensor:
+
+    TensorRef<int8_t, layout::ColumnMajorInterleaved<32> > C;
+
+  4. A helper exists to define a TensorRef for a contiguous matrix whose layout
+     is not known at compile time.
+
+    int ldm;                     // leading dimension
+    layout::Matrix kind;         // Could be layout::Matrix::kRowMajor or layout::Matrix::kColumnMajor
+    
+
+    TensorRef<int, layout::ContiguousMatrix> E(ptr_E, {ldm, kind});
+
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRef {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  /// Reference type to an element
+  using Reference = typename platform::conditional<
+    sizeof_bits<Element>::value >= 8,
+    Element &,
+    SubbyteReference<Element>
+    >::type;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRef<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// Pointer
+  Element* ptr_;
+
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(): ptr_(nullptr) {
+  
+  }
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    Element *ptr,                   ///< pointer to start of tensor
+    Layout const &layout            ///< layout object containing stride and mapping function
+  ):
+    ptr_(ptr), layout_(layout) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  template<typename _Magic = int>
+  CUTLASS_HOST_DEVICE
+  TensorRef(
+    NonConstTensorRef const &ref,              ///< TensorRef to non-const data
+    ///SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
+    _Magic magic = (typename platform::enable_if< ! platform::is_same<NonConstTensorRef, TensorRef<Element_, Layout_> >::value, _Magic>::type)0
+  ):
+    ptr_(ref.data()), layout_(ref.layout()) { }
+
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(const_cast<typename platform::remove_const<Element>::type *>(ptr_), layout_);
+  }
+
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr) {
+    ptr_ = ptr;
+  }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout) {
+    ptr_ = ptr;
+    layout_ = layout;
+  }
+
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return ReferenceFactory<typename platform::remove_const<Element>::type,
+                            (sizeof_bits<Element>::value < 8)>::get(ptr_, idx);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  typename Layout::Stride::Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_pointer_offset(LongIndex offset_) {
+    ptr_ = ReferenceFactory<typename platform::remove_const<Element>::type,
+           (sizeof_bits<Element>::value < 8)>::add_pointer_offset(ptr_, offset_);
+    return *this;
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRef & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator+(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef operator-(TensorCoord const& b) const {
+    TensorRef result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRef & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+};
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRef<Element, Layout> make_TensorRef(Element *ptr, Layout const &layout) {
+  return TensorRef<Element, Layout>(ptr, layout);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Partial specializations to handle degenerate and sub-byte cases.
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+bool TensorRef_aligned(TensorRef<Element, Layout> const &ref, int alignment) {
+
+  int const kStrideRank = Layout::kStrideRank;
+
+  if (reinterpret_cast<uintptr_t>(ref.data()) % alignment) {
+    return false;
+  }
+
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < kStrideRank; ++i) {
+    if (ref.stride(i) % alignment) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba3a2308081e8c4b11d18cb8125ec7943e534f0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_ref_planar_complex.h
@@ -0,0 +1,374 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides, bounds, and a pointer to tensor data.
+*/
+#pragma once
+
+#include <cstdint>
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element_>
+struct PlanarComplexReference {
+
+  //
+  // Type definitions
+  //
+
+  using Element = Element_;
+  using ComplexElement = complex<Element>;
+
+  //
+  // Data members
+  //
+
+  Element *real;
+  Element *imag;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference(
+    Element *real_ = nullptr, 
+    Element *imag_ = nullptr
+  ):
+    real(real_), imag(imag_) { }
+
+  /// Loads the complex element
+  CUTLASS_HOST_DEVICE
+  operator complex<Element>() const {
+    return complex<Element>{*real, *imag};
+  }
+
+  /// Stores a complex element to the location pointed to by the reference 
+  CUTLASS_HOST_DEVICE
+  PlanarComplexReference &operator=(complex<Element> const &rhs) {
+    *real = rhs.real();
+    *imag = rhs.imag();
+    return *this;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/* \brief TensorRef is a template for objects pointing to the start of tensors of arbitrary rank
+          and layout within memory. A TensorRef combines a pointer and a Layout concept
+
+*/
+template <
+  /// Data type of element stored within tensor (concept: NumericType)
+  typename Element_,
+  /// Defines a mapping from logical coordinate to linear memory (concept: Layout)
+  typename Layout_
+>
+class TensorRefPlanarComplex {
+ public:
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Complex element type
+  using ComplexElement = complex<Element>;
+
+  /// Mapping function from logical coordinate to linear memory
+  using Layout = Layout_;
+
+  static_assert(sizeof_bits<Element>::value >= 8,
+    "Planar complex not suitable for subbyte elements at this time");
+
+  /// Reference type to an element
+  using Reference = PlanarComplexReference<Element>;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Layout's stride vector
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to constant data
+  using ConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorRef to non-constant data
+  using NonConstTensorRef = TensorRefPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// Pointer
+  Element* ptr_;
+
+  /// Layout object maps logical coordinates to linear offsets
+  Layout layout_;
+
+  /// Offset to imaginary part
+  LongIndex imaginary_stride_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorRef with a pointer and layout object.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    Element *ptr = nullptr,                   ///< pointer to start of tensor
+    Layout const &layout = Layout(),          ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride = 0
+  ):
+    ptr_(ptr), layout_(layout), imaginary_stride_(imaginary_stride) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex(
+    NonConstTensorRef const &ref              ///< TensorRef to non-const data
+  ):
+    ptr_(ref.data()), layout_(ref.layout()), imaginary_stride_(ref.imaginary_stride_) { }
+
+  /// Returns a reference to constant-valued tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(ptr_, layout_, imaginary_stride_);
+  }
+
+  CUTLASS_HOST_DEVICE
+  NonConstTensorRef non_const_ref() const {
+    return NonConstTensorRef(
+      const_cast<typename platform::remove_const<Element>::type *>(ptr_), 
+      layout_, 
+      imaginary_stride_);
+  }
+
+  /// Updates only the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr = nullptr, LongIndex imaginary_stride = 0) {
+    ptr_ = ptr;
+    imaginary_stride_ = imaginary_stride;
+  }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride) {
+    ptr_ = ptr;
+    layout_ = layout;
+    imaginary_stride_ = imaginary_stride;
+  }
+
+  /// Returns true if the TensorRef is non-null
+  CUTLASS_HOST_DEVICE
+  bool good() const {
+    return ptr_ != nullptr;
+  }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * data() const { return ptr_; }
+
+  /// Returns the pointer to referenced data
+  CUTLASS_HOST_DEVICE
+  Element * imaginary_data() const { return ptr_ + imaginary_stride_; }
+
+  /// Returns a reference to the element at a given linear index
+  CUTLASS_HOST_DEVICE
+  Reference data(LongIndex idx) const {
+    return Reference(ptr_ + idx, ptr_ + idx + imaginary_stride_);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout & layout() {
+    return layout_;
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  Layout layout() const {
+    return layout_;
+  }
+
+  /// Gets the stride to an imaginary element
+  LongIndex imaginary_stride() const {
+    return imaginary_stride_;
+  }
+
+  /// Gets the stride to an imaginary element
+  LongIndex &imaginary_stride() {
+    return imaginary_stride_;
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride stride() const {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride vector
+  CUTLASS_HOST_DEVICE
+  Stride & stride() {
+    return layout_.stride();
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index stride(int dim) const {
+    return layout_.stride().at(dim);
+  }
+
+  /// Returns the layout object's stride in a given physical dimension
+  CUTLASS_HOST_DEVICE
+  Index & stride(int dim) {
+    return layout_.stride().at(dim);
+  }
+
+  /// Computes the offset of an index from the origin of the tensor
+  CUTLASS_HOST_DEVICE
+  LongIndex offset(TensorCoord const& coord) const {
+    return layout_(coord);
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference operator[](TensorCoord const& coord) const {
+    return data(offset(coord));
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_pointer_offset(LongIndex offset_) {
+    ptr_ += offset_;
+    return *this;
+  }
+
+  /// Adds an offset to each pointer
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & add_coord_offset(TensorCoord const &coord) {
+    add_pointer_offset(offset(coord));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator+(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_coord_offset(b);
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator+=(TensorCoord const& b) {
+    add_coord_offset(b);
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex operator-(TensorCoord const& b) const {
+    TensorRefPlanarComplex result(*this);
+    result.add_pointer_offset(-offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorRefPlanarComplex & operator-=(TensorCoord const& b) {
+    add_pointer_offset(-offset(b));
+    return *this;
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_real() const {
+    return cutlass::TensorRef<Element, Layout>(data(), layout());
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorRef<Element, Layout> ref_imag() const {
+    return cutlass::TensorRef<Element, Layout>(imaginary_data(), layout());
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE
+TensorRefPlanarComplex<Element, Layout> make_TensorRefPlanarComplex(
+  Element *ptr, 
+  Layout const &layout, 
+  int64_t imaginary_stride) {
+
+  return TensorRefPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..d669443abd8b5b246a9d2aaf2ce4dd91f782f948
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view.h
@@ -0,0 +1,297 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorView : public TensorRef<Element_, Layout_> {
+ public:
+
+  /// Base tensor reference
+  using Base = cutlass::TensorRef<Element_, Layout_>;
+
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorView<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// View extent
+  TensorCoord extent_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView() { }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout), extent_(extent) {
+  
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorView(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) {
+    Base::reset(ptr, layout);
+    this->resize(extent);
+  }
+
+  /// Updates the pointer
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr) {
+    Base::reset(ptr);
+  }
+
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord const &extent) {
+    this->extent_ = extent;
+  }
+
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+
+  /// Returns the number of logical elements
+  CUTLASS_HOST_DEVICE
+  LongIndex size() const {
+    return extent_.product();
+  }
+
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() const {
+    return TensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorView subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+
+    TensorView result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result;
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorView result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorView& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorView<Element, Layout> make_TensorView(
+  Element *ptr, 
+  Layout const &layout,
+  typename Layout::TensorCoord const &extent) {
+
+  return TensorView<Element, Layout>(ptr, layout, extent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b8f7b47c49d75f0b000d134031ea169fcc6d2a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tensor_view_planar_complex.h
@@ -0,0 +1,302 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a structure containing strides and a pointer to tensor data.
+
+    TensorView is derived from TensorRef and contributes bounds to the tensor's index space. Thus,
+    it is a complete mathematical object and may be used in tensor algorithms. It is decoupled from
+    data storage and is therefore lightweight and may be embedded in larger tensor objects or
+    memory structures.
+
+    See cutlass/tensor_ref.h for more details about the mapping of the logical tensor index space to
+    linear memory.
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include <cmath>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref_planar_complex.h"
+#include "cutlass/tensor_view.h" // cutlass::TensorView
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Data type of element stored within tensor
+  typename Element_,
+  /// Maps a Coord<Rank_> in the logical tensor index space to the internal n-D array
+  typename Layout_
+>
+class TensorViewPlanarComplex : public TensorRefPlanarComplex<Element_, Layout_> {
+ public:
+
+  /// Base tensor reference
+  using Base = cutlass::TensorRefPlanarComplex<Element_, Layout_>;
+
+  /// Mapping function from logical coordinate to internal n-D array
+  using Layout = Layout_;
+
+  /// TensorRef pointing to constant memory
+  using ConstTensorRef = typename Base::ConstTensorRef;
+
+  /// Underlying TensorRef type
+  using TensorRef = Base;
+
+  /// Data type of individual access
+  using Element = Element_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = Layout::kRank;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Coordinate in storage n-D array
+  using Stride = typename Layout::Stride;
+
+  /// TensorView pointing to constant memory
+  using ConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type const,
+    Layout>;
+
+  /// TensorView pointing to non-constant memory
+  using NonConstTensorView = TensorViewPlanarComplex<
+    typename platform::remove_const<Element>::type,
+    Layout>;
+
+  /// Require at least rank=1. Mathematically, a rank=0 tensor would be considered to be a
+  /// scalar, but degenerate cases such as these are difficult to accommodate without
+  /// extensive C++ metaprogramming or support for zero-length arrays.
+  static_assert(kRank > 0, "Cannot define a zero-rank TensorRef");
+
+ private:
+
+  /// View extent
+  TensorCoord extent_;
+
+ public:
+
+  //
+  // Methods
+  //
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(TensorCoord const &extent = TensorCoord()): extent_(extent) {
+
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    Element *ptr,                         ///< pointer to start of tensor
+    Layout const &layout,                 ///< layout object containing stride and mapping function
+    LongIndex imaginary_stride,           ///< stride between real and imaginary part
+    TensorCoord const &extent             ///< size of the view in logical coordinates
+  ):
+    Base(ptr, layout, imaginary_stride), extent_(extent) {
+  
+  }
+
+  /// Constructs a TensorView object
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    TensorRef const &ref,                 ///< pointer and layout object referencing a tensor
+    TensorCoord const &extent             ///< logical size of tensor
+  ):
+    Base(ref), extent_(extent) {
+  
+  }
+
+  /// Converting constructor from TensorRef to non-constant data.
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex(
+    NonConstTensorView const &view        ///< TensorView to non-const data
+  ):
+    Base(view), extent_(view.extent_) { }
+
+  /// Updates the pointer and layout object
+  CUTLASS_HOST_DEVICE
+  void reset(Element* ptr, Layout const &layout, LongIndex imaginary_stride, TensorCoord size) {
+    Base::reset(ptr, layout, imaginary_stride);
+    this->resize(extent_);
+  }
+
+  /// Changes the size of the view without affecting pointer or layout
+  CUTLASS_HOST_DEVICE
+  void resize(TensorCoord extent) {
+    this->extent_ = extent;
+  }
+
+  /// Returns the extent of the view (the size along each logical dimension).
+  CUTLASS_HOST_DEVICE
+  TensorCoord const& extent() const { return extent_; }
+
+  /// Returns the extent along a particular logical dimension.
+  CUTLASS_HOST_DEVICE
+  Index extent(int dim) const { return extent_.at(dim); }
+
+  /// Determines whether a location is within a tensor
+  CUTLASS_HOST_DEVICE
+  bool contains(TensorCoord const& coord) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int dim = 0; dim < kRank; ++dim) {
+      if (!(coord[dim] >= 0 && coord[dim] < extent(dim))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  Base ref() const {
+    return Base(this->data(), this->layout(), this->imaginary_stride());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), this->layout());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent_);
+  }
+
+  /// Returns a Tensor_view given location and size quantities
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex subview(
+    TensorCoord extent,                               ///< extent of the resulting view
+    TensorCoord const& location = TensorCoord()       ///< resulting view's origin within the old view
+  ) const {
+
+    TensorViewPlanarComplex result(this->ref(), extent.clamp(extent_ - location));
+    result.add_coord_offset(location);
+    return result; 
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  size_t capacity() const {
+    return Base::layout().capacity(extent_);
+  }
+
+  /// Returns a TensorView offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator+(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorViewPlanarComplex result(*this);
+    result.add_pointer_offset(this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator+=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(this->offset(b));
+    return *this;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex operator-(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) const {
+
+    TensorRef result(*this);
+    result.add_pointer_offset(-this->offset(b));
+    return result;
+  }
+
+  /// Returns a TensorRef offset by a given amount
+  CUTLASS_HOST_DEVICE
+  TensorViewPlanarComplex& operator-=(
+    TensorCoord const& b            ///< offset in the logical coordinate space of the tensor
+  ) {
+
+    this->add_pointer_offset(-this->offset(b));
+    return *this;
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_real() const {
+    return cutlass::TensorView<Element, Layout>(this->data(), this->layout(), extent_);
+  }
+
+  /// TensorRef to real-valued tensor
+  CUTLASS_HOST_DEVICE
+  cutlass::TensorView<Element, Layout> view_imag() const {
+    return cutlass::TensorView<Element, Layout>(this->imaginary_data(), this->layout(), extent_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs a TensorRef, deducing types from arguments.
+template <
+  typename Element,
+  typename Layout
+>
+CUTLASS_HOST_DEVICE TensorViewPlanarComplex<Element, Layout> make_TensorViewPlanarComplex(
+  Element *ptr, 
+  Layout const &layout,
+  typename Layout::LongIndex imaginary_stride,
+  typename Layout::TensorCoord const &extent) {
+
+  return TensorViewPlanarComplex<Element, Layout>(ptr, layout, imaginary_stride, extent);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tfloat32.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tfloat32.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bc13e177f1d027fbba789367ac3f2ee5b748877
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/tfloat32.h
@@ -0,0 +1,479 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+    \file
+    \brief Defines a proxy class for storing Tensor Float 32 data type.
+*/
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include "cutlass/floating_point_nvrtc.h"
+#else
+#include <cmath>
+#include <limits>
+#include <cstdint>
+#include <cstring> // std::memcpy
+#endif
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tensor Float 32 data type
+struct alignas(4) tfloat32_t {
+
+  //
+  // Data members
+  //
+
+  /// Storage type
+  uint32_t storage;
+
+  //
+  // Methods
+  //
+  private:
+    CUTLASS_HOST_DEVICE
+    static uint32_t float_to_storage(float s) {
+  #if defined(__CUDA_ARCH__)
+      uint32_t result = reinterpret_cast<uint32_t const &>(s);
+  #else
+      uint32_t result;
+      std::memcpy(&result, &s, sizeof(float));
+  #endif
+      return result;
+    }
+
+  public:
+  /// Constructs from an unsigned int
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t bitcast(uint32_t x) {
+    tfloat32_t h;
+    h.storage = x;
+    return h;
+  }
+
+  /// Emulated rounding is fast in device code
+  CUTLASS_HOST_DEVICE
+  static tfloat32_t round_half_ulp_truncate(float const &s) {
+    uint32_t x = float_to_storage(s);
+
+    #if defined(__CUDA_ARCH__)
+    if (::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #else
+    if (std::isfinite(s)) {
+      x += 0x1000u;
+    }
+    #endif
+
+    return tfloat32_t::bitcast(x);
+  }
+
+  tfloat32_t() = default;
+
+  /// Floating-point conversion - round toward nearest even
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).raw()) { }
+
+  // Conversion from double (this rounds twice)
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(double x): tfloat32_t(float(x)) { }
+
+  /// Integer conversion - round toward zero
+  CUTLASS_HOST_DEVICE
+  explicit tfloat32_t(int x) {
+    float flt = static_cast<float>(x);
+    #if defined(__CUDA_ARCH__)
+    storage = reinterpret_cast<uint32_t const &>(flt);
+    #else
+    std::memcpy(&storage, &flt, sizeof(storage));
+    #endif
+  }
+
+  // Conversion to float
+  CUTLASS_HOST_DEVICE
+  operator float() const {
+
+    // Conversions to IEEE single-precision requires clearing dont-care bits
+    // of the mantissa.
+    unsigned bits = (storage & ~0x1fffu);
+
+    #if defined(__CUDA_ARCH__)
+    return reinterpret_cast<float const &>(bits);
+    #else
+    float flt;
+    std::memcpy(&flt, &bits, sizeof(flt));
+    return flt;
+    #endif
+  }
+
+  /// Converts to double
+  CUTLASS_HOST_DEVICE
+  explicit operator double() const {
+    return double(float(*this));
+  }
+
+  /// Converts to int
+  CUTLASS_HOST_DEVICE
+  explicit operator int() const {
+    return int(float(*this));
+  }
+
+  /// Casts to bool
+  CUTLASS_HOST_DEVICE
+  explicit operator bool() const {
+    return (float(*this) != 0.0f);
+  }
+
+  /// Obtains raw bits
+  CUTLASS_HOST_DEVICE
+  uint32_t raw() const {
+    return storage;
+  }
+
+  /// Returns the sign bit
+  CUTLASS_HOST_DEVICE
+  bool signbit() const {
+    return ((raw() & 0x80000000) != 0);
+  }
+
+  /// Returns the biased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent_biased() const {
+    return int((raw() >> 23) & 0x0ff);
+  }
+
+  /// Returns the unbiased exponent
+  CUTLASS_HOST_DEVICE
+  int exponent() const {
+    return exponent_biased() - 127;
+  }
+
+  /// Returns the mantissa
+  CUTLASS_HOST_DEVICE
+  int mantissa() const {
+    return int(raw() & 0x7fffff);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool signbit(cutlass::tfloat32_t const& h) {
+  return h.signbit();
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) {
+  return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isnan(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isfinite(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() != 0x0ff);
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t nan_tf32(const char*) {
+  // NVIDIA canonical NaN
+  return cutlass::tfloat32_t::bitcast(0x7fffffff);
+}
+
+CUTLASS_HOST_DEVICE
+bool isinf(cutlass::tfloat32_t const& h) {
+  return (h.exponent_biased() == 0x0ff) && !h.mantissa();
+}
+
+CUTLASS_HOST_DEVICE
+bool isnormal(cutlass::tfloat32_t const& h) {
+  return h.exponent_biased() && h.exponent_biased() != 0x0ff;
+}
+
+CUTLASS_HOST_DEVICE
+int fpclassify(cutlass::tfloat32_t const& h) {
+  int exp = h.exponent_biased();
+  int mantissa = h.mantissa();
+  if (exp == 0x0ff) {
+    if (mantissa) {
+      return FP_NAN;
+    }
+    else {
+      return FP_INFINITE;
+    }
+  }
+  else if (!exp) {
+    if (mantissa) {
+      return FP_SUBNORMAL;
+    }
+    else {
+      return FP_ZERO;
+    }
+  }
+  return FP_NORMAL;
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) {
+#if defined(__CUDACC_RTC__)
+  return cutlass::tfloat32_t(sqrtf(float(h)));
+#else
+  return cutlass::tfloat32_t(std::sqrt(float(h)));
+#endif
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) {
+
+  uint32_t a_mag = (a.raw() & 0x7fffffff);
+  uint32_t b_sign = (b.raw() & 0x80000000);
+  uint32_t result = (a_mag | b_sign);
+
+  return tfloat32_t::bitcast(result);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Standard Library operations and definitions
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace std {
+
+#if !defined(__CUDACC_RTC__)
+/// Numeric limits
+template <>
+struct numeric_limits<cutlass::tfloat32_t> {
+  static bool const is_specialized = true;
+  static bool const is_signed = true;
+  static bool const is_integer = false;
+  static bool const is_exact = false;
+  static bool const has_infinity = true;
+  static bool const has_quiet_NaN = true;
+  static bool const has_signaling_NaN = false;
+  static std::float_denorm_style const has_denorm = std::denorm_present;
+  static bool const has_denorm_loss = true;
+  static std::float_round_style const round_style = std::round_to_nearest;
+  static bool const is_iec559 = false;
+  static bool const is_bounded = true;
+  static bool const is_modulo = false;
+  static int const digits = 19;
+
+  /// Least positive value
+  static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); }
+
+  /// Minimum finite value
+  static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); }
+
+  /// Maximum finite value
+  static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); }
+
+  /// Returns smallest finite value
+  static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace std
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Arithmetic operators
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_HOST_DEVICE
+bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) == float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) != float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) < float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) <= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) > float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return float(lhs) >= float(rhs);
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) + float(rhs));
+}
+
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs) {
+  return tfloat32_t::bitcast(0x80000000 ^ lhs.raw());
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) - float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) * float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) {
+  return tfloat32_t(float(lhs) / float(rhs));
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) + float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) - float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) * float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) {
+  lhs = tfloat32_t(float(lhs) / float(rhs));
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator++(tfloat32_t & lhs) {
+  float tmp(lhs);
+  ++tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t& operator--(tfloat32_t & lhs) {
+  float tmp(lhs);
+  --tmp;
+  lhs = tfloat32_t(tmp);
+  return lhs;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator++(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp++;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+CUTLASS_HOST_DEVICE
+tfloat32_t operator--(tfloat32_t & lhs, int) {
+  tfloat32_t ret(lhs);
+  float tmp(lhs);
+  tmp--;
+  lhs = tfloat32_t(tmp);
+  return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// User-defined literals
+//
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(long double x) {
+  return cutlass::tfloat32_t(float(x));
+}
+
+CUTLASS_HOST_DEVICE
+cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) {
+  return cutlass::tfloat32_t(int(x));
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/thread/matrix.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/thread/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..c338306132b9d9b2e42ff26759f7d1b3a7bc1ae3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/thread/matrix.h
@@ -0,0 +1,198 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Defines a matrix object intended for storing data in registers and operations within
+      a CUDA thread.
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/matrix_coord.h"
+
+namespace cutlass {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Per-thread matrix object storing a packed matrix
+template <
+  typename Element,
+  int Rows,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+class Matrix : public Array<Element, Rows * Columns> {
+public:
+  
+  // Verify layout refers to a rank=2 matrix.
+  static_assert(
+    Layout::kRank == 2,
+    "Layout type must refer to a rank=2 matrix");
+
+  /// Base type
+  using Base = Array<Element, Rows * Columns>;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Number of rows
+  static int const kRows = Rows;
+
+  /// Number of columns
+  static int const kColumns = Columns;
+
+  /// Layout within the array
+  using Layout = Layout_;
+
+  /// Reference type to an element
+  using Reference = Element &;
+
+  /// Logical rank of tensor index space
+  static int const kRank = 2;
+
+  /// Index type
+  using Index = typename Layout::Index;
+
+  /// Long index used for pointer offsets
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Coordinate in logical tensor space
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Stride type
+  using Stride = typename Layout::Stride;
+
+  /// TensorRef to matrix object
+  using TensorRef = TensorRef<Element, kRank, Layout>;
+
+  /// TensorRef to constant matrix object
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  /// TensorRef to matrix object
+  using TensorView = TensorView<Element, kRank, Layout>;
+
+  /// TensorRef to constant matrix object
+  using ConstTensorView = typename TensorView::ConstTensorView;
+
+  /// Diagonal vector
+  using Diagonal = Vector<Element, __NV_STD_MIN(kRows, kColumns)>;
+
+private:
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Returns the size of the object
+  CUTLASS_HOST_DEVICE
+  static MatrixCoord extent() {
+    return make_Coord(kRows, kColumns);
+  }
+
+  /// Returns the layout object
+  CUTLASS_HOST_DEVICE
+  static Layout layout() {
+    return Layout::packed(extent());
+  }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix() { }
+
+  /// Ctor
+  CUTLASS_HOST_DEVICE
+  Matrix(Diagonal const &diag) {
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorRef ref() {
+    return TensorRef(this->data(), layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  ConstTensorRef const_ref() const {
+    return ConstTensorRef(this->data(), layout());
+  }
+
+  /// Returns a TensorRef pointing to the first element of the tensor.
+  CUTLASS_HOST_DEVICE
+  TensorView view() {
+    return TensorView(ref(), extent());
+  }
+
+  /// Returns a TensorView to const data
+  CUTLASS_HOST_DEVICE
+  ConstTensorView const_view() const {
+    return ConstTensorView(const_ref(), extent());
+  }
+
+  /// Returns a reference to the element at a given Coord
+  CUTLASS_HOST_DEVICE
+  Reference at(MatrixCoord const& coord) const {
+    typename Base::size_type offset_(layout().offset(coord));
+    return Base::at(offset_);
+  }
+
+  /// Returns the number of scalar elements needed to store tensor.
+  CUTLASS_HOST_DEVICE
+  LongIndex capacity() const {
+    return LongIndex(Base::size());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Column vector defined as a matrix with exactly one column
+template <
+  typename Element,
+  int Rows,
+  typename Layout = layout::ColumnMajor
+>
+using ColumnVector = Matrix<Element, Rows, 1, Layout>;
+
+/// Row vector defined as a matrix with exactly one row
+template <
+  typename Element,
+  int Columns,
+  typename Layout = layout::RowMajor
+>
+using RowVector = Matrix<Element, 1, Columns, Layout>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/trace.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..803c72eca35a4cc3ee0712981942016f987f5b44
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/trace.h
@@ -0,0 +1,59 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Helpers for optionally tracing through code when debugging.
+
+    This file is to be included after all other headers.
+*/
+
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Tracing options
+#ifndef CUTLASS_DEBUG_TRACE_LEVEL
+#define CUTLASS_DEBUG_TRACE_LEVEL 0
+#endif
+
+#if CUTLASS_DEBUG_TRACE_LEVEL
+#include <iostream>
+#include "cutlass/core_io.h"
+#if defined(__CUDA_ARCH__)
+#define CUTLASS_TRACE_HOST(x)
+#else
+#define CUTLASS_TRACE_HOST(x) { std::cout << __FILE__ << ":" << __LINE__ << "  " << x << std::endl; }
+#endif
+#else
+#define CUTLASS_TRACE_HOST(x)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..41bc4786c7a8d148340a23bf1ce1db66f04f10b4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -0,0 +1,754 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+*/
+
+#pragma once
+
+#include "cute/arch/mma_sm90_gmma.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace collective {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+using namespace cute;
+
+template <bool Transpose, class SmemLayoutAtom, class ElementType>
+constexpr auto
+gmma_smem_transpose_or_passthrough() {
+  if constexpr (Transpose) {
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW128_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW64_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_SW32_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_SW32_Atom<ElementType>{};
+    }
+    else if constexpr (cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemLayoutAtom>) {
+      return GMMA::Layout_K_INTER_Atom<ElementType>{};
+    }
+    else {
+      static_assert(cutlass::detail::dependent_false<SmemLayoutAtom>, "Unsupported Layout_SW_Atom for B SMEM transposition");
+    }
+  }
+  else {
+    return SmemLayoutAtom{};
+  }
+}
+
+template <class SmemCopyAtom, class ElementType>
+constexpr auto
+use_universal_transposition() {
+  if constexpr (sizeof(ElementType) == 1) {
+    return !cute::is_same_v<GMMA::Layout_MN_SW128_Atom<ElementType>, SmemCopyAtom>;
+  }
+  else if constexpr (sizeof(ElementType) == 4){
+    // Only universal transposition can handle SW64 and Non swizzle SMEM layout
+    if constexpr (cute::is_same_v<GMMA::Layout_MN_SW64_Atom<ElementType>, SmemCopyAtom> ||
+                  cute::is_same_v<GMMA::Layout_MN_INTER_Atom<ElementType>, SmemCopyAtom>) {
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+  else {
+    static_assert(cutlass::detail::dependent_false<ElementType>, "Unsupported ElementType for B SMEM transposition");
+  }
+}
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class NoTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  constexpr CUTLASS_HOST_DEVICE
+  NoTranspositionOperandB(
+      int,
+      int,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int, int) { }
+
+  CUTLASS_DEVICE void synchronize(int) { }
+
+  CUTLASS_DEVICE void synchronize() { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const&,
+    TensorTransposedSmemB const&,
+    int) { }
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class UniversalTranspositionOperandB {
+public:
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  constexpr CUTLASS_HOST_DEVICE 
+  UniversalTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage, int current_step) {
+      if (current_step > 0) {
+        return;
+      }
+
+      constexpr int NumMathWarpGroup = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+      static_assert(NumMathWarpGroup == 1 ||
+                    (!detail::use_universal_transposition<SmemLayoutAtomB, ElementB>() && NumMathWarpGroup == 2),
+                    "Wrong math warp group number for TransposeB");
+      constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+
+      constexpr int BytesPerSmemSwizzleUnit = 16;
+      constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// Universal transposition, need warp_group sync between load and store.
+      /// The number of reg used depends on the input elementB.
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /*
+          In one copy step, a warp group would load WarpgroupTileSize * WarpgroupTileSize tile then store to transposed location.
+          In warp_group_tile, each warp holds Four WarpTileSize x WarpTileSize elements:
+                    K
+              ------------
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step 0
+            | W0 W1 W2 W3  ---
+                  ....
+            | W0 W1 W2 W3  ---
+            | W0 W1 W2 W3    |
+            | W0 W1 W2 W3    | --> Copy Step n
+            | W0 W1 W2 W3  ---
+      */
+      static_assert((NumThreadsPerWarpGroup % WarpThreadShapeN == 0), "Unsupported warp thread layout.");
+      constexpr auto WarpgroupThreadLayout = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<NumThreadsPerWarpGroup / WarpThreadShapeN>{}));
+
+      // Get copy tile and partition to each thread
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpgroupThreadLayout,                           // thr_layout
+        Layout<_1>{}                                     // val_layout
+      );
+      static_assert(size(sB_tiled_copy) == size(TiledMma{}), "Wrong thread number in TiledCopy.");
+
+      auto sB_thr_copy        = sB_tiled_copy.get_thread_slice(warp_group_thread_idx);
+      Tensor tCsB             = sB_thr_copy.partition_S(     sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+      Tensor tCsB_transposed  = sB_thr_copy.partition_D(gmma_sB(_,_,read_stage)); // (CPY, CPY_N, CPY_K)
+
+      // Divide partitioned tile to limit register usage
+      constexpr int  CopySteps      = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+      constexpr auto CopyTileShape  = make_shape(size<0>(tCsB), Int< size<1>(tCsB) / CopySteps >{}, size<2>(tCsB));
+      static_assert(size<1>(tCsB) % CopySteps == 0, "CopySteps must evenly divide rank 1 size of partitioned SMEM.");
+
+      Tensor tCsB_copy_tile            = zipped_divide(tCsB, CopyTileShape);
+      Tensor tCsB_copy_tile_transposed = zipped_divide(tCsB_transposed, CopyTileShape);
+      auto   transpose_fragment        = make_fragment_like(tCsB_copy_tile(_,_0{}));
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step = 0; step < CopySteps; ++step) {
+        copy(sB_tiled_copy, tCsB_copy_tile(_,step), transpose_fragment);
+
+        // Make sure all elements are read before being overwritten
+        __syncthreads();
+
+        copy(sB_tiled_copy, transpose_fragment, tCsB_copy_tile_transposed(_,step));
+      }
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    // SMEM fence to make sure B is transposed before math
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB {
+public:
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+  
+  static constexpr int Steps             = 2;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invalid warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  static_assert(size<2>(typename TiledMma::AtomShape_MNK{}) <= WarpThreadShapeK,
+      "Need to be able to transpose first k-block in the first step");
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+      if (current_step >= StepsPerWarpGroup) {
+        return;
+      }
+
+      static constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      /// A warp group uses 2 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+      /// In each step, one warp would hold two warp_tiles.
+      ///  Step 0:                Step 1:
+      ///  W0 W1 W2 W3            -- -- -- --
+      ///  W1 W0 -- --            -- -- W3 W2
+      ///  W2 -- -- --            -- W3 W0 W1
+      ///  W3 -- -- --            -- W2 W1 W0
+      ///
+      /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+      ///
+      /// Fully static coord LUT to avoid extra register use.
+      /// [warp_id][step][warp_tile][n / k]
+      /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+      /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+      /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+      /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+      /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+      ///
+      /// Encoding the coord of warp tile0 into two int64_t values.
+      /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+      /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+      /// The 2-step transposition and the 8-step transposition share the same encoding.
+      ///
+      //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+      // Divide entire SMEM to multiple warp_tiles
+      constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+      Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+      Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+      // Get copy tile
+      auto sB_tiled_copy = make_tiled_copy(
+        Copy_Atom<DefaultCopy, ElementB>{},
+        WarpThreadLayout,     // thr_layout
+        Layout<_1>{}          // val_layout
+      );
+
+      static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+      auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+      // Construct fragments for transposition
+      Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+      decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+        make_fragment_like(tmp_tCsB),
+        make_fragment_like(tmp_tCsB)
+      };
+
+      [[maybe_unused]] int step = current_step * NumMathWarpGroup;
+      if constexpr (NumMathWarpGroup == 2) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        step += warp_idx / (NumWarpsPerWarpGroup * 2);
+      }
+
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT >> (NumBitsPerStep * current_step);
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT >> (NumBitsPerStep * current_step);
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      // decoding the warp tile coord.
+      int warp_tile0_n, warp_tile0_k;
+      if constexpr (StepsPerWarpGroup <= NumStepsEncoded) {
+        warp_tile0_n = tmp_warp_tile_n_coord_LUT & MaskPerStep;
+        warp_tile0_k = tmp_warp_tile_k_coord_LUT & MaskPerStep;
+      } else {
+        warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+      }
+
+      int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+      int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+
+      } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step < StepsPerWarpGroup) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int i = 0; i < StepsPerWarpGroup; ++i) {
+      this->operator()(sB, gmma_sB, read_stage, i);
+    }
+    synchronize();
+
+  }
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+template<
+  class TiledMma_,
+  class SmemLayoutB_,
+  class SmemLayoutAtomB_,
+  class ElementB_>
+class AsyncTranspositionOperandB_1BElementB {
+public:
+
+  static_assert(sizeof(ElementB_) == 1);
+
+  using TiledMma = TiledMma_;
+  using SmemLayoutB = SmemLayoutB_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using ElementB = ElementB_;
+
+  static constexpr int Steps             = 8;
+  static constexpr int NumMathWarpGroup  = CUTE_STATIC_V(size(TiledMma{})) / NumThreadsPerWarpGroup;
+  static constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+  static_assert(NumMathWarpGroup <= 2,
+                    "Wrong math warp group number for TransposeB");
+  static constexpr int WarpgroupTileSize = size<1>(SmemLayoutB{});  // A warp group tile would process entire Smem K.
+  static constexpr int NumWarpsPerWarpGroup = NumThreadsPerWarpGroup / NumThreadsPerWarp;
+
+  static constexpr int BytesPerSmemSwizzleUnit = 16;
+  static constexpr int WarpThreadShapeN = BytesPerSmemSwizzleUnit / sizeof(ElementB);
+  static constexpr int WarpThreadShapeK = NumThreadsPerWarp / WarpThreadShapeN;
+  static constexpr int NumWarpTilePerWarpgroupTile = NumWarpsPerWarpGroup * (Steps == 8 ? 2 : 1);
+
+  static constexpr int WarpTileSize                = WarpgroupTileSize / NumWarpTilePerWarpgroupTile;
+  static_assert(WarpTileSize >= WarpThreadShapeN && WarpTileSize >= WarpThreadShapeK, "Invalid warp thread shape." );
+  static constexpr int TilesPerWarp                = 2;                     // Each Warp would process 2 warp_tiles in one step.
+  static constexpr int64_t WarpTileNCoordLUT = 06723763275316420;
+  static constexpr int64_t WarpTileKCoordLUT = 05410541064206420;
+  static constexpr int NumStepsEncoded       = 4;                             // Only encoding first 4 steps into LUT.
+  static constexpr int MaskPerStep           = 07;                            // Each step is encoded into 3bits,
+  static constexpr int NumBitsPerStep        = 3;
+  static constexpr int MaskPerWarp           = 07777;                         // Each warp has 4 steps(12 bits)
+  static constexpr int NumBitsPerWarp        = 12;
+  // Number of warp_group_tiles
+  static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    "Copy size must evenly divide SMEM tile.");
+  static constexpr int WarpgroupTileNum = size<0>(SmemLayoutB{}) / WarpgroupTileSize;
+
+  constexpr CUTLASS_HOST_DEVICE
+  AsyncTranspositionOperandB_1BElementB(
+      int warp_idx_,
+      int warp_group_thread_idx_,
+      TiledMma,
+      SmemLayoutB,
+      SmemLayoutAtomB,
+      ElementB)
+      : warp_idx(warp_idx_)
+      , warp_group_thread_idx(warp_group_thread_idx_)
+      , warp_idx_in_warp_group(warp_idx_ % NumWarpsPerWarpGroup)
+      , current_warp_tile_n_coord_LUT((WarpTileNCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp)
+      , current_warp_tile_k_coord_LUT((WarpTileKCoordLUT >> ((warp_idx_
+            % NumWarpsPerWarpGroup) * NumBitsPerWarp)) & MaskPerWarp) { }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void operator()(
+      TensorSmemB const& sB,
+      TensorTransposedSmemB const& gmma_sB,
+      int read_stage, int current_step)
+  {
+    if (current_step > 0) {
+      return;
+    }
+
+    constexpr auto WarpThreadLayout           = make_layout(make_shape(Int<WarpThreadShapeN>{}, Int<WarpThreadShapeK>{}));
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /// A warp group uses 8 steps to transpose the whole WarpgroupTileSize x WarpgroupTileSize.
+    ///  Divide a warp_group_tile into 8x8 warp_tiles to further reduce the reg usage.
+    ///  Step 0:                   Step 1:                   Step 2:                   Step 3:
+    ///  W0 W1 W2 W3 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W1 W0 -- -- -- -- -- --   -- -- W3 W2 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W3 W0 W1 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W2 W1 W0 -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W1 W0 -- --   -- -- -- -- -- -- W3 W2
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W2 -- -- --   -- -- -- -- -- W3 W0 W1
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W3 -- -- --   -- -- -- -- -- W2 W1 W0
+    ///
+    ///  Step 4:                   Step 5:                   Step 6:                   Step 7:
+    ///  -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3   -- -- -- -- -- -- -- --
+    ///  -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- -- -- -- --   -- -- -- -- W0 W1 W2 W3
+    ///  W0 -- -- -- -- -- -- --   -- W0 -- -- -- -- -- --   -- -- W0 -- -- -- -- --   -- -- -- W0 -- -- -- --
+    ///  W1 -- -- -- -- -- -- --   -- W1 -- -- -- -- -- --   -- -- W1 -- -- -- -- --   -- -- -- W1 -- -- -- --
+    ///  W2 -- -- -- -- -- -- --   -- W2 -- -- -- -- -- --   -- -- W2 -- -- -- -- --   -- -- -- W2 -- -- -- --
+    ///  W3 -- -- -- -- -- -- --   -- W3 -- -- -- -- -- --   -- -- W3 -- -- -- -- --   -- -- -- W3 -- -- -- --
+    ///
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///
+    /// Fully static coord LUT to avoid extra register use.
+    /// [warp_id][step][warp_tile][n / k]
+    /// Step 0            Step 1         Step 2          Step 3          Step 4          Step 5         Step 6           Step 7
+    /// {{{0,0}, {1,1}}, {{2,2}, {3,3}}, {{4,4}, {5,5}}, {{6,6}, {7,7}}, {{4,0}, {0,4}}, {{4,1}, {1,4}}, {{4,2}, {2,4}}, {{4,3}, {3,4}}}, // W0
+    /// {{{1,0}, {0,1}}, {{3,2}, {2,3}}, {{5,4}, {4,5}}, {{7,6}, {6,7}}, {{5,0}, {0,5}}, {{5,1}, {1,5}}, {{5,2}, {2,5}}, {{5,3}, {3,5}}}, // W1
+    /// {{{2,0}, {0,2}}, {{3,1}, {1,3}}, {{6,4}, {4,6}}, {{7,5}, {5,7}}, {{6,0}, {0,6}}, {{6,1}, {1,6}}, {{6,2}, {2,6}}, {{6,3}, {3,6}}}, // W2
+    /// {{{3,0}, {0,3}}, {{2,1}, {1,2}}, {{7,4}, {4,7}}, {{6,5}, {5,6}}, {{7,0}, {0,7}}, {{7,1}, {1,7}}, {{7,2}, {2,7}}, {{7,3}, {3,7}}}, // W3
+    ///
+    /// Encoding the coord of warp tile0 into two int64_t values.
+    /// Only encoding Step 0 ~ Step 4, since Step 5 ~ Step 7 have a straightforward pattern.
+    /// Only encoding warp tile0, since the coords of warp tile1 could be easily deduced from warp tile0.
+    /// The 2-step transposition and the 8-step transposition share the same encoding.
+    ///
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // Divide entire SMEM to multiple warp_tiles
+    constexpr auto WarpTileShape = make_shape(Int<WarpTileSize>(), Int<WarpTileSize>());
+    Tensor s_tile                = zipped_divide(     sB(_,_,read_stage), WarpTileShape);
+    Tensor s_tile_transposed     = zipped_divide(gmma_sB(_,_,read_stage), WarpTileShape);
+
+    // Get copy tile
+    auto sB_tiled_copy = make_tiled_copy(
+      Copy_Atom<DefaultCopy, ElementB>{},
+      WarpThreadLayout,     // thr_layout
+      Layout<_1>{}          // val_layout
+    );
+    static_assert(size(sB_tiled_copy) * NumWarpsPerWarpGroup == size(TiledMma{}) / NumMathWarpGroup, "Wrong thread number in TiledCopy.");
+    auto sB_thr_copy = sB_tiled_copy.get_thread_slice(warp_group_thread_idx % NumThreadsPerWarp);  // slice based on lane_idx
+
+    // Construct fragments for transposition
+    Tensor tmp_tCsB = sB_thr_copy.partition_S(flatten(s_tile(_, make_coord(_0{}, _0{}))));
+    decltype(make_fragment_like(tmp_tCsB)) transpose_fragments[TilesPerWarp] = {
+      make_fragment_like(tmp_tCsB),
+      make_fragment_like(tmp_tCsB)
+    };
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for (int warp_group_tile = 0; warp_group_tile < WarpgroupTileNum; ++warp_group_tile) {
+      int tmp_warp_tile_n_coord_LUT = current_warp_tile_n_coord_LUT;
+      int tmp_warp_tile_k_coord_LUT = current_warp_tile_k_coord_LUT;
+      constexpr int StepsPerWarpGroup = Steps / NumMathWarpGroup;
+
+      if constexpr (NumMathWarpGroup == 2) {
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep * (warp_idx / (NumWarpsPerWarpGroup * 2));
+      }
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int step_per_warp_group = 0; step_per_warp_group < StepsPerWarpGroup; ++step_per_warp_group) {
+        // For 2 math warpgroup, warp idx4~7 is 1st warp group and 8~9 is 2nd, so decide if 2nd warpgroup need warp idx divide 8.
+        int step = step_per_warp_group * NumMathWarpGroup + warp_idx / (NumWarpsPerWarpGroup * 2);
+        // decoding the warp tile coord.
+        int warp_tile0_n = step < NumStepsEncoded ? (tmp_warp_tile_n_coord_LUT & MaskPerStep) : 4 + warp_idx_in_warp_group;
+        int warp_tile0_k = step < NumStepsEncoded ? (tmp_warp_tile_k_coord_LUT & MaskPerStep) : step - 4;
+        int warp_tile1_n = warp_tile0_n == warp_tile0_k ? warp_tile0_n + 1 : warp_tile0_k;
+        int warp_tile1_k = warp_tile0_n == warp_tile0_k ? warp_tile0_k + 1 : warp_tile0_n;
+
+        tmp_warp_tile_n_coord_LUT >>= NumBitsPerStep;
+        tmp_warp_tile_k_coord_LUT >>= NumBitsPerStep;
+
+        static_assert(TilesPerWarp == 2);
+
+        // [warp_tile][n/k]
+        const int warp_tile_coord[TilesPerWarp][2] = {
+          // n                                                           k
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile0_n, warp_tile0_k}, // warp_tile 0
+          {warp_group_tile * NumWarpTilePerWarpgroupTile + warp_tile1_n, warp_tile1_k}  // warp_tile 1
+        };
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB = sB_thr_copy.partition_S(
+            flatten(s_tile(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+
+          copy(sB_tiled_copy, tCsB, transpose_fragments[warp_tile]);
+        }
+
+        // Make sure elements in two 8x8 warp tiles are all consumed
+        __syncwarp();
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int warp_tile = 0; warp_tile < TilesPerWarp; ++warp_tile) {
+          Tensor tCsB_transposed = sB_thr_copy.partition_D(
+            flatten(s_tile_transposed(_, make_coord(warp_tile_coord[warp_tile][0], warp_tile_coord[warp_tile][1])))
+          ); // (CPY, CPY_N, CPY_K)
+          copy(sB_tiled_copy, transpose_fragments[warp_tile], tCsB_transposed);
+        }
+      } // lock step
+    } // loop warp_group_tile
+  }
+
+  CUTLASS_DEVICE void synchronize(int step) {
+    if (step == 0) {
+      // SMEM fence to make sure B is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+    }
+  }
+
+  CUTLASS_DEVICE void synchronize() {
+    cutlass::arch::fence_view_async_shared();
+    cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::TransposeBarrier);
+  }
+
+  template <
+    class TensorSmemB,
+    class TensorTransposedSmemB>
+  CUTLASS_DEVICE void transpose(
+    TensorSmemB const& sB,
+    TensorTransposedSmemB const& gmma_sB,
+    int read_stage) {
+    this->operator()(sB, gmma_sB, read_stage, 0);
+    synchronize();
+  }
+
+private:
+  const int warp_idx;
+  const int warp_group_thread_idx;
+  const int warp_idx_in_warp_group;
+  const int current_warp_tile_n_coord_LUT;
+  const int current_warp_tile_k_coord_LUT;
+};
+
+
+template<
+  class TiledMma,
+  class SmemLayoutB,
+  class SmemLayoutAtomB,
+  class ElementB,
+  bool TransposeB
+>
+constexpr CUTLASS_HOST_DEVICE
+auto
+make_transpose_operand_b(
+    int warp_idx,
+    int warp_group_thread_idx,
+    TiledMma,
+    SmemLayoutB,
+    SmemLayoutAtomB,
+    ElementB,
+    cute::bool_constant<TransposeB>)
+{
+  if constexpr (!TransposeB) {
+    return NoTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (use_universal_transposition<SmemLayoutAtomB, ElementB>()) {
+    return UniversalTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else if constexpr (sizeof(ElementB) == 1) {
+    return AsyncTranspositionOperandB_1BElementB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+  else {
+    return AsyncTranspositionOperandB(
+        warp_idx, warp_group_thread_idx, TiledMma{},
+        SmemLayoutB{}, SmemLayoutAtomB{}, ElementB{});
+  }
+}
+
+}; // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace collective
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..265d2fe4367180b0c5c76f22df7d00f01dfb170e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/device/transform_universal_adapter.hpp
@@ -0,0 +1,303 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Transform Kernel Universal adapter
+*/
+
+#pragma once
+
+// common
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/mma.hpp"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cutlass/kernel_launch.h"
+#if !defined(__CUDACC_RTC__)
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/trace.h"
+#endif // !defined(__CUDACC_RTC__)
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::transform::device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <class TransformKernel_>
+class TransformUniversalAdapter
+{
+public:
+  using TransformKernel = GetUnderlyingKernel_t<TransformKernel_>;
+  using Arguments = typename TransformKernel::Arguments;
+  using Params = typename TransformKernel::Params;
+  static bool const kEnableCudaHostAdapter = CUTLASS_ENABLE_CUDA_HOST_ADAPTER;
+
+
+private:
+
+  /// Kernel API parameters object
+  Params params_;
+
+public:
+
+  /// Access the Params structure
+  Params const& params() const {
+    return params_;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    return TransformKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+    workspace_bytes += TransformKernel::get_workspace_size(args);
+
+    CUTLASS_TRACE_HOST("  workspace_bytes: " << workspace_bytes);
+
+    return workspace_bytes;
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Arguments const& args, void* workspace = nullptr) {
+    auto tmp_params = TransformKernel::to_underlying_arguments(args, workspace);
+    return TransformKernel::get_grid_shape(tmp_params);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(Params const& params) {
+    return TransformKernel::get_grid_shape(params);
+  }
+
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::initialize() - workspace "
+      << workspace << ", stream: " << (stream ? "non-null" : "null")
+      << ", EnableCudaHostAdapter: " << (kEnableCudaHostAdapter ? "True" : "false"));
+
+    // Initialize the workspace
+    Status status = TransformKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    // Initialize the Params structure
+    params_ = TransformKernel::to_underlying_arguments(args, workspace);
+    // Don't set the function attributes - require the CudaHostAdapter to set it.
+    if constexpr (kEnableCudaHostAdapter) {
+      CUTLASS_ASSERT(cuda_adapter);
+      return Status::kSuccess;
+    }
+    else {
+      //
+      // Account for dynamic smem capacity if needed
+      //
+      int smem_size = TransformKernel::SharedStorageSize;
+
+      CUTLASS_ASSERT(cuda_adapter == nullptr);
+
+      if (smem_size >= (48 << 10)) {
+        CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+        cudaError_t result = cudaFuncSetAttribute(
+            device_kernel<TransformKernel>,
+            cudaFuncAttributeMaxDynamicSharedMemorySize,
+            smem_size);
+        if (cudaSuccess != result) {
+          result = cudaGetLastError(); // to clear the error bit
+          CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+          return Status::kErrorInternal;
+        }
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  static Status
+  run(Params& params,
+      cudaStream_t stream = nullptr,
+      CudaHostAdapter *cuda_adapter = nullptr,
+      int32_t kernel_index = 0,
+      bool launch_with_pdl = false) {
+    CUTLASS_TRACE_HOST("TransformUniversalAdapter::run()");
+    dim3 const block = TransformKernel::get_block_shape();
+    dim3 const grid = get_grid_shape(params);
+
+    // configure smem size and carveout
+    int smem_size = TransformKernel::SharedStorageSize;
+
+    Status launch_result{ Status::kSuccess };
+    // Use extended launch API only for mainloops that use it
+    if constexpr (TransformKernel::ArchTag::kMinComputeCapability >= 90) {
+      // Currently only support 1x1x1 for transform kernel.
+      dim3 const cluster = {1,1,1};
+      void* kernel_params[] = {&params};
+
+      if constexpr (kEnableCudaHostAdapter) {
+        //
+        // Use the cuda host adapter
+        //
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+
+          if (launch_with_pdl) {
+            CUTLASS_TRACE_HOST(
+              "TransformUniversalAdapter::run() does not support launching with PDL and a custom cuda adapter.");
+            return Status::kErrorInternal;
+          }
+          launch_result = cuda_adapter->launch(grid,
+                                               cluster,
+                                               block,
+                                               smem_size,
+                                               stream,
+                                               kernel_params,
+                                               kernel_index);
+          CUTLASS_TRACE_HOST("Kernel Launch Result" << cutlassGetStatusString(launch_result));
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        void const* kernel = (void const*) device_kernel<TransformKernel>;
+        if constexpr (TransformKernel::ArchTag::kMinComputeCapability == 90) {
+          launch_result = ClusterLauncher::launch(
+            grid, cluster, block, smem_size, stream, kernel, kernel_params, launch_with_pdl);
+        }
+      }
+    }
+    else {
+      launch_result = Status::kSuccess;
+      cutlass::arch::synclog_setup();
+
+      if constexpr (kEnableCudaHostAdapter) {
+        CUTLASS_ASSERT(cuda_adapter);
+        if (cuda_adapter) {
+          void* kernel_params[] = {&params};
+
+          launch_result = cuda_adapter->launch(
+            grid, block, smem_size, stream, kernel_params, 0
+          );
+
+        }
+        else {
+          return Status::kErrorInternal;
+        }
+      }
+      else {
+        CUTLASS_ASSERT(cuda_adapter == nullptr);
+        cutlass::kernel_launch<TransformKernel>(grid, block, smem_size, stream, params, launch_with_pdl);
+      }
+    }
+
+    cudaError_t result = cudaGetLastError();
+    if (cudaSuccess == result && Status::kSuccess == launch_result) {
+      return Status::kSuccess;
+    }
+    else if (cudaSuccess != result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cudaGetErrorString(result));
+    }
+    else if (Status::kSuccess != launch_result) {
+      CUTLASS_TRACE_HOST("  Kernel launch failed. Reason: " << cutlassGetStatusString(launch_result));
+    }
+    return Status::kErrorInternal;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    int32_t kernel_index = 0,
+    bool launch_with_pdl = false
+  ) {
+    Status status = initialize(args, workspace, stream, cuda_adapter);
+
+    if (Status::kSuccess == status) {
+      status = run(params_, stream, cuda_adapter, kernel_index, launch_with_pdl);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const& args,
+    void* workspace = nullptr,
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(args, workspace, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr,
+    bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr, bool launch_with_pdl = false) {
+    return run(params_, stream, cuda_adapter, 0 /*kernel_index*/, launch_with_pdl);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::transform::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c9d7589a309ebe6276bb564ac76a9e036bdd50a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/filter_format_transformer.hpp
@@ -0,0 +1,223 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/* \file
+   \brief Convolution filter format transformation kernel.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <random>
+
+#include "cutlass/coord.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+#include "cute/int_tuple.hpp"
+#include "cute/tensor.hpp"
+#include "cute/config.hpp"
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+enum class FilterFormat {
+  CKTRS,
+  CTRSK,
+  KTRSC
+};
+
+template <
+  FilterFormat SrcFormat,
+  FilterFormat DstFormat,
+  int NumDimensions,
+  class Element_,
+  int AlignmentBytes = 16
+>
+struct ConvFilterFormatTransformer {
+  
+  using Element = Element_;
+  static_assert(SrcFormat == FilterFormat::CKTRS, "Currently only source format of CKTRS is supported");
+  static_assert(DstFormat == FilterFormat::CTRSK || DstFormat == FilterFormat::KTRSC, "Currently only destination format of CTRSK/KTRSC is supported");
+  static_assert(AlignmentBytes > 0 && AlignmentBytes % static_cast<int>(sizeof(Element)) == 0, "Invalid alignment setting");
+
+  // In ktrsc order.
+  using FilterExtent = array<int, NumDimensions>;
+
+  // Default cta tile shape: 32x32
+  static constexpr auto CTATileShape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+  // Default thread layout: (4, 32)
+  static constexpr auto ThreadLayout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+
+  static constexpr uint32_t MaxThreadsPerBlock = 128;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+
+  using ArchTag = arch::Sm90;
+
+  // Default ctor
+  CUTLASS_HOST_DEVICE
+  ConvFilterFormatTransformer() {}
+
+  struct Arguments {
+    const void *src_ptr;
+    void *dst_ptr;
+    FilterExtent filter_extent;
+  };
+
+  struct Params {
+    using TensorSrc = decltype(make_tensor(make_gmem_ptr(recast_ptr<const Element>(nullptr)), make_layout(take<0,NumDimensions>(FilterExtent{}))));
+    using TensorDst = decltype(make_tensor(make_gmem_ptr(recast_ptr<Element>(nullptr)), make_layout(make_shape(int32_t(0), int32_t(0)))));
+
+    TensorSrc src;
+    TensorDst dst; 
+  };
+
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  static Status
+  can_implement(Arguments const& args) {
+    bool implementable = true;
+    // alignment rule
+    {
+      int contiguous_dim = DstFormat == FilterFormat::CTRSK ? args.filter_extent[0] : args.filter_extent[NumDimensions - 1];
+      int align_element = AlignmentBytes / static_cast<int>(sizeof(Element));
+
+      implementable &= (contiguous_dim % align_element == 0);
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Alignment setting is invalid.\n");
+        return Status::kInvalid;
+      }
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    return 0;
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(size(shape(ThreadLayout)), 1, 1);
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    auto dim_m = ceil_div(size<0>(shape(params.dst)), get<0>(CTATileShape));
+    auto dim_n = ceil_div(size<1>(shape(params.dst)), get<1>(CTATileShape));
+
+    return dim3(dim_m, dim_n, 1);
+  }
+
+  static cutlass::Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    return Status::kSuccess;
+  }
+
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace) {
+    auto k = args.filter_extent[0];
+    auto c = args.filter_extent[NumDimensions - 1];
+    auto srt = reverse(take<1,NumDimensions - 1>(args.filter_extent));
+
+    // source shape (s,r,t,k,c)
+    auto shape_src = flatten(make_shape(srt, k, c));
+    auto shape_dst = DstFormat == FilterFormat::CTRSK ? make_shape(k, c * product(srt)) : make_shape(c, k * product(srt));
+
+    auto src = make_tensor(make_gmem_ptr(recast_ptr<const Element>(args.src_ptr)), make_layout(shape_src));
+    auto dst = make_tensor(make_gmem_ptr(recast_ptr<Element>(args.dst_ptr)), make_layout(shape_dst));
+
+    return Params{src, dst};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char *smem_buf) {
+    // Tile the input tensor into blocks
+    auto block_coord = make_coord(blockIdx.x, blockIdx.y);
+    auto block_shape = make_shape(Int<4 * AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<32>{});
+    // Default thread layout: (4, 32)
+    auto thread_layout = make_layout(make_shape(Int<4>{}, Int<32>{}));
+    auto vec_layout = make_layout(make_shape(Int<AlignmentBytes / static_cast<int>(sizeof(Element))>{}, Int<1>{}));
+
+    Tensor tile_D = local_tile(params.dst, block_shape, block_coord);
+
+    // Construct tiled copy
+    using AccessType = cutlass::AlignedArray<Element, size(vec_layout)>;
+    using Atom = Copy_Atom<UniversalCopy<AccessType>, Element>;
+
+    auto tiled_copy = make_tiled_copy(Atom{}, thread_layout, vec_layout);
+    auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x);
+    Tensor thr_tile_D = thr_copy.partition_D(tile_D);
+
+    // shape (s, r, t)
+    auto shape_trs = take<0, NumDimensions - 2>(shape(params.src));
+    // strided_c = c for format CTRSK, strided_c = k for format KTRSC
+    auto strided_c = DstFormat == FilterFormat::CTRSK ? get<NumDimensions - 1>(shape(params.src)) : get<NumDimensions - 2>(shape(params.src));
+    // shape (s, r, t, c) for format CTRSK and shape (s, r, t, k) for format KTRSC 
+    auto shape_ctrs = append<NumDimensions - 1>(shape_trs, strided_c);
+    auto srtc_coord = idx2crd(int(blockIdx.y * get<1>(block_shape) + threadIdx.x / size<0>(thread_layout)), shape_ctrs);
+    // index of k for format CTRSK and index of c for format KTRSC
+    auto n_layout = make_layout(make_shape(gridDim.x, size<0>(thread_layout)), make_stride(size<0>(block_shape), size<0>(vec_layout)));
+    int n_idx = n_layout(make_coord(blockIdx.x, threadIdx.x % size<0>(thread_layout)));
+
+    // Fragment to load from S and store to D
+    auto frag = make_fragment_like(thr_tile_D);
+    // Predicate tensor.
+    Tensor thr_tile_P = make_tensor<bool>(shape(thr_tile_D));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < size(frag); ++i) {
+      auto srt_coord = take<0, NumDimensions - 2>(srtc_coord);
+      auto kc_coord = DstFormat == FilterFormat::CTRSK ?
+          make_coord(n_idx+i, get<NumDimensions - 2>(srtc_coord)) :
+          make_coord(get<NumDimensions - 2>(srtc_coord), n_idx+i);
+      auto coord = flatten(make_coord(srt_coord, kc_coord)); 
+      thr_tile_P(i) = elem_less(coord, shape(params.src));
+      if (thr_tile_P(i)) {
+        frag(i) = params.src(coord);
+      }
+    }
+
+    // Copy from RMEM to GMEM
+    copy_if(tiled_copy, thr_tile_P, frag, thr_tile_D);
+  }
+};
+
+} // namespace cutlass::transform::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..577c68c341c5c7a3d26c7209b2c40e309c65abee
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
@@ -0,0 +1,603 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+
+#pragma once
+
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v, cute::uint_bit_t
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor
+#include "cute/algorithm/cooperative_copy.hpp" // cute::cooperative_copy
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/gemm/gemm.h"             // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/numeric_types.h"         // cutlass::has_negative_zero_v
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+
+namespace cutlass::transform::kernel {
+
+using namespace cute;
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressor {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementAMmaRawUnit = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using ElementAUintCompressed = cute::sparse_elem<ElementASparsity{}, ElementAUint>;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+  // Data Type for storing one chunk's metadata
+  static constexpr int ElementEBitsPerChunk = typename SparseConfig::ElementEBitsPerChunk{};
+  CUTE_STATIC_ASSERT(ElementEBitsPerChunk == 4, "ElementEBitsPerChunk is 4 for SM90");
+  using ElementEChunk = cute::uint_bit_t<ElementEBitsPerChunk>;
+  CUTE_STATIC_ASSERT(cute::is_same_v<ElementEChunk, cute::uint4_t>, "ElementEChunk is uint4_t for SM90");
+  using ElementESparsityPerChunk = Int<ElementEMmaSparsity{} / (cute::sizeof_bits_v<ElementEMmaRaw> / ElementEBitsPerChunk)>;
+
+  // AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = TensorEAtomM{};
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+
+  struct SharedStorage {
+    ElementEMma cEsE[cute::size(TensorEAtom{})];
+    ElementAUintCompressed cACsAC[cute::size(TensorEAtom{})];
+    ElementAUint cAsA[cute::size(TensorEAtom{})];
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct TransformArguments {
+    void const* ptr_A{nullptr};
+    StrideA dA{};
+    void* ptr_ACompress{nullptr};
+    void* ptr_E{nullptr};
+  };
+
+  using TransformParams = TransformArguments;
+
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+
+public:
+  static Params
+  to_underlying_arguments(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::to_underlying_arguments()");
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size");
+      return Status::kErrorInvalidProblem;
+    }
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::can_implement() (True)");
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    CUTLASS_UNUSED(args);
+    // Backward compatible with host compressor
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_workspace_size() (" << SharedStorageSize << ")");
+    return SharedStorageSize;
+  }
+
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    CUTLASS_UNUSED(args);
+    CUTLASS_UNUSED(workspace);
+    CUTLASS_UNUSED(stream);
+    CUTLASS_UNUSED(cuda_adapter);
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::initialize_workspace()");
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    constexpr int MaxAlignmentM = cutlass::const_max(TensorEAlignmentM, TensorAAlignmentM);
+    constexpr int MaxAlignmentK = cutlass::const_max(TensorEAlignmentK, TensorAAlignmentK);
+    const auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+
+    const int GemmMAlignedMax = cutlass::round_up(GemmM, MaxAlignmentM);
+    const int GemmKAlignedMax = cutlass::round_up(GemmK, MaxAlignmentK);
+
+    const int gridDim_X = cutlass::ceil_div(GemmMAlignedMax, TensorEAtomM{});
+    const int gridDim_Y = cutlass::ceil_div(GemmKAlignedMax, TensorEAtomK{});
+    const int gridDim_Z = GemmL;
+
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_grid_shape() ("
+      << gridDim_X << ", "
+      << gridDim_Y << ", "
+      << gridDim_Z << ")");
+    return dim3(gridDim_X, gridDim_Y, gridDim_Z);
+  }
+
+  static dim3
+  get_block_shape() {
+    CUTLASS_TRACE_HOST("SM90StructuredSparseCompressor::get_block_shape() ("
+      << MaxThreadsPerBlock << ", "
+      << 1 << ", "
+      << 1 << ")");
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTE_DEVICE
+  void
+  operator()(Params params, void* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+
+  CUTE_DEVICE
+  static void
+  run(Params params, void* smem_buf = nullptr) {
+    structure_sparse_compress(params, smem_buf);
+  }
+
+private:
+
+  struct MetadataOneChunk1to2 {
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 2);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b0100;
+          case 1:
+            return 0b1110;
+          default:
+            CUTE_GCC_UNREACHABLE;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (4 * elt_phy_idx));
+    }
+
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  struct MetadataOneChunk2to4{
+
+    CUTE_DEVICE
+    void set_metadata_bits(int elt_log_idx, int elt_phy_idx) {
+      auto metadata_bits = [&]() -> uint8_t {
+        CUTLASS_ASSERT(elt_log_idx >= 0 && elt_log_idx < 4);
+        switch (elt_log_idx) {
+          case 0:
+            return 0b00;
+          case 1:
+            return 0b01;
+          case 2:
+            return 0b10;
+          case 3:
+            return 0b11;
+          default:
+            CUTLASS_ASSERT(false);
+            CUTE_GCC_UNREACHABLE;
+            return 0b00;
+        }
+      };
+
+      storage_ |= (metadata_bits() << (2 * elt_phy_idx));
+    }
+
+    CUTE_DEVICE
+    ElementEChunk storage() const {
+      return ElementEChunk{storage_};
+    }
+
+  private:
+    uint8_t storage_ = 0b0000;
+  };
+
+  using MetadataOneChunk = cute::conditional_t<SparseConfig::IsTF32,
+                                               MetadataOneChunk1to2,
+                                               MetadataOneChunk2to4>;
+
+private:
+
+  CUTE_DEVICE
+  static void
+  structure_sparse_compress(Params params, void* smem_buf) {
+    // * Input Params
+    auto [GemmM, GemmN, GemmK, GemmL] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    [[maybe_unused]] const int gridDim_X = gridDim.x;
+    [[maybe_unused]] const int gridDim_Y = gridDim.y;
+    [[maybe_unused]] const int gridDim_Z = gridDim.z;
+    [[maybe_unused]] const int blockDim_X = blockDim.x;
+
+    // * Global Tensor Layout
+    const cute::Layout layout_gA = make_layout(make_shape(GemmM, GemmK, GemmL), dA);
+    const cute::Layout layout_gAC = SparseConfig::fill_layoutA(params.problem_shape);
+    const cute::Layout layout_gE = SparseConfig::fill_layoutE(params.problem_shape);
+
+    // * Construct Global Tensor
+    const cute::Tensor gA   = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUint>(ptr_A)), layout_gA);
+    cute::Tensor gAC_sparse = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementAUintCompressed>(ptr_ACompress)), layout_gAC );
+    cute::Tensor gAC        = cute::recast<ElementAUint>(gAC_sparse);
+    cute::Tensor gE_sparse  = make_tensor(make_gmem_ptr(cute::recast_ptr<ElementEMma>(ptr_E)), layout_gE);
+    cute::Tensor gE         = cute::recast<ElementEMmaRaw>(gE_sparse);
+
+    // * CTA Tensor Layout
+    using cAsA_layout_row = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutRight{}));
+    using cAsA_layout_col = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{}), LayoutLeft{}));
+    using cAsA_layout     = cute::conditional_t<cute::is_same_v<LayoutATag, layout::RowMajor>, cAsA_layout_row, cAsA_layout_col>;
+    using cACsAC_layout   = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementASparsity{}), LayoutRight{}));
+    using cEsE_layout     = decltype(make_layout(make_shape(TensorEAtomM{}, TensorEAtomK{} / ElementEMmaSparsity{}), LayoutRight{}));
+
+    CUTE_STATIC_ASSERT(cute::is_static_v<TensorEAtom>, "TensorEAtom needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cAsA_layout>, "cAsA_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cACsAC_layout>, "cACsAC_layout needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<cEsE_layout>, "cEsE_layout needs to be static");
+
+    const int blockIdx_X = blockIdx.x;
+    const int blockIdx_Y = blockIdx.y;
+    const int blockIdx_Z = blockIdx.z;
+    const int threadIdx_X = threadIdx.x;
+
+    // * Construct CTA Tensor
+    const auto cta_coord = make_coord(blockIdx_X, blockIdx_Y, blockIdx_Z);
+    cute::Tensor cAgA   = cute::recast<ElementAMmaRawUnit>(local_tile(gA, shape(cAsA_layout{}), cta_coord));
+    cute::Tensor cACgAC = cute::recast<ElementAMmaRawUnit>(local_tile(gAC, shape(cACsAC_layout{}), cta_coord));
+    cute::Tensor cEgE   = local_tile(gE, shape(cEsE_layout{}), cta_coord);
+
+    cute::Tensor cAsA   = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cAsA)), cAsA_layout{}));
+    cute::Tensor cACsAC = cute::recast<ElementAMmaRawUnit>(make_tensor(make_smem_ptr(cute::recast_ptr<ElementAUint>(shared_storage.cACsAC)), cACsAC_layout{}));
+    cute::Tensor cEsE   = make_tensor(make_smem_ptr(cute::recast_ptr<ElementEMmaRaw>(shared_storage.cEsE)), cEsE_layout{});
+    cute::Tensor cEsE_chunk = cute::recast<ElementEChunk>(cEsE);
+
+    // * Handle in unit of Chunk when compress
+    using OneChunkSizeA  = Int<LogicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeAC = Int<PhysicalElemsAMmaRawPerChunk>;
+    using OneChunkSizeE  = Int<LogicalElemsAPerChunk / ElementESparsityPerChunk{}>;
+    using NumOneChunkK   = Int<cutlass::ceil_div(TensorEAtomK{}, LogicalElemsAPerChunk)>;
+
+    cute::Tensor cAsA_log_chunk   = logical_divide(cAsA, make_shape(_, OneChunkSizeA{}));
+    cute::Tensor cACsAC_log_chunk = logical_divide(cACsAC, make_shape(_, OneChunkSizeAC{}));
+    cute::Tensor cEsE_log_chunk   = logical_divide(cEsE_chunk, make_shape(_, OneChunkSizeE{}));
+
+    // * Corner Case Handle
+    const auto GemmM_within_Cta = (GemmM - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmM - blockIdx_X * TensorEAtomM{};
+    const auto GemmK_within_Cta = ( (GemmK - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmK - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+    const auto GemmK_NumOneChunk_within_Cta = GemmK_within_Cta / LogicalElemsAMmaRawPerChunk;
+
+    const auto GemmMAlignedAC = cutlass::round_up(GemmM, TensorAAlignmentM);
+    const auto GemmKAlignedAC = cutlass::round_up(GemmK, TensorAAlignmentK);
+    const auto GemmMAlignedAC_within_Cta = (GemmMAlignedAC - blockIdx_X * TensorEAtomM{} > TensorEAtomM{}) ? TensorEAtomM{} : GemmMAlignedAC - blockIdx_X * TensorEAtomM{};
+    const auto GemmKAlignedAC_within_Cta = ( (GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} > TensorEAtomK{}) ? TensorEAtomK{} : GemmKAlignedAC - blockIdx_Y * TensorEAtomK{} ) / ElemsARawPerElementAMmaRaw;
+
+    // * Clear CTA Smem Tensor
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cACsAC);
+    cooperative_clear<MaxThreadsPerBlock>(threadIdx_X, cEsE);
+
+    // * Input CTA Tensor G to S
+    if (GemmM_within_Cta == TensorEAtomM{} && GemmK_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cAgA, cAsA, threadIdx_X, GemmM_within_Cta, GemmK_within_Cta);
+    }
+
+    // Construct a sign bit mask for handling negative zeros 
+    ElementAMmaRawUnit sign_mask = ElementAMmaRawUnit{ 0 };
+    if constexpr (has_negative_zero_v<ElementA>) {
+      ElementAMmaRawUnit one_sign_mask = static_cast<ElementAMmaRawUnit>(~(ElementAMmaRawUnit{ 1 } << (cute::sizeof_bits_v<ElementA> - 1)));
+      for (int i = 0; i < sizeof(ElementAMmaRawUnit) / sizeof(ElementAUint); ++i) {
+        sign_mask = static_cast<ElementAMmaRawUnit>((int32_t)sign_mask | (int32_t)one_sign_mask << (i * cute::sizeof_bits_v<ElementA>));
+      }
+    }
+
+    // * Compress
+    // cACsAC is always row major order
+    // TensorEAtomM threads perform the compression, each thread compress one row
+    const int row_i = threadIdx_X;
+    if (row_i < GemmM_within_Cta) {
+
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < NumOneChunkK{}; ++col_chunk_i) {
+        if (col_chunk_i < GemmK_NumOneChunk_within_Cta) {
+          // Compress is handled in unit of ElementAMmaRawUnit
+          cute::Tensor tAsA   = cAsA_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tACsAC = cACsAC_log_chunk(row_i, make_coord(_, col_chunk_i));
+          cute::Tensor tEsE   = cEsE_log_chunk(row_i, make_coord(_, col_chunk_i));
+
+          int non_zero_cnt = 0;
+          // None zero element indx
+          // e.g.
+          //  2:4 sparsity [x 0 0 x]
+          //  non_zero_elt_log_idx = [0, 3]
+          int non_zero_elt_log_idx[OneChunkSizeAC{}] = { 0 };
+
+          // * Find None Zero Element Idx within Chunk
+          CUTE_UNROLL
+          for (int elt_log_idx = 0; elt_log_idx < OneChunkSizeA{}; ++elt_log_idx) {
+            ElementAMmaRawUnit elem_A = tAsA[elt_log_idx];
+            
+            // Handle negative 0
+            ElementAMmaRawUnit masked_elem_A = elem_A;
+            if constexpr (has_negative_zero_v<ElementA>) {
+              masked_elem_A = elem_A & sign_mask;
+            }
+
+            if (masked_elem_A != ElementAMmaRawUnit{0}) {
+              non_zero_elt_log_idx[non_zero_cnt] = elt_log_idx;
+              tACsAC[non_zero_cnt] = elem_A;
+              non_zero_cnt++;
+            }
+          }
+
+          // * Corner Case for 2:4 sparsity
+          if constexpr (cute::sizeof_bits_v<ElementAMmaRawUnit> < 32) {
+            // i.e. [0 0 0 x] -> [(0) 0 0 x]
+            if (non_zero_cnt == 1 && non_zero_elt_log_idx[0] == 3) {
+              tACsAC[1] = tACsAC[0];
+              tACsAC[0] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[0] = 0;
+              non_zero_elt_log_idx[1] = 3;
+            }
+            // i.e. [0 0 x 0] -> [0 0 x (0)]
+            // i.e. [0 x 0 0] -> [0 x 0 (0)]
+            // i.e. [x 0 0 0] -> [x 0 0 (0)]
+            else if (non_zero_cnt == 1) {
+              tACsAC[1] = ElementAMmaRawUnit{0};
+              non_zero_elt_log_idx[1] = 3;
+            }
+          }
+
+          // * Set Metadata Bits
+          MetadataOneChunk metadata_one_chunk;
+          CUTE_UNROLL
+          for (int elt_phy_idx = 0; elt_phy_idx < OneChunkSizeAC{}; elt_phy_idx++) {
+            metadata_one_chunk.set_metadata_bits(non_zero_elt_log_idx[elt_phy_idx], elt_phy_idx);
+          }
+          tEsE[0] = metadata_one_chunk.storage();
+
+        }
+        else {
+          break;
+        }
+      }
+    }
+
+    // * Sync after Compress
+    __syncthreads();
+
+    // * Output Cta Tensor S to G
+    if (GemmM_within_Cta > 0 && GemmK_within_Cta > 0) {
+      constexpr int MaxVecBits = 128; // STG.128
+      cute::cooperative_copy<MaxThreadsPerBlock, MaxVecBits>(threadIdx_X, cEsE, cEgE);
+    }
+
+    if (GemmMAlignedAC_within_Cta == TensorEAtomM{} && GemmKAlignedAC_within_Cta == TensorEAtomK{}) {
+      copy_vec_pred<false, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+    else {
+      copy_vec_pred<true, LayoutATag>(cACsAC, cACgAC, threadIdx_X, GemmMAlignedAC_within_Cta, (GemmKAlignedAC_within_Cta / ElementASparsity::value));
+    }
+
+  } // end of structure_sparse_compress()
+
+  template<uint32_t NumThreads,
+           typename TensorSrc>
+  CUTE_DEVICE
+  static void
+  cooperative_clear(
+    uint32_t const& tid,
+    TensorSrc dSrc) {
+    
+    auto dSrctSrc = local_partition(dSrc, make_layout(make_shape(NumThreads, _1{})), tid);
+    cute::clear(dSrctSrc);
+
+    // Sync all thread data access
+    __syncthreads();
+  }
+
+  template <bool pred,
+            typename LayoutTag,
+            typename TensorSrc,
+            typename TensorDst>
+  CUTE_DEVICE
+  static void
+  copy_vec_pred(
+      TensorSrc dSrc,
+      TensorDst dDst,
+      int threadIdx_X,
+      int valid_rows,
+      int valid_cols) {
+
+    constexpr bool IsRowMajor = cute::is_same_v<LayoutTag, cutlass::layout::RowMajor>;
+    using Element = typename TensorSrc::element_type;
+    constexpr bool IsQmmaF6 = cute::sizeof_bits_v<Element> == 6;
+
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dSrc))>, "shape(dSrc) needs to be static");
+    CUTE_STATIC_ASSERT(cute::is_static_v<decltype(shape(dDst))>, "shape(dDst) needs to be static");
+    CUTE_STATIC_ASSERT(cute::sizeof_bits_v<typename TensorSrc::element_type> == cute::sizeof_bits_v<typename TensorDst::element_type>,
+      "dSrc and dDst need to have same element bit width");
+    CUTE_STATIC_ASSERT(cute::size(dSrc) == cute::size(dDst), "dSrc and dDst need to have same size");
+
+    // ValueShape
+    using ValueShape = 
+      cute::conditional_t<IsQmmaF6,
+                          Shape<Int<1>, Int<1>>,
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<1>, Int<128 / sizeof_bits_v<Element>>>,
+                          Shape<Int<128 / sizeof_bits_v<Element>>, Int<1>>>
+      >;
+
+    constexpr int ValueShapeRows = shape<0>(ValueShape{});
+    constexpr int ValueShapeCols = shape<1>(ValueShape{});
+
+    // ThreadShape
+    using ThreadShape = 
+      cute::conditional_t<IsQmmaF6,
+                          cute::conditional_t<IsRowMajor,
+                                              Shape<Int<MaxThreadsPerBlock>, Int<1>>,
+                                              Shape<Int<1>, Int<MaxThreadsPerBlock>>>,
+      cute::conditional_t<IsRowMajor,
+                          Shape<Int<MaxThreadsPerBlock / (shape<1>(dSrc) / ValueShapeCols)>, Int<                     (shape<1>(dSrc) / ValueShapeCols)>>,
+                          Shape<Int<                     (shape<0>(dSrc) / ValueShapeRows)>, Int<MaxThreadsPerBlock / (shape<0>(dSrc) / ValueShapeRows)>>>
+      >;
+
+    constexpr int ThreadShapeRows = shape<0>(ThreadShape{});
+    constexpr int ThreadShapeCols = shape<1>(ThreadShape{});
+
+    const int threadIdx_X_row = threadIdx_X / ThreadShapeCols;
+    const int threadIdx_X_col = threadIdx_X % ThreadShapeCols;
+
+    // Row Major
+    if constexpr (IsRowMajor) {
+      CUTE_UNROLL
+      for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+        CUTE_UNROLL
+        for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+          CUTE_UNROLL
+          for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+            CUTE_UNROLL
+            for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    // Col Major
+    else {
+      CUTE_UNROLL
+      for (int col_chunk_i = 0; col_chunk_i < cutlass::ceil_div(shape<1>(dSrc) , ThreadShapeCols * ValueShapeCols); ++col_chunk_i) {
+        CUTE_UNROLL
+        for (int iter_row_blk = 0; iter_row_blk < cutlass::ceil_div(shape<0>(dSrc), ThreadShapeRows * ValueShapeRows); ++iter_row_blk) {
+          CUTE_UNROLL
+          for (int iter_col_thr = 0; iter_col_thr < ValueShapeCols; ++iter_col_thr) {
+            CUTE_UNROLL
+            for (int iter_row_thr = 0; iter_row_thr < ValueShapeRows; ++iter_row_thr) {
+              const int row_i = (iter_row_blk * ThreadShapeRows + threadIdx_X_row) * ValueShapeRows + iter_row_thr;
+              const int col_i = (col_chunk_i * ThreadShapeCols + threadIdx_X_col) * ValueShapeCols + iter_col_thr;
+              if constexpr ( (not pred) and (not IsQmmaF6) ) {
+                dDst(row_i, col_i) = dSrc(row_i, col_i);
+              }
+              else {
+                if (row_i < valid_rows && col_i < valid_cols) {
+                  dDst(row_i, col_i) = dSrc(row_i, col_i);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  
+    // Sync all thread data access
+    __syncthreads();
+  } // end of copy_vec_pred()
+  
+};
+
+}  // namespace cutlass::transform::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f23535fea5df8df728b7c806d65f75f28c36aa3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
@@ -0,0 +1,325 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils for structured sparse kernels
+*/
+
+#pragma once
+
+#include <algorithm>                           // std::fill
+#include <array>                               // std::array
+#include <random>                              // std::mt19937
+
+#include "cute/numeric/numeric_types.hpp"      // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                     // cute::Tensor, cute::make_tensor
+#include "cutlass/arch/arch.h"                 // cutlass::arch::SmXY
+#include "cutlass/detail/dependent_false.hpp"  // cutlass::detail::dependent_false
+#include "cutlass/gemm/gemm.h"                 // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"                 // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/numeric_size.h"              // cutlass::bits_to_bytes
+
+#include "cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp"
+
+namespace cutlass::transform::kernel {
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class StructuredSparseCompressorUtility {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  //* EltA
+  using ElementA = ElementA_;
+  using LayoutATag = LayoutATag_;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+
+  //* EltE
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+
+  //* AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  //* Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  StructuredSparseCompressorUtility() = default;
+
+  StructuredSparseCompressorUtility(ProblemShape problem, StrideA dA) {
+    set_problem_size(problem, dA);
+  }
+
+  void set_problem_size(ProblemShape problem, StrideA dA_) {
+    M = cute::size<0>(problem);
+    K = cute::size<2>(problem);
+    L = cute::size<3>(problem);
+
+    // The following three vars are logical elem count!
+    K_alignedA  = round_up(K, TensorAAlignmentK);
+    M_alignedA  = round_up(M, TensorAAlignmentM);
+    K_alignedE = round_up(K, TensorEAlignmentK);
+    M_alignedE = round_up(M, TensorEAlignmentM);
+
+    dA = dA_;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along K after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along K-dim
+   */
+  int get_metadata_m_physical() const {
+    return M_alignedE;
+  }
+
+  /**
+   * @brief Get the TensorE number of ElementE along M after alignment requirement
+   * 
+   * @return int : number of ElementE (uint8_t) along M-dim
+   */
+  int get_metadata_k_physical() const {
+    return K_alignedE / ElementEMmaSparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along K after alignment requirement
+   * 
+   * @return int : number of ElementA along K-dim
+   */
+  int get_tensorA_k_physical() const {
+    return K_alignedA / ElementASparsity{};
+  }
+
+  /**
+   * @brief Get the TensorACompressed number of ElementA along M after alignment requirement
+   * 
+   * @return int : number of ElementA along M-dim
+   */
+  int get_tensorA_m_physical() const {
+    return M_alignedA;
+  }
+
+  /**
+   * @brief Get the TensorACompressed Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_compressed_tensor_A_bytes() const {
+    const auto tensor_a_comp_num_elt_a = get_tensorA_m_physical() * get_tensorA_k_physical() * L;
+    const auto tensor_a_comp_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_comp_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_comp_bytes;
+  }
+
+  /**
+   * @brief Get the TensorA Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_raw_tensor_A_bytes() const {
+    const auto tensor_a_num_elt_a = uint64_t(M) * uint64_t(K) * uint64_t(L);
+    const auto tensor_a_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_a_num_elt_a * cute::sizeof_bits_v<ElementA>);
+    return tensor_a_bytes;
+  }
+
+  /**
+   * @brief Get the TensorE Bytes
+   * 
+   * @return uint64_t bytes
+   */
+  uint64_t get_tensor_E_bytes() const {
+    const auto tensor_e_num_elt_a = uint64_t(get_metadata_m_physical()) * uint64_t(get_metadata_k_physical()) * uint64_t(L);
+    const auto tensor_e_bytes = cutlass::bits_to_bytes<uint64_t>(tensor_e_num_elt_a * cute::sizeof_bits_v<ElementEMmaRaw>);
+    return tensor_e_bytes;
+  }
+
+  constexpr auto fill_layoutA_from_compressor() const {
+    return SparseConfig::fill_layoutA(cute::make_tuple(M,_1{},K,L));
+  }
+
+  constexpr auto fill_layoutE_from_compressor() const {
+    return SparseConfig::fill_layoutE(cute::make_tuple(M,_1{},K,L));
+  }
+
+  void structure_sparse_zero_mask_fill(void* host_a_ptr, uint64_t seed) {
+    
+    constexpr int ChunkSize = LogicalElemsAMmaRawPerChunk;
+    using ChunkElement = cute::uint_bit_t<cute::sizeof_bits_v<ElementAMmaRaw>>;
+
+    cute::Tensor gA_eltA = cute::make_tensor(
+        cute::recast_ptr<ElementA>(host_a_ptr),
+        cute::make_layout(make_shape(M, K, L), dA));
+
+    // Input TensorA is handled in unit of ElementAMmaRaw instead of ElementA
+    cute::Tensor gA = cute::recast<ChunkElement>(gA_eltA);
+
+    // Extract out the Chunk from K-mode
+    Tensor gA_chunk = cute::zipped_divide(gA, cute::Shape<_1,cute::Int<ChunkSize>>{}); // (Chunk, Rest)
+
+    // Half of the data is zero to indicate sparsityA = 2
+    std::array<int, ChunkSize> nnzb_indicator{};
+    for (size_t i = 1; i < nnzb_indicator.size(); i += 2) {
+      nnzb_indicator.at(i) = 1;
+    }
+
+    std::mt19937 rng(seed);
+    auto rest_shape = cute::shape<1>(gA_chunk);
+    for (auto iter = cute::make_coord_iterator(rest_shape); iter != cute::ForwardCoordIteratorSentinel{}; ++iter) {
+      std::shuffle(nnzb_indicator.begin(), nnzb_indicator.end(), rng);
+      for (int c = 0; c < size<0>(gA_chunk); ++c) {                        // for each elem within chunk
+        if (nnzb_indicator[c] == 0) {
+          gA_chunk(c, *iter) = ChunkElement{0};
+        }
+      }  // end of within chunk
+    }    // end of chunk_idx
+  }
+
+  int M{-1};
+  int K{-1};
+  int L{-1};
+  StrideA dA{};
+
+private:
+  int K_alignedA{-1};
+  int M_alignedA{-1};
+  int K_alignedE{-1};
+  int M_alignedE{-1};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+struct StructuredSparseCompressorSelector {
+  static_assert(cutlass::detail::dependent_false<ArchTag>,
+      "Could not select a structured sparse compressor for given parameters.");
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm90> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm100> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig
+>
+struct StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    arch::Sm120> {
+  using Compressor = SM90StructuredSparseCompressor<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig
+  >;
+};
+
+template<
+  class ProblemShape,
+  class ElementA,
+  class LayoutATag,
+  class SparseConfig,
+  class ArchTag
+>
+using StructuredSparseCompressor = typename StructuredSparseCompressorSelector<
+    ProblemShape,
+    ElementA,
+    LayoutATag,
+    SparseConfig,
+    ArchTag
+>::Compressor;
+
+} // End namespace cutlass::transform::kernel
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/pitch_linear_thread_map.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef553aab2043775758c2a87d422456dc5cca2426
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/pitch_linear_thread_map.h
@@ -0,0 +1,926 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing how threads are mapped to a given tile.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/layout/pitch_linear.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Strip-mines a pitch-linear tile among a given number of threads, first along
+/// the contiguous dimension then along the strided dimension.
+///
+/// The tile must be divisible by the thread count such that all threads may
+/// execute the same number of iterations with the same delta to exhaustively
+/// cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+///
+/// This ThreadMap is used by SIMT kernels and operand E of the sparse tensor
+/// kernels.
+template <
+  typename Shape_,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStripminedThreadMap {
+  
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal implementation details
+  struct Detail {
+
+    static_assert(!(Shape::kContiguous % kElementsPerAccess), "");
+
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    static_assert((Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+                      (!(kThreads % ShapeVec::kContiguous)),
+                  "Shape must be divisible by number of iterations of each thread.");
+  };
+
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? (Detail::ShapeVec::kStrided + (kThreads / Detail::ShapeVec::kContiguous - 1)) /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+  
+
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      1,
+      kThreads / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * kElementsPerAccess,
+      1
+    >
+  >::type;
+
+  /// Shape of the tile in units of vectors
+  using StorageShape = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<Shape::kContiguous,
+                               Iterations::kStrided*(kThreads / Detail::ShapeVec::kContiguous)>,
+      layout::PitchLinearShape<Shape::kContiguous, Shape::kStrided>>::type;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * kElementsPerAccess, 
+      thread_id / Detail::ShapeVec::kContiguous);
+  }
+};
+
+/// This ThreadMap is used by GEMV
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadContiguous
+{
+ static_assert((Shape::kContiguous % (Threads * ElementsPerAccess)) == 0,
+              "Contiguous shape must divide number of threads");
+
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / (kThreads * kElementsPerAccess),
+                      Shape::kStrided>;
+
+  using Delta = layout::PitchLinearShape<1, 1>;
+
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+    return TensorCoord(thread_id * Iterations::kContiguous * kElementsPerAccess, 0);
+  }
+};
+
+template <
+  typename Shape,
+  int Threads,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearTilePolicyStripminedThreadStrided
+{
+  static_assert((Shape::kStrided % Threads == 0),
+                "Strided shape must divide number of threads");
+
+  using TensorCoord = layout::PitchLinearCoord;
+
+  static int const kThreads = Threads;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using Iterations = layout::PitchLinearShape<
+                      Shape::kContiguous / kElementsPerAccess,
+                      Shape::kStrided / kThreads>;
+
+  using Delta = layout::PitchLinearShape<1, 1>;
+
+  using ShapeVec = Shape;
+
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id)
+  {
+
+    return TensorCoord(0, thread_id * Iterations::kStrided);
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements.
+///
+/// This ThreadMap is used by tensor core kernels.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearWarpRakedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    static_assert(
+      !(ShapeInAccesses::kContiguous % WarpThreadArrangement::kContiguous),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+
+    static_assert(
+      !(ShapeInAccesses::kStrided % WarpThreadArrangement::kStrided),
+      "ShapeInAccesses must be divisible by WarpThreadArrangement.");
+
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+        (WarpAccessIterations::kStrided >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kStrided);
+
+    static int const kWarpsContiguous =
+        (kWarpCount > WarpAccessIterations::kStrided
+             ? kWarpCount / kWarpsStrided
+             : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided
+  >;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Policy defining a warp-raked arrangement in which a shape is partitioned into contiguous
+/// elements. Warps are arranged based on a stride.
+///
+/// This ThreadMap is used by tensor core kernels for NCxHWx layout.
+template <
+  typename Shape_,
+  int Threads,
+  typename WarpThreadArrangement_,
+  int ElementsPerAccess = 1
+>
+struct PitchLinearStridedWarpRakedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  using WarpThreadArrangement = WarpThreadArrangement_;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Base ThreadMap
+  using BaseThreadMap = PitchLinearWarpRakedThreadMap<
+    Shape,
+    kThreads,
+    WarpThreadArrangement,
+    kElementsPerAccess
+  >;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = typename BaseThreadMap::ThreadAccessShape;
+
+
+  struct Detail {
+
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    using WarpAccessIterations = typename BaseThreadMap::Detail::WarpAccessIterations;
+
+    static int const kWarpSize = BaseThreadMap::Detail::kWarpSize;
+
+    static int const kWarpCount = BaseThreadMap::Detail::kWarpCount;
+
+    using ShapeInAccesses = typename BaseThreadMap::Detail::ShapeInAccesses;
+
+    // Divide it into the number of warps, first partitioning the contiguous dimension then the
+    // stride.
+    static int const kWarpsContiguous =
+        (WarpAccessIterations::kContiguous >= kWarpCount
+             ? kWarpCount
+             : WarpAccessIterations::kContiguous);
+
+    static int const kWarpsStrided =
+        (kWarpCount > WarpAccessIterations::kContiguous
+             ? kWarpCount / kWarpsContiguous
+             : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = typename BaseThreadMap::Delta;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided * Iterations::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Transpose the existing ThreadMap.  For example, interleaved layout is like
+/// congruous in the global memory and crosswise in the shared memory.  We need
+/// to transpose the coordinates between two.
+
+template <typename ThreadMap_, typename WarpThreadArrangement_>
+struct TransposePitchLinearThreadMap {
+  /// Underlying ThreadMap
+  using ThreadMap = ThreadMap_;
+
+  /// Tensor coordinate
+  using TensorCoord = typename ThreadMap::TensorCoord;
+
+  /// Tile shape
+  using Shape = typename ThreadMap::Shape;
+
+  /// Number of threads total
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(!(Shape::kContiguous % kElementsPerAccess),
+                  "Shape must be divisible by vector length.");
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement =
+        layout::PitchLinearShape<ThreadMap::Detail::kWarpsStrided,
+                                 ThreadMap::Detail::kWarpsContiguous>;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations =
+      layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+                               ThreadMap::Iterations::kContiguous>;
+
+  static_assert(Iterations::kContiguous == 1,
+    "Contiguous iteration has to be one to reuse the same shared store function with those that don't need transpose");
+
+  static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta =
+      layout::PitchLinearShape<Detail::WarpThreadArrangement::kContiguous *
+                                   kElementsPerAccess,
+                               Detail::WarpThreadArrangement::kStrided>;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical
+  /// coordinate space Note this is slightly different from the one of
+  /// PitchLinearWarpRakedThreadMap.
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access
+    // (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+        Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+        Detail::WarpThreadArrangement::kStrided * Iterations::kStrided};
+
+    // This is the offset of a specific warp (in units of vectors)
+    // Note the order of / and %. Also the 2nd operand is kStrided.
+    layout::PitchLinearCoord warp_offset{
+        (warp_id / Detail::WarpArrangement::kStrided),
+        (warp_id % Detail::WarpArrangement::kStrided)};
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+        lane_id % Detail::WarpThreadArrangement::kContiguous,
+        lane_id / Detail::WarpThreadArrangement::kContiguous};
+
+    // This is the offset of a thread within a threadblock tile (units of
+    // vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+        warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of
+    // elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+        thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+        thread_offset_in_threadblock_tile_vec.strided()};
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMapSimt {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+    static_assert(kElementsPerAccess == 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+    static_assert(Iterations::kStrided == 1,
+      "Strided iteration has to be one to reuse the same shared store function with those that don't need transpose");
+
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+
+    ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+
+
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+/// Policy defining a warp-striped arrangement.  This partitions a tile into vectorized memory
+/// accesses performed by each warp then distributes warps across them. Warps are striped in the
+/// strided dimension and raked across the contiguous dimension.
+template <
+  typename Shape_,                          /// Overall shape to partition in units of elements
+  int Threads,                              /// Number of partiticipation threads
+  typename WarpThreadArrangement_,          /// Describes the shape of one memory access per warp
+  int ElementsPerAccess = 1                 /// Number of elements accessed by each thread per memory operation (i.e. vector size)
+>
+struct PitchLinearWarpStripedThreadMap {
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract vector length from Layout
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  /// Shape of access by each thread
+  using ThreadAccessShape = layout::PitchLinearShape<kElementsPerAccess, 1>;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// Fixed arrangement of threads within a warp (units of threads).
+    using WarpThreadArrangement = WarpThreadArrangement_;
+
+    /// Number of threads per warp
+    static int const kWarpSize = WarpThreadArrangement::kCount;
+
+    /// Number of participating warps
+    static int const kWarpCount = kThreads / kWarpSize;
+
+    static_assert(
+      !(Shape::kContiguous % kElementsPerAccess),
+      "Shape must be divisible by vector length.");
+
+    /// Compute the 'shape' of the overall tile in units of vectors
+    using ShapeInAccesses = layout::PitchLinearShape<
+      Shape::kContiguous / kElementsPerAccess,
+      Shape::kStrided
+    >;
+
+    // compute number of warp-level accesses total
+    using WarpAccessIterations = layout::PitchLinearShape<
+      ShapeInAccesses::kContiguous / WarpThreadArrangement::kContiguous,
+      ShapeInAccesses::kStrided / WarpThreadArrangement::kStrided
+    >;
+
+    // Divide it into the number of warps, first partitioning the strided dimension then the
+    // contiguous.
+    static int const kWarpsStrided =
+      (WarpAccessIterations::kStrided >= kWarpCount
+        ? kWarpCount : (kWarpCount / WarpAccessIterations::kStrided));
+
+    static int const kWarpsContiguous =
+      (kWarpCount > WarpAccessIterations::kStrided ?
+        WarpAccessIterations::kContiguous / kWarpsStrided : 1);
+
+    /// Arrangement of warps within a threadblock-scoped tile
+    using WarpArrangement = layout::PitchLinearShape<
+      kWarpsContiguous, kWarpsStrided
+    >;
+  };
+
+  ///< Iterations along each dimension (concept: PitchLinearShape)
+  using Iterations = layout::PitchLinearShape<
+    Detail::WarpAccessIterations::kContiguous / Detail::kWarpsContiguous,
+    Detail::WarpAccessIterations::kStrided / Detail::kWarpsStrided
+  >;
+
+  static_assert(Iterations::kCount,
+    "Number of iterations must be non-zero");
+
+  ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+  using Delta = layout::PitchLinearShape<
+    Detail::WarpThreadArrangement::kContiguous * kElementsPerAccess,
+    Detail::WarpThreadArrangement::kStrided * Detail::WarpArrangement::kStrided
+  >;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    int warp_id = (thread_id / Detail::kWarpSize);
+    int lane_id = (thread_id % Detail::kWarpSize);
+
+    //
+    // compute warp-level offset
+    //
+
+    // This is the shape of the entire area covered by a warp's memory access (in units of vectors)
+    layout::PitchLinearCoord warp_footprint{
+      Detail::WarpThreadArrangement::kContiguous * Iterations::kContiguous,
+      Detail::WarpThreadArrangement::kStrided
+    };
+
+    // This is the offset of a specific warp (in units of vectors)
+    layout::PitchLinearCoord warp_offset{
+      (warp_id % Detail::kWarpsContiguous),
+      (warp_id / Detail::kWarpsContiguous)
+    };
+
+    // This is the offset of a specific thread within a warp (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_warp{
+      lane_id % Detail::WarpThreadArrangement::kContiguous,
+      lane_id / Detail::WarpThreadArrangement::kContiguous
+    };
+
+    // This is the offset of a thread within a threadblock tile (units of vectors)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_vec =
+      warp_footprint * warp_offset + thread_offset_in_warp;
+
+    // This is the offset of a thread within a threadblock tile (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile_base{
+      thread_offset_in_threadblock_tile_vec.contiguous() * kElementsPerAccess,
+      thread_offset_in_threadblock_tile_vec.strided()
+    };
+
+    return thread_offset_in_threadblock_tile_base;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Strip-mines a pitch-linear tile among a given number of threads, first along the contiguous
+/// dimension then along the strided dimension, while each thread access a 2D thread-tile.
+///
+/// The tile must be divisible by the thread count such that all threads may execute the same
+/// number of iterations with the same delta to exhaustively cover the tile.
+///
+/// This class satisfies the "RegularThreadMapping" concept.
+template <
+  typename Shape_,
+  int Threads,
+        typename ThreadTileShape
+>
+struct PitchLinear2DThreadTileStripminedThreadMap;
+
+
+template <
+  typename Shape_,
+  int Threads
+>
+struct PitchLinear2DThreadTileStripminedThreadMap <Shape_, Threads, cutlass::layout::PitchLinearShape<4, 4>>{
+
+  /// Tensor coordinate
+  using TensorCoord = layout::PitchLinearCoord;
+
+  /// Tile shape
+  using Shape = Shape_;
+
+  /// Access Shape of each thread
+  using ThreadAccessShape = cutlass::layout::PitchLinearShape<4, 4>;
+  //using ThreadAccessShape = ThreadTileShape;
+
+  /// Number of threads total
+  static int const kThreads = Threads;
+
+  /// Extract length of each access from Layout
+  static int const kElementsPerAccess = ThreadAccessShape::kContiguous;
+
+  static_assert(!(kElementsPerAccess % 4) , "kElementsPerAccess, needs to be multiple of 4 (32bits)");
+
+  /// Internal implementation details
+  struct Detail {
+
+    static_assert(!(ThreadAccessShape::kContiguous % 4), "ThreadAccessShape, needs to be multiple of 4");
+
+    static_assert(!(Shape::kContiguous % ThreadAccessShape::kContiguous), "");
+
+    static_assert(!((Shape::kContiguous * Shape::kStrided) % (kThreads * ThreadAccessShape::kCount)),
+      "Shape must be divisible thread count * accesses per thread.");
+
+    /// Shape of the tile in units of vectors
+    using ShapeVec = layout::PitchLinearShape<
+      Shape::kContiguous / ThreadAccessShape::kContiguous,
+      Shape::kStrided / ThreadAccessShape::kStrided
+    >;
+
+    static_assert(
+      (Threads < ShapeVec::kContiguous && !(ShapeVec::kContiguous % kThreads)) ||
+      (!(kThreads % ShapeVec::kContiguous) && !(ShapeVec::kStrided % (kThreads / ShapeVec::kContiguous))),
+      "Shape must be divisible by number of iterations of each thread."
+    );
+  };
+
+  /// Number of iterations by each thread
+  using Iterations = typename platform::conditional<
+      Threads >= Detail::ShapeVec::kContiguous,
+      layout::PitchLinearShape<
+          1,
+          // Redo the comparison here to work around divide by zero compiler
+          // error.  The compiler evaluates both path of platform::conditional.
+          (Threads >= Detail::ShapeVec::kContiguous
+               ? Detail::ShapeVec::kStrided /
+                     (kThreads / Detail::ShapeVec::kContiguous)
+               : 0)>,
+      layout::PitchLinearShape<Detail::ShapeVec::kContiguous / kThreads,
+                               Detail::ShapeVec::kStrided>>::type;
+
+  /// Interval between accesses along each dimension of the tensor's logical coordinate space
+  /// (in units of Elements)
+  using Delta = typename platform::conditional<
+    Threads >= Detail::ShapeVec::kContiguous,
+    layout::PitchLinearShape<
+      Shape::kContiguous,
+      kThreads * ThreadAccessShape::kStrided / Detail::ShapeVec::kContiguous
+    >,
+    layout::PitchLinearShape<
+      kThreads * ThreadAccessShape::kContiguous,
+      1
+    >
+  >::type;
+
+  /// Maps thread ID to a coordinate offset within the tensor's logical coordinate space
+  /// (in units of Elements)
+  CUTLASS_HOST_DEVICE
+  static TensorCoord initial_offset(int thread_id) {
+
+    return TensorCoord(
+      (thread_id % Detail::ShapeVec::kContiguous) * ThreadAccessShape::kContiguous,
+      (thread_id / Detail::ShapeVec::kContiguous) * ThreadAccessShape::kStrided);
+  }
+};
+
+/// Thread Mapping a 2D threadtiled mapping as a transposed Pitchlinear2DThreadTile mapping
+template <typename ThreadMap_>
+struct TransposePitchLinearThreadMap2DThreadTile {
+    /// Underlying ThreadMap
+    using ThreadMap = ThreadMap_;
+
+    /// Tensor coordinate
+    using TensorCoord = typename ThreadMap::TensorCoord;
+
+    /// Tile shape
+    using Shape = typename ThreadMap::Shape;
+
+    /// Number of threads total
+    static int const kThreads = ThreadMap::kThreads;
+
+    /// Extract vector length from Layout
+    static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+
+    static_assert(kElementsPerAccess > 1 , "Simt transpose requires elements per access to be 1");
+    ///< Iterations along each dimension (concept: PitchLinearShape)
+    using Iterations =
+        layout::PitchLinearShape<ThreadMap::Iterations::kStrided,
+        ThreadMap::Iterations::kContiguous>;
+
+    static_assert(Iterations::kCount, "Number of iterations must be non-zero");
+
+    /// Shape of access by each thread
+    using ThreadAccessShape = typename ThreadMap::ThreadAccessShape;
+
+    ///< Delta between accesses (units of elements, concept: PitchLinearShape)
+    using Delta =
+        layout::PitchLinearShape<ThreadMap::Delta::kStrided,
+        ThreadMap::Delta::kContiguous>;
+
+
+    /// Maps thread ID to a coordinate offset within the tensor's logical
+    /// coordinate space Note this is slightly different from the one of
+    /// PitchLinearWarpRakedThreadMap.
+    CUTLASS_HOST_DEVICE
+        static TensorCoord initial_offset(int thread_id) {
+
+        TensorCoord coord = ThreadMap::initial_offset(thread_id);
+        return TensorCoord(
+            coord.strided(),
+            coord.contiguous()
+        );
+    }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/transpose.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/transpose.h
new file mode 100644
index 0000000000000000000000000000000000000000..508cad846e6d6b819c26570e5dcae9844f712089
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/transpose.h
@@ -0,0 +1,107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Basic copy routines for tensor views
+*/
+
+#pragma once
+
+namespace cutlass {
+namespace transform {
+namespace thread {
+
+/// Transforms a fragment by doing a transpose
+template <
+  int ElementCount, 
+  typename TransposeShape, 
+  typename Element
+> struct Transpose;
+
+/// Specialization for int8_t 4x4 transpose
+template <int ElementCount_>
+struct Transpose<ElementCount_, layout::PitchLinearShape<4,4> , int8_t> {
+
+    static const int kElementCount = ElementCount_;
+    using TransposeShape = layout::PitchLinearShape<4,4>;
+    using Element = int8_t;
+    using Fragment = cutlass::Array<Element, kElementCount>;
+
+    static_assert(!(kElementCount % TransposeShape::kCount), "Shape needs to be multiple of 16 elements to do a 4x4 transpose");
+
+    CUTLASS_DEVICE 
+    void transform(Fragment& dst, Fragment& src) {
+
+    // Expose src/dst as int arrays.
+    int* src_int = reinterpret_cast<int*>(&src);
+    int* dst_int = reinterpret_cast<int*>(&dst);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementCount / TransposeShape::kCount; i++){
+  
+      int const i0 = 4 * i + 0;
+      int const i1 = 4 * i + 1;
+      int const i2 = 4 * i + 2;
+      int const i3 = 4 * i + 3;
+
+      int a0 = src_int[i0];
+      int a1 = src_int[i1];
+      int a2 = src_int[i2];
+      int a3 = src_int[i3];
+
+      int b0, b1, b2, b3, c0;
+      b0 = __byte_perm(a0, a1, 0x0040);
+      c0 = __byte_perm(a2, a3, 0x0040);
+      b0 = __byte_perm(b0, c0, 0x5410);
+
+      b1 = __byte_perm(a0, a1, 0x0051);
+      c0 = __byte_perm(a2, a3, 0x0051);
+      b1 = __byte_perm(b1, c0, 0x5410);
+
+      b2 = __byte_perm(a0, a1, 0x0062);
+      c0 = __byte_perm(a2, a3, 0x0062);
+      b2 = __byte_perm(b2, c0, 0x5410);
+
+      b3 = __byte_perm(a0, a1, 0x0073);
+      c0 = __byte_perm(a2, a3, 0x0073);
+      b3 = __byte_perm(b3, c0, 0x5410);
+
+      dst_int[i0] = b0;
+      dst_int[i1] = b1;
+      dst_int[i2] = b2;
+      dst_int[i3] = b3;
+    }
+  }
+};
+
+}  // namespace thread
+}  // namespace layout
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/unary_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/unary_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3977af529124dc3db34610046b72145c2a14bf00
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/thread/unary_op.h
@@ -0,0 +1,105 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+
+namespace cutlass {
+namespace transform {
+namespace thread {
+
+namespace UnaryTransform {
+    struct Identity;    ///< None (i.e., identity)
+    struct Conjugate;   ///< Complex conjugate
+}
+
+/// Element-wise unary operator that transforms one element of a fragment at a time
+template<
+    typename FragmentIn, ///< Input Fragment
+    typename FragmentOut,///< Output Fragment
+    typename Transform>  ///< Unary transform operator
+class UnaryOp
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentOut execute(FragmentIn &in)
+        {
+            static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match.");
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            FragmentOut out;
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = static_cast<typename FragmentOut::Element>(in[i]);
+                }
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for (int i=0; i < FragmentIn::kElements; ++i){
+                   out[i] = conj(static_cast<typename FragmentOut::Element>(in[i]));
+                }
+            }
+            return out;
+        }
+};
+
+template<typename FragmentIn, typename Transform>
+class UnaryOp<FragmentIn, FragmentIn, Transform>
+{
+    public:
+        CUTLASS_DEVICE
+        static FragmentIn execute(FragmentIn &in)
+        {
+            static_assert(platform::is_same<Transform, UnaryTransform::Identity>::value ||
+                          platform::is_same<Transform, UnaryTransform::Conjugate>::value,
+                          "Unary Operator not supported.");
+
+            if (platform::is_same<Transform, UnaryTransform::Identity>::value )
+            {
+                return in;
+            }
+            else if (platform::is_same<Transform, UnaryTransform::Conjugate>::value )
+            {
+                for(int i=0; i < FragmentIn::kElements; ++i){
+                   in[i] = conj(in[i]);
+                }
+            }
+            return in;
+        }
+      };
+    }
+  }
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd717d678f8234b9fd39f7d22c4de5c231da4c42
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_iterator.h
@@ -0,0 +1,199 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for matrix of indices (ellColInd matrix) 
+*/
+
+#pragma once
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+namespace ell{
+
+constexpr unsigned int SmemPow = 8;
+constexpr unsigned int SmemStages = 2;
+constexpr unsigned int SmemSize = 1 << SmemPow;
+constexpr unsigned int SmemMask = (SmemSize*SmemStages-1);
+
+class SharedStorage{
+  public:
+    Array<int, SmemSize*SmemStages> array;
+};
+
+class Iterator{
+  public:
+  using Layout = layout::PitchLinear;
+  using LongIndex = typename Layout::LongIndex;
+
+  private:
+    const int *gmem_col_idx_;
+    int *smem_col_idx_;
+    const int  block_size_;
+    const int  base_idx_;
+    const int  k_shape_;
+    const int  ell_increment_;
+    const int  array_length_;
+    int  col_idx_base_;
+    int  residue_;
+    int  counter_;
+
+    int  pow2_;
+    int  residue_shape_;
+
+    int  smem_offset_;
+    int  smem_stage_;
+    int  gmem_offset_;
+
+    int  lane_;
+
+    bool is_pow2_;
+    bool is_residue_tile_;
+
+  public:
+    CUTLASS_DEVICE
+    void load_ell_indices(){
+      for(int i=threadIdx.x; i<SmemSize; i+=blockDim.x){
+        int idx = (gmem_offset_+i < array_length_) ? gmem_offset_+i : array_length_-1;
+        int gmem_col_idx = gmem_col_idx_[idx] - base_idx_;
+        smem_col_idx_[i + smem_stage_ * SmemSize] = 
+          (gmem_col_idx >= 0) ? gmem_col_idx : -1;
+      }
+      gmem_offset_ += SmemSize;
+      smem_stage_ ^= 1;
+    }
+
+    CUTLASS_DEVICE
+    Iterator(
+        SharedStorage& shared_storage_base,
+        const int* col_idx,
+        const int& block_size,
+        const int& base_idx,
+        const int  k_shape,
+        const int& problem_size_k,
+        const int& ell_stride,
+        const int& thread_idx)
+        : residue_(0),
+          counter_(0),
+          smem_offset_(0),
+          smem_stage_(0),
+          gmem_offset_(0),
+          block_size_(block_size),
+          base_idx_(base_idx),
+          k_shape_(k_shape),
+          ell_increment_(ell_stride * block_size),
+          array_length_((problem_size_k + block_size_ - 1) / block_size_), 
+          residue_shape_(problem_size_k % k_shape_),
+          is_residue_tile_(residue_shape_ != 0),
+          smem_col_idx_(reinterpret_cast<int*>(&shared_storage_base.array)),
+          gmem_col_idx_(const_cast<int*>(col_idx)),
+          lane_(thread_idx % 32) {
+
+      load_ell_indices();
+      __syncthreads();
+          
+      is_pow2_ = ((block_size_ & (block_size_ - 1)) == 0);
+      if( is_pow2_ && k_shape <= block_size_ ) lane_ = 0;
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_;
+
+      pow2_ = 0;
+      while(block_size_ >> (pow2_ + 1)) ++pow2_;
+    }
+
+    CUTLASS_DEVICE
+    int get_blocksize(){
+      return block_size_;
+    }
+
+    CUTLASS_DEVICE
+    Iterator &operator++(){
+      if(is_residue_tile_){
+        residue_ += residue_shape_;
+        is_residue_tile_ = false;
+      } else {
+        residue_ += k_shape_;
+      }
+
+      if(residue_ < block_size_){
+        return *this;
+      }
+
+      if((array_length_ > SmemSize) && (((smem_offset_ >> SmemPow) & 1) != smem_stage_)) 
+        load_ell_indices();
+
+      if(residue_ == block_size_){
+        ++smem_offset_;
+        counter_ += ell_increment_;
+        residue_ = 0;
+        col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+        return *this;
+      }
+      
+      if(is_pow2_){
+        smem_offset_ += residue_ >> pow2_; 
+        counter_ += (residue_ >> pow2_) * ell_increment_;
+        residue_ = residue_ & ((1 << pow2_) - 1);
+      }
+      else {
+        smem_offset_ += residue_ / block_size_; 
+        counter_ += (residue_ / block_size_) * ell_increment_;
+        residue_ %= block_size_;
+      }
+      
+      col_idx_base_ = smem_col_idx_[(smem_offset_ + lane_) & SmemMask] * ell_increment_ - counter_;
+      
+      return *this;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset(const int& idx) {
+      int num_jump_tiles;
+      if(is_pow2_)
+        num_jump_tiles = (idx + residue_) >> pow2_;
+      else 
+        num_jump_tiles = (idx + residue_) / block_size_;
+
+      int tmp = __shfl_sync(0xffffffff, col_idx_base_, num_jump_tiles); 
+      return tmp - num_jump_tiles * ell_increment_;
+    }
+    
+    CUTLASS_DEVICE
+    LongIndex get_offset_fast() {
+      return col_idx_base_;
+    }
+};
+
+}
+}
+}
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3676c2339067f9eaad667e11e0d798ae3f4d5c95
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
@@ -0,0 +1,1350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaMultistage
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class EllPredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    LongIndex stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_(layout.stride(0)) {
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Initial offset to define ELL block
+  TensorCoord ell_offset_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+          
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(), 
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+      
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    ell_offset_ = ThreadMap::initial_offset(thread_id);
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(residue_extent, false);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      thread_offset_ += residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(residue_offset_));
+
+      compute_predicates_(extent_, true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+  
+  /// Returns a k_location
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    if(kAdvanceRank){ //strided
+      return ell_offset_.strided() + iteration_strided_ * ThreadMap::Delta::kStrided;
+    }else{
+      return ell_offset_.contiguous() + iteration_contiguous_ * ThreadMap::Delta::kContiguous + iteration_vector_ * AccessType::kElements;
+    }
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    if(kAdvanceRank)
+      return params_.stride_;
+    else
+      return 1;
+  }
+  
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+
+    Mask mask;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = ell_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (kAdvanceRank == 0) {
+        guard = (coord.strided() < blocksize);
+      } else {
+        guard = (coord.contiguous() < blocksize);
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      mask[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] &= predicates_[i];
+    }
+    set_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class EllPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+  
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+  
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileAccessIterator for row-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class EllPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_k() const {
+    return iterator_.get_k();
+  }
+  
+  CUTLASS_HOST_DEVICE
+  int get_stride() const {
+    return iterator_.get_stride();
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileAccessIterator operator++(int) {
+    EllPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_DEVICE
+  void ell_add_mask(int blocksize) {
+    iterator_.ell_add_mask(blocksize);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e377bba4c454267737bffda73b1dff7572174ee7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
@@ -0,0 +1,1315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Ell iterator for Blocked-Ell matrix (ellValue matrix) used with EllMmaPipelined
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+#include "cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h"
+#include "cutlass/transform/threadblock/ell_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// EllPredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::EllPredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class EllPredicatedTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class EllPredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      EllPredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend EllPredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return address_iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { address_iterator_.ell_add_mask(blocksize); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator &ell_iter) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          address_iterator_.set_iteration_index(idx);
+          LongIndex ell_offset = 0;
+
+          int k_offset = address_iterator_.get_k();
+          ell_offset = ell_iter.get_offset(k_offset) * sizeof(Element);
+          
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator &ell_iter) {
+
+    LongIndex ell_offset = ell_iter.get_offset_fast() * sizeof(Element);
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + ell_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          bool is_valid = address_iterator_.valid();
+          is_valid = is_valid && (ell_offset >= 0);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, is_valid);
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+  
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class EllPredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): EllPredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { 
+    iterator_.ell_add_mask(blocksize); 
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Iterator for ELL storage
+  using EllIterator = typename cutlass::transform::threadblock::ell::Iterator; 
+  
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index(frag, ell_iter);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_ell_index_fast(Fragment &frag, EllIterator& ell_iter) {
+    iterator_.load_with_ell_index_fast(frag, ell_iter);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of EllPredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class EllPredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = EllPredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend EllPredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a EllPredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : EllPredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  EllPredicatedTileIterator operator++(int) {
+    EllPredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+  
+  /// Returns a stride
+  CUTLASS_HOST_DEVICE
+  int get_stride() const { return iterator_.get_stride(); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// add mask for small tiles in ELL
+  CUTLASS_HOST_DEVICE
+  void ell_add_mask(int blocksize) { iterator_.ell_add_mask(blocksize); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..dab597c835ced1a4f070858b26da3007d268c04e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -0,0 +1,375 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    It can be used to load the gamma and beta vectors of layernorm which is loop variant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/conv/threadblock/conv2d_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorAccessIterator
+///
+template <typename ThreadblockShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for fprop pitch-linear data.
+///
+template <typename ThreadblockShape_, typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                              Element_,
+                                              layout::PitchLinear> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = ThreadblockShape::kContiguous / kElementsPerAccess;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  TensorCoord thread_offset_;
+
+  int problem_size_k_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  bool guard_;
+
+  TensorCoord::Index residue_size_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset) {
+    pointer_ = (thread_id < kThreads)
+                   ? reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(scale_pointer))
+                   : reinterpret_cast<BytePointer>(
+                         const_cast<NonConstPointer>(bias_pointer));
+
+    // Per-thread offset in logical coordinates of tensor
+    int thread_base = (thread_id < kThreads) ? 0 : kThreads;
+
+    problem_size_k_ = problem_size_k;
+
+    is_residue_tile_ = true;
+
+    residue_size_ = (problem_size_k_ - threadblock_offset.contiguous()) % ThreadblockShape::kContiguous;
+
+    if (residue_size_ == 0) {
+      residue_size_ = ThreadblockShape::kContiguous;
+    }
+
+    guard_ = ((thread_id - thread_base) * kElementsPerAccess) < residue_size_;
+
+    thread_offset_ =
+        threadblock_offset +
+        TensorCoord((thread_id - thread_base) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      /// Extent of tensor
+      int problem_size_k,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole threadblock tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    guard_ = threadIdx.x < kThreads * 2;
+
+    TensorCoord offset = is_residue_tile_ ?
+      TensorCoord(residue_size_ + ThreadblockShape::kContiguous * (tile_offset.contiguous() - 1), 0)
+      : TensorCoord(ThreadblockShape::kContiguous * tile_offset.contiguous(), 0);
+
+    thread_offset_ =
+        thread_offset_ +
+        offset;
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        (thread_offset_.contiguous() * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    guard_ &= (!enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return guard_;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename ThreadblockShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorAccessIterator<ThreadblockShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using ThreadblockShape = ThreadblockShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<ThreadblockShape::kColumn, ThreadblockShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      ///< Extent of tensor
+      int problem_size_k,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size_k, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator(
+      int problem_size_k,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorAccessIterator(problem_size_k,
+                                                scale_pointer, bias_pointer,
+                                                thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorAccessIterator operator++(int) {
+    PredicatedScaleBiasVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5d9e70d73bfcbdc27ab78bbedea1278c3b25950
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
@@ -0,0 +1,328 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates calculating the address and predicates to the load of scale and bias vectors.
+
+    This iterator uses masks to guard out-of-bounds accesses.
+
+    This can be used to load var and mean vectors in layernorm which is loop invariant.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedScaleBiasVectorIterator
+///
+template <typename WarpShape,
+          typename Element,
+          typename Layout>
+class PredicatedScaleBiasVectorIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for wgrad pitch-linear data.
+///
+template <typename WarpShape_, typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::PitchLinear> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kElementsPerAccess = 1;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+  static int const kIterations = WarpShape::kContiguous / 8;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<__half2, 2 * kIterations * kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  ConstPointer scale_pointer_;
+  ConstPointer bias_pointer_;
+
+  /// Size of tensor
+  int problem_size_;
+
+  int32_t thread_offset_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : problem_size_(problem_size),
+        scale_pointer_(scale_pointer),
+        bias_pointer_(bias_pointer) {
+
+    thread_offset_ = threadblock_offset.contiguous() + (thread_id % 32) / 4;
+  }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      /// Extent of tensor
+      int problem_size,
+      /// Pointer to start of scale vector
+      ConstPointer scale_pointer,
+      /// Pointer to start of scale vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole warp tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ += (WarpShape::kContiguous * tile_offset.contiguous());
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.fill(__float2half2_rn(0.0f));
+    __half2 *frag_ptr = reinterpret_cast<__half2 *>(&frag);
+
+    // load scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2].x,
+        scale_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // load bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+
+      cutlass::arch::global_load<
+        __half,
+        sizeof(AccessType)
+      >(
+        frag_ptr[c * 2 + 1].x,
+        bias_pointer_ + thread_offset_ + c * 8,
+        (thread_offset_ + c * 8) < problem_size_ 
+      );
+    }
+
+    // duplicate scale
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2].y = frag_ptr[c * 2].x;
+    }
+
+    // duplicate bias
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < kIterations; ++c) {
+      frag_ptr[c * 2 + 1].y = frag_ptr[c * 2 + 1].x;
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename WarpShape_,
+          typename Element_>
+class PredicatedScaleBiasVectorIterator<WarpShape_,
+                                        Element_,
+                                        layout::RowMajor> {
+ public:
+
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedScaleBiasVectorIterator<
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>,
+      Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  using Fragment = typename UnderlyingIterator::Fragment;
+
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      ///< Extent of tensor
+      int problem_size,
+      ///< Pointer to the start of the scale vector
+      ConstPointer scale_pointer,
+      ///< Pointer to the start of the bias vector
+      ConstPointer bias_pointer,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(problem_size, scale_pointer, bias_pointer,
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedScaleBiasVectorIterator(
+      int problem_size,  ///< Extent of tensor
+      ConstPointer scale_pointer,  ///< Pointer to the start of the scale vector
+      ConstPointer bias_pointer,   ///< Pointer to the start of the bias vector
+      int thread_id                ///< ID of each participating thread
+      )
+      : PredicatedScaleBiasVectorIterator(problem_size,
+                                          scale_pointer, bias_pointer,
+                                          thread_id, make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// threadblock tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load(frag);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..3640709868602584f93e3409a251c0baff19d18d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -0,0 +1,2118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorPredicates
+///
+template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIteratorPredicates {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+// private:
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+
+      if (is_steady_state) {
+        if (kAdvanceRank == 0) {
+          guard = (coord.strided() < extent.strided());
+        } else {
+          guard = (coord.contiguous() < extent.contiguous());
+        }
+      } else {
+        guard = (coord.strided() < extent.strided() &&
+                 coord.contiguous() < extent.contiguous());
+      }
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
+
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(), 
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+      
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    compute_predicates_(residue_extent, false);
+
+    set_iteration_index(0);
+  }
+
+  /// Default constructor
+  PredicatedTileAccessIteratorPredicates() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates(
+      /// Extent of tensor
+      TensorCoord extent)
+      : extent_(extent) {
+	}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorPredicates &operator++() {
+
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+
+    
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
+class PredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+  
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    
+    using Base = PredicatedTileAccessIteratorParams;
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : 
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+
+  /// Gather indices
+  int const *indices_;
+
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are needed as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+	  the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : params_(params),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
+        is_residue_tile_(true),
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+          
+    if (Gather) {
+      assert(indices_);
+    }
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather && !Permute) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+
+      Layout layout(params_.stride_);
+
+      if (!Gather && !Permute) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
+
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
+        }
+      } else {
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
+        } else {
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
+        }
+      }
+    } else {
+      if (!Gather && !Permute) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
+      } else {
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
+      }
+    }
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    if (Gather || Permute)
+    {
+      if (!valid()) {
+        return nullptr;
+      }
+
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
+
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
+    }
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather && !Permute) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather && !Permute) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+  
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column()),
+                  indices) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType, 
+      Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row()),
+                  indices) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+          the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+	is_residue_tile_(true) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(the_predicates.residue_offset_));
+
+      the_predicates.compute_predicates_(the_predicates.extent_, true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { the_predicates.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for affine rank-2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.  
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major interleaved data.  
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class PredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator operator++(int) {
+    PredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() { return iterator_.valid(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
new file mode 100644
index 0000000000000000000000000000000000000000..93eac72e40ddf6b0f3d268957873417e5d5a442f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -0,0 +1,834 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIterator2dThreadTile
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType>
+class PredicatedTileAccessIterator2dThreadTile;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = (ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kStrided + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+
+   public:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : 
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) : 
+      Base(base) { }
+  };
+
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Index of residue tile
+  int residue_tile_idx_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+  /// Tracks iterations within the thread loop
+  int iteration_thread_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_HOST_DEVICE
+  void compute_predicates_(
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++) {
+
+          TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous,
+                                      ts + s * ThreadMap::Delta::kStrided);
+
+          TensorCoord coord = thread_offset_ + iteration_coord;
+
+          bool guard;
+
+          if (is_steady_state) {
+            if (kAdvanceRank == 0) {
+              guard = (coord.strided() < extent_.strided());
+            } else {
+              guard = (coord.contiguous() < extent_.contiguous());
+            }
+          } else {
+            guard = (coord.strided() < extent_.strided() &&
+                     coord.contiguous() < extent_.contiguous());
+          }
+
+          int pred_idx = ts + c *  ThreadMap::ThreadAccessShape::kStrided + s * ThreadMap::Iterations::kContiguous *  ThreadMap::ThreadAccessShape::kStrided;
+          int word_idx = pred_idx / kPredicatesPerWord;
+          int residual = pred_idx % kPredicatesPerWord;
+          int byte_idx = residual / kPredicatesPerByte;
+          int bit_idx = residual % kPredicatesPerByte;
+          
+          predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+        }
+      }
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        extent_(extent),
+        is_residue_tile_(true) {
+          
+
+    TensorCoord residue_offset;
+    if (kAdvanceRank) {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kStrided;
+      residue_offset = make_Coord(0, residue_tile_idx_ * Shape::kStrided);
+    } else {
+      residue_tile_idx_ =
+          (extent_[kAdvanceRank] - threadblock_offset[kAdvanceRank] - 1) /
+          Shape::kContiguous;
+      residue_offset = make_Coord(residue_tile_idx_ * Shape::kContiguous, 0);
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + residue_offset +
+                     ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(false);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    int residual = index % (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    iteration_strided_ = index / (ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided);
+    
+    iteration_contiguous_ = residual / ThreadMap::ThreadAccessShape::kStrided;
+    iteration_thread_ = residual % ThreadMap::ThreadAccessShape::kStrided;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += int(sizeof(Element)) * pointer_offset;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+      TensorCoord residue_offset;
+      if (kAdvanceRank) {
+        residue_offset = TensorCoord(0, residue_tile_idx_ * Shape::kStrided);
+      } else {
+        residue_offset = TensorCoord(residue_tile_idx_ * Shape::kContiguous, 0);
+      }
+
+      thread_offset_ -= residue_offset;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(-layout(residue_offset));
+
+      compute_predicates_(true);
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * (tile_offset.strided() - 1);
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * (tile_offset.contiguous() - 1);
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * tile_offset.strided();
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * tile_offset.contiguous();
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *ret_val = reinterpret_cast<AccessType *>(
+                pointer_ + (iteration_thread_ * params_.stride_  + iteration_contiguous_ * ThreadMap::Delta::kContiguous) * int(sizeof(Element)));
+
+    return ret_val;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+
+    iteration_thread_++;
+
+    if (iteration_thread_ < ThreadMap::ThreadAccessShape::kStrided)
+      return *this;
+
+    iteration_thread_ = 0;
+
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    int pred_idx = 
+      iteration_thread_ + 
+      iteration_contiguous_ * ThreadMap::ThreadAccessShape::kStrided + 
+      iteration_strided_ * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    
+    return pred;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class PredicatedTileAccessIterator2dThreadTile<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIterator2dThreadTile<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIterator2dThreadTile(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIterator2dThreadTile operator++(int) {
+    PredicatedTileAccessIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e509a344e955438ea4eabe6806ed2ab79343d36
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -0,0 +1,290 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/detail/helper_macros.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Predicated tile access iterator descriptor object containing template dependent state
+struct PredicatedTileAccessIteratorDesc {
+
+  int element_size_bits = -1;
+  int advance_rank = -1;
+  layout::PitchLinearCoord threadblock_shape;
+  layout::PitchLinearCoord threadmap_iterations;
+  layout::PitchLinearCoord threadmap_delta;
+
+  //
+  // Methods
+  //
+
+  PredicatedTileAccessIteratorDesc() = default;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc(
+    int element_size_bits_,
+    int advance_rank_,
+    layout::PitchLinearCoord threadblock_shape_,
+    layout::PitchLinearCoord threadmap_iterations_,
+    layout::PitchLinearCoord threadmap_delta_
+  ):
+    element_size_bits(element_size_bits_),
+    advance_rank(advance_rank_),
+    threadblock_shape(threadblock_shape_),
+    threadmap_iterations(threadmap_iterations_),
+    threadmap_delta(threadmap_delta_)
+  {
+    #if 0
+    printf("PredicatedTileAccessIteratorDesc(%d, %d, {%d, %d}, {%d, %d}, {%d, %d}})\n",
+      element_size_bits,
+      advance_rank,
+      threadblock_shape.contiguous(), threadblock_shape.strided(),
+      threadmap_iterations.contiguous(), threadmap_iterations.strided(),
+      threadmap_delta.contiguous(), threadmap_delta.strided());
+    #endif
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Helper template to construct an PredicatedTileAccessIteratorDesc from a template 
+// dependent state
+template <
+  typename Shape, typename Element, typename Layout,
+  int AdvanceRank, typename ThreadMap>
+  struct MakePredicatedTileAccessIteratorDesc;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for pitch-linear data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap> {
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return PredicatedTileAccessIteratorDesc(
+      sizeof_bits<Element>::value,
+      AdvanceRank,
+      {Shape::kContiguous, Shape::kStrided},
+      {ThreadMap::Iterations::kContiguous, ThreadMap::Iterations::kStrided},
+      {ThreadMap::Delta::kContiguous, ThreadMap::Delta::kStrided}
+    );
+}
+
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajor, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for row-major data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajor, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for column-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::ColumnMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK, Shape::kColumn / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIterator for roww-major interleaved data.
+template <
+  typename Shape, typename Element, int AdvanceRank, 
+  typename ThreadMap, int InterleavedK>
+struct MakePredicatedTileAccessIteratorDesc <
+    Shape, Element, layout::RowMajorInterleaved<InterleavedK>, AdvanceRank, ThreadMap> {
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kInterleavedK = InterleavedK;
+
+  using UnderlyingMakeOperator = MakePredicatedTileAccessIteratorDesc<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK, Shape::kRow / kInterleavedK>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap>;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorDesc operator()() {
+
+    return UnderlyingMakeOperator()();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// Parameters struct
+//
+
+struct PredicatedTileAccessIteratorParams {
+
+  using Index = int32_t;
+  using LongIndex = int64_t;
+
+  //
+  // Data members
+  //
+  /// stride of pitch-linear layout (units of Element)
+  LongIndex stride_ = 0;
+  /// amount (in byte) to increment pointer to move to next access along
+  /// strided dimension
+  LongIndex inc_strided_ = 0;
+  /// amount (in byte) to increment pointer from last access to first access
+  /// of next tile
+  LongIndex inc_next_ = 0;
+  /// amount (in byte) to increment pointer from first access of current tile
+  /// to first access of next tile
+  LongIndex inc_advance_ = 0;
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    CUTLASS_ASSERT(desc.element_size_bits > 0);
+    CUTLASS_ASSERT(desc.advance_rank == 0 || desc.advance_rank == 1);
+
+    stride_ = stride;
+
+    inc_strided_ = (LongIndex(stride_) * desc.threadmap_delta.strided()) *
+                     desc.element_size_bits / 8;
+
+    if (desc.advance_rank) {
+      // advance along strided dimension
+      inc_advance_ =
+          desc.threadblock_shape.strided() * LongIndex(stride_) * desc.element_size_bits / 8;
+    } else {
+      // advance along contiguous dimension
+      inc_advance_ = desc.threadblock_shape.contiguous() * desc.element_size_bits / 8;
+    }
+
+    inc_next_ = inc_advance_ - LongIndex(desc.threadmap_iterations.strided() - 1) *
+                                   desc.threadmap_delta.strided() * LongIndex(stride_) *
+                                   desc.element_size_bits / 8;    
+
+    return Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Status initialize(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    return initialize(LongIndex(stride), desc);
+  }
+
+  PredicatedTileAccessIteratorParams() = default;
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(Index stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorParams(LongIndex stride, PredicatedTileAccessIteratorDesc desc) {
+    initialize(stride, desc);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..f657fe25813567b47156047f6ef023b678ac097f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
@@ -0,0 +1,892 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+   from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last
+   "residue" tile first, with the objective of minimizing predicate mask updates
+   during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/blas3.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorTriangularMatrix
+///
+template <typename Shape, typename Element, typename Layout, 
+          int AdvanceRank, typename ThreadMap, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType>
+class PredicatedTileAccessIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  using CompareOp = typename TrMatrixCompareOp<kFillMode, kDiagType>::Type;
+
+  static_assert( kFillMode == FillMode::kFull || 
+                 ((kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) && AccessType::kElements == 1), 
+                 "BLAS3 iterator for the triangular/symmetric matrix must use AccessType::kElements as 1");
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements), 
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount = 
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    StrideIndex stride_;
+    /// (true)  pitch-linear layout is mapped to row-major matrix 
+    /// (false) pitch-linear layout is mapped to column-major matrix
+    bool is_row_major_;
+    /// for vectorized access across the diagonal boundary guard condition is
+    /// checked for the element on the boundary
+    int access_diagonal_boundary_;    
+    /// amount (in byte) to increment pointer to move to next access along
+    /// strided dimension
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_strided_(0), inc_next_(0), inc_advance_(0), is_row_major_(false), access_diagonal_boundary_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout, bool is_row_major, int access_diagonal_boundary) : 
+      stride_(layout.stride(0)), is_row_major_(is_row_major), access_diagonal_boundary_(access_diagonal_boundary) {
+
+      inc_strided_ = (LongIndex(stride_) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kStrided - 1) *
+                                     ThreadMap::Delta::kStrided * LongIndex(stride_) *
+                                     sizeof_bits<Element>::value / 8;
+
+    };
+
+
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const &params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Guard predicates
+  uint32_t predicates_[kPredicateWordCount];
+
+  /// Track global memory addresses on the diagonal 
+  /// To ignore imag part for diagonal elements of hermitian matrices
+  uint32_t predicates_onDiag_[kPredicateWordCount];
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent) {
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0u;
+      predicates_onDiag_[i] = 0u;
+    }
+
+    CompareOp compare_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_idx = 0; access_idx < ThreadMap::Iterations::kCount * kAccessesPerVector; ++access_idx) {
+
+      int s = access_idx / (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+      
+      int access_residual = access_idx % (ThreadMap::Iterations::kContiguous * kAccessesPerVector);
+
+      int c = access_residual / kAccessesPerVector;
+      int v = access_residual % kAccessesPerVector;
+
+      TensorCoord iteration_coord(c * ThreadMap::Delta::kContiguous + v * AccessType::kElements,
+                                s * ThreadMap::Delta::kStrided);
+
+      TensorCoord coord = thread_offset_ + iteration_coord;
+
+      bool guard;
+      bool onDiag = false;
+
+      guard = ((coord.strided() < extent.strided()) && 
+                (coord.contiguous() < extent.contiguous()));
+    
+
+      // guard access on the wrong side of the triagular matrix diagonal
+      if (kFillMode == FillMode::kLower || kFillMode == FillMode::kUpper) {
+        coord += TensorCoord{params_.access_diagonal_boundary_, 0};
+
+        bool triagular_guard_row_major = compare_op(coord.strided(), coord.contiguous()) | !params_.is_row_major_;
+        bool triagular_guard_col_major = compare_op(coord.contiguous(), coord.strided()) | params_.is_row_major_;
+        
+        guard = guard && triagular_guard_row_major && triagular_guard_col_major;
+
+        if (kDiagType == DiagType::kUnit) {
+          onDiag = (guard && coord.strided() == coord.contiguous()) ? true : false;
+        }
+      }
+
+      int pred_idx_onDiag = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+      int word_idx_onDiag = pred_idx_onDiag / kPredicatesPerWord;
+      int residual_onDiag = pred_idx_onDiag % kPredicatesPerWord;
+      int byte_idx_onDiag = residual_onDiag / kPredicatesPerByte;
+      int bit_idx_onDiag = residual_onDiag % kPredicatesPerByte;
+      
+      predicates_onDiag_[word_idx_onDiag] |= (unsigned(onDiag) << (byte_idx_onDiag * 8 + bit_idx_onDiag));
+
+      int pred_idx = v + kAccessesPerVector * (c + ThreadMap::Iterations::kContiguous * s);
+
+      int word_idx = pred_idx / kPredicatesPerWord;
+      int residual = pred_idx % kPredicatesPerWord;
+      int byte_idx = residual / kPredicatesPerByte;
+      int bit_idx = residual % kPredicatesPerByte;
+      
+      predicates_[word_idx] |= (unsigned(guard) << (byte_idx * 8 + bit_idx));
+
+    }
+
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(const_cast<NonConstPointer>(pointer))),
+        extent_(extent) {
+
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(thread_offset_));
+
+    compute_predicates_(extent_);
+
+    set_iteration_index(0);
+  }
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+      pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      thread_offset_ += TensorCoord{0, Shape::kStrided * tile_offset.strided()};
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+      pointer_ += Shape::kStrided * tile_offset.strided();
+      thread_offset_ += TensorCoord{Shape::kContiguous * tile_offset.contiguous(), 0};
+    }
+
+    compute_predicates_(extent_);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(
+        pointer_ + 
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+
+    ++iteration_vector_;
+    if (iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    iteration_vector_ = 0;
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = enable ? 0u : predicates_[i];
+    }
+
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = 0xffffffff;
+    }
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { 
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      predicates_[i] = mask[i];
+    }
+
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+     CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kPredicateWordCount; ++i) {
+      mask[i] = predicates_[i];
+    }
+  }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_onDiag_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+
+    
+    int pred_idx = 
+      iteration_vector_ + kAccessesPerVector * (iteration_contiguous_ + iteration_strided_ * ThreadMap::Iterations::kContiguous);
+
+    int word_idx = pred_idx / kPredicatesPerWord;
+    int residual = pred_idx % kPredicatesPerWord;
+    int byte_idx = residual / kPredicatesPerByte;
+    int bit_idx = residual % kPredicatesPerByte;
+    
+    bool pred = (predicates_[word_idx] & (1u << (byte_idx * 8 + bit_idx))) != 0;
+    return pred;
+    
+
+    //return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+            SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+            typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, kSideMode, kFillMode, kDiagType, 
+                                   AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kLower) ? (AccessType::kElements - 1) : 0;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), false, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          typename AccessType_>
+class PredicatedTileAccessIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                                  kSideMode, kFillMode, kDiagType, AccessType_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorTriangularMatrix<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, 
+      kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  static int const kAccessDiagonalBoundary = 
+    (kFillMode == FillMode::kUpper) ? (AccessType::kElements - 1) : 0;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0)), true, kAccessDiagonalBoundary){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorTriangularMatrix operator++(int) {
+    PredicatedTileAccessIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Return if the address in on the diagonal
+  CUTLASS_HOST_DEVICE
+  bool getOnDiag() {
+    return iterator_.getOnDiag();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..43c4cbd1a5758e0288f82babbe7043d22f83c009
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -0,0 +1,1887 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we 
+    only need to compute the predicates twice, once before the first tile and 
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIterator
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int AccessSize = ThreadMap::kElementsPerAccess,
+  bool Gather = false,
+  typename PermuteLayout = layout::NoPermute
+>
+class PredicatedTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, bool Gather, typename PermuteLayout>
+class PredicatedTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank,
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType, Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset, indices) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0)))
+    {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column()),
+      indices)
+    { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize,
+  bool Gather,
+  typename PermuteLayout
+>
+class PredicatedTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, 
+                             ThreadMap_, AccessSize, Gather, PermuteLayout> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize,
+    Gather,
+    PermuteLayout
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,        ///< Initial offset of threadblock
+    int const *indices = nullptr                        ///< Gather indices
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row()),
+      indices
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRankN<2>, AdvanceRank,
+                             ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+
+    friend PredicatedTileIterator;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) {}
+
+    /// Default constructor
+    Params() = default;
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2ColumnMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1)))
+    {}
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for affine rank 2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int AccessSize
+>
+class PredicatedTileIterator<Shape_, Element_, layout::AffineRank2RowMajor, AdvanceRank, ThreadMap_, AccessSize, false> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::AffineRankN<2>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for interleaved data.  It is mapped
+/// to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::ColumnMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator for interleaved-32 data.  It is
+/// mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int AccessSize, int InterleavedK>
+class PredicatedTileIterator<Shape_, Element_,
+                             layout::RowMajorInterleaved<InterleavedK>,
+                             AdvanceRank, ThreadMap_, AccessSize, false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessSize>;
+
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  PredicatedTileIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator operator++(int) {
+    PredicatedTileIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe48df6e7dc1c66c9e55b8eab14aa1fb53bc14b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -0,0 +1,787 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h"
+#include "cutlass/transform/thread/transpose.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIterator2dThreadTile
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIterator2dThreadTile;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  bool Transpose = false
+>
+class PredicatedTileIterator2dThreadTile;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, bool Transpose_>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Transpose_> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  /// extra set of parenthesis is needed for VS compiler
+  struct alignas((ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value /
+                  8)) AccessType {
+
+    Array<Element, ThreadMap::kElementsPerAccess> storage;
+
+    static int const kElements = ThreadMap::kElementsPerAccess;
+  };
+
+  /// Optionally this fragment can be 4x4 transposed
+  using Transform = thread::Transpose< ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount , layout::PitchLinearShape<4,4>, Element>;
+  static bool const transpose = Transpose_;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIterator2dThreadTile<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, AccessType>;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIterator2dThreadTile;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) 
+        : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+
+            frag_ptr[access_idx] =
+                *(address_iterator_.get() + pointer_offset);
+          }
+
+          ++address_iterator_;
+        }
+      }
+    }
+
+    if (transpose) {
+      Transform t;
+      t.transform(frag, frag);
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int ts = 0; ts < ThreadMap::ThreadAccessShape::kStrided; ts++){
+
+          int access_idx = ts + c * ThreadMap::ThreadAccessShape::kStrided  + \
+              s * ThreadMap::Iterations::kContiguous * ThreadMap::ThreadAccessShape::kStrided;
+
+          address_iterator_.set_iteration_index(access_idx);
+          if (address_iterator_.valid()) {
+            *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    Transpose
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIterator2dThreadTile for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  bool Transpose_
+>
+class PredicatedTileIterator2dThreadTile<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Transpose_> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static bool const Transpose = Transpose_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    Transpose
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIterator2dThreadTile;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base) 
+        : params_(base) {}
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset,         ///< Initial offset of threadblock
+    int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIterator2dThreadTile with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIterator2dThreadTile(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIterator2dThreadTile operator++(int) {
+    PredicatedTileIterator2dThreadTile self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf5e8586675c11bb52e2db5346ff19f489461af
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
@@ -0,0 +1,818 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorTriangularMatrix
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize register liveness
+/// and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params" object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is constructed.
+/// Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator is constructed.
+/// Subsequent additions to logical coordinate offset may be performed but are relatively expensive.
+///
+/// Vistitation order is intended to first visit a "residual" tile that may be partially full in
+/// both the advance dimension and the steady-state dimension. This is assumed to be the last
+/// tile in the iteration sequence. Advancing an iterator that has just been constructed moves to
+/// the first tile that is full in the advance dimension and recomputes predicates. Subsequent
+/// accesses may be performed without updating internal predicates and are efficient in terms of
+/// live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced at least once
+/// outside any looping structure to minimize integer arithmetic. 
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to dereferencing
+/// the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params, 
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks - subsequent loads become NO-OPs.
+//     }
+//  
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator = transform::threadblock::PredicatedTileIteratorTriangularMatrix;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize = ThreadMap::kElementsPerAccess
+>
+class PredicatedTileIteratorTriangularMatrix;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, 
+          SideMode kSideMode, FillMode kFillMode, DiagType kDiagType, 
+          int AccessSize>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, 
+                                             kSideMode, kFillMode, kDiagType,
+                                             AccessSize> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<Element, AccessSize, (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator =
+      PredicatedTileAccessIteratorTriangularMatrix<Shape, Element, Layout, kAdvanceRank,
+                                   ThreadMap, kSideMode, kFillMode, kDiagType, AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount *
+                                               ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorTriangularMatrix;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : params_(layout) { }
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : address_iterator_(params.params_, pointer, extent, thread_id,
+                          threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id,
+                               make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { address_iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { address_iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { address_iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { address_iterator_.get_mask(mask); }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+          
+          address_iterator_.set_iteration_index(idx);
+          char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+
+          AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType,
+                                     sizeof(AccessType)
+                                    >(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_byte_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+
+          int idx = v + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+          AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_byte_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, 
+                                              kSideMode, kFillMode, kDiagType,
+                                              AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    }
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.row(), extent.column()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.row(), threadblock_offset.column())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorTriangularMatrix for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  SideMode kSideMode, 
+  FillMode kFillMode, 
+  DiagType kDiagType,
+  int AccessSize
+>
+class PredicatedTileIteratorTriangularMatrix<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, 
+                                            kSideMode, kFillMode, kDiagType,
+                                            AccessSize> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedTileIteratorTriangularMatrix<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kSideMode, 
+    kFillMode, 
+    kDiagType,
+    AccessSize
+  >;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+  private:
+
+    friend PredicatedTileIteratorTriangularMatrix;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+  public:
+    
+    CUTLASS_HOST_DEVICE
+    Params() { } 
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout): params_(layout::PitchLinear(layout.stride(0))) {
+
+    };
+  };
+
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset, and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object 
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id,                                ///< ID of each participating thread
+    TensorCoord const &threadblock_offset         ///< Initial offset of threadblock
+  ):
+    iterator_(
+      params.params_,
+      pointer,
+      layout::PitchLinearCoord(extent.column(), extent.row()),
+      thread_id,
+      layout::PitchLinearCoord(threadblock_offset.column(), threadblock_offset.row())
+    ) { }
+
+  /// Construct a PredicatedTileIteratorTriangularMatrix with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix(
+    Params const &params,                         ///< Precomputed parameters object
+    Pointer pointer,                              ///< Pointer to start of tensor
+    TensorCoord extent,                           ///< Extent of tensor
+    int thread_id                                 ///< ID of each participating thread
+  ): PredicatedTileIteratorTriangularMatrix(params, pointer, extent, thread_id, make_Coord(0, 0)) { }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the iterator's
+  /// internal pointer is reverted to the first "steady state" tile. Subsequent calls
+  /// are lightweight and must only update the internal pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorTriangularMatrix operator++(int) {
+    PredicatedTileIteratorTriangularMatrix self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+  
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..df551c13f52834bfa6258104f99c7ed008342279
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing computing the addresses of loading small
+    vectors from the global memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedVectorAccessIterator
+///
+template <
+    /// Shape of the vector accessed by the entire threadblock
+    typename Shape,
+    /// Shape of the vector accessed by the warp
+    typename WarpShape,
+    /// Type of Element
+    typename Element,
+    /// Layout of the vector
+    typename Layout,
+    /// Number of elements for each access
+    int ElementsPerAccess,
+    /// Support residual tile
+    bool EnableResidualAccess = false
+>
+class PredicatedVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Vector access iterator specialized for vectors, e.g. scale and bias
+/// Thread arrangements are for TensorOps
+///
+template <
+  typename Shape_, 
+  typename WarpShape_, 
+  typename Element_, 
+  int ElementsPerAccess, 
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator <
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::PitchLinear,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+  public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+//  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kThreads = 32;
+  static int const kRowsPerIteration = 8;
+  static int const kThreadsPerRow = kThreads / kRowsPerIteration;
+  static int const kThreadsPerRowMask = 0x3;
+  static int const kIterations = WarpShape::kContiguous / (kThreadsPerRow * kElementsPerAccess); 
+  static int const kWarpCountStrided = Shape::kStrided / WarpShape::kStrided;
+
+  using AccessType = AlignedArray<Element, kElementsPerAccess>;
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Extent of tensor
+  TensorCoord extent_;
+
+  /// pointer offset of each thread
+  TensorCoord thread_offset_;
+
+  /// iteration index
+  LongIndex iteration_;
+
+  /// residual access
+  bool is_residual_;
+
+  /// residual offset of each thread
+  TensorCoord residual_offset_;
+
+ public:
+  /// Constructs a vector access iterator
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to the start of the vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    /// ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id,
+    /// Initial offset of threadblock
+    TensorCoord const &threadblock_offset)
+    : pointer_(reinterpret_cast<BytePointer>(
+                       const_cast<NonConstPointer>(pointer))),
+      extent_(extent),
+      is_residual_(false) {
+
+
+    int warp_offset = (warp_id / kWarpCountStrided) * WarpShape::kContiguous;
+
+    // Per-thread offset in logical coordinates of tensor
+
+    thread_offset_ = threadblock_offset + TensorCoord(warp_offset, 0) +
+        TensorCoord((thread_id & kThreadsPerRowMask) * kElementsPerAccess, 0);
+
+    set_iteration_index(0);
+
+    if(EnableResidualAccess) {
+      // compute residual offset
+      typename TensorCoord::Index residual_size = extent_.contiguous() % WarpShape::kContiguous;
+      if (residual_size) {
+        is_residual_ = true;
+        residual_offset_ = make_Coord(residual_size, 0);
+      }
+    }
+  }
+
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+    /// Pointer to start of vector
+    ConstPointer pointer,
+    /// Extent of vector
+    TensorCoord extent,
+    ///< ID of each participating thread
+    int thread_id,
+    /// ID of each participating warp
+    int warp_id)
+    : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id,
+                                     make_Coord(0, 0)) {}
+
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_ = index;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+
+    thread_offset_ =
+        thread_offset_ +
+        TensorCoord(WarpShape::kContiguous * tile_offset.contiguous(), 0);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        ((thread_offset_.contiguous() + iteration_ * kThreadsPerRow * kElementsPerAccess) 
+        * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iteration_;
+    if(iteration_ >= kIterations)
+      iteration_ = 0; 
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    if(EnableResidualAccess && is_residual_) {
+      is_residual_ = false;
+      thread_offset_ += residual_offset_; 
+    }
+    else
+      add_tile_offset(TensorCoord(1, 0));
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return ((thread_offset_.contiguous() + 
+              iteration_ * kThreadsPerRow * kElementsPerAccess) < extent_.contiguous());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedVectorAccessIterator for row-major data.
+///
+template <
+  typename Shape_,
+  typename WarpShape_,
+  typename Element_,
+  int ElementsPerAccess,
+  bool EnableResidualAccess
+>
+class PredicatedVectorAccessIterator<
+  Shape_,
+  WarpShape_,
+  Element_,
+  layout::RowMajor,
+  ElementsPerAccess,
+  EnableResidualAccess
+> {
+ public:
+
+  using Shape = Shape_;
+  using WarpShape = WarpShape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ConstPointer = const Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = PredicatedVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, 
+      layout::PitchLinearShape<WarpShape::kColumn, WarpShape::kRow>, 
+      Element,
+      layout::PitchLinear,
+      ElementsPerAccess,
+      EnableResidualAccess>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+  static int const kElementsPerAccess = UnderlyingIterator::kElementsPerAccess;
+  static int const kRowsPerIteration = UnderlyingIterator::kRowsPerIteration;
+  static int const kThreads = UnderlyingIterator::kThreads;
+  static int const kIterations = UnderlyingIterator::kIterations;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ///< Pointer to the start of the vector
+      ConstPointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< ID of each participating warp
+      int warp_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset)
+      : iterator_(pointer, layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id, warp_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a PredicatedVectorAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator(
+      ConstPointer pointer,   ///< Pointer to the start of the vector
+      TensorCoord extent,     ///< Extent of tensor
+      int thread_id,          ///< ID of each participating thread
+      int warp_id             ///< ID of each participating warp
+      )
+      : PredicatedVectorAccessIterator(pointer, extent, thread_id, warp_id, 
+                                        make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedVectorAccessIterator operator++(int) {
+    PredicatedVectorAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  void advance() {
+    iterator_.advance();
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1aae46988418c72a9322b7e6b47e1dfe4fadff8d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
@@ -0,0 +1,253 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Templates implementing computing the addresses of storing of small
+   scale and bias vectors in the shared memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// RegularScaleBiasVectorAccessIterator
+///
+template <typename Shape, typename Element, typename Layout>
+class RegularScaleBiasVectorAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<Shape_, Element_, layout::PitchLinear> {
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Element type per access
+  static int const kElementsPerAccess = 128 / sizeof_bits<Element>::value;
+  static int const kThreads = Shape::kContiguous / kElementsPerAccess;
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Internal pointer 
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : byte_offset_(0) {
+    // Per-thread offset in logical coordinates of tensor
+    int thread_offset = thread_id * kElementsPerAccess;
+
+    // initialize pointer
+    pointer_ =
+        reinterpret_cast<AccessType *>(scale_bias_ref.data() + thread_offset);
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(pointer_);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() { return *this; }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    // Multiply by 2 because we store scale and bias belong to the same stage
+    // next to each other.
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * 2);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_>
+class RegularScaleBiasVectorAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor> {
+ public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularScaleBiasVectorAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator(
+      TensorRef scale_bias_ref,  ///< Pointer to the start of the scale and bias
+                                 ///< vector
+      int thread_id              ///< ID of each participating thread
+      )
+      : iterator_({scale_bias_ref.data(), scale_bias_ref.stride()}, thread_id) {
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularScaleBiasVectorAccessIterator operator++(int) {
+    RegularScaleBiasVectorAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform 
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb491b5a4b5f4e1b757f99110f6a9fd28675088
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -0,0 +1,58 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing the address computation of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8>
+class RegularTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..adda9339b87865799c56baba4c3f8df580e26ac5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -0,0 +1,408 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1), 
+      ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0), 
+      ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c89686a71995b45f9d4cf0fd1f0fba12ca7d8a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
@@ -0,0 +1,587 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap,
+           bool Dynamic_iterations = false,
+          int Alignment =
+              sizeof_bits<Element>::value* ThreadMap::kElementsPerAccess / 8
+          >
+class RegularTileAccessIteratorDirectConv;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations OFF
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_, false, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    //Do nothing
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * ThreadMap::Iterations::kStrided *
+                           ThreadMap::Delta::kStrided * stride_ * ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps with dynamic_iterations ON
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::PitchLinear,
+    AdvanceRank, ThreadMap_,true, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Element type per access
+  using AccessType = Array<Element, ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+  /// Total iterattions in the strided dimension: Dynamic value
+  int total_iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) / ThreadMap::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_base));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    total_iteration_strided_ = num;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < total_iteration_strided_) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous +
+                       coord.strided() * total_iteration_strided_ * ThreadMap::Delta::kStrided * stride_ *
+                           ThreadMap::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for column major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment >
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::ColumnMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations , Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+  
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for row major layouts
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_,bool Dynamic_iterations, int Alignment>
+class RegularTileAccessIteratorDirectConv<
+    Shape_, Element_,
+    layout::RowMajor,
+    AdvanceRank, ThreadMap_, Dynamic_iterations, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIteratorDirectConv<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0), 
+      ThreadMap_,
+      Dynamic_iterations>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_num(int num) {
+    iterator_.set_iteration_num(num);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIteratorDirectConv operator++(int) {
+    RegularTileAccessIteratorDirectConv prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e172447fa96b02e11246f5f397911841c52eff4c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -0,0 +1,821 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                          Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(
+          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * Shape::kContiguous * Layout::kFactor +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                  Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::TensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+  static int const kCrosswise = Crosswise;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(!(ThreadMap::Delta::kContiguous % kCrosswise),
+                "kCrosswise is the smallest unit in the contiguous dimension "
+                "for shared memory swizzling.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+
+    /// Number of pointers
+    ///
+    /// Note:TN kblock32 layouts only needs 1 pointer, but strangely
+    /// reducing pointer count hurts perfomrnace
+    static int const kPointerCount =
+        (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Total number of sections.  The memory is divided into stages.  One stage
+  /// can store one tile.  Stage is divided into sections.  Interleaved layout
+  /// can have multiple sections in a stage.  The rest layout only has one section
+  /// in a stage.
+  int sections_;
+
+  /// Sections that a stage has
+  int sections_per_stage_;
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : sections_(ref.stride(0) / kCrosswise),
+        sections_per_stage_(Shape::kContiguous / kCrosswise),
+        // stride_ = kCrosswise x sections_ x kFactor
+        stride_(ref.stride(0) * Layout::kFactor / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data()) +
+                    ref.offset(thread_offset_in_threadblock_tile) /
+                        Layout::kElementsPerAccess;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset =
+        stride_idx * ThreadMap::Delta::kStrided * stride_ / Layout::kFactor +
+        // kCrosswise elements in the contiguous dimension would span to a
+        // shared memory cache line.
+        iteration_contiguous_ * (ThreadMap::Delta::kContiguous / kCrosswise) *
+            Layout::TileShape::kContiguous;
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next section.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(coord.contiguous() * sections_per_stage_ * stride_ *
+                           ThreadMap::kElementsPerAccess / sections_ +
+                       coord.strided() * Shape::kStrided * stride_ *
+                           Layout::kElementsPerAccess / Layout::kFactor);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise<
+                                    sizeof_bits<Element_>::value, Crosswise>,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
new file mode 100644
index 0000000000000000000000000000000000000000..b55f841eee2e09aec8af5c8ec945a1997705c9f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -0,0 +1,1532 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing computing the addresses of storing of tiles
+   from pitch-linear rank=2 tensors.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm75.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm80.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous64b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous64b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous64b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 64;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 64b");
+
+    ///< Number of pointers - two pointers are needed if making more than 4 iterations along
+    ///< strided dimension
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1);
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_[Detail::kPointerCount];
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data());
+
+    byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element);
+    
+    if (Detail::kPointerCount == 2) {
+      byte_offset_[1] = byte_offset_[0] ^ 8;
+    }
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    pointer_ += pointer_offset / ThreadMap::kElementsPerAccess;
+  }
+
+  /// Returns a pointer
+  CUTLASS_DEVICE
+  AccessType *get() const {
+
+    // Map the logical contiguous and strided access to the internal swizzled structure.
+    int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16 + stride_ * ThreadMap::Delta::kContiguous * iteration_contiguous_;
+
+    char *access_byte_ptr = reinterpret_cast<char *>(pointer_ + uniform_offset);
+
+    int byte_offset;
+
+    // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations)
+    // in the strided dimension
+    if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) {
+      byte_offset = byte_offset_[1];
+    }
+    else {
+      byte_offset = byte_offset_[0];
+    }
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicand64bCrosswise,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicand64bCrosswise,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicand64bCrosswise,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous + 
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous128b,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCongruous128b,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous128b,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::TensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  static_assert(ThreadMap::kThreads / 32 > 1, 
+    "This tile iterator requires at least two warps.");
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value *
+                          ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128b");
+
+    ///< Number of pointers
+    static int const kPointerCount = 1;
+  };
+
+
+  static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension");
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_;
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+ public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ): 
+    stride_(ref.stride(0) / Layout::kElementsPerAccess),
+    byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    // This is the offset of a thread within a threadblock tile for a specific
+    // pointer (units of elements)
+    layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base;
+
+    // initialize pointer
+    pointer_ = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+
+    set_iteration_index(0);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = index / ThreadMap::Iterations::kContiguous;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    AccessType *access_ptr = pointer_;
+
+    int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2);
+    int offset_s = (iteration_strided_ / 2) * 8;
+
+    int access_offset = offset_c * stride_ + offset_s;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iteration_contiguous_;
+
+    if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous)
+      return *this;
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    iteration_contiguous_ = 0;
+    ++iteration_strided_;
+
+    if (iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    iteration_strided_ = 0;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+
+    RegularTileAccessIterator prev(*this);
+
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous * stride_ + 
+      coord.strided() * Shape::kStrided * Layout::kElementsPerAccess);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCrosswise128x4,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(TensorRef ref,  ///< Pointer to start of tensor
+                            int thread_id   ///< ID of each participating thread
+                            )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment>
+class RegularTileAccessIterator<Shape_, Element_,
+                                layout::RowMajorTensorOpMultiplicandCrosswise128x4,
+                                AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise128x4,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator(
+    TensorRef ref,  ///< Pointer to start of tensor
+    int thread_id   ///< ID of each participating thread
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileAccessIterator operator++(int) {
+    RegularTileAccessIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..be07e43f6f45132f79d95afb95714c4392149b66
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -0,0 +1,62 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
+>
+class RegularTileIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c186ce3fe0650c3f8927d84f1983916d9d1867f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -0,0 +1,552 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for pitch-linear.  This one is used by 2-stage SIMT kernels
+/// and sparse tensor core meta data.
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+  
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the contiguous or strided dimensions.");
+
+private:
+
+  //
+  // Types
+  //
+
+  //
+  // Data members
+  //
+
+  /// Pointer to memory
+  uint8_t *pointer_;
+
+  /// Stride quantity
+  StrideIndex stride_;
+
+  /// Amount to increment pointer along strided dimension
+  Index increment_strided_;
+
+  /// Amount to advance pointer between tiles
+  Index increment_advance_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ): 
+    pointer_(reinterpret_cast<uint8_t *>(ref.data()) + (ref.offset(ThreadMap::initial_offset(thread_idx)) * sizeof_bits<Element>::value / 8)) {
+    
+    stride_ = ref.stride()[0];
+    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value) * ThreadMap::Delta::kStrided / 8;
+    
+    increment_advance_ = 
+      (kAdvanceRank == 0 ? 
+        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
+        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8));
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous /
+                                   ThreadMap::kElementsPerAccess];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    load_with_pointer_offset(
+      frag, 
+      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
+        tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int idx = c + s * ThreadMap::Iterations::kContiguous;
+        access_ptr[c * ThreadMap::Delta::kContiguous /
+                   ThreadMap::kElementsPerAccess] = frag_ptr[idx];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    store_with_pointer_offset(
+      frag,
+      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    pointer_ += increment_advance_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    pointer_ -= increment_advance_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset;
+  }
+
+  /// Adds a tile offset in the unit of tile.
+  /// In GEMM/Conv implementation, this is used to move in the k dimension in the shared memory.
+  /// Below layouts are the shared memory layouts.  Current SM50 SIMT kernels only use col major A and row major B.
+  ///   For row major A operand, k dimension is contiguous dimension;
+  ///   For col major A operand, k dimension is strided dimension;
+  ///   For row major B operand, k dimension is strided dimension;
+  ///   For col major B operand, k dimension is contiguous dimension.
+  /// Below two classes map col/row major to the pitch linear coordinates used
+  /// in this base class.
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    int offset = sizeof_bits<Element>::value *
+        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
+    add_pointer_offset(offset);
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+    /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+#if 0
+    AccessType *access_ptr = pointer_[iteration_strided_ & 1];
+    int stride_idx = (iteration_strided_ & ~1);
+
+    int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+                        iteration_contiguous_ * ThreadMap::Delta::kContiguous /
+                            ThreadMap::kElementsPerAccess;
+
+    char *access_byte_ptr =
+        reinterpret_cast<char *>(access_ptr + access_offset);
+    return reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_);
+#endif
+    return reinterpret_cast<AccessType *>(pointer_);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for row major 
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::RowMajor, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  using Underlying = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kAlignment
+  >;
+
+  using AccessType = typename Underlying::AccessType;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return iterator_.get();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for pitch-linear
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_, layout::ColumnMajor, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  using Underlying = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap
+  >;
+
+  using AccessType = typename Underlying::AccessType;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return iterator_.get();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ed2e7fdd08ceafe772c97ab90f915c2268cabbb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -0,0 +1,509 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors. 
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+  typename Shape,
+  typename Element,
+  typename Layout,
+  int AdvanceRank,
+  typename ThreadMap,
+  int Alignment = sizeof_bits<Element>::value * ThreadMap::kElementsPerAccess / 8
+>
+class RegularTileIterator2dThreadTile;
+
+
+/// Regular tile iterator specialized for pitch-linear + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::PitchLinear, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the contiguous or strided dimensions.");
+
+private:
+
+  //
+  // Types
+  //
+  
+  using AccessType = AlignedArray<Element, ThreadMap::ThreadAccessShape::kCount, kAlignment>;
+
+  //
+  // Data members
+  //
+
+  /// Pointer to memory
+  uint8_t *pointer_;
+
+  /// Stride quantity
+  StrideIndex stride_;
+
+  /// Amount to increment pointer along strided dimension
+  LongIndex increment_strided_;
+
+  /// Amount to advance pointer between tiles
+  LongIndex increment_advance_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(): pointer_(nullptr), increment_strided_(0), increment_advance_(0) { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx,
+    int interleave
+  ){ 
+    
+    TensorCoord t = ThreadMap::initial_offset(thread_idx);
+    long int offset = t[0] * interleave + t[1] * ref.stride()[0]/interleave;
+    pointer_ = reinterpret_cast<uint8_t *>(ref.data() + offset);
+
+    stride_ = ref.stride()[0] / interleave;
+    increment_strided_ = (ref.stride()[0] * sizeof_bits<Element>::value / 8) * ThreadMap::Delta::kStrided / interleave;
+
+    increment_advance_ = 
+      (kAdvanceRank == 0 ? 
+        Shape::kContiguous * sizeof_bits<Element>::value / 8 : 
+        Shape::kStrided * (ref.stride()[0] * sizeof_bits<Element>::value / 8) / interleave);
+  }
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+    uint8_t const *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+          int idx = c + s * ThreadMap::Iterations::kContiguous;
+           frag_ptr[idx] = access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided];
+        }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    load_with_pointer_offset(
+      frag, 
+      tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + 
+        tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+    uint8_t *byte_pointer = pointer_ + pointer_offset * sizeof_bits<Element>::value / 8;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_pointer);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+          int idx = c + s * ThreadMap::Iterations::kContiguous;
+          access_ptr[c * ThreadMap::Delta::kContiguous / ThreadMap::ThreadAccessShape::kStrided] = frag_ptr[idx];
+      }
+
+      if (s + 1 < ThreadMap::Iterations::kStrided) {
+        byte_pointer += increment_strided_;
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    store_with_pointer_offset(
+      frag,
+      tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_
+    );
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    pointer_ += increment_advance_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    pointer_ -= increment_advance_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += pointer_offset;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    int offset = sizeof_bits<Element>::value *
+        (coord.contiguous() * Shape::kContiguous + coord.strided() * Shape::kStrided * stride_) / 8;
+    add_pointer_offset(offset);
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::RowMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorInterleaved<4>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+
+  using Underlying = RegularTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap,
+    kAlignment
+  >;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Regular tile iterator specialized for interleaved layout + 2d thread-tiled threadmapping
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator2dThreadTile<Shape_, Element_, layout::ColumnMajorInterleaved<4>, AdvanceRank, ThreadMap_, Alignment> {
+public:
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorInterleaved<4>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * ThreadMap::ThreadAccessShape::kCount>;
+  using PitchLinearThreadMap = PitchLinearStripminedThreadMap< layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, 
+                                  ThreadMap::kThreads, ThreadMap::ThreadAccessShape::kCount >;
+                        
+
+  using Underlying = RegularTileIterator2dThreadTile<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::PitchLinear,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap
+  >;
+
+  static_assert(kAdvanceRank == 0 || kAdvanceRank == 1, 
+    "Advance rank may only be along the row or column dimensions.");
+
+private:
+
+  Underlying iterator_;
+
+public:
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile() { }
+
+  CUTLASS_DEVICE
+  RegularTileIterator2dThreadTile(
+    TensorRef const &ref, 
+    int thread_idx
+  ):
+    iterator_({ref.data(), ref.stride()}, thread_idx, 4) {
+
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, TensorCoord const & tile_offset) {
+    iterator_.load_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Loads a fragment
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) {
+    iterator_.load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag, TensorCoord const & tile_offset) {
+    iterator_.store_with_pointer_offset(frag, {tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Stores a fragment
+  CUTLASS_HOST_DEVICE
+  void store(Fragment const &frag) {
+    iterator_.store_with_pointer_offset(frag, 0);
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances the pointer
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator2dThreadTile &operator--() {
+    --iterator_;
+    return *this;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..723f328d976fc170d198282823e3da6876ec1ba6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -0,0 +1,1107 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing storing of tiles from pitch-linear rank=2 tensors. 
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                          Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+  };
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : address_iterator_(ref, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_tile_offset({0, 1});
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_tile_offset(coord);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    load_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment &frag, Index byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *byte_ptr = reinterpret_cast<char const *>(address_iterator_.get()) + byte_offset;
+        AccessType const *access_ptr = reinterpret_cast<AccessType const *>(byte_ptr);
+
+        frag_ptr[access_idx] = *access_ptr;
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+        *access_ptr = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::ColumnMajorTensorOpMultiplicandCongruous<
+        sizeof_bits<Element_>::value, Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag, 
+    Index pointer_offset) {
+    
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept | 
+///            ReadableContiguousTileIteratorConcept | 
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::RowMajorTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                                  Crosswise>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1, 
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCongruous<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCongruous<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+  
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag, 
+    Index pointer_offset) {
+    
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::TensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>;
+
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : address_iterator_(ref, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_tile_offset({1, 0});
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_tile_offset(coord);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    store_with_byte_offset(frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const &frag, Index byte_offset) {  
+    address_iterator_.set_iteration_index(0);
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *byte_ptr = reinterpret_cast<char *>(address_iterator_.get()) + byte_offset;
+        AccessType *access_ptr = reinterpret_cast<AccessType *>(byte_ptr);
+
+        *access_ptr = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::ColumnMajorTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, int Alignment, int Crosswise>
+class RegularTileIterator<Shape_, Element_,
+                          layout::RowMajorTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Crosswise>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Crosswise>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::TensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Crosswise>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                    InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value,
+                                                      InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+    /// This iterator is specialized for an access size that is 128 bits in
+    /// length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess ==
+                      kAccessSizeInBits,
+                  "This iterator requires a policy whose access size is 128bs");
+  };
+
+ private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = RegularTileAccessIterator<Shape, Element, Layout,
+                                                       kAdvanceRank, ThreadMap>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : address_iterator_(ref, thread_id) {}
+ 
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    address_iterator_.add_pointer_offset(Shape::kCount);
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset);
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+        *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx];
+        ++address_iterator_;
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for k interleaved arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank, typename ThreadMap_, int InterleavedK, int Alignment>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                             InterleavedK>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::TensorOpMultiplicandColumnMajorInterleaved<sizeof_bits<Element_>::value,
+                                                         InterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    cutlass::MatrixShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::TensorOpMultiplicandRowMajorInterleaved<sizeof_bits<Element_>::value, InterleavedK>,
+    (kAdvanceRank == 1 ? 0 : 1),
+    ThreadMap
+  >;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+       : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.strided(), coord.contiguous()});
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
new file mode 100644
index 0000000000000000000000000000000000000000..53121c6114cc3675e4d97f9da65d3ecb58e46d62
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -0,0 +1,1460 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses and visits the last "residue" tile
+    first, with the objective of minimizing predicate mask updates during steady-state operation.
+
+    A precomputed "Params" object minimizes the amount of state that must be stored in registers,
+    and integer addition is used to advance the pointer through memory.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/layout/tensor_op_multiplicand_sm70.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType * pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+
+      // This is the offset of a thread within a threadblock tile for a specific pointer
+      // (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
+    );
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+            vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
+
+        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+          vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::VoltaTensorOpMultiplicandCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+/// Tile iterator specialized for congruous arrangements for TensorOps
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for pitch-linear iterator may along advance along the "
+    "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using StrideIndex = typename Layout::Stride::Index;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    /// This iterator is specialized for an access size that is 128 bits in length.
+    static int const kAccessSizeInBits = 128;
+
+    static_assert(
+      sizeof_bits<Element_>::value * ThreadMap::kElementsPerAccess == kAccessSizeInBits,
+      "This iterator requires a policy whose access size is 128bs");
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+  };
+
+
+private:
+
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, ThreadMap::Iterations::kCount * Layout::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Stride value
+  StrideIndex stride_;
+
+  /// Internal pointer to first access of tile
+  AccessType * pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): stride_(ref.stride(0) / Layout::kElementsPerAccess), byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+
+      // This is the offset of a thread within a threadblock tile for a specific pointer
+      // (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+        thread_offset_base + layout::PitchLinearCoord{0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    add_pointer_offset((kAdvanceRank ? Shape::kStrided * stride_ * Layout::kElementsPerAccess : Shape::kContiguous));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset(
+      coord.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess +
+      coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess
+    );
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+            c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+            vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char const *access_byte_ptr = reinterpret_cast<char const *>(access_ptr + access_offset);
+
+        frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(access_byte_ptr + byte_offset_);
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / ThreadMap::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[s & 1];
+      int stride_idx = (s & ~1);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        int access_offset = stride_idx * ThreadMap::Delta::kStrided * stride_ +
+          c * ThreadMap::Delta::kContiguous / ThreadMap::kElementsPerAccess +
+          vec_pointer_offset;
+
+        int access_idx = c + s * ThreadMap::Iterations::kContiguous;
+
+        char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+        *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) = frag_ptr[access_idx];
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for column-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+    Element,
+    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 0 : 1),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major congruous TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+  Shape_,
+  Element_,
+  layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+  AdvanceRank,
+  ThreadMap_,
+  Alignment> {
+public:
+
+  static_assert(AdvanceRank == 0 || AdvanceRank == 1,
+    "Specialization for row-major iterator may along advance along the "
+    "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+    Element,
+    layout::VoltaTensorOpMultiplicandBCongruous<sizeof_bits<Element_>::value>,
+    (kAdvanceRank == 0 ? 1 : 0),
+    ThreadMap_>;
+
+public:
+
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+private:
+
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+public:
+
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(
+    TensorRef ref,                              ///< Pointer to start of tensor
+    int thread_id                               ///< ID of each participating thread
+  ): iterator_({ref.data(), ref.stride()}, thread_id) {
+
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(
+    Fragment const &frag,
+    Index pointer_offset) {
+
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+
+/// Tile iterator specialized for crosswise arrangements for TensorOps.
+///
+/// Volta TN SMEM layout is a little diffrent:
+/// Crosseised elements will be stored in a line, while contiguous elements
+/// sre stored in line-by-line.
+/// Padding is used to reduce SMEM bank conflicts.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<
+    Shape_, Element_,
+    layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                               Shape_::kContiguous>,
+    AdvanceRank, ThreadMap_, Alignment> {
+
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout =
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 Shape::kContiguous>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Internal details made public to facilitate introspection
+  struct Detail {
+
+    ///< Number of pointers
+    static int const kPointerCount = (ThreadMap::Iterations::kStrided > 1 ? 2 : 1);
+
+    /// Iterations for the kElementsPerAccess of ThreadMap
+    static int const kIterarionsPerAccess =
+        ThreadMap::kElementsPerAccess / Layout::kElementsPerAccess;
+
+    /// Contiguous elements per line
+    static int const kContiguousElementsPerLine = 4;
+  };
+
+ private:
+  /// Element type per access
+  using AccessType = Array<Element, Layout::kElementsPerAccess>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment =
+      Array<Element, ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// The crosswised elements will be stored in a line.
+  /// line_size is size of crosswised dimension plus padding.
+  /// in units of AccessType
+  Index line_size;
+
+  /// Internal pointer to first access of tile
+  AccessType *pointer_[Detail::kPointerCount];
+
+  /// Internal byte offset
+  Index byte_offset_;
+
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : line_size(ref.stride(0) * Detail::kContiguousElementsPerLine / Layout::kElementsPerAccess),
+        byte_offset_(0) {
+
+    layout::PitchLinearCoord thread_offset_base =
+        ThreadMap::initial_offset(thread_id);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < Detail::kPointerCount; ++i) {
+      // This is the offset of a thread within a threadblock tile for a specific
+      // pointer (units of elements)
+      layout::PitchLinearCoord thread_offset_in_threadblock_tile =
+          thread_offset_base +
+          layout::PitchLinearCoord{
+              0, ThreadMap::Detail::WarpThreadArrangement::kStrided * i};
+
+      // initialize pointer
+      pointer_[i] = reinterpret_cast<AccessType *>(
+          ref.data() + ref.offset(thread_offset_in_threadblock_tile));
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_offset_ += pointer_offset * sizeof(Element);
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    // (Shape::kContiguous/Layout::kElementsPerAccess)*
+    //   line_size * Layout::kElementsPerAccess
+    add_pointer_offset(Shape::kContiguous * line_size);
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    this->operator++();
+
+    return prev;
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    add_pointer_offset((coord.contiguous() * (Shape::kContiguous / Layout::kElementsPerAccess) *
+                       line_size + coord.strided() * Shape::kStrided) *
+                       Layout::kElementsPerAccess);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      AccessType *access_ptr = pointer_[(s & 1) ^ (s / 2)];
+
+      access_ptr += 16 * (s / 2);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
+
+          int access_offset = 
+            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size +
+            vec_pointer_offset + i * line_size;
+
+          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
+            Detail::kIterarionsPerAccess + i;
+
+          char const *access_byte_ptr = reinterpret_cast<char const*>(access_ptr + access_offset);
+
+          frag_ptr[access_idx] = *reinterpret_cast<AccessType const *>(
+              access_byte_ptr + byte_offset_);
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    AccessType const *frag_ptr = reinterpret_cast<AccessType const *>(&frag);
+
+    Index vec_pointer_offset = pointer_offset / Layout::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+
+      AccessType *access_ptr = pointer_[(s & 1) ^ ((s >> 1) & 1)];
+
+      access_ptr += 16 * (s / 2) + vec_pointer_offset;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for(int i = 0; i < Detail::kIterarionsPerAccess; ++i) {
+
+          int access_offset = 
+            c * ThreadMap::Delta::kContiguous / Detail::kContiguousElementsPerLine * line_size + i * line_size;
+
+          int access_idx = (c + s * ThreadMap::Iterations::kContiguous) *
+            Detail::kIterarionsPerAccess + i;
+
+          char *access_byte_ptr = reinterpret_cast<char *>(access_ptr + access_offset);
+
+          *reinterpret_cast<AccessType *>(access_byte_ptr + byte_offset_) =
+              frag_ptr[access_idx];
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for column-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_,
+                          layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Shape_::kRow>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for column-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Shape::kRow>;
+  static int const kAdvanceRank = AdvanceRank;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                            Shape::kRow>,
+      (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.row(), coord.column()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile Iterator specialized for row-major crosswise TensorOp formats.
+///
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept
+///
+template <
+  typename Shape_,
+  typename Element_,
+  int AdvanceRank,
+  typename ThreadMap_,  
+  int Alignment
+>
+class RegularTileIterator<Shape_, Element_,
+                          layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+                              sizeof_bits<Element_>::value, Shape_::kColumn>,
+                          AdvanceRank, ThreadMap_, Alignment> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for row-major iterator may along advance along the "
+      "columns(rank=0) or rows(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+      sizeof_bits<Element_>::value, Shape::kColumn>;
+  static int const kAdvanceRank = AdvanceRank;
+  static int const kAlignment = Alignment;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using ThreadMap = ThreadMap_;
+
+  /// Underlying iterator type
+  using UnderlyingIterator = RegularTileIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::VoltaTensorOpMultiplicandCrosswise<sizeof_bits<Element_>::value,
+                                                 Shape::kColumn>,
+      (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>;
+
+ public:
+  /// Fragment object to be loaded or stored
+  using Fragment = Array<Element, UnderlyingIterator::Fragment::kElements>;
+
+ private:
+  /// Underlying iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Construct a TileIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator(TensorRef ref,  ///< Pointer to start of tensor
+                      int thread_id   ///< ID of each participating thread
+                      )
+      : iterator_({ref.data(), ref.stride()}, thread_id) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Adds a tile offset
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const &coord) {
+    iterator_.add_tile_offset({coord.column(), coord.row()});
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  RegularTileIterator operator++(int) {
+    RegularTileIterator prev(*this);
+    ++iterator_;
+
+    return prev;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) { load_with_pointer_offset(frag, 0); }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/vector_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e5d181c177b2ad6627c927ae4ad3fb9c99a96d3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/threadblock/vector_iterator.h
@@ -0,0 +1,149 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template wraps the vector access iterator concept to load whole vector from tensors in
+      memory. This is typically used for per-channel scale and bias in convolution kernels.
+*/
+
+#pragma once
+
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename VectorAccessIterator_>
+class VectorIterator {
+public:
+  using VectorAccessIterator = VectorAccessIterator_;
+
+  using Shape = typename VectorAccessIterator::Shape;
+  using Element = typename VectorAccessIterator::Element;
+  using Layout = typename VectorAccessIterator::Layout;
+  using TensorCoord = typename Layout::TensorCoord;
+  using AccessType = typename VectorAccessIterator::AccessType;
+  using TensorRef = typename VectorAccessIterator::TensorRef;
+  using Index = typename VectorAccessIterator::Index;
+  using LongIndex = typename VectorAccessIterator::LongIndex;
+
+  static int const kElementsPerAccess = VectorAccessIterator::kElementsPerAccess;
+  static int const kRowsPerIteration = VectorAccessIterator::kRowsPerIteration;
+  static int const kThreads = VectorAccessIterator::kThreads;
+  static int const kIterations = VectorAccessIterator::kIterations;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+    Element, kElementsPerAccess * kIterations>;
+
+private:
+
+  /// Internal state
+  VectorAccessIterator vector_access_iterator_;
+
+public:
+
+  /// Constructor
+  CUTLASS_HOST_DEVICE
+  VectorIterator(
+    Element const *ptr,
+    TensorCoord extent,
+    int thread_idx,
+    int warp_idx,
+    MatrixCoord const &threadblock_offset = MatrixCoord()
+  ):
+    vector_access_iterator_(ptr, extent, thread_idx, warp_idx, threadblock_offset) { }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  VectorIterator &operator++() {
+    vector_access_iterator_.advance();
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  CUTLASS_HOST_DEVICE
+  VectorIterator operator++(int) {
+    VectorIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment &frag, Index pointer_offset) {
+
+    frag.clear();
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < kIterations; ++c) {
+
+        cutlass::arch::global_load<
+          AccessType,
+          sizeof(AccessType)
+        >(
+          frag_ptr[c],
+          vector_access_iterator_.get() + pointer_offset,
+          vector_access_iterator_.valid()
+        );
+
+        ++vector_access_iterator_;
+      }
+//    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment &frag) {
+    vector_access_iterator_.set_iteration_index(0);
+    load_with_pointer_offset(frag, 0);
+  }
+
+  CUTLASS_DEVICE
+  void advance() {
+    vector_access_iterator_.advance();
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b27b77f9b697476ed54a019cd94120561371ebd1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/transform/warp/vector_fragment_iterator.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of a warp vector
+      that participate in one warp-level mma operation.
+
+      Typically, this is used to access the scale/bias fragment of a warp-level mma operation.
+      The scale/bias vector is then partitioned into smaller fragments that can be fed into 
+      next warp-level mma operation. 
+
+      This iterator is necessary to accomplish warp-level mma fusion where the scale/bias vector is 
+      applied to the multiplicand for the next mma.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/array.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_conversion.h"
+
+namespace cutlass {
+namespace transform {
+namespace warp {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Size of the input fragment tile shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator;
+
+
+// Partial specialization for PitchLinear layout tile
+
+template <
+    /// Size of the input fragment vector shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator<Shape_, Element_,
+                                         cutlass::layout::PitchLinear,
+                                         InstructionShape_, ElementsPerAccess> {
+ public:
+    
+  /// Size of the input threadblock tile shape (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::PitchLinear;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kRowsPerIteration = 8;
+  static int const kColumnsPerAccess = 8;
+  static int const kElementsPerIteration = kRowsPerIteration * InstructionShape::kK / kThreads;
+  static int const kAccessPerIteration = kElementsPerIteration / kElementsPerAccess;
+  
+  /// Number of iterations
+  using Iterations = MatrixShape<InstructionShape::kM / kRowsPerIteration, Shape::kContiguous / kElementsPerIteration>;
+
+public:
+
+  //
+  // Derived quantities
+  //
+  // All fragments have kElementsPerAccess scale followed by bias
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one iteration of the iterator.
+  using Fragment = Array<Element, kElementsPerIteration * Iterations::kRow>;
+
+  /// Input threadblock fragment tile
+  using ThreadblockFragment = Array<Element, Shape::kContiguous >;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<Element, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Input threadblock fragment tile
+  AccessType const *iterator_;
+
+  /// Internal index
+  int index_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
+      : iterator_(reinterpret_cast<AccessType const *>(&threadblock_frag)),
+        index_(0) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset; 
+
+    if(index_ >= Iterations::kColumn)
+        index_ = 0;
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    index_ = idx;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int r = 0; r < Iterations::kRow; r++) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < kAccessPerIteration; i++) {
+    
+          frag_ptr[i * Iterations::kRow + r].clear();
+          frag_ptr[i * Iterations::kRow + r] = iterator_[index_ * kAccessPerIteration + i];
+        }
+    }
+  }
+
+};
+
+// Partial specialization for Row-Major layout tile
+
+template <
+    /// Size of the input fragment tile shape (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    //// Number of elements per access when loading fragment
+    int ElementsPerAccess>
+class VectorFragmentIterator<Shape_, Element_,
+                                         cutlass::layout::RowMajor,
+                                         InstructionShape_, ElementsPerAccess> {
+ public:
+    
+  /// Size of the input threadblock tile shape (concept: MatrixShape)
+  using Shape = Shape_;
+
+  /// Element type
+  using Element = Element_;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+
+  /// Underlying iterator
+  using Base = VectorFragmentIterator<
+    layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+    layout::PitchLinear, InstructionShape, ElementsPerAccess>;
+
+
+ public:
+
+  //
+  // Derived quantities
+  //
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one iteration of the iterator.
+  using Fragment = typename Base::Fragment;
+
+  /// Input threadblock fragment tile
+  using ThreadblockFragment = typename Base::ThreadblockFragment;
+
+ private:
+  /// Underlying iterator
+  Base iterator_;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator(ThreadblockFragment const &threadblock_frag)
+      : iterator_(threadblock_frag) {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    iterator_.add_offset(index_offset);
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  VectorFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_index(int idx) {
+    iterator_.set_index(idx);
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+    iterator_.load(frag);
+  }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace conv
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint128.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint128.h
new file mode 100644
index 0000000000000000000000000000000000000000..68896d6b60767221fd41421a0d3fdf75392c3604
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint128.h
@@ -0,0 +1,269 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief Defines an unsigned 128b integer with several operators to support 64-bit integer division.
+*/
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#include <cstdlib>
+#include <cmath>
+#include <type_traits>
+#include <stdexcept>
+#endif
+
+
+/// Optionally enable GCC's built-in type
+#if (defined(__x86_64) || defined (__aarch64__)) && !(defined(__CUDA_ARCH__) && ((__CUDACC_VER_MAJOR__ <= 10) || ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ <= 4)))) && defined(__GNUC__)
+#define CUTLASS_UINT128_NATIVE
+#elif !defined(__CUDA_ARCH__)
+// No custom support for 128b arithmetic on device
+#if defined(_MSC_VER) && defined(_M_AMD64)
+#define CUTLASS_INT128_ARITHMETIC
+#include <intrin.h>
+#if _MSC_VER >= 1920 && !defined(__CUDA_ARCH__)
+#define CUTLASS_INT128_ARITHMETIC_DIV
+#include <immintrin.h>
+#endif
+#endif
+#endif
+
+namespace cutlass {
+
+///! Unsigned 128b integer type
+struct alignas(16) uint128_t
+{
+  /// Size of one part of the uint's storage in bits
+  static constexpr int storage_bits_ = 64;
+
+  struct hilo
+  {
+    uint64_t lo;
+    uint64_t hi;
+  };
+
+  // Use a union to store either low and high parts or, if present, a built-in 128b integer type.
+  union {
+    struct hilo hilo_;
+
+#if defined(CUTLASS_UINT128_NATIVE)
+    unsigned __int128 native;
+#endif // defined(CUTLASS_UINT128_NATIVE)
+  };
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  uint128_t() : hilo_{0, 0} {}
+
+  /// Constructor from uint64
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_) : hilo_{lo_, 0} {}
+
+  /// Constructor from two 64b unsigned integers
+  CUTLASS_HOST_DEVICE
+  uint128_t(uint64_t lo_, uint64_t hi_) : hilo_{lo_, hi_} {}
+
+  /// Optional constructor from native value
+#if defined(CUTLASS_UINT128_NATIVE)
+  uint128_t(unsigned __int128 value) : native(value) { }
+#endif
+
+  /// Lossily cast to uint64
+  CUTLASS_HOST_DEVICE
+  explicit operator uint64_t() const
+  {
+    return hilo_.lo;
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void exception()
+  {
+#if defined(__CUDA_ARCH__)
+  asm volatile ("  brkpt;\n");
+#else
+  // throw std::runtime_error("Not yet implemented.");
+  abort();
+#endif
+  }
+
+  /// Add
+  CUTLASS_HOST_DEVICE
+  uint128_t operator+(uint128_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native + rhs.native;
+#else
+    y.hilo_.lo = hilo_.lo + rhs.hilo_.lo;
+    y.hilo_.hi = hilo_.hi + rhs.hilo_.hi + (y.hilo_.lo < hilo_.lo);
+#endif
+    return y;
+  }
+
+  /// Subtract
+  CUTLASS_HOST_DEVICE
+  uint128_t operator-(uint128_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native - rhs.native;
+#else
+    y.hilo_.lo = hilo_.lo - rhs.hilo_.lo;
+    y.hilo_.hi = hilo_.hi - rhs.hilo_.hi - (rhs.hilo_.lo && y.hilo_.lo > hilo_.lo);
+#endif
+    return y;
+  }
+
+  /// Multiply by unsigned 64b integer yielding 128b integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator*(uint64_t const& rhs) const
+  {
+    uint128_t y{};
+#if defined(CUTLASS_UINT128_NATIVE)
+    y.native = native * rhs;
+#elif defined(CUTLASS_INT128_ARITHMETIC)
+    // Multiply by the low part
+    y.hilo_.lo = _umul128(hilo_.lo, rhs, &y.hilo_.hi);
+
+    // Add the high part and ignore the overflow
+    uint64_t overflow{0};
+    y.hilo_.hi += _umul128(hilo_.hi, rhs, &overflow);
+#else
+    CUTLASS_UNUSED(rhs);
+    exception();
+#endif
+    return y;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator/(uint64_t const& divisor) const
+  {
+    uint64_t quotient{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    uint64_t remainder{0};
+    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Divide 128b operation by 64b operation yielding a 64b quotient
+  CUTLASS_HOST_DEVICE
+  uint64_t operator%(uint64_t const& divisor) const
+  {
+    uint64_t remainder{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    (void)_udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return remainder;
+  }
+
+  /// Computes the quotient and remainder in a single method.
+  CUTLASS_HOST_DEVICE
+  uint64_t divmod(uint64_t &remainder, uint64_t divisor) const
+  {
+    uint64_t quotient{0};
+#if defined(CUTLASS_UINT128_NATIVE)
+    quotient = uint64_t(native / divisor);
+    remainder = uint64_t(native % divisor);
+#elif defined(CUTLASS_INT128_ARITHMETIC_DIV)
+    // implemented using MSVC's arithmetic intrinsics
+    quotient = _udiv128(hilo_.hi, hilo_.lo, divisor, &remainder);
+#else
+    CUTLASS_UNUSED(remainder);
+    CUTLASS_UNUSED(divisor);
+    exception();
+#endif
+    return quotient;
+  }
+
+  /// Left-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator<<(int sh) const
+  {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= storage_bits_) {
+      return uint128_t(0, hilo_.lo << (sh - storage_bits_));
+    }
+    else {
+      return uint128_t(
+        (hilo_.lo << sh),
+        (hilo_.hi << sh) | uint64_t(hilo_.lo >> (storage_bits_ - sh))
+      );
+    }
+  }
+
+  /// Right-shifts a 128b unsigned integer
+  CUTLASS_HOST_DEVICE
+  uint128_t operator>>(int sh) const
+  {
+    if (sh == 0) {
+      return *this;
+    }
+    else if (sh >= storage_bits_) {
+      return uint128_t((hilo_.hi >> (sh - storage_bits_)), 0);
+    }
+    else {
+      return uint128_t(
+        (hilo_.lo >> sh) | (hilo_.hi << (storage_bits_ - sh)),
+        (hilo_.hi >> sh)
+      );
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint256.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint256.h
new file mode 100644
index 0000000000000000000000000000000000000000..3657853557ebccfd6be63ce6ba0fa4d69880d649
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/uint256.h
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+  \brief Defines an unsigned 256b integer.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+#if defined(__CUDACC_RTC__)
+#include CUDA_STD_HEADER(cstdint)
+#else
+#include <cstdint>
+#include <cstdlib>
+#include <cmath>
+#include <type_traits>
+#include <stdexcept>
+#endif
+#include "cutlass/uint128.h"
+
+namespace cutlass {
+
+///! Unsigned 256b integer type
+struct alignas(32) uint256_t {
+  /// Size of one part of the uint's storage in bits
+  static constexpr int storage_bits_ = 128;
+
+  struct hilo {
+    uint128_t lo;
+    uint128_t hi;
+  };
+
+  // Use a union to store either low and high parts.
+  union {
+    struct hilo hilo_;
+  };
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  CUTLASS_HOST_DEVICE
+  uint256_t() : hilo_{uint128_t{}, uint128_t{}} {}
+
+  /// Constructor from uint128
+  CUTLASS_HOST_DEVICE
+  uint256_t(uint128_t lo_) : hilo_{lo_, uint128_t{}} {}
+
+  /// Constructor from two 128b unsigned integers
+  CUTLASS_HOST_DEVICE
+  uint256_t(uint128_t lo_, uint128_t hi_) : hilo_{lo_, hi_} {}
+
+  /// Lossily cast to uint128_t
+  CUTLASS_HOST_DEVICE
+  explicit operator uint128_t() const {
+    return hilo_.lo;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/version.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..57a73a5fbb41a22ed5e44743c84fa1bbbe0b0075
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/version.h
@@ -0,0 +1,80 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#define CUTLASS_MAJOR 4
+#define CUTLASS_MINOR 2
+#define CUTLASS_PATCH 1
+
+#ifdef CUTLASS_VERSIONS_GENERATED
+#include "cutlass/version_extended.h"
+#else
+#define CUTLASS_BUILD 0
+#define CUTLASS_REVISION ""
+#endif
+
+#define CUTLASS_VERSION ((CUTLASS_MAJOR)*100 + (CUTLASS_MINOR)*10 + CUTLASS_PATCH)
+
+namespace cutlass {
+
+  inline constexpr uint32_t getVersion() {
+    return CUTLASS_VERSION;
+  }
+  inline constexpr uint32_t getVersionMajor() {
+    return CUTLASS_MAJOR;
+  }
+  inline constexpr uint32_t getVersionMinor() {
+    return CUTLASS_MINOR;
+  }
+  inline constexpr uint32_t getVersionPatch() {
+    return CUTLASS_PATCH;
+  }
+  inline constexpr uint32_t getVersionBuild() {
+    return CUTLASS_BUILD + 0;
+  }
+
+  inline std::string getVersionString() {
+    std::string version = "@CUTLASS_VERSION@";
+    if (getVersionBuild()) {
+      version += "." + std::to_string(getVersionBuild());
+    }
+    return version;
+  }
+  
+  inline std::string getGitRevision() {
+    return "@CUTLASS_REVISION@";
+  }
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/wmma_array.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/wmma_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..77929f60f73dc07ea2a8e47de1cfb95b5f8859f0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/wmma_array.h
@@ -0,0 +1,133 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types
+           and is safe to use in a union.
+*/
+
+#pragma once
+
+#include "cutlass/arch/wmma.h"
+
+#if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Wmma array type (WmmaFragmentArray holds elements of type nvcuda::wmma::fragment)
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N,
+  /// Whether the element type of T is half_t or __half
+  bool IsHalfType = (platform::is_same<typename T::element_type, cutlass::half_t>::value ||
+                     platform::is_same<typename T::element_type, __half>::value)
+>
+class WmmaFragmentArray: public Array<T, N, true> {
+public:
+
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], (typename T::element_type)0);
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
+
+    return *this;
+  }
+};
+
+/// Partial specialization for the case in which T::element_type is
+/// half_t or __half. This is needed because the cast (typename T::element_type)0
+/// in the primary template flags as an error when __CUDA_NO_HALF_CONVERSIONS__
+/// is set.
+template <
+  /// Element type
+  typename T,
+  /// Number of elements in the array
+  int N
+>
+class WmmaFragmentArray<T, N, true>: public Array<T, N, true> {
+public:
+
+  /// Efficient clear method (override Array::clear())
+  CUTLASS_HOST_DEVICE
+  void clear()
+  {
+    for(int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      nvcuda::wmma::fill_fragment((*this)[i], __float2half(0.f));
+    }
+  }
+
+  CUTLASS_HOST_DEVICE
+  WmmaFragmentArray<T, N>& operator+=(const WmmaFragmentArray<T, N>& rhs)
+  {
+    using element_type = typename T::element_type;
+    plus<T> add;
+
+    for (int i = 0; i < Array<T, N, true>::kElements; i++)
+    {
+      (*this)[i] = add((*this)[i], rhs[i]);
+    }
+
+    return *this;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif // if defined(CUTLASS_ARCH_WMMA_ENABLED)
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/workspace.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/workspace.h
new file mode 100644
index 0000000000000000000000000000000000000000..485ebbe3ae27af7ddc05bc1e36f32b1a4ee65901
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/include/cutlass/workspace.h
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Utilities for initializing workspaces
+*/
+
+#pragma once
+
+#if !defined(__CUDACC_RTC__)
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+#include "cutlass/trace.h"
+#endif
+
+#include "cutlass.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+static constexpr int MinWorkspaceAlignment = 16;
+
+#if !defined(__CUDACC_RTC__)
+static Status
+zero_workspace(
+    void* workspace,
+    size_t workspace_size,
+    cudaStream_t stream = nullptr,
+    [[maybe_unused]] CudaHostAdapter *cuda_adapter = nullptr) {
+  if (workspace_size > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  clearing workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    cudaError_t result = cudaMemsetAsync(workspace, 0, workspace_size, stream);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaMemsetAsync() returned error " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+#if !defined(__CUDACC_RTC__)
+template <typename T>
+Status
+fill_workspace(void* workspace, T fill_value, size_t fill_count, cudaStream_t stream = nullptr, CudaHostAdapter *cuda_adapter = nullptr) {
+  static_assert(sizeof(T) == 4 || sizeof(T) == 2 || sizeof(T) == 1, "Unsupported fill type");
+  if (fill_count > 0) {
+    if (workspace == nullptr) {
+      CUTLASS_TRACE_HOST("  error: device workspace must not be null");
+      return Status::kErrorWorkspaceNull;
+    }
+
+    CUTLASS_TRACE_HOST("  filling workspace");
+
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+    //
+    // Use the cuda host adapter
+    //
+    CUTLASS_ASSERT(cuda_adapter);
+    if (cuda_adapter) {
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, fill_value, fill_count, stream)) {
+        return Status::kErrorInternal;
+      }
+    }
+    else {
+      return Status::kErrorInternal;
+    }
+#else
+    CUdeviceptr d_workspace = reinterpret_cast<CUdeviceptr>(workspace);
+    CUresult result = CUDA_SUCCESS;
+    if (sizeof(T) == 4) {
+      result = cuMemsetD32Async(d_workspace, reinterpret_cast<uint32_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 2) {
+      result = cuMemsetD16Async(d_workspace, reinterpret_cast<uint16_t&>(fill_value), fill_count, stream);
+    }
+    else if (sizeof(T) == 1) {
+      result = cuMemsetD8Async(d_workspace, reinterpret_cast<uint8_t&>(fill_value), fill_count, stream);
+    }
+
+    if (CUDA_SUCCESS != result) {
+      const char** error_string_ptr = nullptr;
+      (void) cuGetErrorString(result, error_string_ptr);
+      if (error_string_ptr != nullptr) {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned error " << *error_string_ptr);
+      }
+      else {
+        CUTLASS_TRACE_HOST("  cuMemsetD" << sizeof(T) * 8 << "Async() returned unrecognized error");
+      }
+      return Status::kErrorInternal;
+    }
+#endif
+  }
+
+  return Status::kSuccess;
+}
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb617dc20d35f6dd352a84c3964a58fa9bc687e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/__init__.py
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Local module imports
+from .dsl import *
+from .runtime import *
+from ._mlir_helpers import lru_cache_ir
+from .env_manager import get_str_env_var, detect_gpu_arch
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..607a24d032c6ef899b586a41d2bb771c381406b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR Dialect helper functions
+"""
+
+from . import arith
+from .lru_cache_ir import lru_cache_ir
+
+
+__all__ = ["arith", "lru_cache_ir"]
+
+try:
+    from . import gpu
+
+    __all__.extend(["gpu"])
+except ImportError:
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py
new file mode 100644
index 0000000000000000000000000000000000000000..60cc8db31fd7369d721f3d7c64c5bb8fb03502a8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py
@@ -0,0 +1,691 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR Arith Dialect helper functions
+"""
+
+import array
+import numpy as np
+
+from ..common import *
+from ..._mlir import ir  # type: ignore
+from ..._mlir.extras import types as T  # type: ignore
+from ..._mlir.dialects import arith, nvgpu, math, builtin  # type: ignore
+
+from .lru_cache_ir import lru_cache_ir
+
+# =============================================================================
+# Arith Dialect Helper functions
+# =============================================================================
+
+
+def recast_type(src_type, res_elem_type) -> ir.Type:
+    if isinstance(src_type, T.VectorType):
+        if src_type.scalable:
+            res_type = T.vector(
+                *src_type.shape,
+                res_elem_type,
+                scalable=src_type.scalable,
+                scalable_dims=src_type.scalable_dims,
+            )
+        else:
+            res_type = T.vector(*src_type.shape, res_elem_type)
+    elif isinstance(src_type, T.RankedTensorType):
+        res_type = T.RankedTensorType.get(
+            element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides
+        )
+    elif isinstance(src_type, T.UnrankedTensorType):
+        res_type = T.UnrankedTensorType.get(element_type=res_elem_type)
+    elif isinstance(src_type, T.MemRefType):
+        res_type = T.MemRefType.get(
+            element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides
+        )
+    else:
+        res_type = res_elem_type
+    return res_type
+
+
+def is_scalar(ty) -> bool:
+    return not isinstance(
+        ty, (T.VectorType, T.RankedTensorType, T.UnrankedTensorType, T.MemRefType)
+    )
+
+
+def element_type(ty) -> ir.Type:
+    if not is_scalar(ty):
+        return ty.element_type
+    else:
+        return ty
+
+
+def is_narrow_precision(ty) -> bool:
+    narrow_types = {
+        T.f8E8M0FNU(),
+        T.f8E4M3FN(),
+        T.f8E4M3(),
+        T.f8E5M2(),
+        T.f8E4M3B11FNUZ(),
+        T.f4E2M1FN(),
+        T.f6E3M2FN(),
+        T.f6E2M3FN(),
+    }
+    return ty in narrow_types
+
+
+def is_float_type(ty) -> bool:
+    return (
+        arith._is_float_type(ty)
+        # TODO-upstream: prediction is not correct. Patch here and fix in upstream later
+        or is_narrow_precision(ty)
+        or ty in (T.bf16(), T.tf32())
+    )
+
+
+def truncf_to_narrow(res_ty, src, loc, ip):
+    res_elem_ty = element_type(res_ty)
+    if res_elem_ty == T.f8E8M0FNU():
+        rnd = nvgpu.RoundingMode.RP
+    else:
+        rnd = nvgpu.RoundingMode.RN
+    return nvgpu.cvt_fptrunc(res_ty, src, rnd=rnd, loc=loc, ip=ip)
+
+
+def extf_from_narrow(res_ty, src, loc, ip):
+    src_elem_ty = element_type(src.type)
+
+    # When source type is E8M0, temporary element type has to be bf16
+    tmp_elem_ty = T.bf16() if src_elem_ty == T.f8E8M0FNU() else T.f16()
+    tmp_ty = recast_type(src.type, tmp_elem_ty)
+
+    # narrow -> bf16/f16 -> target type
+    tmp = nvgpu.cvt_fpext(tmp_ty, src, loc=loc, ip=ip)
+    return arith.extf(res_ty, tmp, loc=loc, ip=ip)
+
+
+def bitcast(src, res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+    return arith.bitcast(res_type, src, loc=loc, ip=ip)
+
+
+def cvtf(src, res_elem_type, *, loc=None, ip=None):
+    src_elem_type = element_type(src.type)
+
+    if res_elem_type == src_elem_type:
+        return src
+
+    res_type = recast_type(src.type, res_elem_type)
+
+    # Treat TF32 as F32 and use i32 as intermediate data
+    # TODO-upstream: update arith to support tf32 <-> f32 conversion
+    if src_elem_type == T.tf32():
+        # tf32 -> i32
+        tmp_type = recast_type(src.type, T.i32())
+        src = builtin.unrealized_conversion_cast([tmp_type], [src], loc=loc, ip=ip)
+        # i32 -> f32
+        src = bitcast(src, T.f32(), loc=loc, ip=ip)
+        # f32 -> X with `cvtf` recursively
+        return cvtf(src, res_elem_type, loc=loc, ip=ip)
+
+    if res_elem_type == T.tf32():
+        # X -> f32 with `cvtf`` recursively
+        tmp = cvtf(src, T.f32(), loc=loc, ip=ip)
+        # f32 -> i32
+        tmp = bitcast(tmp, T.i32(), loc=loc, ip=ip)
+        # i32 -> tf32
+        return builtin.unrealized_conversion_cast([res_type], [tmp], loc=loc, ip=ip)
+
+    if res_elem_type.width > src_elem_type.width:
+        if is_narrow_precision(src_elem_type):
+            return extf_from_narrow(res_type, src, loc, ip)
+        else:
+            return arith.extf(res_type, src, loc=loc, ip=ip)
+    else:
+        tmp_mlir_type = recast_type(src.type, T.f32())
+
+        # f16 -- extf -> f32 -- truncf -> bf16
+        # TODO-upstream: update arith to support bf16 <-> f16 conversion?
+        if (src_elem_type == T.f16() and res_elem_type == T.bf16()) or (
+            src_elem_type == T.bf16() and res_elem_type == T.f16()
+        ):
+            tmp = arith.extf(tmp_mlir_type, src, loc=loc, ip=ip)
+            return arith.truncf(res_type, tmp, loc=loc, ip=ip)
+
+        # {f8, f6, f4} -> f16, f32, ...
+        elif is_narrow_precision(res_elem_type):
+            return truncf_to_narrow(res_type, src, loc, ip)
+        else:
+            return arith.truncf(res_type, src, loc=loc, ip=ip)
+
+
+def fptoi(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+    # TODO-upstream: update arith to support this kind of conversion
+    if element_type(src.type) in (T.tf32(), T.bf16()):
+        src = cvtf(src, T.f32(), loc=loc, ip=ip)
+
+    if signed:
+        return arith.fptosi(res_type, src, loc=loc, ip=ip)
+    else:
+        return arith.fptoui(res_type, src, loc=loc, ip=ip)
+
+
+def itofp(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None):
+    res_type = recast_type(src.type, res_elem_type)
+
+    orig_res_type = res_type
+    # TODO-upstream: update arith to support this kind of conversion
+    if res_elem_type in (T.tf32(), T.bf16()):
+        res_type = recast_type(src.type, T.f32())
+
+    if signed and element_type(src.type).width > 1:
+        res = arith.sitofp(res_type, src, loc=loc, ip=ip)
+    else:
+        res = arith.uitofp(res_type, src, loc=loc, ip=ip)
+
+    if orig_res_type == res_type:
+        return res
+
+    return cvtf(res, element_type(orig_res_type), loc=loc, ip=ip)
+
+
+def int_to_int(a, dst_elem_type, *, loc=None, ip=None):
+    src_signed = a.signed
+    dst_signed = dst_elem_type.signed
+    src_width = element_type(a.type).width
+    dst_width = dst_elem_type.width
+
+    dst_mlir_type = recast_type(a.type, dst_elem_type.mlir_type)
+
+    if dst_width == src_width:
+        return a
+    elif src_signed != False and not dst_signed:
+        # Signed -> Unsigned
+        if dst_width > src_width:
+            return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+    elif src_signed == dst_signed:
+        # Same signedness
+        if dst_width > src_width:
+            if src_signed != False and src_width > 1:
+                return arith.extsi(dst_mlir_type, a, loc=loc, ip=ip)
+            else:
+                return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+    else:
+        # Unsigned -> Signed
+        if dst_width > src_width:
+            return arith.extui(dst_mlir_type, a, loc=loc, ip=ip)
+        else:
+            # For truncation from unsigned to signed, we need to handle overflow
+            # First truncate to the target width
+            trunc = arith.trunci(dst_mlir_type, a, loc=loc, ip=ip)
+            # Then reinterpret as signed
+            if dst_signed:
+                return arith.bitcast(dst_mlir_type, trunc, loc=loc, ip=ip)
+            return trunc
+
+
+# =============================================================================
+# Arith Ops Emitter Helpers
+#   - assuming type of lhs and rhs match each other
+#   - op name matches python module operator
+# =============================================================================
+
+
+def _cast(res_elem_ty, src, is_signed=None, *, loc=None, ip=None):
+    """
+    This function provides simplified interface to upstream op builder
+        arith.truncf(T.vector(shape, new_type), src)
+
+    is simplified as because it's element-wise op which can't change shape
+        arith.truncf(new_type, src)
+    """
+    if isinstance(src, ir.Value):
+        src_ty = src.type
+    else:
+        src_ty = type(src).mlir_type
+        src = src.ir_value()
+
+    src_elem_ty = element_type(src_ty)
+
+    if src_elem_ty == res_elem_ty:
+        return src
+    elif is_float_type(src_elem_ty) and is_float_type(res_elem_ty):
+        # float-to-float
+        return cvtf(src, res_elem_ty, loc=loc, ip=ip)
+    elif arith._is_integer_like_type(src_elem_ty) and arith._is_integer_like_type(
+        res_elem_ty
+    ):
+        if src_elem_ty.width >= res_elem_ty.width:
+            cast_op = arith.trunci
+        else:
+            if is_signed:
+                cast_op = arith.extsi
+            else:
+                cast_op = arith.extui
+
+        res_ty = recast_type(src_ty, res_elem_ty)
+        return cast_op(res_ty, src, loc=loc, ip=ip)
+    elif is_float_type(src_elem_ty) and arith._is_integer_like_type(res_elem_ty):
+        return fptoi(src, is_signed, res_elem_ty, loc=loc, ip=ip)
+    elif arith._is_integer_like_type(src_elem_ty) and is_float_type(res_elem_ty):
+        return itofp(src, is_signed, res_elem_ty, loc=loc, ip=ip)
+    else:
+        raise DSLRuntimeError(
+            f"cast from {src_elem_ty} to {res_elem_ty} is not supported"
+        )
+
+
+@lru_cache_ir()
+def const(value, ty=None, *, loc=None, ip=None):
+    """
+    Generates dynamic expression for constant values.
+    """
+    from ..typing import Numeric, NumericMeta
+    from ..dsl import is_dynamic_expression, _numpy_type_to_mlir_type
+
+    if isinstance(value, Numeric):
+        value = value.value
+
+    # Early return
+    if is_dynamic_expression(value) and (
+        value.type.isinstance(value.type) or T.bool().isinstance(value.type)
+    ):
+        return value
+
+    # Assume type
+    if ty is None:
+        if isinstance(value, float):
+            ty = T.f32()
+        elif isinstance(value, bool):
+            ty = T.bool()
+        elif isinstance(value, int):
+            ty = T.i32()
+        elif isinstance(value, np.ndarray):
+            ty = T.vector(*value.shape, _numpy_type_to_mlir_type(value.dtype))
+            value = array.array(value.dtype.kind, value.flatten().tolist())
+        else:
+            raise DSLNotImplemented(f"{type(value)} is not supported")
+    elif isinstance(ty, NumericMeta):
+        ty = ty.mlir_type
+    elif isinstance(ty, ir.Type):
+        if ir.RankedTensorType.isinstance(ty) or ir.VectorType.isinstance(ty):
+            elem_ty = ty.element_type
+            if isinstance(elem_ty, ir.IntegerType):
+                attr = ir.IntegerAttr.get(elem_ty, value)
+            else:
+                attr = ir.FloatAttr.get(elem_ty, value)
+            value = ir.DenseElementsAttr.get_splat(ty, attr)
+        elif arith._is_float_type(ty) and isinstance(value, (bool, int)):
+            value = float(value)
+        elif arith._is_integer_like_type(ty) and isinstance(value, float):
+            value = int(value)
+    else:
+        raise DSLNotImplemented(f"type {ty} is not supported")
+
+    return arith.constant(ty, value, loc=loc, ip=ip)
+
+
+def _dispatch_to_rhs_r_op(op):
+    """Decorator that dispatches to the right-hand-side's reverse operation.
+
+    If the other operand is not an ArithValue or is a subclass (more specific)
+    of ArithValue, this allows proper method resolution for binary operations.
+    """
+
+    def wrapper(self, other, **kwargs):
+        if not isinstance(other, ArithValue):
+            if not isinstance(other, (int, float, bool)):
+                # allows to call other.__rmul__
+                return NotImplemented
+
+        return op(self, other, **kwargs)
+
+    return wrapper
+
+
+def _binary_op(op):
+    """
+    Decorator to check if the 'other' argument is an ArithValue.
+    If not, returns NotImplemented.
+    """
+
+    def wrapper(self, other, **kwargs):
+        # When reach this point, `self` must be cast to base `ArithValue` type
+        if isinstance(other, (int, float, bool)):
+            other = const(other, self.type).with_signedness(self.signed)
+
+        # Call the original function
+        # If sub-class doesn't implement overloaded arithmetic, cast to base class
+        return op(self, other, **kwargs)
+
+    return wrapper
+
+
+# Operator overloading
+@ir.register_value_caster(ir.Float4E2M1FNType.static_typeid)
+@ir.register_value_caster(ir.Float6E2M3FNType.static_typeid)
+@ir.register_value_caster(ir.Float6E3M2FNType.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3FNType.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3B11FNUZType.static_typeid)
+@ir.register_value_caster(ir.Float8E5M2Type.static_typeid)
+@ir.register_value_caster(ir.Float8E4M3Type.static_typeid)
+@ir.register_value_caster(ir.Float8E8M0FNUType.static_typeid)
+@ir.register_value_caster(ir.BF16Type.static_typeid)
+@ir.register_value_caster(ir.F16Type.static_typeid)
+@ir.register_value_caster(ir.FloatTF32Type.static_typeid)
+@ir.register_value_caster(ir.F32Type.static_typeid)
+@ir.register_value_caster(ir.F64Type.static_typeid)
+@ir.register_value_caster(ir.IntegerType.static_typeid)
+@ir.register_value_caster(ir.VectorType.static_typeid)
+@ir.register_value_caster(ir.RankedTensorType.static_typeid)
+class ArithValue(ir.Value):
+    """Overloads operators for MLIR's Arith dialects binary operations."""
+
+    def __init__(self, v, signed: Union[bool, None] = None):
+        if isinstance(v, int):
+            v = arith.constant(self.type, v)
+        super().__init__(v)
+
+        elem_ty = element_type(self.type)
+        self.is_float = arith._is_float_type(elem_ty)
+        # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL
+        self.signed = signed and elem_ty.width > 1
+
+    def with_signedness(self, signed: Union[bool, None]):
+        return type(self)(self, signed)
+
+    def __neg__(self, *, loc=None, ip=None):
+        if self.type == T.bool():
+            raise TypeError(
+                "Negation, the operator `-` is not supported for boolean type"
+            )
+
+        if self.is_float:
+            return arith.negf(self, loc=loc, ip=ip)
+        else:
+            c0 = arith.constant(self.type, 0, loc=loc, ip=ip)
+            return arith.subi(c0, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __pow__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float and other.is_float:
+            return math.powf(self, other, loc=loc, ip=ip)
+        elif self.is_float and not other.is_float:
+            return math.fpowi(self, other, loc=loc, ip=ip)
+        elif not self.is_float and other.is_float:
+            lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip)
+            rhs = cvtf(other, T.f32(), loc=loc, ip=ip)
+            return math.powf(lhs, rhs, loc=loc, ip=ip)
+        elif not self.is_float and not other.is_float:
+            return math.ipowi(self, other, loc=loc, ip=ip)
+        else:
+            raise DSLNotImplemented(f"Unsupported '{self} ** {other}'")
+
+    @_binary_op
+    def __rpow__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__pow__(self, loc=loc, ip=ip)
+
+    # arith operators
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __add__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.addf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.addi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __sub__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.subf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.subi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __mul__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.mulf(self, other, loc=loc, ip=ip)
+        else:
+            return arith.muli(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __truediv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.divf(self, other, loc=loc, ip=ip)
+        else:
+            lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip)
+            rhs = itofp(other, other.signed, T.f32(), loc=loc, ip=ip)
+            return arith.divf(lhs, rhs, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            q = arith.divf(self, other, loc=loc, ip=ip)
+            return math.floor(q, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.floordivsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.divui(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __mod__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.remf(self, other, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.remsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.remui(self, other, loc=loc, ip=ip)
+
+    @_binary_op
+    def __radd__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__add__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rsub__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__sub__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rmul__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__mul__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__truediv__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__floordiv__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rmod__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__mod__(self, loc=loc, ip=ip)
+
+    # Comparison operators (comparison doesn't have right-hand-side variants)
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __lt__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OLT, self, other, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.cmpi(arith.CmpIPredicate.slt, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ult, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __le__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OLE, self, other, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.cmpi(arith.CmpIPredicate.sle, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ule, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __eq__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OEQ, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.eq, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __ne__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            # In Python, bool(float("nan")) is True, so use unordered comparison here
+            return arith.cmpf(arith.CmpFPredicate.UNE, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ne, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __gt__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OGT, self, other, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.cmpi(arith.CmpIPredicate.sgt, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.ugt, self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __ge__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.is_float:
+            return arith.cmpf(arith.CmpFPredicate.OGE, self, other, loc=loc, ip=ip)
+        elif self.signed != False:
+            return arith.cmpi(arith.CmpIPredicate.sge, self, other, loc=loc, ip=ip)
+        else:
+            return arith.cmpi(arith.CmpIPredicate.uge, self, other, loc=loc, ip=ip)
+
+    # Unary operators
+    def __invert__(self, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(self, arith.constant(self.type, -1))
+
+    # Bitwise operations
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __and__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.andi(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __or__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.ori(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __xor__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __rshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        if self.signed != False:
+            return arith.shrsi(self, other, loc=loc, ip=ip)
+        else:
+            return arith.shrui(self, other, loc=loc, ip=ip)
+
+    @_dispatch_to_rhs_r_op
+    @_binary_op
+    def __lshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.shli(self, other, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rand__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.andi(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __ror__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.ori(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rxor__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return arith.xori(other, self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rrshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__rshift__(self, loc=loc, ip=ip)
+
+    @_binary_op
+    def __rlshift__(self, other, *, loc=None, ip=None) -> "ArithValue":
+        return other.__lshift__(self, loc=loc, ip=ip)
+
+    def __hash__(self):
+        return super().__hash__()
+
+    def __str__(self):
+        return "?"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def _min(lhs, rhs, *, loc=None, ip=None):
+    """
+    This function provides a unified interface for building arith min
+
+    Assuming the operands have the same type
+    """
+    from ..dsl import is_dynamic_expression
+
+    if not is_dynamic_expression(lhs):
+        if not is_dynamic_expression(rhs):
+            return min(lhs, rhs)
+        else:
+            lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip)
+    else:
+        if not is_dynamic_expression(rhs):
+            rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip)
+
+    if arith._is_integer_like_type(lhs.type):
+        if lhs.signed != False:
+            return arith.minsi(lhs, rhs, loc=loc, ip=ip)
+        else:
+            return arith.minui(lhs, rhs, loc=loc, ip=ip)
+    else:
+        return arith.minimumf(lhs, rhs, loc=loc, ip=ip)
+
+
+def _max(lhs, rhs, *, loc=None, ip=None):
+    """
+    This function provides a unified interface for building arith max
+
+    Assuming the operands have the same type
+    """
+    from ..dsl import is_dynamic_expression
+
+    if not is_dynamic_expression(lhs):
+        if not is_dynamic_expression(rhs):
+            return max(lhs, rhs)
+        else:
+            lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip)
+    else:
+        if not is_dynamic_expression(rhs):
+            rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip)
+
+    if arith._is_integer_like_type(lhs.type):
+        if lhs.signed != False:
+            return arith.maxsi(lhs, rhs, loc=loc, ip=ip)
+        else:
+            return arith.maxui(lhs, rhs, loc=loc, ip=ip)
+    else:
+        return arith.maximumf(lhs, rhs, loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b0d0500824f3c5ffc9ae51c7218f40c64b780c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR GPU Dialect helper functions
+"""
+
+
+from ..._mlir import ir
+from ..._mlir.dialects import gpu, arith, scf
+from ..._mlir.extras import types as T
+
+from ..common import *
+
+# =============================================================================
+# GPU Dialect Helper functions
+# =============================================================================
+
+
+def create_async_token():
+    token_ty = gpu.AsyncTokenType.get()
+    token = gpu.wait(token_ty, [])
+    return token
+
+
+def printf(fmt, *args, threadNumber=-1):
+    """Generate gpu.printf OP predicated on threadNumber"""
+    type_formats = []
+    for arg in args:
+        ty_format = None
+        if ir.IndexType.isinstance(arg.type):
+            ty_format = "%llu"
+        if ir.IntegerType.isinstance(arg.type):
+            width = ir.IntegerType(arg.type).width
+            if width == 64:
+                ty_format = "%llu"
+            elif width == 32:
+                ty_format = "%d"
+            elif width == 1:
+                ty_format = "%i"
+        if ir.F32Type.isinstance(arg.type):
+            ty_format = "%f"
+        if ty_format is None:
+            raise DSLNotImplemented(arg.type)
+        type_formats.append(ty_format)
+    if threadNumber == -1:
+        gpu.printf(fmt.format(*type_formats) + "\n", args)
+    if threadNumber != -1:
+        tidx = gpu.thread_id(gpu.Dimension.x)
+        predicate = arith.cmpi(
+            arith.CmpIPredicate.eq, tidx, arith.constant(_T.index(), threadNumber)
+        )
+        if_op = scf.IfOp(predicate)
+        with ir.InsertionPoint(if_op.then_block):
+            gpu.printf(fmt.format(*type_formats) + "\n", args)
+            scf.yield_([])
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d717b42f94cfab678e70eceb5cc4d30dd10a45
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides @lru_cache_ir
+It extends functools.lru_cache with IR Context awareness.
+
+Example usage:
+from cutlass import ir
+from lru_cache_ir import lru_cache_ir
+
+@lru_cache_ir(ir, maxsize=128, typed=False)
+def make_layout(...):
+...
+
+"""
+
+
+from functools import lru_cache, wraps
+
+from ..._mlir import ir  # type: ignore
+
+
+def get_ir_context(func):
+    """
+    Return the context for given func called under ir.
+    Currently the context includes MLIRContext and InsertionPoint.
+    """
+    try:
+        if ir:
+            return (ir.Context.current, ir.InsertionPoint.current)
+        else:
+            return None
+    except ValueError:
+        return None
+
+
+def lru_cache_ir(maxsize=128, typed=True):
+    """
+    Applies an LRU cache to a given function, with awareness of IR context.
+
+    Usage is similar to functools.lru_cache while taking `ir` as required argument.
+
+    :param ir: The IR object from which to derive the context by `get_ir_context`
+    :param maxsize: Max cache size, same as functools.lru_cache
+    :param typed: Whether params are type-sensitive, default to True as IR is type-sensitive
+    """
+
+    def decorator(func):
+        # Use functools.lru_cache with a custom wrapper to control the key generation
+        @lru_cache(maxsize=maxsize, typed=typed)
+        def cached_func(context, *args, **kwargs):
+            return func(*args, **kwargs)
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                # Call the cached function with the context
+                return cached_func(get_ir_context(func), *args, **kwargs)
+            except (RuntimeError, TypeError):
+                return func(*args, **kwargs)
+
+        # Expose cache-related methods for introspection
+        wrapper.cache_clear = cached_func.cache_clear
+        wrapper.cache_info = cached_func.cache_info
+        return wrapper
+
+    return decorator
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/op.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3989c75e5462d11d5ca229b757f4e5b45c7ee013
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/_mlir_helpers/op.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides MLIR's OP helper functions
+"""
+
+
+import inspect
+from functools import wraps
+
+from ..._mlir import ir
+
+
+def dsl_user_op(opFunc):
+    @wraps(opFunc)
+    def wrapper(*args, **kwargs):
+        loc = kwargs.pop("loc", None)
+        if loc is None:
+            frame = inspect.currentframe().f_back
+            file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0)
+            loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc)
+        res_or_list = opFunc(*args, **kwargs, loc=loc)
+        return res_or_list
+
+    return wrapper
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b11474c6b5b4fd30fb1feb6fae792fc9e059686
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_helpers.py
@@ -0,0 +1,616 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides helper functions that are generated by the preprocessor.
+The preprocessor read through python's ast and changes the input code.
+"""
+
+from typing import Callable, Iterator, Optional, overload
+from typing_extensions import deprecated
+import warnings
+import inspect
+from types import BuiltinFunctionType
+from functools import lru_cache
+from inspect import getmembers
+
+from .utils.logger import log
+from .common import *
+
+from ._mlir_helpers.arith import ArithValue
+
+
+class Executor:
+    """
+    The Executor class handles dynamic and compile-time (constexpr) execution
+    of "for" loops and "if-else-elif" statements.
+
+    Methods:
+        set_functions:  Assigns the functions for checking loop bounds and
+                        conditional evaluation.
+
+        for_execute: Generates MLIR for OP
+        while_execute: Generates MLIR while OP
+        if_execute: generate MLIR if OP
+    """
+
+    def __init__(self):
+        self._is_dynamic_expression = None
+        self._loop_execute_range_dynamic = None
+        self._if_dynamic = None
+        self._while_dynamic = None
+        self._compare_executor = None
+        self._any_executor = None
+        self._all_executor = None
+        self._builtin_redirector = None
+
+    def set_functions(
+        self,
+        *,
+        is_dynamic_expression: Callable,
+        loop_execute_range_dynamic: Callable,
+        if_dynamic: Callable,
+        while_dynamic: Callable,
+        compare_executor: Callable,
+        any_executor: Callable = None,
+        all_executor: Callable = None,
+        builtin_redirector: Callable = None,
+    ):
+        self._is_dynamic_expression = is_dynamic_expression
+        self._loop_execute_range_dynamic = loop_execute_range_dynamic
+        self._if_dynamic = if_dynamic
+        self._while_dynamic = while_dynamic
+        self._compare_executor = compare_executor
+        self._any_executor = any_executor
+        self._all_executor = all_executor
+        self._builtin_redirector = builtin_redirector
+
+    @staticmethod
+    def convert_to_list(x):
+        """This function is used to convert x to a list.
+        If x is None, return an empty list.
+        If x is not a list, return a list containing x.
+        Otherwise, return x itself.
+        """
+        if x is None:
+            return []
+        if not isinstance(x, list):
+            return [x]
+        return x
+
+    @staticmethod
+    def converge_ret_val(res):
+        """This function is used to converge res (the return value) of the function.
+        If res is None, return None.
+        If res is a list and has only one element, return the element.
+        Otherwise, return res itself.
+        """
+        if res is None:
+            return res
+        elif isinstance(res, list) and len(res) == 1:
+            return res[0]
+        return res
+
+    def for_execute(
+        self,
+        func,
+        start,
+        stop,
+        step,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+        unroll=-1,
+        unroll_full=False,
+        prefetch_stages=None,
+    ):
+        assert (
+            self._loop_execute_range_dynamic
+        ), "Functions must be set before execution."
+        log().debug("start [%s] stop [%s] step [%s]", start, stop, step)
+
+        return self._loop_execute_range_dynamic(
+            func,
+            start,
+            stop,
+            step,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+        )
+
+    def if_execute(
+        self,
+        pred,
+        then_block: Callable,
+        else_block: Optional[Callable] = None,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+    ):
+        assert self._if_dynamic, "Functions must be set before execution."
+
+        # MLIR generation
+        return self._if_dynamic(
+            pred,
+            then_block,
+            else_block,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+        )
+
+    def while_execute(
+        self,
+        pred,
+        while_before_block: Callable,
+        while_after_block: Callable,
+        write_args=[],
+        full_write_args_count=0,
+        write_args_names=[],
+    ):
+        assert self._while_dynamic, "Functions must be set before execution."
+
+        # MLIR generation
+        return self._while_dynamic(
+            while_before_block,
+            while_after_block,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+        )
+
+
+# =============================================================================
+# Decorator
+# =============================================================================
+
+executor = Executor()
+
+
+def loop_selector(
+    start,
+    stop,
+    step,
+    *,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+    unroll=-1,
+    unroll_full=False,
+    prefetch_stages=None,
+):
+    log().debug(
+        "start [%s] stop [%s] step [%s] write_args [%s] full_write_args_count [%s] write_args_names [%s] unroll [%s] unroll_full [%s] prefetch_stages [%s]",
+        start,
+        stop,
+        step,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+        unroll,
+        unroll_full,
+        prefetch_stages,
+    )
+    from .typing import Integer, Numeric
+
+    def _maybe_upcast(value):
+        if isinstance(value, Integer):
+            value = value.ir_value()
+
+        return value
+
+    start = _maybe_upcast(start)
+    stop = _maybe_upcast(stop)
+    step = _maybe_upcast(step)
+
+    def ir_loop(func):
+        return executor.for_execute(
+            func,
+            start,
+            stop,
+            step,
+            write_args,
+            full_write_args_count,
+            write_args_names,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+        )
+
+    return ir_loop
+
+
+def if_selector(pred, write_args=[]):
+    log().debug("pred [%s] write_args [%s]", pred, write_args)
+    # Handle Numeric types here?
+
+    from .typing import Numeric
+
+    if isinstance(pred, Numeric):
+        pred = pred.value
+
+    def ir_loop(func):
+        return func(pred, *write_args)
+
+    return ir_loop
+
+
+def while_selector(pred, write_args=[]):
+    def ir_while_loop(func):
+        return func(pred, *write_args)
+
+    return ir_while_loop
+
+
+def while_executor(
+    pred,
+    while_before_block: Callable,
+    while_after_block: Callable,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+):
+    return executor.while_execute(
+        pred,
+        while_before_block,
+        while_after_block,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+    )
+
+
+def if_executor(
+    pred,
+    then_block: Callable,
+    else_block: Optional[Callable] = None,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+):
+    return executor.if_execute(
+        pred,
+        then_block,
+        else_block,
+        write_args,
+        full_write_args_count,
+        write_args_names,
+    )
+
+
+# =============================================================================
+# Range
+# =============================================================================
+
+
+class range:
+    """
+    A range-like object for dynamic loop iteration in the DSL.
+
+    This class provides a range interface similar to Python's built-in range,
+    but is designed to be preprocessed into constructs for dynamic
+    loop execution.
+
+    The class supports both single-argument (stop) and three-argument
+    (start, stop, step) constructors with additional parameters for loop
+    optimization:
+
+    - unroll: Number of iterations to unroll (0 or 1 = no unrolling)
+    - unroll_full: Whether to fully unroll the loop
+    - prefetch_stages: Number of prefetch stages to generate
+    """
+
+    @overload
+    def __new__(cls, stop, unroll=0, unroll_full=False, prefetch_stages=None):
+        pass
+
+    @overload
+    def __new__(
+        cls, start, stop, step, unroll=0, unroll_full=False, prefetch_stages=None
+    ):
+        pass
+
+    def __new__(cls, *args, **kwargs):
+        raise DSLRuntimeError("dynamic range should be always preprocessed to IR")
+
+    def __iter__(self) -> Iterator[int]:
+        raise DSLRuntimeError("dynamic range should be always preprocessed to IR")
+
+
+@deprecated(
+    "range_dynamic is deprecated and will be removed in the future, please remove it."
+)
+def range_dynamic(*args, **kwargs):
+    raise DSLRuntimeError("range_dynamic should be always preprocessed to IR")
+
+
+def range_constexpr(*args):
+    raise DSLRuntimeError("range_constexpr should be preprocessed by preprocessor.")
+
+
+# =============================================================================
+# If expressions
+# =============================================================================
+
+
+def const_expr(expression):
+    """
+    This function is used to check if the expression is a python value.
+    If the expression is a python value, return the boolean value of the expression.
+    If the expression is a dynamic expression, raise an error.
+    """
+    from .typing import Numeric
+
+    failed = False
+
+    if isinstance(expression, Numeric):
+        if isinstance(expression.value, (int, float, bool)):
+            return expression.value
+        else:
+            failed = True
+    elif executor._is_dynamic_expression(expression):
+        failed = True
+
+    if failed:
+        raise DSLRuntimeError(
+            f"The function `const_expr({expression})` received a dynamic expression (non compile-time constant).",
+            context={
+                "If your expression depends on dynamic values": "Remove `const_expr()`",
+            },
+        )
+    return expression
+
+
+@deprecated(
+    "dynamic_expr is deprecated and will be removed in the future, please remove it."
+)
+def dynamic_expr(expression):
+    return expression
+
+
+# =============================================================================
+# Assertion & casting
+# =============================================================================
+
+
+def assert_executor(test, msg=None):
+    from .typing import Numeric
+
+    fail = False
+    # Implicit convert dynamic expression to bool is not allowed
+    # So here explicitly do a None check
+    if test is not None and executor._is_dynamic_expression(test):
+        if isinstance(test, Numeric):
+            try:
+                test = test.to(bool)
+            except:
+                fail = True
+        else:
+            fail = True
+
+    if not fail:
+        assert test, msg
+    else:
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion="Please replace with runtime assert.",
+        )
+
+
+def bool_cast(value):
+    if executor._is_dynamic_expression(value):
+        raise DSLRuntimeError(
+            "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.",
+            suggestion="Please explicitly convert to boolean with expressions like comparision.",
+        )
+    return bool(value)
+
+
+def compare_executor(left, comparators, ops):
+    """
+    Executes comparison operations with a left operand and a list of comparators.
+
+    Args:
+        left: The leftmost value in the comparison chain
+        comparators: A list of values to compare against
+        ops: A list of comparison operators to apply
+
+    Returns:
+        The result of the comparison chain
+
+    Raises:
+        AssertionError: If the executor function is not set before execution
+    """
+    assert (
+        executor._compare_executor is not None
+    ), "Function must be set before execution."
+    return executor._compare_executor(left, comparators, ops)
+
+
+def any_executor(iterable):
+    """Executes the 'any' operation on an iterable, handling both dynamic and static expressions.
+
+    :param iterable: An iterable to check if any elements evaluate to True
+    :type iterable: Iterable
+    :return: boolean of Python value or IR value
+    :rtype: bool or cutlass.Boolean
+
+    """
+    if executor._any_executor and executor._is_dynamic_expression(iterable):
+        return executor._any_executor(iterable)
+    else:
+        return any(iterable)
+
+
+def all_executor(iterable):
+    """Executes the 'all' operation on an iterable, handling both dynamic and static expressions.
+
+    :param iterable: An iterable to check if all elements evaluate to True
+    :type iterable: Iterable
+    :return: boolean of Python value or IR value
+    :rtype: bool or cutlass.Boolean
+    """
+    if executor._all_executor and executor._is_dynamic_expression(iterable):
+        return executor._all_executor(iterable)
+    else:
+        return all(iterable)
+
+
+# =============================================================================
+# Control flow checks
+# =============================================================================
+class DSLOptimizationWarning(Warning):
+    """
+    This warning is used to warn the user about the optimization related issues in DSL.
+    """
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__()
+
+    def __str__(self):
+        return self.message
+
+
+def range_value_check(*args):
+    """
+    Ensure all `range_constexpr` bounds are compile-time constants (Python ints).
+    """
+    try:
+        args = tuple(arg.__index__() for arg in args)
+
+        # Compute range size and warn if it's too large
+        start = 0
+        end = 0
+        step = 1
+        if len(args) == 1:
+            end = args[0]
+        elif len(args) == 2:
+            start = args[0]
+            end = args[1]
+        elif len(args) == 3:
+            start = args[0]
+            end = args[1]
+            step = args[2]
+
+        range_length = (abs(end - start) - 1) // abs(step) + 1
+        if range_length >= 64:
+            warnings.warn(
+                f"This static loop has {range_length} iterations, which may be very slow to compile, consider using `cutlass.range(..., unroll_full=True)` instead.",
+                category=DSLOptimizationWarning,
+                stacklevel=2,
+            )
+
+        return (start, end, step)
+    except:
+        raise DSLRuntimeError(
+            "`range_constexpr` requires constexpr (compile-time constant) for all arguments.",
+            suggestion="Use `range` instead of `range_constexpr`.",
+        )
+
+
+def range_perf_warning(filename, lineno, *args):
+    has_dynamic_expr = False
+    for arg in args:
+        if executor._is_dynamic_expression(arg):
+            has_dynamic_expr = True
+            break
+    if not has_dynamic_expr:
+        warnings.warn_explicit(
+            (
+                "This loop is no longer unrolled and may cause performance regression. "
+                "Use `range(..., unroll_full=True)` for full unrolling, or switch to `range_constexpr` when bounds are compile-time constants."
+            ),
+            category=DSLOptimizationWarning,
+            filename=filename,
+            lineno=lineno,
+        )
+
+
+@lru_cache(maxsize=1)
+def _get_self_module():
+    """
+    This function is used to get the owning module of this function.
+    """
+    return inspect.getmodule(_get_self_module)
+
+
+def cf_symbol_check(symbol):
+    """
+    Check if the symbol is control flow symbol from current module.
+    """
+
+    failed = False
+    name = symbol.__name__
+    self_module = _get_self_module()
+    if inspect.ismodule(symbol):
+        name = "range"
+        if not self_module.__name__.startswith(symbol.__name__):
+            failed = True
+    else:
+        owning_module = inspect.getmodule(symbol)
+        if owning_module != self_module:
+            failed = True
+
+    if failed:
+        raise DSLRuntimeError(
+            f"Incorrect {symbol.__name__} is used.",
+            suggestion=f"Please avoid overriding `{symbol.__name__}` from DSL package.",
+        )
+
+
+def redirect_builtin_function(fcn):
+    """
+    This function is used to redirect built-in function call
+    to the function defined in DSL package.
+    """
+    # Only redirect if it's a built-in
+    if isinstance(fcn, BuiltinFunctionType) and executor._builtin_redirector:
+        return executor._builtin_redirector(fcn)
+    return fcn
+
+
+def copy_members(dest, src):
+    """
+    Copies all non-callable, non-dunder members from src to dest if they exist in src.
+    Skips members that are callables or have names starting with double underscores.
+    """
+    if id(dest) == id(src):
+        return
+
+    members = getmembers(dest)
+    for name, value in members:
+        if (
+            name.startswith("__")
+            or isinstance(value, Callable)
+            or not hasattr(src, name)
+        ):
+            continue
+        setattr(dest, name, getattr(src, name))
+
+
+def get_locals_or_none(locals, symbols):
+    """
+    Given a locals() dictionary and a list of symbol names, return a list of their values
+    in the same order as the symbols list. If a symbol is not present in locals, None is returned
+    for that symbol.
+    """
+    variables = []
+    for symbol in symbols:
+        if symbol in locals:
+            variables.append(locals[symbol])
+        else:
+            variables.append(None)
+    return variables
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_preprocessor.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f2d1ae84405a13f7fffd241c6e6bdd6e167010
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/ast_preprocessor.py
@@ -0,0 +1,1958 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module defines the `DSLPreprocessor` class, which acts as a Python preprocessor.
+It uses Python's AST and rewrites specific Python statements such as `for` and `if-else`.
+
+The preprocessor operates on the following constructs:
+    - `for` loops:
+        - Rewrites `for` loops with the `@loop_selector` decorator.
+        - Supports `range`, `range_dynamic` for loop iteration.
+    - `if-elif-else` statements:
+        - Rewrites conditional statements with the `@if_selector` decorator.
+        - Supports `dynamic_expr` and `const_expr` in the condition expressions.
+
+Additionally, both `for` loops and `if-else` statements require `yield`
+operation generation. The preprocessor handles this by:
+    - Using a `ScopeManager` to track symbols across different scopes during AST traversal.
+    - Identifying read-only, read-write, and active variables for DSL constructs.
+    - Generating `yield` operations for symbols that are classified as read-write or write.
+
+It is designed to be generic and can handle `for` and `if` constructs from other dialects.
+In such cases, the user's DSL should implement `@loop_selector` and `@if_selector`
+to generate dialect-specific operations for `for` and `if` statements.
+"""
+
+import ast
+import importlib
+import inspect
+import textwrap
+import warnings
+from dataclasses import dataclass
+from typing import List, Set, Dict, Any, Callable, Optional
+from types import ModuleType
+from collections import OrderedDict
+from copy import deepcopy
+
+from .common import *
+from .utils.logger import log
+
+
+class OrderedSet:
+    """
+    A deterministic set implementation for ordered operations.
+    """
+
+    def __init__(self, iterable=None):
+        self._dict = dict.fromkeys(iterable or [])
+
+    def add(self, item):
+        self._dict[item] = None
+
+    def __iter__(self):
+        return iter(self._dict)
+
+    def __and__(self, other):
+        return OrderedSet(key for key in self._dict if key in other)
+
+    def __or__(self, other):
+        new_dict = self._dict.copy()
+        new_dict.update(dict.fromkeys(other))
+        return OrderedSet(new_dict)
+
+    def __sub__(self, other):
+        return OrderedSet(key for key in self._dict if key not in other)
+
+    def intersections(self, others):
+        """Compute the intersection of this set with multiple other sets.
+
+        :param others: A list of sets to compute intersections with
+        :type others: List[Set[str]]
+        :return: A new ordered set containing elements that appear in this set
+            and at least one of the other sets
+        """
+        result = OrderedSet()
+        for key in self._dict:
+            for other in reversed(others):
+                if key in other:
+                    result.add(key)
+                    break
+        return result
+
+
+@dataclass
+class ImportInfo:
+    """
+    Information about an import expression.
+    """
+    module_path: str
+    attr_name: Optional[str]
+    alias_name: str
+
+
+@dataclass
+class ScopeManager:
+    """
+    Manages symbol scopes during AST traversal.
+    Manage nested scopes during transformations.
+    """
+
+    scopes: List[Set[str]]
+
+    @classmethod
+    def create(cls) -> "ScopeManager":
+        return cls([])
+
+    def add_to_scope(self, name: str) -> None:
+        if name == "_":
+            return
+        self.scopes[-1].add(name)
+
+    def get_active_symbols(self) -> List[Set[str]]:
+        return self.scopes.copy()
+
+    def __enter__(self) -> "ScopeManager":
+        self.scopes.append(set())
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.scopes.pop()
+
+
+class DSLPreprocessor(ast.NodeTransformer):
+    """
+    A preprocessor for transforming Python ASTs. It supports:
+
+    - Rewriting `for` loops with the `@loop_selector` decorator.
+    - Rewriting `if-elif-else` statements with the `@if_selector` decorator.
+    - Generating `yield` operations for read-write or write symbols.
+    """
+
+    DECORATOR_FOR_STATEMENT = "loop_selector"
+    DECORATOR_IF_STATEMENT = "if_selector"
+    DECORATOR_WHILE_STATEMENT = "while_selector"
+    IF_EXECUTOR = "if_executor"
+    WHILE_EXECUTOR = "while_executor"
+    ASSERT_EXECUTOR = "assert_executor"
+    BOOL_CAST = "bool_cast"
+    IMPLICIT_DOWNCAST_NUMERIC_TYPE = "implicitDowncastNumericType"
+    SUPPORTED_FOR_RANGE_STATEMENTS = {"range", "range_dynamic", "range_constexpr"}
+    COMPARE_EXECUTOR = "compare_executor"
+    ANY_EXECUTOR = "any_executor"
+    ALL_EXECUTOR = "all_executor"
+
+    def __init__(self, client_module_name):
+        super().__init__()
+        self.counter = 0  # Unique function names for multiple loops
+        self.scope_manager = ScopeManager.create()
+        self.processed_functions = set()
+        self.function_counter = 0
+        self.function_name = "<unknown function>"
+        self.class_name = None
+        self.file_name = "<unknown filename>"
+        self.function_depth = 0
+        self.local_closures = set()
+        self.function_globals = None
+        self.client_module_name = client_module_name
+        self.import_top_module = False
+
+    def _create_module_attribute(
+        self,
+        func_name,
+        *,
+        top_module_name="_dsl_",
+        submodule_name="ast_helpers",
+        lineno=None,
+        col_offset=None,
+    ):
+        # If we simply copy location from origin node, it contains a way to wide range, which cause location in traceback to be wrong.
+        def set_location(node, lineno, col_offset):
+            if lineno and col_offset:
+                node.lineno = lineno
+                node.end_lineno = lineno
+                node.col_offset = col_offset
+                node.end_col_offset = col_offset
+
+        base = ast.Name(id=top_module_name, ctx=ast.Load())
+        set_location(base, lineno, col_offset)
+        if submodule_name:
+            base = ast.Attribute(value=base, attr=submodule_name, ctx=ast.Load())
+            set_location(base, lineno, col_offset)
+        node = ast.Attribute(value=base, attr=func_name, ctx=ast.Load())
+        set_location(node, lineno, col_offset)
+        return node
+
+    def _get_module_imports(self, decorated_func):
+        """Extract imports from the module containing the decorated function"""
+        imports = []
+
+        # Get the module containing the decorated function
+        if module := inspect.getmodule(decorated_func):
+            try:
+                # Get the module source code
+                source = inspect.getsource(module)
+                module_ast = ast.parse(source)
+
+                # Extract imports from the full module
+                alias = lambda n: n.asname if n.asname else n.name
+                for node in ast.walk(module_ast):
+                    if isinstance(node, ast.Import):
+                        for name in node.names:
+                            imports.append(
+                                ImportInfo(
+                                    module_path=name.name,
+                                    attr_name=None,
+                                    alias_name=alias(name),
+                                )
+                            )
+                    elif isinstance(node, ast.ImportFrom):
+                        module_name = node.module
+                        if node.level > 0:
+                            # Handle relative imports
+                            package_name = module.__package__.rsplit(
+                                ".", node.level - 1
+                            )[0]
+                            module_name = f"{package_name}.{module_name}"
+                        for name in node.names:
+                            imports.append(
+                                ImportInfo(
+                                    module_path=module_name,
+                                    attr_name=name.name,
+                                    alias_name=alias(name),
+                                )
+                            )
+            except (IOError, TypeError):
+                pass
+
+        return imports
+
+    def exec(self, function_name, original_function, code_object, exec_globals):
+        # Get imports from the original module
+        module_imports = self._get_module_imports(original_function)
+
+        # Import all required modules
+        for import_info in module_imports:
+            module_path, attr_name, alias_name = (
+                import_info.module_path,
+                import_info.attr_name,
+                import_info.alias_name,
+            )
+            try:
+                module = importlib.import_module(module_path)
+                if attr_name:
+                    if attr_name == "*":
+                        if hasattr(module, "__all__"):
+                            attrs = module.__all__
+                        else:
+                            attrs = [
+                                name for name in dir(module) if not name.startswith("_")
+                            ]
+                    else:
+                        attrs = [attr_name]
+
+                    for attr in attrs:
+                        alias = attr if attr_name == "*" else alias_name
+                        exec_globals[alias] = getattr(module, attr)
+                else:
+                    exec_globals[alias_name] = module
+            except (ImportError, AttributeError) as e:
+                raise ImportError(f"Failed to import {module_path}: {str(e)}")
+
+        # Execute the transformed code
+        log().info(
+            "ASTPreprocessor Executing transformed code for function [%s]",
+            function_name,
+        )
+        exec(code_object, exec_globals)
+        return exec_globals.get(function_name)
+
+    @staticmethod
+    def print_ast(transformed_tree=None):
+        print("#", "-" * 40, "Transformed AST", "-" * 40)
+        unparsed_code = ast.unparse(transformed_tree)
+        print(unparsed_code)
+        print("#", "-" * 40, "End Transformed AST", "-" * 40)
+
+    def make_func_param_name(self, base_name, used_names):
+        """Generate a unique parameter name that doesn't collide with existing names."""
+        if base_name not in used_names:
+            return base_name
+
+        i = 0
+        while f"{base_name}_{i}" in used_names:
+            i += 1
+        return f"{base_name}_{i}"
+
+    def transform_function(self, func_name, function_pointer):
+        """
+        Transforms a function.
+        """
+        # Skip if the function has already been processed
+        if function_pointer in self.processed_functions:
+            log().info(
+                "ASTPreprocessor Skipping already processed function [%s]", func_name
+            )
+            return []
+
+        # Step 1. Parse the given function
+        file_name = inspect.getsourcefile(function_pointer)
+        lines, start_line = inspect.getsourcelines(function_pointer)
+        dedented_source = textwrap.dedent("".join(lines))
+        tree = ast.parse(dedented_source, filename=file_name)
+        # Bump the line numbers so they match the real source file
+        ast.increment_lineno(tree, start_line - 1)
+
+        # Step 1.2 Check the decorator
+        if not self.check_decorator(tree.body[0]):
+            log().info(
+                "[%s] - Skipping function due to missing decorator",
+                func_name,
+            )
+            return []
+
+        self.processed_functions.add(function_pointer)
+        log().info("ASTPreprocessor Transforming function [%s]", func_name)
+
+        # Step 2. Transform the function
+        transformed_tree = self.visit(tree)
+
+        # Step 3. Import cutlass and base_dsl
+        top_module_name = ".".join(self.client_module_name)
+        import_stmts = []
+        if self.import_top_module:
+            import_stmts.append(ast.Import(names=[ast.alias(name=top_module_name)]))
+        import_stmts.append(
+            ast.Import(
+                names=[ast.alias(name=f"{top_module_name}.base_dsl", asname="_dsl_")]
+            )
+        )
+        transformed_tree.body = import_stmts + transformed_tree.body
+
+        # Step 4. Import cutlass and base_dsl
+        ast.fix_missing_locations(transformed_tree)
+        combined_body = transformed_tree.body
+
+        # Step 5. Return the transformed tree
+        return combined_body
+
+    def check_early_exit(self, tree, kind):
+        """
+        Checks if a given region or scope in the provided Python code has early exits.
+        """
+
+        class EarlyExitChecker(ast.NodeVisitor):
+            def __init__(self, kind):
+                self.has_early_exit = False
+                self.early_exit_node = None
+                self.early_exit_type = None
+                self.kind = kind
+                self.loop_nest_level = 0
+
+            # Early exit is not allowed in any level of dynamic control flow
+            def visit_Return(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "return"
+
+            def visit_Raise(self, node):
+                self.has_early_exit = True
+                self.early_exit_node = node
+                self.early_exit_type = "raise"
+
+            def visit_Break(self, node):
+                # For break/continue in inner loops, we don't consider it as early exit
+                if self.loop_nest_level == 0 and self.kind != "if":
+                    self.has_early_exit = True
+                    self.early_exit_node = node
+                    self.early_exit_type = "break"
+
+            def visit_Continue(self, node):
+                if self.loop_nest_level == 0 and self.kind != "if":
+                    self.has_early_exit = True
+                    self.early_exit_node = node
+                    self.early_exit_type = "continue"
+
+            def visit_For(self, node):
+                self.loop_nest_level += 1
+                self.generic_visit(node)
+                self.loop_nest_level -= 1
+
+            def visit_While(self, node):
+                self.loop_nest_level += 1
+                self.generic_visit(node)
+                self.loop_nest_level -= 1
+
+        checker = EarlyExitChecker(kind)
+        checker.generic_visit(tree)
+        if not checker.has_early_exit:
+            return
+        raise DSLAstPreprocessorError(
+            message=f"Early exit ({checker.early_exit_type}) is not allowed in `{self.function_name}`"
+            + (f" in `{self.class_name}`" if self.class_name else ""),
+            filename=self.file_name,
+            snippet=ast.unparse(tree),
+            suggestion=(
+                "If predicates are constant expression, write like "
+                "`if const_expr(...)` or `for ... in range_constexpr(...)`. "
+                "In that case, early exit will be executed by Python "
+                "interpreter, so it's supported."
+            ),
+        )
+
+    def is_node_constexpr(self, node) -> bool:
+        """
+        Determines if the node is a constexpr.
+        Supported nodes are if, while statements.
+        """
+        if isinstance(node, ast.If) or isinstance(node, ast.While):
+            if isinstance(node.test, ast.Call):
+                func = node.test.func
+
+                if isinstance(func, ast.Attribute) and func.attr == "const_expr":
+                    return True
+
+                elif isinstance(func, ast.Name) and func.id == "const_expr":
+                    return True
+        return False
+
+    def _get_range_kind(self, iter_node):
+        """
+        Return "range", "range_dynamic", "range_constexpr" or None for the iterable
+        """
+        if isinstance(iter_node, ast.Call):
+            func = iter_node.func
+            if (
+                isinstance(func, ast.Name)
+                and func.id in self.SUPPORTED_FOR_RANGE_STATEMENTS
+            ):
+                return func.id, True, len(iter_node.keywords) != 0
+            if (
+                isinstance(func, ast.Attribute)
+                and func.attr in self.SUPPORTED_FOR_RANGE_STATEMENTS
+            ):
+                return func.attr, False, len(iter_node.keywords) != 0
+        return None, None, None
+
+    def transform(self, original_function, exec_globals):
+        """
+        Transforms the provided function using the preprocessor.
+        """
+        self.file_name = inspect.getsourcefile(original_function)
+        self.function_globals = exec_globals
+        transformed_tree = self.transform_function(
+            original_function.__name__, original_function
+        )
+        self.function_globals = None
+        unified_tree = ast.Module(body=transformed_tree, type_ignores=[])
+        unified_tree = ast.fix_missing_locations(unified_tree)
+
+        return unified_tree
+
+    def analyze_region_variables(
+        self, node: Union[ast.For, ast.If], active_symbols: List[Set[str]]
+    ):
+        """
+        Analyze variables in different code regions to identify read-only, write-only,
+        and active variables for DSL constructs.
+        """
+
+        # we need orderedset to keep the insertion order the same. otherwise generated IR is different each time
+        write_args = OrderedSet()
+        invoked_args = OrderedSet()
+        local_closure = self.local_closures
+        file_name = self.file_name
+        region_node = node
+
+        class RegionAnalyzer(ast.NodeVisitor):
+            force_store = False
+
+            def visit_Name(self, node):
+                """
+                Mark every store as write.
+                """
+                if isinstance(node.ctx, ast.Store) or self.force_store:
+                    write_args.add(node.id)
+
+            def visit_Subscript(self, node):
+                # When subscript occurs on the lhs of an assignment, the `Name` is still a load, but `Subscript` is marked as `Store`.
+                # We need to force the store for the `Name` to be marked as write.
+                if isinstance(node.ctx, ast.Store):
+                    self.force_store = True
+                    self.visit(node.value)
+                    self.force_store = False
+                    self.visit(node.slice)
+                else:
+                    self.generic_visit(node)
+
+            def visit_Assign(self, node):
+                self.force_store = True
+                [self.visit(target) for target in node.targets]
+                self.force_store = False
+                self.visit(node.value)
+
+            def visit_AugAssign(self, node):
+                self.force_store = True
+                self.visit(node.target)
+                self.force_store = False
+                self.visit(node.value)
+
+            @staticmethod
+            def get_call_base(func_node):
+                if isinstance(func_node, ast.Attribute):
+                    # If the .value is another Attribute, keep digging
+                    if isinstance(func_node.value, ast.Attribute):
+                        return RegionAnalyzer.get_call_base(func_node.value)
+                    # If the .value is a Name, that's our base
+                    elif isinstance(func_node.value, ast.Name):
+                        return func_node.value.id
+                    else:
+                        # Could be something else (lambda, call, etc.)
+                        return None
+                elif isinstance(func_node, ast.Name):
+                    return None
+                return None
+
+            @staticmethod
+            def get_function_name(func_node: ast.Call):
+                if isinstance(func_node.func, ast.Name):
+                    function_name = func_node.func.id
+                # Check if it's a method or attribute call
+                elif isinstance(func_node.func, ast.Attribute):
+                    function_name = func_node.func.attr
+                else:
+                    function_name = None
+                return function_name
+
+            def visit_Call(self, node):
+                base_name = RegionAnalyzer.get_call_base(node.func)
+
+                if isinstance(node.func, ast.Name):
+                    func_name = node.func.id
+                    if func_name in local_closure:
+                        raise DSLAstPreprocessorError(
+                            f"Function `{func_name}` is a closure and is not supported in for/if statements",
+                            filename=file_name,
+                            snippet=ast.unparse(region_node),
+                        )
+
+                # Classes are mutable by default. Mark them as write. If they are
+                # dataclass(frozen=True), treat them as read in runtime.
+                if base_name is not None and base_name not in ("self"):
+                    invoked_args.add(base_name)
+
+                self.generic_visit(node)
+
+        analyzer = RegionAnalyzer()
+        analyzer.visit(ast.Module(body=node))
+
+        # If arg is both write and invoke, remove from invoked_args
+        invoked_args = invoked_args - write_args
+
+        write_args = list(write_args.intersections(active_symbols))
+        invoked_args = list(invoked_args.intersections(active_symbols))
+
+        return write_args + invoked_args, len(write_args)
+
+    def extract_range_args(self, iter_node):
+        args = iter_node.args
+        if len(args) == 1:
+            return (
+                self.visit(ast.Constant(value=0)),
+                self.visit(args[0]),
+                self.visit(ast.Constant(value=1)),
+                False,
+            )
+        elif len(args) == 2:
+            return (
+                self.visit(args[0]),
+                self.visit(args[1]),
+                self.visit(ast.Constant(value=1)),
+                False,
+            )
+        elif len(args) == 3:
+            return self.visit(args[0]), self.visit(args[1]), self.visit(args[2]), True
+        else:
+            raise DSLAstPreprocessorError(
+                "Unsupported number of arguments in range", filename=self.file_name
+            )
+
+    def extract_unroll_args(self, iter_node):
+        keywords = {kw.arg: kw.value for kw in iter_node.keywords}
+        return (
+            keywords.get("unroll", ast.Constant(value=-1)),
+            keywords.get("unroll_full", ast.Constant(value=False)),
+        )
+
+    def issue_deprecation_warning(self, *, message, category, filename, lineno):
+        warnings.simplefilter("always", category)  # turn off filter
+        warnings.warn_explicit(
+            message, category=category, filename=filename, lineno=lineno
+        )
+        warnings.simplefilter("default", category)  # reset filter
+
+    def extract_prefetch_stages_args(self, iter_node):
+        keywords = {kw.arg: kw.value for kw in iter_node.keywords}
+        if "pipelining" in keywords:
+            self.issue_deprecation_warning(
+                message="pipelining is deprecated, use prefetch_stages instead",
+                category=DeprecationWarning,
+                filename=self.file_name,
+                lineno=iter_node.lineno,
+            )
+            return keywords.get("pipelining", ast.Constant(value=None))
+        return keywords.get("prefetch_stages", ast.Constant(value=None))
+
+    def create_loop_function(
+        self,
+        func_name,
+        node,
+        start,
+        stop,
+        step,
+        unroll,
+        unroll_full,
+        prefetch_stages,
+        write_args,
+        full_write_args_count,
+    ):
+        """
+        Creates a loop body function with the `loop_selector` decorator.
+        """
+
+        func_args = [ast.arg(arg=node.target.id, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+
+        # Create the loop body
+        transformed_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                transformed_body.extend(transformed_stmt)
+            else:
+                transformed_body.append(transformed_stmt)
+
+        # Handle the return for a single iterated argument correctly
+        if len(write_args) == 0:
+            transformed_body.append(ast.Return())
+        else:
+            transformed_body.append(
+                ast.Return(
+                    value=ast.List(
+                        elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+                        ctx=ast.Load(),
+                    )
+                )
+            )
+
+        # Define the decorator with parameters
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_FOR_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[start, stop, step],
+                keywords=[
+                    ast.keyword(arg="unroll", value=unroll),
+                    ast.keyword(arg="unroll_full", value=unroll_full),
+                    ast.keyword(arg="prefetch_stages", value=prefetch_stages),
+                    ast.keyword(
+                        arg="write_args",
+                        value=self.generate_get_locals_or_none_call(write_args),
+                    ),
+                    ast.keyword(
+                        arg="full_write_args_count",
+                        value=ast.Constant(value=full_write_args_count),
+                    ),
+                    ast.keyword(
+                        arg="write_args_names",
+                        value=ast.List(
+                            elts=[ast.Constant(value=arg) for arg in write_args],
+                            ctx=ast.Load(),
+                        ),
+                    ),
+                ],
+            ),
+            node,
+        )
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=ast.arguments(
+                    posonlyargs=[],
+                    args=func_args,
+                    kwonlyargs=[],
+                    kw_defaults=[],
+                    defaults=[],
+                ),
+                body=transformed_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+
+    def visit_BoolOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+
+        # It is necessary to expand short circuit evaluation explicit here
+        # Although we do not support inline if-else for IR generation, this is actually evaluated in Python
+        # So it's fine here
+        # Transform "and" to "and_"
+        if isinstance(node.op, ast.And):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == False:
+            #     return lhs
+            # else
+            #     return and_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=False)
+            helper_func = self._create_module_attribute(
+                "and_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+        # Transform "or" to "or_"
+        elif isinstance(node.op, ast.Or):
+            # Create an if-else statement in AST form
+            # if type(lhs) == bool and lhs == True:
+            #     return lhs
+            # else
+            #     return or_(lhs, rhs)
+            short_circuit_value = ast.Constant(value=True)
+            helper_func = self._create_module_attribute(
+                "or_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+        else:
+            # BoolOp should be either And or Or
+            raise DSLAstPreprocessorError(
+                f"Unsupported boolean operation: {node.op}",
+                filename=self.file_name,
+                snippet=ast.unparse(node),
+            )
+
+        def short_circuit_eval(value, short_circuit_value):
+            return ast.BoolOp(
+                op=ast.And(),
+                values=[
+                    ast.Compare(
+                        left=ast.Call(
+                            func=ast.Name(id="type", ctx=ast.Load()),
+                            args=[value],
+                            keywords=[],
+                        ),
+                        ops=[ast.Eq()],
+                        comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                    ),
+                    ast.Compare(
+                        left=value,
+                        ops=[ast.Eq()],
+                        comparators=[short_circuit_value],
+                    ),
+                ],
+            )
+
+        lhs = node.values[0]
+
+        for i in range(1, len(node.values)):
+            test = short_circuit_eval(lhs, short_circuit_value)
+            lhs = ast.IfExp(
+                test=test,
+                body=lhs,
+                orelse=ast.Call(
+                    func=helper_func,
+                    args=[lhs, node.values[i]],
+                    keywords=[],
+                ),
+            )
+
+        return ast.copy_location(lhs, node)
+
+    def visit_UnaryOp(self, node):
+        # Visit child nodes first
+        self.generic_visit(node)
+
+        # Transform "not" to "~" as we overload __invert__
+        if isinstance(node.op, ast.Not):
+            func_name = self._create_module_attribute(
+                "not_",
+                top_module_name="cutlass",
+                submodule_name=None,
+                lineno=node.lineno,
+                col_offset=node.col_offset,
+            )
+            self.import_top_module = True
+            return ast.copy_location(
+                ast.Call(func=func_name, args=[node.operand], keywords=[]), node
+            )
+
+        return node
+
+    def _insert_range_value_check(self, node):
+        """
+        Insert a check for range arguments
+        """
+        range_inputs = node.iter.args
+        check_call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    "range_value_check", lineno=node.lineno, col_offset=node.col_offset
+                ),
+                args=range_inputs,
+                keywords=[],
+            ),
+            node.iter,
+        )
+        node.iter = ast.copy_location(
+            ast.Call(
+                func=ast.Name(id="range", ctx=ast.Load()),
+                args=[ast.Starred(value=check_call, ctx=ast.Load())],
+                keywords=[],
+            ),
+            node.iter,
+        )
+
+    def _insert_cf_symbol_check(self, func):
+        """
+        Insert a check for range symbol
+        """
+        check_call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    "cf_symbol_check", lineno=func.lineno, col_offset=func.col_offset
+                ),
+                args=[deepcopy(func)],
+                keywords=[],
+            ),
+            func,
+        )
+        return ast.Expr(check_call)
+
+    def visit_For(self, node):
+        # For static for loop (for with range_constexpr or not range based for), preprocessor keeps the loop.
+        range_kind, is_builtin_range, has_keyword = self._get_range_kind(node.iter)
+        if range_kind == "range_constexpr" or range_kind == None:
+            self.generic_visit(node)
+            if range_kind == "range_constexpr":
+                check_call = self._insert_cf_symbol_check(node.iter.func)
+                # Rewrite range_constexpr to range
+                node.iter.func = ast.Name(id="range", ctx=ast.Load())
+                self._insert_range_value_check(node)
+                return [check_call, node]
+            return node
+
+        active_symbols = self.scope_manager.get_active_symbols()
+
+        with self.scope_manager:
+            if isinstance(node.target, ast.Name):
+                self.scope_manager.add_to_scope(node.target.id)
+
+            if range_kind == "range_dynamic":
+                # Generate a warning
+                self.issue_deprecation_warning(
+                    message="range_dynamic is deprecated and will be removed in the future, please remove it.",
+                    category=DeprecationWarning,
+                    filename=self.file_name,
+                    lineno=node.iter.lineno,
+                )
+
+            warning_call = None
+            if range_kind == "range" and is_builtin_range and not has_keyword:
+                # Warn about possible performance regression due to behavior change
+                warning_call = ast.Expr(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            "range_perf_warning",
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[
+                            ast.Constant(value=self.file_name),
+                            ast.Constant(value=node.iter.lineno),
+                        ]
+                        + node.iter.args,
+                        keywords=[],
+                    )
+                )
+                ast.copy_location(warning_call, node.iter)
+
+            is_prefixed_range = range_kind == "range" and not is_builtin_range
+            check_call = None
+            if range_kind == "range_dynamic" or is_prefixed_range:
+                # Insert a check for range symbol
+                if not is_prefixed_range:
+                    check_call = self._insert_cf_symbol_check(node.iter.func)
+                else:
+                    # Get toplevel module
+                    check_call = self._insert_cf_symbol_check(node.iter.func.value)
+
+            new_for_node = self.transform_for_loop(node, active_symbols)
+            if check_call is not None:
+                new_for_node = [check_call] + new_for_node
+
+        return new_for_node if warning_call is None else [warning_call] + new_for_node
+
+    @staticmethod
+    def _hoist_expr_to_assignments(expr, name):
+        return ast.copy_location(
+            ast.Assign(targets=[ast.Name(id=name, ctx=ast.Store())], value=expr), expr
+        )
+
+    def _build_select_and_assign(self, *, name, test, body, orelse, location):
+        node = ast.copy_location(
+            ast.Assign(
+                targets=[ast.Name(id=name, ctx=ast.Store())],
+                value=ast.IfExp(
+                    test=test,
+                    body=body,
+                    orelse=orelse,
+                ),
+            ),
+            location,
+        )
+        self.generic_visit(node)
+        return node
+
+    def _handle_negative_step(self, node, start_expr, stop_expr, step_expr):
+        # hoist start, stop, step to assignments
+        start_ori_name = f"start_ori_{self.counter}"
+        start = self._hoist_expr_to_assignments(start_expr, start_ori_name)
+        stop_ori_name = f"stop_ori_{self.counter}"
+        stop = self._hoist_expr_to_assignments(stop_expr, stop_ori_name)
+        step_ori_name = f"step_ori_{self.counter}"
+        step = self._hoist_expr_to_assignments(step_expr, step_ori_name)
+
+        extra_exprs = [start, stop, step]
+
+        # Handle possible negative step, generates the following code in Python:
+        # isNegative = step < 0
+        isNegative_name = f"isNegative_{self.counter}"
+        isNegative = ast.copy_location(
+            ast.Assign(
+                targets=[ast.Name(id=isNegative_name, ctx=ast.Store())],
+                value=ast.Compare(
+                    left=ast.Name(id=step_ori_name, ctx=ast.Load()),
+                    ops=[ast.Lt()],
+                    comparators=[ast.Constant(value=0)],
+                ),
+            ),
+            step,
+        )
+
+        # start = stop if isNegative else start
+        start_name = f"start_{self.counter}"
+        start = self._build_select_and_assign(
+            name=start_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.Name(id=stop_ori_name, ctx=ast.Load()),
+            orelse=ast.Name(id=start_ori_name, ctx=ast.Load()),
+            location=start,
+        )
+
+        # stop = start if isNegative else stop
+        stop_name = f"stop_{self.counter}"
+        stop = self._build_select_and_assign(
+            name=stop_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.Name(id=start_ori_name, ctx=ast.Load()),
+            orelse=ast.Name(id=stop_ori_name, ctx=ast.Load()),
+            location=stop,
+        )
+
+        # step = -step if isNegative else step
+        step_name = f"step_{self.counter}"
+        step = self._build_select_and_assign(
+            name=step_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.UnaryOp(
+                op=ast.USub(), operand=ast.Name(id=step_ori_name, ctx=ast.Load())
+            ),
+            orelse=ast.Name(id=step_ori_name, ctx=ast.Load()),
+            location=step,
+        )
+
+        # offset = start + stop if isNegative else 0
+        offset_name = f"offset_{self.counter}"
+        offset = self._build_select_and_assign(
+            name=offset_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.BinOp(
+                op=ast.Add(),
+                left=ast.Name(id=start_name, ctx=ast.Load()),
+                right=ast.Name(id=stop_name, ctx=ast.Load()),
+            ),
+            orelse=ast.Constant(value=0),
+            location=node,
+        )
+
+        extra_exprs.append(isNegative)
+        extra_exprs.append(start)
+        extra_exprs.append(stop)
+        extra_exprs.append(step)
+        extra_exprs.append(offset)
+
+        # Add this to begining of loop body
+        # for i in range(start, stop, step):
+        #     i = offset - i if isNegative else i
+        assert isinstance(node.target, ast.Name)
+
+        target_name = node.target.id
+        target = self._build_select_and_assign(
+            name=target_name,
+            test=ast.Name(id=isNegative_name, ctx=ast.Load()),
+            body=ast.BinOp(
+                op=ast.Sub(),
+                left=ast.Name(id=offset_name, ctx=ast.Load()),
+                right=ast.Name(id=target_name, ctx=ast.Load()),
+            ),
+            orelse=ast.Name(id=target_name, ctx=ast.Load()),
+            location=node.target,
+        )
+
+        node.body.insert(0, target)
+
+        return (
+            ast.Name(id=start_name, ctx=ast.Load()),
+            ast.Name(id=stop_name, ctx=ast.Load()),
+            ast.Name(id=step_name, ctx=ast.Load()),
+            extra_exprs,
+        )
+
+    def transform_for_loop(self, node, active_symbols):
+        # Check for early exit and raise exception
+        self.check_early_exit(node, "for")
+        if node.orelse:
+            raise DSLAstPreprocessorError(
+                "dynamic for loop with else is not supported",
+                filename=self.file_name,
+                snippet=ast.unparse(node),
+            )
+
+        # Get loop target variable name
+        target_var_name = None
+        target_var_is_active_before_loop = False
+        if isinstance(node.target, ast.Name):
+            target_var_name = node.target.id
+            for active_symbol in active_symbols:
+                if target_var_name in active_symbol:
+                    target_var_is_active_before_loop = True
+                    active_symbols.remove(active_symbol)
+                    break
+
+        # Add necessary exprs to handle this
+        if target_var_is_active_before_loop:
+            # Initialize an extra loop carried variable
+            loop_carried_var_name = f"loop_carried_var_{self.counter}"
+            pre_loop_expr = ast.copy_location(
+                ast.Assign(
+                    targets=[ast.Name(id=loop_carried_var_name, ctx=ast.Store())],
+                    value=ast.Name(id=target_var_name, ctx=ast.Load()),
+                ),
+                node,
+            )
+            # append an extra assignment to the loop carried variable
+            node.body.append(
+                ast.copy_location(
+                    ast.Assign(
+                        targets=[ast.Name(id=loop_carried_var_name, ctx=ast.Store())],
+                        value=ast.Name(id=target_var_name, ctx=ast.Load()),
+                    ),
+                    node,
+                )
+            )
+            active_symbols.append({loop_carried_var_name})
+
+        start_expr, stop_expr, step_expr, has_step = self.extract_range_args(node.iter)
+        unroll, unroll_full = self.extract_unroll_args(node.iter)
+        prefetch_stages = self.extract_prefetch_stages_args(node.iter)
+        write_args, full_write_args_count = self.analyze_region_variables(
+            node, active_symbols
+        )
+
+        if has_step and self.client_module_name[0] == "cutlass":
+            start, stop, step, exprs = self._handle_negative_step(
+                node, start_expr, stop_expr, step_expr
+            )
+        else:
+            start, stop, step, exprs = start_expr, stop_expr, step_expr, []
+
+        if target_var_is_active_before_loop:
+            exprs.append(pre_loop_expr)
+
+        func_name = f"loop_body_{self.counter}"
+        self.counter += 1
+
+        func_def = self.create_loop_function(
+            func_name,
+            node,
+            start,
+            stop,
+            step,
+            unroll,
+            unroll_full,
+            prefetch_stages,
+            write_args,
+            full_write_args_count,
+        )
+
+        assign = self.create_cf_call(func_name, write_args, node)
+
+        # This should work fine as it modifies the AST structure
+        exprs = exprs + [func_def] + assign
+
+        if target_var_is_active_before_loop:
+            # Create a new assignment to the target variable
+            exprs.append(
+                ast.copy_location(
+                    ast.Assign(
+                        targets=[ast.Name(id=target_var_name, ctx=ast.Store())],
+                        value=ast.Name(id=loop_carried_var_name, ctx=ast.Load()),
+                    ),
+                    node,
+                )
+            )
+
+        return exprs
+
+    def visit_Assert(self, node):
+        test = self.visit(node.test)
+
+        args = [ast.keyword(arg="test", value=test)]
+        if node.msg:
+            msg = self.visit(node.msg)
+            args.append(ast.keyword(arg="msg", value=msg))
+
+        # Rewrite to assert_executor(test, msg)
+        new_node = ast.Expr(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.ASSERT_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                ),
+                args=[],
+                keywords=args,
+            )
+        )
+
+        # Propagate line number from original node to new node
+        ast.copy_location(new_node, node)
+        return new_node
+
+    def visit_Call(self, node):
+        func = node.func
+        # Visit args and kwargs
+        node.args = [self.visit(arg) for arg in node.args]
+        node.keywords = [self.visit(kwarg) for kwarg in node.keywords]
+
+        # Rewrite call to some built-in functions
+        if isinstance(func, ast.Name):
+            # Check if the function is 'bool'
+            if func.id == "bool":
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            self.BOOL_CAST,
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[node.args[0]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+            elif func.id in ["any", "all"]:
+                helper_func = (
+                    self.ANY_EXECUTOR if func.id == "any" else self.ALL_EXECUTOR
+                )
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            helper_func, lineno=node.lineno, col_offset=node.col_offset
+                        ),
+                        args=[node.args[0]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+            elif func.id in ["min", "max"]:
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            func.id,
+                            top_module_name="cutlass",
+                            submodule_name=None,
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[node.args[0], node.args[1]],
+                        keywords=[],
+                    ),
+                    node,
+                )
+        elif isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name):
+            def create_downcast_call(arg):
+                return ast.copy_location(
+                    ast.Call(
+                        func=self._create_module_attribute(
+                            self.IMPLICIT_DOWNCAST_NUMERIC_TYPE,
+                            submodule_name="typing",
+                            lineno=node.lineno,
+                            col_offset=node.col_offset,
+                        ),
+                        args=[arg],
+                        keywords=[],
+                    ),
+                    arg,
+                )
+            module = self.function_globals.get(func.value.id)
+            if isinstance(module, ModuleType) and module.__package__.endswith(
+                "._mlir.dialects"
+            ):
+                # Check if argument is Numeric, if so, call ir_value()
+                args = []
+                for arg in node.args:
+                    args.append(create_downcast_call(arg))
+                kwargs = []
+                for kwarg in node.keywords:
+                    kwargs.append(
+                        ast.copy_location(
+                            ast.keyword(
+                                arg=kwarg.arg,
+                                value=create_downcast_call(kwarg.value),
+                            ),
+                            kwarg,
+                        )
+                    )
+                return ast.copy_location(
+                    ast.Call(func=func, args=args, keywords=kwargs), node
+                )
+        else:
+            node.func = self.visit(node.func)
+
+        return node
+
+    def visit_ClassDef(self, node):
+        self.class_name = node.name
+        self.generic_visit(node)
+        self.class_name = None
+        return node
+
+    def _visit_target(self, target):
+        if isinstance(target, ast.Name):
+            self.scope_manager.add_to_scope(target.id)
+        elif isinstance(target, ast.Tuple):
+            for t in target.elts:
+                if isinstance(t, ast.Name):
+                    self.scope_manager.add_to_scope(t.id)
+
+    def visit_Assign(self, node):
+        for target in node.targets:
+            self._visit_target(target)
+        self.generic_visit(node)
+        return node
+
+    def visit_AugAssign(self, node):
+        self._visit_target(node.target)
+        self.generic_visit(node)
+        return node
+
+    def visit_Name(self, node):
+        isLoad = isinstance(node.ctx, ast.Load)
+        if node.id in ["max", "min", "any", "all"] and isLoad:
+            return ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        "redirect_builtin_function",
+                        lineno=node.lineno,
+                        col_offset=node.col_offset,
+                    ),
+                    args=[node],
+                    keywords=[],
+                ),
+                node,
+            )
+        elif node.id == "_" and isLoad:
+            raise DSLAstPreprocessorError("Read '_' is not allowed")
+        else:
+            self.generic_visit(node)
+        return node
+
+    def check_decorator(self, node: ast.AST) -> bool:
+        """
+        Check if the function has the correct decorator for preprocessing.
+        """
+        if not isinstance(node, ast.FunctionDef):
+            return False
+        decorator_list = node.decorator_list
+        if len(decorator_list) == 0:
+            return False
+
+        for d in decorator_list:
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in ["jit", "kernel"]:
+                        if d.keywords == []:
+                            return True
+                        for keyword in d.keywords:
+                            if keyword.arg == "preprocess":
+                                try:
+                                    if isinstance(keyword.value, ast.Constant):
+                                        return keyword.value.value
+                                    else:
+                                        return ast.literal_eval(keyword.value)
+                                except:
+                                    pass
+
+            elif isinstance(d, ast.Attribute):
+                if d.attr in ["jit", "kernel"]:
+                    return True
+
+        return False
+
+    def remove_dsl_decorator(self, decorator_list):
+        """
+        Remove .jit and .kernel decorators
+        The decorator can be in two forms:
+        - @jit(...)
+        - @jit
+        """
+        new_decorator_list = []
+        decorator_names = ["jit", "kernel"]
+        for d in decorator_list:
+            is_jit_or_kernel = False
+            if isinstance(d, ast.Call):
+                if isinstance(d.func, ast.Attribute):
+                    if d.func.attr in decorator_names:
+                        is_jit_or_kernel = True
+            elif isinstance(d, ast.Attribute):
+                if d.attr in decorator_names:
+                    is_jit_or_kernel = True
+
+            if not is_jit_or_kernel:
+                new_decorator_list.append(d)
+        return new_decorator_list
+
+    def visit_FunctionDef(self, node):
+        with self.scope_manager:
+            self.function_counter += 1
+            self.function_name = node.name
+            if self.function_depth > 0:
+                self.local_closures.add(node.name)
+
+            self.function_depth += 1
+
+            # Add function name and arguments
+            self.scope_manager.add_to_scope(node.name)
+            for arg in node.args.args:
+                self.scope_manager.add_to_scope(arg.arg)
+
+            self.generic_visit(node)
+
+        self.function_depth -= 1
+
+        # Remove .jit and .kernel decorators
+        node.decorator_list = self.remove_dsl_decorator(node.decorator_list)
+        return node
+
+    def visit_With(self, node):
+        with self.scope_manager:
+            for item in node.items:
+                if isinstance(item.optional_vars, ast.Name):
+                    self.scope_manager.add_to_scope(item.optional_vars.id)
+            self.generic_visit(node)
+
+        return node
+
+    def visit_While(self, node):
+        # Constexpr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            check = self._insert_cf_symbol_check(node.test.func)
+            return [check, node]
+
+        active_symbols = self.scope_manager.get_active_symbols()
+
+        with self.scope_manager:
+            # Check for early exit and raise exception
+            self.check_early_exit(node, "while")
+
+            write_args, full_write_args_count = self.analyze_region_variables(
+                node, active_symbols
+            )
+            func_name = f"while_region_{self.counter}"
+            self.counter += 1
+
+            func_def = self.create_while_function(
+                func_name, node, write_args, full_write_args_count
+            )
+            assign = self.create_cf_call(func_name, write_args, node)
+
+        return [func_def] + assign
+
+    def visit_Try(self, node):
+        with self.scope_manager:
+            self.generic_visit(node)
+        return node
+
+    def visit_ExceptHandler(self, node):
+        with self.scope_manager:
+            if node.name:  # Exception variable
+                self.scope_manager.add_to_scope(node.name)
+            self.generic_visit(node)
+        return node
+
+    def create_cf_call(self, func_name, yield_args, node):
+        """Creates the assignment statement for the if function call"""
+        if not yield_args:
+            return [
+                ast.copy_location(
+                    ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load())), node
+                )
+            ]
+        has_self = False
+        for i, arg in enumerate(yield_args):
+            if arg == "self":
+                has_self = True
+                yield_args[i] = "yield_self"
+                break
+        if len(yield_args) == 1:
+            assign = ast.Assign(
+                targets=[ast.Name(id=yield_args[0], ctx=ast.Store())],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+        else:
+            assign = ast.Assign(
+                targets=[
+                    ast.Tuple(
+                        elts=[ast.Name(id=var, ctx=ast.Store()) for var in yield_args],
+                        ctx=ast.Store(),
+                    )
+                ],
+                value=ast.Name(id=func_name, ctx=ast.Load()),
+            )
+
+        if has_self:
+            fix_self = ast.Expr(
+                value=ast.Call(
+                    func=self._create_module_attribute(
+                        "copy_members", lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[
+                        ast.Name(id="self", ctx=ast.Load()),
+                        ast.Name(id="yield_self", ctx=ast.Load()),
+                    ],
+                    keywords=[],
+                )
+            )
+            return [ast.copy_location(assign, node), ast.copy_location(fix_self, node)]
+        else:
+            return [ast.copy_location(assign, node)]
+
+    def visit_IfExp(self, node):
+        """
+        Visits an inline if-else expression (ternary operator).
+        This is the Python equivalent of `x if condition else y`.
+        """
+        self.generic_visit(node)
+        # Emit
+        # node if type(pred) == bool else select_(pred, body, orelse)
+        # so if pred is a python bool, use python to short-circuit and avoid emit arith.select
+        self.import_top_module = True
+        return ast.copy_location(
+            ast.IfExp(
+                test=ast.Compare(
+                    left=ast.Call(
+                        func=ast.Name(id="type", ctx=ast.Load()),
+                        args=[node.test],
+                        keywords=[],
+                    ),
+                    ops=[ast.Eq()],
+                    comparators=[ast.Name(id="bool", ctx=ast.Load())],
+                ),
+                body=node,  # Original ternary expression
+                orelse=ast.Call(
+                    func=self._create_module_attribute(
+                        "select_", top_module_name="cutlass", submodule_name=None
+                    ),
+                    args=[
+                        node.test,
+                        node.body,
+                        node.orelse,
+                    ],
+                    keywords=[],
+                ),
+            ),
+            node,
+        )
+
+    cmpops = {
+        "Eq": "==",
+        "NotEq": "!=",
+        "Lt": "<",
+        "LtE": "<=",
+        "Gt": ">",
+        "GtE": ">=",
+        "Is": "is",
+        "IsNot": "is not",
+        "In": "in",
+        "NotIn": "not in",
+    }
+    def compare_ops_to_str(self, node):
+        names = [
+            ast.Constant(value=self.cmpops[op.__class__.__name__]) for op in node.ops
+        ]
+        return ast.List(elts=names, ctx=ast.Load())
+
+    def visit_Compare(self, node):
+        self.generic_visit(node)
+
+        comparator_strs = self.compare_ops_to_str(node)
+
+        keywords = [
+            ast.keyword(arg="left", value=node.left),
+            ast.keyword(
+                arg="comparators", value=ast.List(elts=node.comparators, ctx=ast.Load())
+            ),
+            ast.keyword(arg="ops", value=comparator_strs),
+        ]
+
+        call = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(self.COMPARE_EXECUTOR),
+                args=[],
+                keywords=keywords,
+            ),
+            node,
+        )
+
+        return call
+
+    def visit_If(self, node):
+        # const_expr doesn't get preprocessed
+        if self.is_node_constexpr(node):
+            self.generic_visit(node)
+            check = self._insert_cf_symbol_check(node.test.func)
+            return [check, node]
+
+        active_symbols = self.scope_manager.get_active_symbols()
+        with self.scope_manager:
+            # Check for early exit and raise exception
+            self.check_early_exit(node, "if")
+
+            yield_args, full_write_args_count = self.analyze_region_variables(
+                node, active_symbols
+            )
+            func_name = f"if_region_{self.counter}"
+            self.counter += 1
+
+            func_def = self.create_if_function(
+                func_name, node, yield_args, full_write_args_count
+            )
+            assign = self.create_cf_call(func_name, yield_args, node)
+
+        return [func_def] + assign
+
+    def generate_get_locals_or_none_call(self, write_args):
+        return ast.Call(
+            func=self._create_module_attribute("get_locals_or_none"),
+            args=[
+                ast.Call(
+                    func=ast.Name(id="locals", ctx=ast.Load()), args=[], keywords=[]
+                ),
+                ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ],
+            keywords=[],
+        )
+
+    def create_if_function(self, func_name, node, write_args, full_write_args_count):
+        test_expr = self.visit(node.test)
+        pred_name = self.make_func_param_name("pred", write_args)
+        func_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+        func_args_then_else = [ast.arg(arg=var, annotation=None) for var in write_args]
+
+        then_body = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                then_body.extend(transformed_stmt)
+            else:
+                then_body.append(transformed_stmt)
+
+        # Create common return list for all blocks
+        return_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+            ctx=ast.Load(),
+        )
+
+        # Create common function arguments
+        func_decorator_arguments = ast.arguments(
+            posonlyargs=[], args=func_args, kwonlyargs=[], kw_defaults=[], defaults=[]
+        )
+        func_then_else_arguments = ast.arguments(
+            posonlyargs=[],
+            args=func_args_then_else,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        then_block_name = f"then_block_{self.counter}"
+        else_block_name = f"else_block_{self.counter}"
+        elif_region_name = f"elif_region_{self.counter}"
+        self.counter += 1
+
+        # Create then block
+        then_block = ast.copy_location(
+            ast.FunctionDef(
+                name=then_block_name,
+                args=func_then_else_arguments,
+                body=then_body + [ast.Return(value=return_list)],
+                decorator_list=[],
+            ),
+            node,
+        )
+
+        # Decorator keywords
+        decorator_keywords = [
+            ast.keyword(
+                arg="pred", value=test_expr
+            ),  # ast.Name(id="pred", ctx=ast.Load())
+            ast.keyword(
+                arg="write_args",
+                value=self.generate_get_locals_or_none_call(write_args),
+            ),
+        ]
+
+        # Create decorator
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_IF_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+
+        # Executor keywords
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="write_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="full_write_args_count",
+                value=ast.Constant(value=full_write_args_count),
+            ),
+            ast.keyword(
+                arg="write_args_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="then_block", value=ast.Name(id=then_block_name, ctx=ast.Load())
+            ),
+        ]
+
+        # Handle different cases
+        if not write_args and node.orelse == []:
+            # No write_args case - only then_block needed
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        self.IF_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [then_block, ast.Return(value=execute_call)]
+        else:
+            # Create else block based on node.orelse
+            if node.orelse:
+                if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
+                    # Handle elif case
+                    elif_node = node.orelse[0]
+                    nested_if_name = elif_region_name
+                    # Recursion for nested elif
+                    nested_if = self.create_if_function(
+                        nested_if_name, elif_node, write_args, full_write_args_count
+                    )
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=[
+                            nested_if,
+                            ast.Return(
+                                value=ast.Name(id=nested_if_name, ctx=ast.Load())
+                            ),
+                        ],
+                        decorator_list=[],
+                    )
+                else:
+
+                    else_body = []
+                    for stmt in node.orelse:
+                        transformed_stmt = self.visit(
+                            stmt
+                        )  # Recursively visit inner statements
+                        if isinstance(transformed_stmt, list):
+                            else_body.extend(transformed_stmt)
+                        else:
+                            else_body.append(transformed_stmt)
+
+                    # Regular else block
+                    else_block = ast.FunctionDef(
+                        name=else_block_name,
+                        args=func_then_else_arguments,
+                        body=else_body + [ast.Return(value=return_list)],
+                        decorator_list=[],
+                    )
+            else:
+                # Default else block
+                else_block = ast.FunctionDef(
+                    name=else_block_name,
+                    args=func_then_else_arguments,
+                    body=[ast.Return(value=return_list)],
+                    decorator_list=[],
+                )
+
+            # Add else_block to execute keywords
+            execute_keywords.append(
+                ast.keyword(
+                    arg="else_block", value=ast.Name(id=else_block_name, ctx=ast.Load())
+                )
+            )
+
+            execute_call = ast.copy_location(
+                ast.Call(
+                    func=self._create_module_attribute(
+                        self.IF_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+                    ),
+                    args=[],
+                    keywords=execute_keywords,
+                ),
+                node,
+            )
+            func_body = [
+                then_block,
+                ast.copy_location(else_block, node),
+                ast.Return(value=execute_call),
+            ]
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_decorator_arguments,
+                body=func_body,
+                decorator_list=[decorator],
+            ),
+            node,
+        )
+
+    def create_while_function(self, func_name, node, write_args, full_write_args_count):
+        """Create a while function that looks like:
+
+        @while_selector(pred, write_args=[])
+        def while_region(pred, write_args):
+            def while_before_block(*write_args):
+                # Note that during eval of pred can possibly alter yield_args
+                return *pred, write_args
+            def while_after_block(*write_args):
+                ...loop_body_transformed...
+                return write_args
+            return self.while_executor(pred, write_args,
+                while_before_block, while_after_block, constexpr)
+        write_args = while_region(pred, write_args)
+
+        Which will later be executed as psuedo-code:
+
+        # Dynamic mode:
+        scf.WhileOp(types(write_args), write_args)
+        with InsertionPoint(before_block):
+            cond, write_args = while_before_block(*write_args)
+            scf.ConditionOp(cond, write_args)
+        with InsertionPoint(after_block):
+            write_args = while_after_block(write_args)
+            scf.YieldOp(write_args)
+        return while_op.results_
+
+        # Const mode:
+        cond, write_args = while_before_block(write_args)
+        while pred:
+            write_args = body_block(write_args)
+            cond, write_args = while_before_block(write_args)
+        return write_args
+        """
+        test_expr = self.visit(node.test)
+        pred_name = self.make_func_param_name("pred", write_args)
+
+        # Section: decorator construction
+        decorator_keywords = [
+            ast.keyword(arg="pred", value=test_expr),
+            ast.keyword(
+                arg="write_args",
+                value=self.generate_get_locals_or_none_call(write_args),
+            ),
+        ]
+        decorator = ast.copy_location(
+            ast.Call(
+                func=self._create_module_attribute(
+                    self.DECORATOR_WHILE_STATEMENT,
+                    lineno=node.lineno,
+                    col_offset=node.col_offset,
+                ),
+                args=[],
+                keywords=decorator_keywords,
+            ),
+            node,
+        )
+
+        # Section: Shared initialization for before and after blocks
+        while_before_block_name = f"while_before_block_{self.counter}"
+        while_after_block_name = f"while_after_block_{self.counter}"
+        self.counter += 1
+        block_args_args = [ast.arg(arg=var, annotation=None) for var in write_args]
+        block_args = ast.arguments(
+            posonlyargs=[],
+            args=block_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        yield_args_ast_name_list = ast.List(
+            elts=[ast.Name(id=var, ctx=ast.Load()) for var in write_args],
+            ctx=ast.Load(),
+        )
+
+        # Section: while_before_block FunctionDef, which contains condition
+        while_before_return_list = ast.List(
+            elts=[test_expr, yield_args_ast_name_list],
+            ctx=ast.Load(),
+        )
+        while_before_stmts = [ast.Return(value=while_before_return_list)]
+        while_before_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_before_block_name,
+                args=block_args,
+                body=while_before_stmts,
+                decorator_list=[],
+            ),
+            test_expr,
+        )
+
+        # Section: while_after_block FunctionDef, which contains loop body
+        while_after_stmts = []
+        for stmt in node.body:
+            transformed_stmt = self.visit(stmt)  # Recursively visit inner statements
+            if isinstance(transformed_stmt, list):
+                while_after_stmts.extend(transformed_stmt)
+            else:
+                while_after_stmts.append(transformed_stmt)
+        while_after_stmts.append(ast.Return(value=yield_args_ast_name_list))
+
+        while_after_block = ast.copy_location(
+            ast.FunctionDef(
+                name=while_after_block_name,
+                args=block_args,
+                body=while_after_stmts,
+                decorator_list=[],
+            ),
+            node,
+        )
+
+        # Section: Execute via executor
+        execute_keywords = [
+            ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())),
+            ast.keyword(
+                arg="write_args",
+                value=ast.List(
+                    elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+            ast.keyword(
+                arg="full_write_args_count",
+                value=ast.Constant(value=full_write_args_count),
+            ),
+            ast.keyword(
+                arg="while_before_block",
+                value=ast.Name(id=while_before_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(
+                arg="while_after_block",
+                value=ast.Name(id=while_after_block_name, ctx=ast.Load()),
+            ),
+            ast.keyword(
+                arg="write_args_names",
+                value=ast.List(
+                    elts=[ast.Constant(value=arg) for arg in write_args],
+                    ctx=ast.Load(),
+                ),
+            ),
+        ]
+
+        execute_call = ast.Call(
+            func=self._create_module_attribute(
+                self.WHILE_EXECUTOR, lineno=node.lineno, col_offset=node.col_offset
+            ),
+            args=[],
+            keywords=execute_keywords,
+        )
+
+        # Putting everything together, FunctionDef for while_region
+        func_args_args = [ast.arg(arg=pred_name, annotation=None)]
+        func_args_args += [ast.arg(arg=var, annotation=None) for var in write_args]
+        func_args = ast.arguments(
+            posonlyargs=[],
+            args=func_args_args,
+            kwonlyargs=[],
+            kw_defaults=[],
+            defaults=[],
+        )
+
+        return ast.copy_location(
+            ast.FunctionDef(
+                name=func_name,
+                args=func_args,
+                body=[
+                    while_before_block,
+                    while_after_block,
+                    ast.Return(value=execute_call),
+                ],
+                decorator_list=[decorator],
+            ),
+            node,
+        )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/cache_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/cache_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d9234f2fe760ba0026a63c139b8535dd777f621
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/cache_helpers.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides jit cache load/dump helper functions
+"""
+
+import os
+import uuid
+import random
+import tempfile
+import pwd
+import time
+from pathlib import Path
+import hashlib
+
+from .utils.logger import log
+from .jit_executor import JitExecutor
+
+from .._mlir import ir
+
+# =============================================================================
+# Jit Cache Helper functions
+# =============================================================================
+
+
+def get_current_user():
+    # Try to get the user from the environment variable first
+    user = os.getenv("USER") or os.getenv("USERNAME")
+    if not user:
+        # Fallback for Unix-like systems
+        user = pwd.getpwuid(os.getuid()).pw_name
+    return user
+
+
+try:
+    default_generated_ir_path = f"/tmp/{get_current_user()}/cutlass_python_cache/"
+except Exception as e:
+    # If all else fails, provide a default fallback path
+    default_generated_ir_path = "/tmp/cutlass_python_cache/"
+    print(f"Could not determine user, using default path. Error: {e}")
+
+
+def load_ir(file, asBytecode=False):
+    """Load generated IR from a file."""
+    assert "mlir" in file
+    func_name = file.split(".mlir")[0].split("dsl_")[-1]
+    with ir.Context() as ctx:
+        with open(file, "rb" if asBytecode else "r") as f:
+            module = ir.Module.parse(f.read())
+
+    return func_name, module
+
+
+def make_unique_filename(fpath: Path, new_ext: str = None) -> Path:
+    """Generate a unique filename with an optional new extension."""
+    random_part = random.randint(0, 999999)
+    timestamp = time.time()
+    hash_input = f"{fpath}_{timestamp}_{random_part}".encode()
+    hash_code = hashlib.md5(hash_input).hexdigest()[:16]  # Shorter hash for readability
+    stem_with_hash = f"{fpath.stem}_{hash_code}"
+    return fpath.with_name(stem_with_hash).with_suffix(new_ext or fpath.suffix)
+
+
+def save_ir(
+    dsl_name: str,
+    module: object,
+    fname: str,
+    isTemp: bool = False,
+    asBytecode: bool = False,
+) -> str:
+    """Save generated IR to a file."""
+    initial_name = f"{dsl_name.lower()}_{fname}.mlir"
+    save_path = Path(tempfile.gettempdir() if isTemp else os.getcwd())
+    save_fname = save_path / initial_name
+    # Random ID to avoid any collisions
+    rnd_id = str(uuid.uuid4())
+    pid = os.getpid()
+    # use temp dir to be robust against program interruptions
+    temp_dir = os.path.join(save_path, f"tmp.pid_{pid}_{rnd_id}")
+    # If the process exits abnormally, may leave a temporary folder. Needs to be removed manually.
+    os.makedirs(temp_dir, exist_ok=False)
+    temp_fname = os.path.join(temp_dir, initial_name)
+
+    if asBytecode:
+        with open(temp_fname, "wb") as f:
+            module.operation.write_bytecode(f)
+    else:
+        with open(temp_fname, "w") as f:
+            print(module, file=f)
+    # os.replace is guaranteed to be atomic on POSIX systems if it succeeds
+    # so filepath cannot see a partial write
+    os.replace(temp_fname, save_fname)
+    os.removedirs(temp_dir)
+    log().debug("Generated IR saved into %s", save_fname)
+    return save_fname
+
+
+def check_func_name(jit_cache, func_name):
+    if not func_name in jit_cache:
+        jit_cache[func_name] = JitExecutor(None, None, None, None, None, None)
+    return jit_cache
+
+
+def load_cache_from_path(dsl_name, cache_limit, path=default_generated_ir_path):
+    """Load cache from a directory path."""
+    if not os.path.exists(path):
+        return dict()
+    files = os.listdir(path)
+    jit_cache = dict()
+    try:
+        for idx, file in enumerate(files):
+            if idx >= int(cache_limit):
+                break
+            # identify dsl prefix
+            if not file.startswith(f"{dsl_name.lower()}"):
+                continue
+            if ".mlir" in file:
+                func_name, ir_module = load_ir(
+                    os.path.join(path, file), asBytecode=True
+                )
+                jit_cache = check_func_name(jit_cache, func_name)
+                jit_cache[func_name].ir_module = ir_module
+    except Exception as e:
+        print(f"{dsl_name} failed with loading generated IR cache.", e)
+        jit_cache = dict()
+    return jit_cache
+
+
+def dump_cache_to_path(
+    dsl_name, jit_cache, cache_limit, path=default_generated_ir_path
+):
+    log().info("JIT cache : dumping [%s] items=[%s]", dsl_name, len(jit_cache))
+    os.makedirs(path, exist_ok=True)
+    original_path = os.getcwd()
+    try:
+        os.chdir(path)
+        for idx, [key, value] in enumerate(jit_cache.items()):
+            if idx >= int(cache_limit):
+                break
+            save_ir(dsl_name, value.ir_module, key, asBytecode=True)
+    except Exception as e:
+        print(f"{dsl_name} failed with caching generated IR", e)
+    finally:
+        os.chdir(original_path)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/common.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cf413ed5018f99ae748cb2eb1883992f27a87b9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/common.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import os
+from typing import Any, Dict, Iterable, Optional, Union
+
+"""
+This module provides a Exception classes DSL class for any Dialect.
+"""
+
+
+# Add color codes at the top of the file after imports
+class Colors:
+    """ANSI color codes for error messages"""
+
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+
+
+# =============================================================================
+# DSL Exceptions
+# =============================================================================
+
+
+class DSLBaseError(Exception):
+    """
+    Base exception for DSL-related errors.
+    Provides optional contextual metadata to aid in debugging.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        line: Optional[int] = None,
+        snippet: Optional[str] = None,
+        filename: Optional[str] = None,
+        error_code: Optional[Union[str, int]] = None,
+        context: Optional[Union[Dict[str, Any], str]] = None,
+        suggestion: Optional[str] = None,
+        cause: Optional[BaseException] = None,
+    ) -> None:
+        self.message = message
+        self.line = line
+        self.filename = filename
+        self.snippet = snippet
+        self.error_code = error_code
+        self.context = context
+        self.suggestion = suggestion
+        self.cause = cause
+
+        super().__init__(self._format_message())
+
+    def _format_message(self):
+        """
+        Formats the complete error message with available metadata.
+        Override this in subclasses if you want to change formatting logic.
+        """
+        parts = [f"{self.__class__.__name__}: {self.message}"]
+
+        if self.error_code is not None:
+            parts.append(f"{Colors.BOLD}Error Code:{Colors.RESET} {self.error_code}\n")
+
+        if self.line is not None:
+            parts.append(f"  Line: {self.line}")
+
+        if self.filename is not None:
+            parts.append(f"  File: {self.filename}")
+
+        if self.snippet:
+            # Optionally truncate long snippets for readability
+            parts.append(f"  Snippet: \n {self.snippet}")
+
+        if self.cause:
+            parts.append(f"  Caused exception: {self.cause}")
+
+        if self.context:
+            if isinstance(self.context, dict):
+                parts.append(f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET}\n")
+                for key, value in self.context.items():
+                    parts.append(f"    {key}: {value}")
+            else:
+                parts.append(
+                    f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET} {self.context}"
+                )
+
+        if self.suggestion:
+            parts.append(f"{Colors.GREEN}💡 Suggestions:{Colors.RESET}")
+            if isinstance(self.suggestion, (list, tuple)):
+                for suggestion in self.suggestion:
+                    parts.append(f" {Colors.GREEN}{suggestion}{Colors.RESET}")
+            else:
+                parts.append(f" {self.suggestion}")
+
+        return "\n".join(parts)
+
+
+class DSLRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during JIT-time code generation in the DSL.
+    """
+
+    # Inherits all logic from DSLBaseError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    pass
+
+
+def _get_friendly_cuda_error_message(error_code, error_name):
+    # Avoid circular dependency
+    from .runtime.cuda import get_device_info
+
+    """Get a user-friendly error message for common CUDA errors."""
+    # Strip the byte string markers if present
+    if isinstance(error_name, bytes):
+        error_name = error_name.decode("utf-8")
+    elif (
+        isinstance(error_name, str)
+        and error_name.startswith("b'")
+        and error_name.endswith("'")
+    ):
+        error_name = error_name[2:-1]
+
+    # Add target architecture info
+    target_arch = os.getenv("CUTE_DSL_ARCH", "unknown")
+
+    error_messages = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"{Colors.RED}❌ Failed to load CUDA kernel - likely architecture mismatch.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"{Colors.RED}❌ CUDA kernel not compatible with your GPU.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"{Colors.RED}💾 CUDA out of memory error.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"{Colors.RED}❌ Invalid CUDA device.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"{Colors.RED}❌ CUDA context not initialized.{Colors.RESET}\n\n"
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"{Colors.RED}⚠️ Invalid parameter passed to CUDA operation.{Colors.RESET}\n\n"
+            f"{Colors.YELLOW}This is likely a bug - please report it with:{Colors.RESET}"
+        ),
+    }
+
+    error_suggestions = {
+        "CUDA_ERROR_INVALID_SOURCE": (
+            f"1. Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            f"2. Clear the compilation cache and regenerate the kernel",
+            f"3. Check CUDA toolkit installation",
+        ),
+        "CUDA_ERROR_NO_BINARY_FOR_GPU": (
+            f"Set env CUTE_DSL_ARCH to match your GPU architecture",
+        ),
+        "CUDA_ERROR_OUT_OF_MEMORY": (
+            f"1. Reduce batch size",
+            f"2. Reduce model size",
+            f"3. Free unused GPU memory",
+        ),
+        "CUDA_ERROR_INVALID_DEVICE": (
+            f"1. Check if CUDA device is properly initialized",
+            f"2. Verify GPU is detected: nvidia-smi",
+            f"3. Check CUDA_VISIBLE_DEVICES environment variable",
+        ),
+        "CUDA_ERROR_NOT_INITIALIZED": (
+            f"1. Check CUDA driver installation",
+            f"2. call `cuda.cuInit(0)` before any other CUDA operation",
+            f"3. Run nvidia-smi to confirm GPU status",
+        ),
+        "CUDA_ERROR_INVALID_VALUE": (
+            f"1. Your GPU model",
+            f"2. SM ARCH setting",
+            f"3. Steps to reproduce",
+        ),
+    }
+
+    message = error_messages.get(
+        error_name, f"{Colors.RED}Unknown CUDA error{Colors.RESET}"
+    )
+
+    # Add debug information
+    debug_info = f"\n- {Colors.BOLD}Error name: {error_name}\n"
+    debug_info += f"- CUDA_TOOLKIT_PATH: {os.getenv('CUDA_TOOLKIT_PATH', 'not set')}\n"
+    debug_info += (
+        f"- Target SM ARCH: {os.getenv('CUTE_DSL_ARCH', 'not set')}{Colors.RESET}\n"
+    )
+
+    try:
+        # Get GPU information using CUDA Python API
+        debug_info += f"\n{Colors.BLUE}📊 GPU Information:{Colors.RESET}\n"
+        gpu_info = get_device_info()
+        debug_info += gpu_info.pretty_str()
+
+        if target_arch and gpu_info.compatible_archs:
+            debug_info += f"\n{Colors.BOLD}Compatibility Check:{Colors.RESET}\n"
+
+            if target_arch not in gpu_info.compatible_archs:
+                debug_info += (
+                    f"{Colors.RED}❌ Error: Target SM ARCH {target_arch} is not compatible\n"
+                    f"💡 Please use one of SM ARCHs: "
+                    f"{Colors.GREEN}{', '.join(gpu_info.compatible_archs or [])}{Colors.RESET}\n"
+                )
+            elif target_arch != gpu_info.sm_arch:
+                debug_info += (
+                    f"{Colors.YELLOW}⚠️  Warning: Using compatible but non-optimal architecture\n"
+                    f"• Current: {target_arch}\n"
+                    f"• Recommended: {Colors.GREEN}{gpu_info.sm_arch}{Colors.RESET} (native)\n"
+                )
+            else:
+                debug_info += f"{Colors.GREEN}✓ Using optimal architecture: {gpu_info.sm_arch}{Colors.RESET}\n"
+
+    except Exception as e:
+        debug_info += (
+            f"\n{Colors.YELLOW}ℹ️  Could not retrieve GPU info: {str(e)}{Colors.RESET}"
+        )
+
+    return message, debug_info, error_suggestions.get(error_name, "")
+
+
+class DSLCudaRuntimeError(DSLBaseError):
+    """
+    Raised when an error occurs during CUDA runtime code generation in the DSL.
+    """
+
+    # Inherits all logic from DSLRuntimeError; override methods if you need
+    # specialized behavior or formatting for runtime errors.
+    def __init__(self, error_code, error_name) -> None:
+        self._error_code = error_code
+        self._error_name = error_name
+        message, debug_info, suggestion = _get_friendly_cuda_error_message(
+            error_code, error_name
+        )
+
+        super().__init__(
+            message, error_code=error_code, context=debug_info, suggestion=suggestion
+        )
+
+
+class DSLAstPreprocessorError(DSLBaseError):
+    """
+    Raised when an error occurs during AST preprocessing or visiting in the DSL.
+    """
+
+    # Same approach: You could override _format_message if you want
+    # to emphasize AST node details or anything specific to preprocessing.
+    pass
+
+
+class DSLNotImplemented(DSLBaseError):
+    """
+    Raised when a feature of the DSL is not implemented yet.
+    """
+
+    # Useful for stubs in your DSL that you plan to implement in the future.
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/compiler.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8b2da07ac9ac104f56c16a5cfcbbf01f01ee786
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/compiler.py
@@ -0,0 +1,288 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a class that compiles generated IR using MLIR's PassManager
+and executes it using MLIR's ExecutionEngine.
+
+"""
+
+from typing import Sequence, Optional, Tuple
+import os
+import sys
+import inspect
+import argparse
+from .common import DSLRuntimeError
+from .utils.logger import log
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+
+from .._mlir import ir
+
+
+# =============================================================================
+# Compiler Class
+# =============================================================================
+
+
+class CompilationError(RuntimeError):
+    """Custom error class for compilation failures"""
+
+    # Add ANSI color codes
+    RED = "\033[91m"
+    YELLOW = "\033[93m"
+    BLUE = "\033[94m"
+    GREEN = "\033[92m"
+    BOLD = "\033[1m"
+    RESET = "\033[0m"
+
+    def __init__(
+        self,
+        message: str,
+        nvvm_error: Optional[str] = None,
+        ir_context: Optional[str] = None,
+        cuda_toolkit: Optional[str] = None,
+        arch: Optional[str] = None,
+    ):
+        self.nvvm_error = nvvm_error
+        self.ir_context = ir_context
+        self.cuda_toolkit = cuda_toolkit
+        self.arch = arch
+        # Call parent with formatted error to avoid showing class name
+        super().__init__("")  # Empty string to avoid class name
+        # Store formatted error for str() representation
+        self._formatted_error = self._format_error()
+
+    def __str__(self) -> str:
+        """Override string representation to avoid showing class name"""
+        return self._formatted_error
+
+    def __repr__(self) -> str:
+        """Override repr representation to avoid showing class name"""
+        return self._formatted_error
+
+    def _format_error(self) -> str:
+        if not self.nvvm_error:
+            return str(self.args[0])
+
+        return f"""NVVM Compilation Error:
+----------------------
+
+{self.BLUE}⚙️  Current Settings:{self.RESET}
+{self.BOLD}- CUDA Toolkit Path: {self.cuda_toolkit or "Not Set"}
+- Target Architecture: {self.arch}{self.RESET}
+
+IR Context (truncated):
+{self.ir_context}
+
+{self.YELLOW}💡 Possible Solutions:{self.RESET}
+{self.GREEN}1. Check if CUDA_TOOLKIT_PATH is set correctly
+2. Verify target architecture ({self.arch}) is supported by your CUDA toolkit
+3. Make sure CUDA toolkit version matches the target architecture requirements{self.RESET}"""
+
+
+class Compiler:
+    """Compiler class for compiling and building MLIR modules."""
+
+    def __init__(self, passmanager, execution_engine):
+        self.passmanager = passmanager
+        self.execution_engine = execution_engine
+
+    def __call__(self, module):
+        """Convenience application method."""
+        self.compile(module)
+
+    def _process_error(self, error_msg: str) -> Tuple[Optional[str], Optional[str]]:
+        """Process error message to extract NVVM error and IR context"""
+        nvvm_error = None
+        ir_msg = ""
+
+        if "NVVM_ERROR" in error_msg:
+            # Extract the specific NVVM error
+            nvvm_error = (
+                error_msg.split("libNVVM extra log:")[1].strip()
+                if "libNVVM extra log:" in error_msg
+                else error_msg
+            )
+
+            # Extract IR context
+            if "see current operation:" in error_msg:
+                # Get the IR section
+                ir_section = error_msg.split("see current operation:")[1].strip()
+                # Remove duplicate IR section
+                ir_section = ir_section.split("error: unknown: Failed translating")[
+                    0
+                ].strip()
+
+                # Get first few lines and last few lines of the IR
+                ir_lines = ir_section.split("\n")
+                if len(ir_lines) > 10:
+                    ir_msg = "\n".join(ir_lines[:5] + ["  ..."] + ir_lines[-5:])
+                else:
+                    ir_msg = ir_section
+
+        return nvvm_error, ir_msg
+
+    def compile(
+        self,
+        module,
+        pipeline: str,
+        cuda_toolkit: str = "",
+        arch: str = "",
+        enable_verifier=False,
+    ):
+        """Compiles the module by invoking the pipeline."""
+        try:
+            pm = self.passmanager.PassManager.parse(pipeline)
+            pm.enable_verifier(enable_verifier)
+            pm.run(module.operation)
+        except Exception as e:
+            error_msg = str(e)
+            nvvm_error, ir_msg = self._process_error(error_msg)
+
+            if nvvm_error:
+                raise CompilationError(
+                    error_msg,
+                    nvvm_error=nvvm_error,
+                    ir_context=ir_msg,
+                    cuda_toolkit=cuda_toolkit,
+                    arch=arch,
+                ) from e
+            raise e
+
+    def jit(self, module, opt_level: int = 2, shared_libs: Sequence[str] = ()):
+        """Wraps the module in a JIT execution engine."""
+        return self.execution_engine.ExecutionEngine(
+            module, opt_level=opt_level, shared_libs=shared_libs
+        )
+
+    def compile_and_jit(
+        self,
+        module,
+        pipeline: str,
+        shared_libs: Sequence[str] = (),
+        opt_level: int = 2,
+        cuda_toolkit: str = "",
+        arch: str = "",
+    ):
+        """Compiles and jits the module."""
+        self.compile(
+            module,
+            pipeline,
+            cuda_toolkit,
+            arch,
+        )
+        return self.jit(module, opt_level, shared_libs)
+
+
+class CompileOptions:
+    def __init__(self, options: str = ""):
+        """
+        This class encapsulates all compilation options relevant to function compilation.
+        It provides a convenient way to manage and pass compilation options,
+        particularly for controlling compilation settings.
+        By centralizing these options, it ensures consistent and flexible configuration of
+        compilation parameters such as optimization level, debugging control, etc.
+
+        :param options: The options for the function. Will be parsed by argparse.
+        :type options: str
+        """
+        if not isinstance(options, str):
+            raise DSLRuntimeError(
+                f"Invalid compilation `options`: {options}, it should be a string"
+            )
+        self._parser = argparse.ArgumentParser()
+        self._parser.add_argument("--opt-level", nargs="?", type=int, default=3)
+        self._parser.add_argument(
+            "--enable-device-assertions", action="store_true", default=False
+        )
+        self._parser.add_argument("--link-libraries", type=str, default="")
+
+        try:
+            self._options = self._parser.parse_args(options.split())
+        except SystemExit as e:
+            # catch argparse error and raise as DSLRuntimeError
+            raise DSLRuntimeError(
+                f"Invalid compile options: '{options}'. Please check the option values and format."
+            )
+        log().info("`cute.compile` CompileOptions: options=" + options)
+
+    def to_str(self):
+        """
+        Generate a string representation of all compilation options
+        which will be used in pipeline options.
+        """
+        option_strings = []
+        for key, value in vars(self._options).items():
+            hyphen_key = key.replace("_", "-")
+            if isinstance(value, bool):
+                formatted_value = "true" if value else "false"
+            else:
+                formatted_value = str(value)
+            option_strings.append(f"{hyphen_key}={formatted_value}")
+
+        return " ".join(option_strings)
+
+
+def compile(func, *args, **kwargs):
+    """
+    This function is used to compile a `cute.jit` decorated function.
+    It will process the compile options and input parameters, do explicit compilation and return  the jit executor.
+
+    :param func: The function to compile. It can be a regular function, a method or a class instance.
+    :param args: The arguments to pass to the function.
+    :param kwargs: The keyword arguments to pass to the function. It can contain `options` like
+    `opt_level` to control the compilation flags.
+
+    :return: The jit executor.
+
+    :raises: DSLRuntimeError if the function is not decorated with `cute.jit` or is not callable.
+    """
+    if func is None:
+        raise DSLRuntimeError("Function is not set or invalid.")
+
+    if not callable(func):
+        raise DSLRuntimeError("Object is not callable.")
+
+    kwargs["compile_only"] = True
+    kwargs["no_cache"] = True
+
+    if inspect.isfunction(func):
+        # regular function
+        pass
+    elif inspect.ismethod(func):
+        # if it's a method, add the instance to the first argument
+        args = [func.__self__] + list(args)
+        func = func.__func__
+    elif inspect.isclass(type(func)) and hasattr(func, "__call__"):
+        # If it's a class instance, get the class's __call__ method
+        args = [func] + list(args)
+        # Get the actual function from the class definition
+        func = func.__call__.__func__
+    else:
+        raise DSLRuntimeError(
+            "Invalid function type, only function, method and module are supported, but got",
+            func,
+        )
+
+    # If it's a wrapped function created by jit decorator, get the original function
+    if hasattr(func, "__wrapped__"):
+        func = func.__wrapped__
+
+    if not hasattr(func, "_dsl_object"):
+        raise DSLRuntimeError("Function is not decorated with jit decorator.")
+
+    # process compile options, extract the options and remove them from the kwargs
+    options = kwargs.pop("options", "")
+    func._dsl_object.compile_options = CompileOptions(options)
+    fcn_ptr = func._dsl_object._preprocess_and_execute(func)
+    return func._dsl_object._func(fcn_ptr, *args, **kwargs)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/dsl.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/dsl.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b17d22b1e6d7157a7f14334b0f29f1386c58c15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/dsl.py
@@ -0,0 +1,1686 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a main DSL class for any Dialect.
+The DSL should be inherited as a new class, and its initialization requires dialects.
+It handles most of the mechanics for the DSL in an agnostic way,
+for example, it can handle various dialect-specific tasks.
+"""
+
+
+# Standard library imports
+from dataclasses import dataclass, field
+import atexit
+import os
+import io
+import sys
+import errno
+import ctypes
+import re
+import inspect
+import argparse
+import hashlib
+from functools import lru_cache, wraps
+from collections import namedtuple
+from abc import ABC, abstractmethod
+from typing import Any, Union, Tuple, get_origin, get_args, List
+from types import FunctionType, SimpleNamespace
+import warnings
+
+from . import typing as t
+from .env_manager import EnvironmentVarManager
+from .compiler import CompileOptions
+from .ast_helpers import DSLOptimizationWarning
+
+# =============================================================================
+# CUDA Python
+# =============================================================================
+
+from ..base_dsl._mlir_helpers.arith import const
+
+# =============================================================================
+# Local module imports
+# =============================================================================
+
+from .cache_helpers import *
+from .jit_executor import JitExecutor
+from .utils.timer import timer
+from .utils.logger import setup_log, log
+from .utils.stacktrace import filter_exception, walk_to_top_module, filter_stackframe
+from .runtime.jit_arg_adapters import is_argument_constexpr, JitArgAdapterRegistry
+
+from .ast_preprocessor import DSLPreprocessor
+from .common import *
+from .typing import (
+    get_c_pointers,
+    get_mlir_types,
+)
+
+# =============================================================================
+# MLIR modules
+# =============================================================================
+
+from .._mlir import ir
+from .._mlir import runtime as rt
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math, func
+
+# =============================================================================
+# Global Variables
+# =============================================================================
+
+MLIR_DYNAMIC = -9223372036854775808
+
+# =============================================================================
+# Codegen Utils
+# =============================================================================
+
+
+def _numpy_type_to_mlir_type(dtype):
+    if dtype == np.float64:
+        return T.f64()
+    if dtype == np.float16:
+        return T.f16()
+    if dtype == np.float32:
+        return T.f32()
+    if dtype == np.int64:
+        return T.i64()
+    if dtype == np.int32:
+        return T.i32()
+    if dtype == np.int16:
+        return T.i16()
+    if dtype == np.int8:
+        return T.i8()
+    if dtype == np.uint64:
+        return T.ui64()
+    if dtype == np.uint32:
+        return T.ui32()
+    if dtype == np.uint16:
+        return T.ui16()
+    if dtype == np.uint8:
+        return T.ui8()
+    if dtype == np.bool_:
+        return T.bool()
+    if dtype == f8E5M2:
+        return T.f8E5M2()
+    if dtype == f8E4M3FN:
+        return T.f8E4M3FN()
+    if dtype == f8E8M0FNU:
+        return T.f8E8M0FNU()
+    if dtype == f6E3M2FN:
+        return T.f6E3M2FN()
+    if dtype == f6E2M3FN:
+        return T.f6E2M3FN()
+    if dtype == f4E2M1FN:
+        return T.f4E2M1FN()
+    assert False, f"Unknown type {type}"
+
+
+def _mlir_type_to_numpy_type(type):
+    if type == T.f64():
+        return np.float64
+    if type == T.f16():
+        return np.float16
+    if type == T.f32():
+        return np.float32
+    if type == T.i64():
+        return np.int64
+    if type == T.i32():
+        return np.int32
+    if type == T.i16():
+        return np.int16
+    if type == T.i8():
+        return np.int8
+    if type == T.ui64():
+        return np.uint64
+    if type == T.ui32():
+        return np.uint32
+    if type == T.ui16():
+        return np.uint16
+    if type == T.ui8():
+        return np.uint8
+    if type == T.bool():
+        return np.bool_
+    assert False, f"Unknown type {type}"
+
+
+# =============================================================================
+# Main DSL Class
+# =============================================================================
+
+
+def is_dynamic_expression(value):
+    """
+    Given the `value`, check if itself is an IR value or recursively go through it to check if it contains IR value
+    """
+    if isinstance(value, (tuple, list)):
+        for x in value:
+            if is_dynamic_expression(x):
+                return True
+    elif isinstance(value, (ir.Value, ir.BlockArgumentList)) or hasattr(
+        value, "__extract_mlir_values__"
+    ):
+        return True
+    return False
+
+
+def extract_mlir_values(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained IR values as list of MLIR values
+    """
+    res = []
+    if hasattr(obj, "__extract_mlir_values__"):
+        res = obj.__extract_mlir_values__()
+    elif isinstance(obj, (tuple, list)):
+        res = sum((extract_mlir_values(x) for x in obj), [])
+    elif isinstance(obj, SimpleNamespace):
+        res = []
+        for k, v in obj.__dict__.items():
+            res.extend(extract_mlir_values(v))
+    # Can't call is_dynamic_expression as _is_dynamic_expression depends on extract_mlir_values
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in extract_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif isinstance(obj, ir.Value):
+        res = [obj]
+    elif isinstance(obj, ir.BlockArgumentList):
+        res = list(obj)  # type: ignore
+
+    return res
+
+
+def new_from_mlir_values(obj, values):
+    """
+    Create a new python object by populating containing MLIR values with list of new values
+    """
+    if hasattr(obj, "__new_from_mlir_values__"):
+        return obj.__new_from_mlir_values__(values)
+    elif isinstance(obj, (tuple, list)):
+        res = []
+        for x in obj:
+            n_items = len(get_mlir_types(x))
+            res.append(new_from_mlir_values(x, values[:n_items]))
+            values = values[n_items:]
+        obj_ty = type(obj)
+        return obj_ty(res)
+    elif isinstance(obj, SimpleNamespace):
+        res = SimpleNamespace()
+        for k, v in obj.__dict__.items():
+            n_items = len(get_mlir_types(v))
+            res.__dict__[k] = new_from_mlir_values(v, values[:n_items])
+            values = values[n_items:]
+        return res
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in new_from_mlir_values to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    elif is_dynamic_expression(obj):
+
+        if len(values) == 0:
+            return obj
+
+        assert len(values) == 1
+        return values[0]
+    else:
+        assert len(values) == 0, f"{obj} expects 0 values, but got {values}"
+        return obj
+
+
+class DSLCallable:
+    """
+    Wrapper class for a callable object used within the DSL.
+
+    DSLCallable is designed to wrap a function and provide additional
+    introspection utilities such as retrieving the argument specification
+    and signature. It ensures that the wrapped function can only be called
+    once, after which the reference to the function is cleared to prevent
+    further invocations. This is useful in scenarios where a function should
+    only be executed a single time within the DSL's execution model.
+
+    Attributes:
+        func (callable): The function to be wrapped and managed.
+
+    Methods:
+        __call__(*args, **kwargs): Calls the wrapped function and clears it.
+    """
+
+    def __init__(self, func):
+        self.func = func
+
+    def __call__(self, *args, **kwargs):
+        ret = self.__func__(*args, **kwargs)
+        self.func = None
+        return ret
+
+    @property
+    def __func__(self):
+        assert self.func is not None, "DSLCallable is already called"
+        return self.func
+
+    @property
+    def __signature__(self):
+        return inspect.signature(self.__func__)
+
+    @property
+    def __name__(self):
+        return self.__func__.__name__
+
+
+class BaseDSL:
+    gpu_module = None
+
+    def __init__(
+        self,
+        *,
+        name: str,
+        dsl_package_name: List[str],
+        compiler_provider: Any,
+        pass_sm_arch_name: str,
+        device_compilation_only=False,
+        preprocess=False,
+    ):
+        """
+        Constructor for initializing the class with required providers and environment settings.
+
+        Parameters:
+        - name (str): Name of DSL, used for environment variables and logging.
+        - package_name (str): Name of the package, used for the preprocessor.
+        - compiler_provider (MLIR dialect): Provider for compiler.
+        - pass_sm_arch_name (str): The keyword name of the SM.
+        - device_compilation_only (bool) : Only device code, and call it via cuda driver
+        - preprocess (bool): Enable AST transformation.
+
+        This constructs a DSL instance and sets up environment management,
+        warning configurations, and logging functionalities. It reads
+        environment variables using `EnvironmentVarManager` and configures
+        a logger with settings from the environment. If environment warnings
+        are detected, they are escalated to errors to ensure strict handling.
+        """
+        # Enforcing initialization of instance variables
+        if not all([name, compiler_provider, pass_sm_arch_name]):
+            raise DSLRuntimeError(
+                "All required parameters must be provided and non-empty"
+            )
+
+        self.name = name
+        self.compiler_provider = compiler_provider
+        self.pass_sm_arch_name = pass_sm_arch_name
+        self.frame = None
+        self.no_cache = False
+        self.device_compilation_only = device_compilation_only
+        self.num_kernels = 0
+        # Read environment variables
+        self.envar = EnvironmentVarManager(self.name)
+        self.enable_preprocessor = preprocess
+        # This cache uses hash of original ir and env as key, allows dump/load to/from file. Enabled by default
+        self.jit_cache = (
+            dict()
+            if self.envar.disable_file_caching
+            else load_cache_from_path(self.name, self.envar.file_caching_capacity)
+        )
+        self.host_jit_decorator_name = f"@{BaseDSL.jit.__name__}"
+        self.device_jit_decorator_name = f"@{BaseDSL.kernel.__name__}"
+
+        # set warning
+        if not self.envar.enable_optimization_warnings:
+            # By default, optimization warnings are disabled
+            warnings.filterwarnings("ignore", category=DSLOptimizationWarning)
+        if self.envar.warnings_as_errors:
+            warnings.filterwarnings("error")
+        if self.envar.warnings_ignore:
+            warnings.filterwarnings("ignore")
+
+        # Initialize logger
+        if self.envar.log_to_console == False and self.envar.jitTimeProfiling:
+            self.envar.log_to_console = True
+            self.envar.log_level = 20  # info level
+        setup_log(
+            self.name,
+            self.envar.log_to_console,
+            self.envar.log_to_file,
+            f"{self.name}.log",
+            self.envar.log_level,
+        )
+
+        # kernel symbols are temporary symbol string variables, their values are valid until the compilation is done.
+        self.kernel_symbols = []
+        # used to generate unique name for gpu.launch
+        self.launch_inner_count = 0
+        # initialize default compile options
+        self.compile_options = CompileOptions()
+
+        if preprocess:
+            self.preprocessor = DSLPreprocessor(dsl_package_name)
+        log().info(f"Initializing {name} DSL")
+        log().debug(f"Logger initialized for {self.name}")
+
+        # Hook excepthook
+        if self.envar.filterStacktrace:
+            origin_excepthook = sys.excepthook
+            module_dir = walk_to_top_module(os.path.dirname(os.path.abspath(__file__)))
+
+            def excepthook(excep_type, value, traceback):
+                filter_exception(value, module_dir)
+                if hasattr(value, "__traceback__"):
+                    origin_excepthook(excep_type, value, value.__traceback__)
+                else:
+                    origin_excepthook(
+                        excep_type, value, filter_stackframe(traceback, module_dir)
+                    )
+
+            sys.excepthook = excepthook
+
+            # Restore original excepthook
+            def restore_excepthook(hook):
+                sys.excepthook = hook
+
+            atexit.register(restore_excepthook, origin_excepthook)
+
+    def dump_cache(self):
+        if not self.envar.disable_file_caching:
+            dump_cache_to_path(
+                self.name, self.jit_cache, self.envar.file_caching_capacity
+            )
+
+    @lru_cache(maxsize=1)
+    def print_warning_once(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+
+    def print_warning(self, message):
+        log().warning(f"Warning: {message}")
+        warnings.warn(message, UserWarning)
+
+    @classmethod
+    @lru_cache(maxsize=1)
+    def _get_dsl(cls):
+        # Instantiate the DSL Class once
+        main_dsl = cls()
+        if not main_dsl.no_cache:
+            # register atexit callback
+            atexit.register(main_dsl.dump_cache)
+        return main_dsl
+
+    @staticmethod
+    def _can_preprocess(**dkwargs):
+        """
+        Check if AST transformation is enabled or not for `jit` and `kernel` decorators.
+        """
+        return dkwargs.pop("preprocess", True)
+
+    @staticmethod
+    def _get_original_function(fcn_ptr, name):
+        """
+        Get the original function from the decorated function
+        """
+        while fcn_ptr.__name__ != name:
+            # If the function is wrapped with functools, get from __wrapped__
+            if hasattr(fcn_ptr, "__wrapped__"):
+                fcn_ptr = fcn_ptr.__wrapped__
+            # If the function is wrapped manually, it's the first in clousure
+            elif callable(fcn_ptr.__closure__[0].cell_contents):
+                fcn_ptr = fcn_ptr.__closure__[0].cell_contents
+            else:
+                raise DSLRuntimeError(
+                    f"Cannot find the original function {name} in the closure chain"
+                )
+        return fcn_ptr
+
+    @staticmethod
+    def _preprocess_and_execute(func):
+        """
+        Run ast transformation and return the materialized function pointer
+        """
+        if hasattr(func, "_transformed_ast"):
+            # If the function ptr is already materialized, use the existing one
+            func._dsl_object.frame = func._decorator_frame
+            if func._transformed_ast is None:
+                func._transformed_ast = func._dsl_object.run_preprocessor(func)
+                if func._transformed_ast is None:
+                    del func._transformed_ast
+                    func._dsl_object.frame = None
+                    return func
+
+            fcn_ptr = func._dsl_object.get_function_ptr(func)
+            # If the function is decorated, de-decorate it
+            fcn_ptr = BaseDSL._get_original_function(fcn_ptr, func.__name__)
+            func._dsl_object.frame = None
+            return DSLCallable(fcn_ptr)
+        return func
+
+    def jit_runner(self, executor, frame, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation.
+        """
+        log().info("jit_runner")
+
+        def jit_runner_decorator(func):
+            func._dsl_object = self
+            # Run preprocessor that alters AST
+            if self.enable_preprocessor and BaseDSL._can_preprocess(**dkwargs):
+                # For an annotated function, add some DSL attributes
+                # When materializing the AST, we need decorator's frame
+                func._decorator_frame = frame
+                # No transformed ast at this point
+                func._transformed_ast = None
+
+            @wraps(func)
+            def jit_wrapper(*args, **kwargs):
+                func_ptr = BaseDSL._preprocess_and_execute(func)
+                return executor(func_ptr, *args, **kwargs)
+
+            return jit_wrapper
+
+        if len(dargs) == 1 and callable(dargs[0]):
+            return jit_runner_decorator(dargs[0])
+        else:
+            return jit_runner_decorator
+
+    @classmethod
+    def jit(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for Host code.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(main_dsl._func, frame, *dargs, **dkwargs)
+
+    @classmethod
+    def kernel(cls, *dargs, **dkwargs):
+        """
+        Decorator to mark a function for JIT compilation for GPU.
+        """
+        frame = inspect.currentframe().f_back
+        # Instantiate the DSL Class
+        main_dsl = cls._get_dsl()
+        return main_dsl.jit_runner(main_dsl._kernel_helper, frame, *dargs, **dkwargs)
+
+    @abstractmethod
+    def _kernel_helper(self, func, *args, **kwargs):
+        """
+        Helper function to handle kernel generation logic
+        """
+        pass
+
+    @abstractmethod
+    def _build_gpu_module(self, attrs):
+        """
+        Build the module op that contains the kernels.
+        """
+        pass
+
+    @abstractmethod
+    def _get_pipeline(self, pipeline):
+        """
+        Get the pipeline from the other configuration options.
+        """
+        if pipeline != None:
+            return pipeline
+        return None
+
+    @staticmethod
+    def log_additions(func_type, operands=None, types=None, arg_attrs=None):
+        if operands is not None and operands != []:
+            log().debug(
+                f"Added {func_type} operands: [%s]", ", ".join(map(str, operands))
+            )
+        if types is not None:
+            log().debug(
+                f"Added {func_type} arg_types: [%s]", ", ".join(map(str, types))
+            )
+        if arg_attrs is not None:
+            log().debug(
+                f"Added {func_type} arg_attrs: [%s]", ", ".join(map(str, arg_attrs))
+            )
+
+    def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec):
+        """Does simple name mangling"""
+
+        for spec_arg, arg in zip(args_spec.args, args):
+            spec_ty = args_spec.annotations.get(spec_arg, None)
+            if spec_ty != None:
+                if issubclass(type(spec_ty), (t.IRValue, t.IRVariadic)):
+                    continue
+                if isinstance(spec_ty, (ir.Type, ir.Value)):
+                    continue
+            if isinstance(arg, (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if isinstance(type(arg), (ir.Type, ir.Value, ir.OpResult)):
+                continue
+            if self._is_tensor_descriptor(arg):
+                continue
+            if inspect.isclass(spec_ty):
+                class_name = str(arg).replace("class", "")
+                class_name = class_name.replace(" ", "")
+                function_name = f"{function_name}_{class_name}"
+            elif isinstance(arg, (list, tuple)):
+                function_name = f"{function_name}_{'_'.join(map(str, arg))}"
+            else:
+                function_name = f"{function_name}_{arg}"
+        # we would need a dedicated MR to follow up
+        unwanted_chars = r"'-![]#,.<>()\":{}=%?@;"
+        translation_table = str.maketrans("", "", unwanted_chars)
+        function_name = function_name.translate(translation_table)
+        # identify address and drop
+        function_name = re.sub(r"0x[a-f0-9]{8,16}", "", function_name)
+        function_name = re.sub(r"\s+", " ", function_name)
+        function_name = function_name.replace(" ", "_")
+        function_name = function_name.replace("\n", "_")
+        # max fname is 256 character, leave space
+        function_name = function_name[:180]
+        log().info(f"Final mangled function name: {function_name}")
+        return function_name
+
+    def _generate_execution_arguments_for_known_types(
+        self, arg, arg_spec, arg_name, i, fop_args, iv_block_args
+    ):
+        """
+        Generate MLIR arguments for known types.
+
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+        ir_arg = []
+        if is_argument_constexpr(arg, arg_spec, arg_name, i, func):
+            ir_arg.append(arg)
+
+        return ir_arg, iv_block_args
+
+    def generate_execution_arguments(
+        self,
+        args,
+        kwargs,
+        fop,
+        args_spec: inspect.FullArgSpec,
+    ):
+        """Create list of arguments that will be passed to MLIR's func.func op"""
+
+        def gen_exec_args(input_args, arg_names, annotations, fop_args):
+            assert len(input_args) == len(arg_names)
+
+            ir_args = []
+            iv_block_args = 0
+            for i, arg in enumerate(input_args):
+                arg_name = arg_names[i]
+                arg_spec = annotations.get(arg_name, None)
+                log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, arg_spec)
+
+                # Implicit cast to NumericMeta
+                if isinstance(arg_spec, t.NumericMeta) and not isinstance(
+                    arg, arg_spec
+                ):
+                    arg = t.cast(arg, arg_spec)
+
+                ir_arg, iv_block_args = (
+                    self._generate_execution_arguments_for_known_types(
+                        arg, arg_spec, arg_name, i, fop_args, iv_block_args
+                    )
+                )
+
+                if not ir_arg:
+                    # If it's not a known type, try JIT argument adapter
+                    # to convert the argument if possible
+                    adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                    arg = adapter(arg) if adapter else arg
+
+                    n_args = len(get_mlir_types(arg))
+                    blk_args = fop_args[iv_block_args : iv_block_args + n_args]
+                    ir_arg.append(new_from_mlir_values(arg, blk_args))
+                    iv_block_args += n_args
+
+                self.log_additions(ir_arg)
+                ir_args.extend(ir_arg)
+
+            return ir_args, iv_block_args
+
+        fop_args = list(fop.regions[0].blocks[0].arguments)
+        ir_args, iv_block_args = gen_exec_args(
+            args, args_spec.args, args_spec.annotations, fop_args
+        )
+        ir_kwargs, _ = gen_exec_args(
+            [kwargs[arg] for arg in args_spec.kwonlyargs],
+            args_spec.kwonlyargs,
+            args_spec.annotations,
+            fop_args[iv_block_args:],
+        )
+        ir_kwargs = {k: v for k, v in zip(args_spec.kwonlyargs, ir_kwargs)}
+
+        log().debug("execution args: %s", ", ".join(map(str, ir_args)))
+        log().debug("execution kwargs: %s", ", ".join(map(str, ir_kwargs)))
+        return ir_args, ir_kwargs
+
+    @abstractmethod
+    def _generate_mlir_type_for_tensor_descriptor(self, tensor):
+        """
+        Generate MLIR type for the tensor descriptor.
+        """
+        pass
+
+    @abstractmethod
+    def _generate_executable_arg_for_tensor_descriptor(
+        self, mlir_value=None, ptr_tensor_ty=None, tensor=None
+    ):
+        """
+        Generates executable value for the given tensor descriptor.
+        """
+        pass
+
+    def _get_globals(self):
+        """
+        Combines global and local variables from the current context and the
+        caller's frame comes. This includes the current module's globals, the
+        global variables from the caller's frame, and the local variables from
+        the caller's frame.
+
+        "self.frame" is used to fetch the caller's frame.
+
+        AST preprocessor generates a new python code, so the resulting globals
+        dictionary is used to execute the python code.
+        """
+        all_globals = {}
+        if self.frame:
+            all_globals.update(self.frame.f_globals)
+            all_globals.update(self.frame.f_locals)
+        return all_globals
+
+    @abstractmethod
+    def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool:
+        pass
+
+    @abstractmethod
+    def _handle_tensor_descriptor(
+        self, maybe_tensor, arg_name: str, need_gpu_memory: bool
+    ) -> Any:
+        pass
+
+    def _validate_arg(self, arg, arg_index, arg_name, arg_spec):
+        """
+        Validates if the arg is really of the annotated type for type safety.
+
+        The default implementation is empty. Subclasses can override this method to add more validation logic.
+        Returns None if validation passes, otherwise returns an error derived from DSLBaseError.
+        """
+        pass
+
+    def _generate_jit_func_args_for_known_types(
+        self,
+        func,
+        arg,
+        arg_name,
+        arg_spec,
+        arg_index,
+        *,
+        is_host=True,
+    ):
+        """
+        Generate JIT function arguments for known types.
+
+        Sub-DSLs can override this method to handle types that are not
+        natively supported by the Base DSL.
+        """
+
+        jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], []
+        default_attr = ir.DictAttr.get({})
+
+        if is_argument_constexpr(arg, arg_spec, arg_name, arg_index, func):
+            jit_exec_arg = jit_arg_type = jit_arg_attr = None
+
+        return jit_exec_arg, jit_arg_type, jit_arg_attr
+
+    def _generate_jit_func_args(
+        self,
+        func,
+        function_name,
+        args,
+        kwargs,
+        args_spec: inspect.FullArgSpec,
+        *,
+        is_host=True,
+    ):
+        """Generate JIT function arguments."""
+
+        assert len(args) == len(args_spec.args) and len(kwargs) == len(
+            args_spec.kwonlyargs
+        ), (
+            f"Input args {len(args)=} and kwargs {len(kwargs)=} must match arg_spec.args "
+            f"{len(args_spec.args)=} and arg_spec.kwonlyargs {len(args_spec.kwonlyargs)=}"
+        )
+
+        jit_arg_types, jit_arg_attrs, jit_exec_args = [], [], []
+        jit_adapted_args = []
+        default_attr = ir.DictAttr.get({})
+
+        input_args = [*args, *kwargs.values()]
+        input_arg_names = [*args_spec.args, *args_spec.kwonlyargs]
+        for i, (arg_name, arg) in enumerate(zip(input_arg_names, input_args)):
+            spec_ty = args_spec.annotations.get(arg_name, None)
+            log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, spec_ty)
+
+            # Implicitly convert into Numeric type if possible
+            if isinstance(spec_ty, t.NumericMeta) and not isinstance(arg, spec_ty):
+                arg = t.cast(arg, spec_ty)
+
+            # Type safety check
+            if spec_ty is not None:
+                err = self._validate_arg(arg, i, arg_name, spec_ty)
+                if err is not None:
+                    raise err
+
+            jit_exec_arg, jit_arg_type, jit_arg_attr = (
+                self._generate_jit_func_args_for_known_types(
+                    func,
+                    arg,
+                    arg_name,
+                    spec_ty,
+                    i,
+                    is_host=is_host,
+                )
+            )
+
+            if jit_arg_type is not None and len(jit_arg_type) == 0:
+                # If not any known type, try JIT argument adapter
+                # to convert the argument
+                adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                if adapter:
+                    arg = adapter(arg)
+                    jit_adapted_args.append(arg)
+
+                if is_host:
+                    jit_exec_arg.extend(get_c_pointers(arg))
+                    jit_arg_type.extend(get_mlir_types(arg))
+                else:
+                    dyn_vals = extract_mlir_values(arg)
+                    jit_exec_arg.extend(dyn_vals)
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+
+                if not jit_arg_type or not jit_exec_arg:
+                    if (is_host and hasattr(arg, "__c_pointers__")) or (
+                        not is_host
+                        and hasattr(arg, "__extract_mlir_values__")
+                        and hasattr(arg, "__new_from_mlir_values__")
+                    ):
+                        pass
+                    else:
+                        raise DSLRuntimeError(
+                            f"failed to generate argument #{i+1} ({arg_name}) for JIT function '{function_name}'.",
+                            context={
+                                f"Argument {arg_name}": "The DSL attempted to convert it into Dynamic Expression (aka MLIR values) but failed.",
+                                f"Call-site argument value": arg,
+                                f"Call-site argument type": type(arg),
+                            },
+                            suggestion=f"Consider annotating the argument with `{arg_name} : Constexpr` "
+                            "if it's a value known at compile-time. "
+                            f"Otherwise, implement the {'`JitArgument`' if is_host else '`DynamicExpression`'} "
+                            f"protocol or register a custom JIT argument adapter for type `{type(arg)}` to "
+                            "enable dynamic value conversion at runtime.",
+                        )
+
+                jit_arg_attr.extend([default_attr] * len(jit_arg_type))
+
+            if jit_arg_type is not None:
+                jit_exec_args.extend(jit_exec_arg)
+                jit_arg_types.extend(jit_arg_type)
+                jit_arg_attrs.extend(jit_arg_attr)
+
+        return jit_exec_args, jit_arg_types, jit_arg_attrs, jit_adapted_args
+
+    def generate_mlir_function_types(
+        self, func, function_name, input_args, kwargs, args_spec: inspect.FullArgSpec
+    ):
+        """Convert input arguments to MLIR function signature also convert numpy arrays to memref."""
+
+        exe_args, types, attrs, adapted_args = self._generate_jit_func_args(
+            func, function_name, input_args, kwargs, args_spec, is_host=True
+        )
+
+        log().debug("Execution Arguments: %s", ", ".join(map(str, exe_args)))
+        log().debug("Types: %s", ", ".join(map(str, types)))
+
+        assert len(exe_args) == len(
+            types
+        ), "expects the same number of arguments and function parameters"
+
+        return exe_args, types, adapted_args
+
+    @dataclass
+    class LaunchConfig:
+        cluster: list = None
+        grid: list = field(default_factory=lambda: [1, 1, 1])
+        block: list = field(default_factory=lambda: [1, 1, 1])
+        smem: int = None
+        async_deps: list = field(default_factory=list)
+        has_cluster: bool = False
+        min_blocks_per_mp: int = 0
+        auto_smem: bool = False
+
+        def __post_init__(self):
+            if len(self.grid) != 3:
+                raise DSLRuntimeError(f"Expect 3d grid!")
+            if len(self.block) != 3:
+                raise DSLRuntimeError(f"Expect 3d block!")
+
+            if self.smem is None:
+                self.smem = 0
+                self.auto_smem = True
+
+            self.has_cluster = self.cluster is not None
+            if self.cluster is None:
+                self.cluster = [None, None, None]
+            elif len(self.cluster) != 3:
+                raise DSLRuntimeError(f"Expect 3d cluster!")
+
+    def diagnostic(self):
+        """Check command line parameters and enables diagnostic"""
+        # Check command line arguments "-diagnostic"
+        parser = argparse.ArgumentParser(description="Process diagnostic status.")
+        parser.add_argument(
+            "-diagnostic",
+            nargs="?",
+            const="all",
+            choices=["all", "fail", "success", "info", "suggestion"],
+            help="Set diagnostic status (fail, success, info, suggestion).",
+        )
+
+        args, _ = parser.parse_known_args()
+        ctx = ir.Context.current
+
+        def callback(d):
+            print(f"  [{self.name} Diagnostic] : {d.message}")
+
+        ctx.attach_diagnostic_handler(callback)
+
+        # Early return, don't enable diagnostics
+        if args.diagnostic is None:
+            return
+
+        # Enable MLIR Flags
+        ctx.emit_error_diagnostics = True
+        ir._GlobalDebug.flag = True
+        if args.diagnostic == "all":
+            ir._GlobalDebug.set_types("diagnostic")
+        else:
+            ir._GlobalDebug.set_types(f"diagnostic-{args.diagnostic}")
+
+    def get_location(self):
+        """
+        Get python location information and generate MLIR location
+        """
+
+        if self.frame is None:
+            log().debug("Frame is None")
+            return None
+
+        file_loc = ir.Location.file(
+            self.frame.f_code.co_filename, self.frame.f_lineno, 0
+        )
+
+        loc = ir.Location.name(self.frame.f_code.co_name, childLoc=file_loc)
+        return loc
+
+    def compile_and_jit(self, module, pipeline, shared_libs, function_name=""):
+        """
+        Compile and JIT an MLIR module.
+        """
+
+        try:
+            self.diagnostic()
+
+            orig_stdout = sys.stdout
+            orig_stderr = sys.stderr
+            sys.stderr = redirect_stderr = io.StringIO()
+            sys.stdout = redirect_stdout = io.StringIO()
+
+            try:
+                kernel = self.compiler_provider.compile_and_jit(
+                    module,
+                    pipeline,
+                    shared_libs=shared_libs,
+                    cuda_toolkit=self.envar.cuda_toolkit,
+                    arch=self.envar.arch,
+                )
+
+            finally:
+                sys.stdout = orig_stdout
+                sys.stderr = orig_stderr
+                ir._GlobalDebug.flag = False
+
+            # Print captured output.
+            print(redirect_stdout.getvalue(), file=sys.stdout, end="")
+            print(redirect_stderr.getvalue(), file=sys.stderr, end="")
+
+            return kernel
+
+        except Exception as e:
+            raise DSLRuntimeError("🧊🧊🧊 ICE 🧊🧊🧊", cause=e)
+        finally:
+            pass
+
+    def preprocess_pipeline(self, pipeline, arch) -> str:
+
+        if self.envar.cuda_toolkit is None:
+            self.print_warning(
+                "CUDA_TOOLKIT_PATH environment variable is not set. Cannot set toolkitPath."
+            )
+
+        options = {
+            "toolkitPath": self.envar.cuda_toolkit if self.envar.cuda_toolkit else None,
+            self.pass_sm_arch_name: arch,
+        }
+
+        opt_str = ""
+        for k, v in options.items():
+            if v:
+                opt_str += f"{k}={v} "
+
+        if opt_str:
+            # Automatically append the pipeline options if any is specified through env var
+            pattern = re.compile(r"{(.+)}")
+            match = pattern.search(pipeline)
+            if match:
+                opt_str = f"{{{match[1]} {opt_str}}}"
+                pipeline = re.sub(r"{.+}", opt_str, pipeline)
+            else:
+                pipeline = pipeline.rstrip(")") + f"{{{opt_str}}})"
+        log().debug(f"Using pipeline = {pipeline}")
+        return pipeline
+
+    def get_shared_libs(self) -> list:
+        shared_libs = []
+        support_libs = self.envar.shared_libs
+        if support_libs is not None:
+            _libs = support_libs.split(":")
+            for lib in _libs:
+                if not os.path.exists(lib):
+                    raise FileNotFoundError(
+                        errno.ENOENT, os.strerror(errno.ENOENT), lib
+                    )
+                shared_libs.append(lib)
+        else:
+            self.print_warning(f"{self.name}_LIBS environment variable is not set")
+
+        return shared_libs
+
+    @lru_cache(maxsize=1)
+    def get_version(self):
+        version_hash = hashlib.sha256()
+
+        return version_hash
+
+    def get_module_hash(self, module, function_name):
+        s = io.BytesIO()
+        module.operation.write_bytecode(s)
+        for attr, value in self.envar.__dict__.items():
+            if value is not None:
+                s.write(str(value).encode())
+        # Add compile options to the hash
+        s.write(self.compile_options.to_str().encode())
+        module_hash = self.get_version().copy()
+        module_hash.update(s.getvalue())
+        module_hash = module_hash.hexdigest()
+
+        log().debug("Bytecode=[%s]", s.getvalue().hex())
+        log().debug("Version=[%s]", self.get_version().hexdigest())
+        log().info(
+            "Function=[%s] Computed module_hash=[%s]", function_name, module_hash
+        )
+        return module_hash
+
+    def build_module(self, module, function_name: str):
+        """
+        Build the MLIR module, verify and return the module
+        """
+
+        # Save IR in a file
+        if self.envar.keepIR:
+            save_ir(self.name, module, function_name)
+
+        if self.envar.printIR:
+            print("\n//===--- ------ Generated IR ------ ---====\n")
+            module.operation.print(
+                enable_debug_info=self.envar.generate_source_location
+            )
+            print("\n//===--- --- End of Generated IR -- ---====\n")
+
+        # Verify the module
+        try:
+            module.operation.verify()
+        except Exception as e:
+            raise DSLRuntimeError(f"🧊🧊🧊 ICE IR Verification Failed 🧊🧊🧊", cause=e)
+
+        return module
+
+    def generate_original_ir(
+        self,
+        ir,
+        func,
+        funcBody,
+        kwargs,
+        function_name,
+        func_types,
+        gpu_module_attrs,
+        args,
+        args_spec,
+    ):
+        # This location is set to None for now; otherwise, calls to the same
+        # function on different lines would produce different line numbers,
+        # which would break the cache.
+        loc = None  # self.get_location()
+
+        def build_ir_module():
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+
+            with ir.InsertionPoint(module.body):
+                # Always generate gpu module. It's canonicalized by the compiler when it's not used.
+                self._build_gpu_module(gpu_module_attrs)
+
+                fop = func.FuncOp(function_name, (func_types, []), loc=loc)
+                fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get()
+                log().debug("Generated Function OP [%s]", fop)
+                with ir.InsertionPoint(fop.add_entry_block()):
+                    ir_args, ir_kwargs = self.generate_execution_arguments(
+                        args, kwargs, fop, args_spec
+                    )
+                    # Call user function body
+                    try:
+                        result = funcBody(*ir_args, **ir_kwargs)
+                        func.ReturnOp([])
+                    except NameError as name_error:
+                        raise DSLRuntimeError(
+                            f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥",
+                            cause=name_error,
+                            suggestion="Using variables defined in dynamic control flow is not supported. Please give an initial value before control flow.",
+                        )
+                    except DSLRuntimeError as dsl_error:
+                        # Throw it's already a DSL error
+                        raise dsl_error
+            return module, result
+
+        # Build IR module
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        module, result = profiler(build_ir_module)()
+        module_hash = self.get_module_hash(module, function_name)
+
+        module = self.build_module(module, function_name)
+
+        return module, module_hash, result
+
+    def compile_and_cache(
+        self, module, module_hash, function_name, pipeline, args_spec, no_cache
+    ):
+        arch = self.envar.arch
+        pipeline = self.preprocess_pipeline(self._get_pipeline(pipeline), arch)
+        shared_libs = self.get_shared_libs()
+        profiler = timer(enable=self.envar.jitTimeProfiling)
+        if (
+            no_cache
+            or module_hash not in self.jit_cache
+            or self.jit_cache[module_hash].ir_module is None
+        ):
+            log().info(
+                "JIT cache miss function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            # Compile and JIT MLIR module
+            engine = profiler(self.compile_and_jit)(
+                module, pipeline, shared_libs, function_name=function_name
+            )
+        else:
+            log().info(
+                "JIT cache hit IN-FILE function=[%s] module_hash=[%s]",
+                function_name,
+                module_hash,
+            )
+            module = self.jit_cache[module_hash].ir_module
+            engine = self.compiler_provider.jit(module, shared_libs=shared_libs)
+        capi_func = profiler(engine.lookup)(function_name)
+        jit_executor = JitExecutor(
+            self,
+            engine,
+            capi_func,
+            module,
+            args_spec,
+            function_name,
+            jit_time_profiling=self.envar.jitTimeProfiling,
+        )
+        jit_executor = jit_executor.update_jit_cuda_modules(self.kernel_symbols)
+
+        if not no_cache:
+            # module stored in cache is compiled.
+            self.jit_cache[module_hash] = jit_executor
+
+        return jit_executor
+
+    def post_compilation_cleanup(self):
+        """Clean up some internal state after one compilation is completed."""
+        # clear the kernel symbols after the compilation is done.
+        self.kernel_symbols = []
+        self.launch_inner_count = 0
+        # reset num_kernels to 0 for next compilation.
+        self.num_kernels = 0
+        # reset the compile options after the compilation is done.
+        self.compile_options = CompileOptions()
+
+    def generate_mlir(
+        self,
+        funcBody,
+        kwargs,
+        function_name,
+        gpu_module_attrs,
+        args,
+        args_spec,
+        pipeline,
+        no_cache,
+        compile_only,
+        loc=None,
+    ):
+        """Generate MLIR module and compile iself.T_provider."""
+        with ir.Context(), ir.Location.unknown():
+            # Convert input arguments to MLIR arguments
+            exe_args, func_types, adapted_args = self.generate_mlir_function_types(
+                funcBody, function_name, args, kwargs, args_spec
+            )
+
+            # Generate original ir module and its hash value.
+            module, module_hash, result = self.generate_original_ir(
+                ir,
+                func,
+                funcBody,
+                kwargs,
+                function_name,
+                func_types,
+                gpu_module_attrs,
+                args,
+                args_spec,
+            )
+
+            # dryrun is used to only generate IR
+            if self.envar.dryrun:
+                return result
+
+            if (
+                no_cache
+                or module_hash not in self.jit_cache
+                or self.jit_cache[module_hash].capi_func is None
+            ):
+                # no cache or cache miss, do ir generation/compilation/jit engine
+                jit_executor = self.compile_and_cache(
+                    module, module_hash, function_name, pipeline, args_spec, no_cache
+                )
+            else:
+                # cache hit
+                log().info(
+                    "JIT cache hit IN-MEMORY function=[%s] module_hash=[%s]",
+                    function_name,
+                    module_hash,
+                )
+                jit_executor = self.jit_cache[module_hash]
+
+            self.post_compilation_cleanup()
+        # If compile_only is set, bypass execution return the jit_executor directly
+        if compile_only:
+            return jit_executor
+        # Run the compiled program
+        jit_executor.run_compiled_program(exe_args)
+
+        return result
+
+    def run_preprocessor(self, funcBody):
+        if not hasattr(funcBody, "_preprocessed"):
+            function_name = funcBody.__name__
+            self.funcBody = funcBody
+            log().info("Started preprocessing [%s]", function_name)
+            exec_globals = self._get_globals()
+            transformed_ast = self.preprocessor.transform(funcBody, exec_globals)
+            if self.envar.print_after_preprocessor:
+                log().info(
+                    f"# Printing unparsed AST after preprocess of func=`{function_name}` id=`{id(funcBody)}`"
+                )
+                DSLPreprocessor.print_ast(transformed_ast)
+            funcBody._preprocessed = True
+            return transformed_ast
+        return None
+
+    def get_function_ptr(self, original_function):
+        file_name = inspect.getsourcefile(original_function)
+        code_object = compile(
+            original_function._transformed_ast, filename=file_name, mode="exec"
+        )
+        return self.preprocessor.exec(
+            original_function.__name__,
+            original_function,
+            code_object,
+            self._get_globals(),
+        )
+
+    def _get_function_bound_args(self, sig, func_name, *args, **kwargs):
+        """
+        Binds provided arguments to a function's signature and applies default values.
+
+        E.g. given a function signature `def foo(a, b=2, c=3)`, and at call-site if we do
+        `foo(a=1, c=4)`, the returned BoundArguments object will have args = `[1]`
+        and kwargs = `{'b': 2, 'c': 4}`
+
+        An exception will be raised if binding fails.
+        """
+        try:
+            bound_args = sig.bind_partial(*args, **kwargs)
+            bound_args.apply_defaults()
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"Failed to bind arguments to function `{func_name}` with signature `{sig}`",
+                cause=e,
+            )
+        return bound_args
+
+    def _canonicalize_args(self, sig, *args, **kwargs):
+        """
+        Canonicalize the input arguments so that returned args only contain
+        positional arguments and kwargs only contain keyword arguments.
+        """
+        function_name = self.funcBody.__name__
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+        canonicalized_args = bound_args.args
+        canonicalized_kwargs = bound_args.kwargs
+        return canonicalized_args, canonicalized_kwargs
+
+    def _check_arg_count(self, *args, **kwargs):
+        if not self.funcBody:
+            raise DSLRuntimeError("Function body is not set.")
+
+        # Pass the actual function object to inspect.signature to get the signature.
+        sig = inspect.signature(self.funcBody)
+
+        function_name = self.funcBody.__name__
+
+        bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs)
+
+        # Check if all non-default arguments are provided
+        for param in sig.parameters.values():
+            if (
+                param.default is inspect.Parameter.empty
+                and param.name not in bound_args.arguments
+            ):
+                raise DSLRuntimeError(
+                    f"Missing required argument in `{function_name}`: '{param.name}'"
+                )
+
+        return sig
+
+    def _func(self, funcBody, *args, **kwargs):
+        """Decorator for MLIR functions.
+        It cuts the boilerplate code, does the following:
+            1. Generates `func.func`
+            2. Types translation (numpy arrays -> cute.memref, float -> <f32>, etc.)
+            3. Compiles and JITs the MLIR module
+            4. Invokes the generated function
+            5. Operator overloading (a + b --> arith.addi a, b)
+            6. Generates GPU kernel function with GPU module and kernel attributes baked
+        """
+        if ir.Context.current is None:
+            pass
+        elif ir.InsertionPoint.current is not None:
+            return funcBody(*args, **kwargs)
+
+        function_name = funcBody.__name__
+        self.funcBody = funcBody
+
+        pipeline = kwargs.pop("pipeline", None)
+        gpu_module_attrs = kwargs.pop("gpu_module_attrs", {})
+
+        # Disable cache
+        no_cache = kwargs.pop("no_cache", False)
+
+        # Always compile(disable cache) and return the result jit_executor
+        compile_only = kwargs.pop("compile_only", False)
+
+        if not no_cache and compile_only:
+            no_cache = True
+            self.print_warning("Cache is disabled as user wants to compile only.")
+
+        # Check the number of arguments
+        sig = self._check_arg_count(*args, **kwargs)
+
+        args_spec = inspect.getfullargspec(funcBody)
+
+        # Canonicalize the input arguments
+        canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+            sig, *args, **kwargs
+        )
+
+        # Simple name mangling
+        function_name = self.mangle_name(function_name, canonicalized_args, args_spec)
+
+        # Generate MLIR Context and start generating IR
+        log().debug(f"Generating MLIR for function '{function_name}'")
+        result = self.generate_mlir(
+            funcBody,
+            canonicalized_kwargs,
+            function_name,
+            gpu_module_attrs,
+            canonicalized_args,
+            args_spec,
+            pipeline,
+            no_cache,
+            compile_only,
+        )
+
+        return result
+
+    class _KernelGenHelper(ABC):
+        def __init__(self):
+            self.func_op = None
+            self.func_type = None
+
+        @abstractmethod
+        def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None):
+            assert arg_types is not None, "Invalid arg_types!"
+            assert kernel_name is not None, "kernel name is empty"
+            pass
+
+        @abstractmethod
+        def generate_func_ret_op(self):
+            pass
+
+        @abstractmethod
+        def generate_launch_op(self, *args, **kwargs):
+            pass
+
+        @abstractmethod
+        def get_func_body_start(self):
+            pass
+
+    @abstractmethod
+    def enter_gpu_module(module):
+        """Compute the insertion point into the given module."""
+        pass
+
+    @lru_cache(maxsize=1)
+    def _get_default_stream(self):
+        """Returns the default stream 0"""
+        from .runtime import cuda as cuda_helpers
+
+        return cuda_helpers.stream_create()
+
+    def _execute_cuda(
+        self, fname_cubin, kernel_name, grid_size, block_size, smem_size, stream=None
+    ):
+        """
+        Executes a specified CUDA kernel from a cubin file, handling module loading,
+        kernel retrieval, stream creation, kernel launch, and synchronization.
+        """
+        from .runtime import cuda as cuda_helpers
+
+        # Step 1. Load CUDA Module
+        module = cuda_helpers.load_cubin_module(fname_cubin)
+        # Step 2. Find CUDA function
+        kernel_ptr = cuda_helpers.get_kernel_function(module, kernel_name)
+
+        sync_execution_default = False
+        if stream is None:
+            stream = self._get_default_stream()
+            sync_execution_default = True
+
+        # Step 4. Launch the kernel
+        cuda_helpers.launch_kernel(
+            kernel_ptr,
+            grid_size,
+            block_size,
+            stream,
+            smem_size=smem_size,
+            kernel_args=self.exe_args,
+        )
+
+        if sync_execution_default:
+            # Step 5. Optional Sync cuda stream
+            cuda_helpers.stream_sync(stream)
+
+    def _execute_by_cuda_driver(
+        self,
+        kernel_generator,
+        generate_cubin,
+        grid_size,
+        block_size,
+        smem_size,
+        stream=None,
+    ):
+        """
+        This function builds IR and execute the module using cuda driver.
+        It doesn't use mlir's cuda runtime
+        """
+        ret = None
+
+        # Step 1. Build IR
+        with ir.Context(), ir.Location.unknown():
+            loc = self.get_location()
+            module = ir.Module.create(loc=loc)
+            unit_attr = ir.UnitAttr.get()
+            module.operation.attributes["gpu.container_module"] = unit_attr
+            with ir.InsertionPoint(module.body):
+                self._build_gpu_module()
+                ret, kernel_name = kernel_generator()
+                log().debug(
+                    f"Kernel generator returned: ret={ret}, kernel_name={kernel_name}"
+                )
+
+        module = self.build_module(module, kernel_name)
+
+        # dryrun is used to only generate IR
+        if self.envar.dryrun:
+            return ret
+
+        # Generate cubin
+        fname_cubin = generate_cubin(module, kernel_name)
+
+        # Execute a cuda kernel from cubin
+        self._execute_cuda(
+            fname_cubin, kernel_name, grid_size, block_size, smem_size, stream
+        )
+
+        return ret
+
+    def generate_kernel_operands_and_types(
+        self, kernel_func, kernel_name, args_spec, args, kwargs
+    ):
+        """
+        Generate the operands and types for the kernel function
+        """
+
+        kernel_operands, kernel_arg_types, kernel_arg_attrs = [], [], []
+
+        log().debug(
+            "Processing GPU kernel call in [%s] mode",
+            (
+                f"Only {self.device_jit_decorator_name}"
+                if self.device_compilation_only
+                else f"{self.host_jit_decorator_name} + {self.device_jit_decorator_name}"
+            ),
+        )
+
+        if self.device_compilation_only:
+            return kernel_operands, kernel_arg_types, kernel_arg_attrs
+
+        kernel_operands, kernel_arg_types, kernel_arg_attrs, _ = (
+            self._generate_jit_func_args(
+                kernel_func, kernel_name, args, kwargs, args_spec, is_host=False
+            )
+        )
+
+        log().debug("Final kernel_operands: %s", ", ".join(map(str, kernel_operands)))
+        log().debug("Final kernel_arg_types: %s", ", ".join(map(str, kernel_arg_types)))
+        log().debug("Final kernel_arg_attrs: %s", ", ".join(map(str, kernel_arg_attrs)))
+
+        assert (
+            len(kernel_operands) == len(kernel_arg_types) == len(kernel_arg_attrs)
+        ), "Size of kernel_operands, kernel_arg_types and kernel_arg_attrs must be equal"
+
+        return kernel_operands, kernel_arg_types, kernel_arg_attrs
+
+    def kernel_launcher(self, *dargs, **dkwargs):
+        def decorator(funcBody):
+            @wraps(funcBody)
+            def kernel_wrapper(*args, **kwargs):
+                """
+                Base decorator for generating kernel function
+
+                This decorator provides a template for kernel function generation
+                including kernel function header/body and kernel launch op at call site
+
+                Optional arguments (with default value in <>):
+                  - requiredArgs <[]>:      specifies the mandatory arguments that must present in kernel function signature
+                                            the args will be validated and collected as a namedtuple
+                  - optionalArgs <[]>:      specifies the optional arguments that might present in kernel function signature
+                                            the args will be collected (if present) as a namedtuple
+                  - unitAttrNames <[]>:     specifies the name(s) of ir.UnitAttr to be set for kernel function op
+                  - valueAttrDict <{}>:     specifies the name(s) and value(s) of ir.Attribute to be set for kernel function op
+                  - kernelGenHelper <None>: specifies the mandatory customized kernel generation helper class (derived from _KernelGenHelper)
+
+                Return value:
+                  A namedtuple "KernelReturns" is returned with following fields:
+                  - kernel_func_ret: the return of the kernel function
+                  - launch_op_ret:   the return of the launch op
+                """
+
+                requiredArgs = dkwargs.get("requiredArgs", [])
+                optionalArgs = dkwargs.get("optionalArgs", [])
+                unitAttrNames = dkwargs.get("unitAttrNames", [])
+                valueAttrDict = dkwargs.get("valueAttrDict", {})
+                kernelGenHelper = dkwargs.get("kernelGenHelper", None)
+
+                kernel_name = funcBody.__name__
+                args_spec = inspect.getfullargspec(funcBody)
+                self.funcBody = funcBody
+
+                # Give each kernel a unique name. (The same kernel may be
+                # called multiple times, resulting in multiple kernel traces.)
+                # The mangled name of Python function is part of the name to
+                # improve readability.
+                kernel_name = f"kernel_{self.mangle_name(kernel_name, args, args_spec)}_{self.num_kernels}"
+                self.num_kernels += 1
+
+                # Step 0. Preprocess the arguments
+                def extract_args(argNames, assertIfNone=False) -> list:
+                    extracted = []
+                    for name in argNames:
+                        value = kwargs.pop(name, None)
+                        if assertIfNone and value is None:
+                            raise DSLRuntimeError(
+                                f"{name} is required for {kernel_name}"
+                            )
+                        extracted.append(value)
+
+                    return extracted
+
+                RequiredArgs = namedtuple("RequiredArgs", requiredArgs)
+                req_args = (
+                    RequiredArgs._make(extract_args(requiredArgs, assertIfNone=True))
+                    if requiredArgs
+                    else None
+                )
+                OptionalArgs = namedtuple("OptionalArgs", optionalArgs)
+                opt_args = (
+                    OptionalArgs._make(extract_args(optionalArgs))
+                    if optionalArgs
+                    else None
+                )
+                assert (
+                    kernelGenHelper is not None
+                ), "kernelGenHelper should be explicitly specified!"
+
+                # check arguments
+                sig = self._check_arg_count(*args, **kwargs)
+
+                # Canonicalize the input arguments
+                canonicalized_args, canonicalized_kwargs = self._canonicalize_args(
+                    sig, *args, **kwargs
+                )
+
+                kernel_operands, kernel_types, kernel_arg_attrs = (
+                    self.generate_kernel_operands_and_types(
+                        funcBody,
+                        kernel_name,
+                        args_spec,
+                        canonicalized_args,
+                        canonicalized_kwargs,
+                    )
+                )
+
+                with self._enter_gpu_module():
+                    log().debug("Generating device kernel")
+                    if self.device_compilation_only:
+                        log().debug("Generating cuda-python arguments")
+                        # Convert input arguments to MLIR arguments
+                        self.exe_args, kernel_types, _ = (
+                            self.generate_mlir_function_types(
+                                funcBody,
+                                kernel_name,
+                                canonicalized_args,
+                                canonicalized_kwargs,
+                                args_spec,
+                            )
+                        )
+
+                    helper = kernelGenHelper()
+                    loc = self.get_location()
+                    fop = helper.generate_func_op(
+                        kernel_types, kernel_arg_attrs, kernel_name, loc
+                    )
+                    log().debug(f"Kernel function op: {fop}")
+                    for attr in unitAttrNames:
+                        fop.attributes[attr] = ir.UnitAttr.get()
+                    for key, val in valueAttrDict.items():
+                        fop.attributes[key] = val
+
+                    fop.sym_visibility = ir.StringAttr.get("public")
+                    with ir.InsertionPoint(helper.get_func_body_start()):
+                        ir_args, ir_kwargs = self.generate_execution_arguments(
+                            canonicalized_args, canonicalized_kwargs, fop, args_spec
+                        )
+                        log().debug(
+                            f"IR arguments - args: {ir_args} ; kwargs: {ir_kwargs}"
+                        )
+                        # Call user function body
+                        kernel_ret = funcBody(*ir_args, **ir_kwargs)
+                        helper.generate_func_ret_op()
+
+                # Step 3. Generate call site `launch_func`
+                kernel_sym = ir.SymbolRefAttr.get(["kernels", kernel_name])
+                launch_ret = helper.generate_launch_op(
+                    kernelSym=kernel_sym,
+                    kernelOperands=kernel_operands,
+                    requiredArgs=req_args,
+                    optionalArgs=opt_args,
+                )
+
+                KernelReturns = namedtuple(
+                    "KernelReturns", ["kernel_func_ret", "launch_op_ret"]
+                )
+                result = KernelReturns(
+                    kernel_func_ret=kernel_ret, launch_op_ret=launch_ret
+                )
+                log().debug(f"Kernel result: {result}, kernel name: {kernel_name}")
+                return result, kernel_name
+
+            return kernel_wrapper
+
+        if len(dargs) == 1 and callable(dargs[0]):
+            return decorator(dargs[0])
+        else:
+            return decorator
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/env_manager.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/env_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa683477f3fb5b18f5459e19bdd468432590b952
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/env_manager.py
@@ -0,0 +1,320 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides utilities for the environment variables setup.
+
+It provides an EnvironmentVarManager, which reads environment variables for the DSL
+and caches them for efficient access.
+
+It also provides utilities to automatically setup a subset of environment variables
+based on heuristics.
+"""
+
+import os
+import sys
+import shutil
+import glob
+from pathlib import Path
+from functools import lru_cache
+from typing import Any
+
+from ..base_dsl.runtime.cuda import get_compute_capability_major_minor
+from .utils.logger import log
+
+IS_WINDOWS = sys.platform == "win32"
+CLIB_EXT = ".dll" if IS_WINDOWS else ".so"
+
+# =============================================================================
+# Environment Variable Helpers
+# =============================================================================
+
+
+@lru_cache(maxsize=None)
+def get_str_env_var(var_name, default_value=None):
+    value = os.getenv(var_name)
+    return value if value is not None else default_value
+
+
+@lru_cache(maxsize=None)
+def get_bool_env_var(var_name, default_value=False):
+    value = get_str_env_var(var_name)
+    if value is None:
+        return default_value
+    return value not in {"False", "0", ""}
+
+
+@lru_cache(maxsize=None)
+def get_int_env_var(var_name, default_value=0):
+    value = get_str_env_var(var_name)
+    return int(value) if value and value.isdigit() else default_value
+
+
+@lru_cache(maxsize=None)
+def has_env_var(var_name):
+    return os.getenv(var_name) is not None
+
+
+def detect_gpu_arch(prefix):
+    """
+    Attempts to detect the machine's GPU architecture.
+
+    Returns:
+        A string representing the GPU architecture (e.g. "70" for compute capability 7.0),
+        or a default value(e.g. "sm_100") if the GPU architecture cannot be determined.
+    """
+    arch = (None, None)
+    try:
+        arch = get_compute_capability_major_minor()
+    except Exception as e:
+        log().info(f"Failed to get CUDA compute capability: {e}")
+
+    if arch == (None, None):
+        # default to sm_100
+        arch = (10, 0)
+
+    major, minor = arch
+    suffix = ""
+    if major >= 9:
+        suffix = "a"
+
+    return f"sm_{major}{minor}{suffix}"
+
+
+def find_libs_in_ancestors(start, target_libs, lib_folder_guesses):
+    """
+    Search ancestor directories for a candidate library folder containing all required libraries.
+
+    Starting from the given path, this function traverses up through each parent directory.
+    For every ancestor, it checks candidate subdirectories (specified by lib_folder_guesses)
+    for files that match the required library extension (CLIB_EXT). Library file names are
+    canonicalized by removing the "lib" prefix from their stem. If a candidate directory contains
+    all of the required libraries (as specified in target_libs), the function returns a list of
+    absolute paths to these library files.
+
+    Parameters:
+        start (str or Path): The starting directory from which to begin the search.
+        target_libs (iterable of str): A collection of required library names (without the "lib" prefix).
+        lib_folder_guesses (iterable of str): Relative paths from an ancestor directory that may contain the libraries.
+
+    Returns:
+        list[str] or None: A list of resolved paths to the required library files if found; otherwise, None.
+    """
+    # Traverse through all parent directories of the resolved starting path.
+    for ancestor in Path(start).resolve().parents:
+        # Iterate over each candidate relative directory path.
+        for rel_path in lib_folder_guesses:
+            target_dir = ancestor / rel_path
+            # Skip if the candidate directory does not exist.
+            if not target_dir.is_dir():
+                continue
+
+            # Initialize a list to hold the resolved paths of matching library files.
+            libs_cand = []
+            # Create a set of the remaining libraries we need to find.
+            remaining_libs = set(target_libs)
+
+            # Iterate over all items in the candidate directory.
+            for p in target_dir.iterdir():
+                # Consider only files with the expected library extension.
+                if p.suffix == CLIB_EXT:
+                    # Canonicalize the library name by removing the "lib" prefix.
+                    lib_name = p.stem.removeprefix("lib")
+                    # If this library is required, add its resolved path and mark it as found.
+                    if lib_name in remaining_libs:
+                        libs_cand.append(str(p.resolve()))
+                        remaining_libs.remove(lib_name)
+
+            # If all required libraries have been found, return the list of library paths.
+            if len(remaining_libs) == 0:
+                return libs_cand
+
+    # Return None if no candidate directory contains all required libraries.
+    return None
+
+
+def _find_cuda_home():
+    """Find the CUDA installation path using a series of heuristic methods.
+    Methods below are checked in order, and the function returns on first match:
+    1. Checking the environment variables CUDA_HOME and CUDA_PATH.
+    2. Searching for the 'nvcc' compiler in the system PATH and deriving the path of cuda.
+    3. Scanning common installation directories based on the operating system.
+       - On Windows systems (when IS_WINDOWS is True), it searches in:
+             C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*
+       - On Unix-like systems, it searches in:
+             /usr/local/cuda*
+
+    Returns:
+        Optional[str]: The absolute CUDA installation path if found; otherwise, None.
+
+    Note:
+        The variable IS_WINDOWS is defined in the module scope.
+    """
+    # Guess #1
+    cuda_home = get_str_env_var("CUDA_HOME") or get_str_env_var("CUDA_PATH")
+    if cuda_home is None:
+        # Guess #2
+        nvcc_path = shutil.which("nvcc")
+        if nvcc_path is not None:
+            cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
+        else:
+            # Guess #3
+            if IS_WINDOWS:
+                glob_pat = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*"
+            else:
+                glob_pat = "/usr/local/cuda*"
+            cuda_homes = glob.glob(glob_pat)
+            if len(cuda_homes) == 0:
+                cuda_home = ""
+            else:
+                cuda_home = cuda_homes[0]
+            if not os.path.exists(cuda_home):
+                cuda_home = None
+    return cuda_home
+
+
+def get_cuda_toolkit_path():
+    """
+    Get cuda_toolkit_path. It returns get_str_env_var('CUDA_TOOLKIT_PATH') if
+    set. Otherwise, attempts to discover a valid CUDA toolkit location and
+    return. If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        cuda_toolkit_path_existing = get_str_env_var("CUDA_TOOLKIT_PATH")
+        if cuda_toolkit_path_existing:
+            return cuda_toolkit_path_existing
+
+        found_cuda_home = _find_cuda_home()
+        if found_cuda_home:
+            return found_cuda_home
+    except Exception as e:
+        log().info("default_env: exception on get_cuda_toolkit_path", e)
+    return None
+
+
+def get_prefix_dsl_libs(prefix: str):
+    """
+    Returns get_str_env_var('{prefix}_LIBS') if set.
+    Otherwise, attempts to discover libs based on heuristics and return
+    If not found, return None.
+    """
+    # Check if the environment variable is already set, if so, return it immediately.
+    try:
+        prefix_libs_existing = get_str_env_var(f"{prefix}_LIBS")
+        if prefix_libs_existing:
+            return prefix_libs_existing
+
+        def get_libs_cand(start):
+            target_libs = {
+                "mlir_c_runner_utils",
+                "mlir_runner_utils",
+                "mlir_cuda_runtime",
+            }
+            lib_folder_guesses = [
+                "lib",
+            ]
+
+            libs_cand = find_libs_in_ancestors(start, target_libs, lib_folder_guesses)
+            if libs_cand:
+                dsl_libs = ":".join(libs_cand)
+                return dsl_libs
+
+            return None
+
+        # find from install folder
+        dsl_libs = get_libs_cand(__file__)
+
+        if not dsl_libs:
+            # try to find from build folder structure
+            dsl_libs = get_libs_cand(Path(__file__).parent.parent.resolve())
+
+        return dsl_libs
+
+    except Exception as e:
+        log().info(f"default_env: exception on get_prefix_dsl_libs", e)
+    return None
+
+
+class EnvironmentVarManager:
+    """Manages environment variables for configuration options.
+
+    Printing options:
+    - [DSL_NAME]_LOG_TO_CONSOLE: Print logging to stderr (default: False)
+    - [DSL_NAME]_PRINT_AFTER_PREPROCESSOR: Print after preprocess (default: False)
+    - [DSL_NAME]_PRINT_IR: Print generated IR (default: False)
+    - [DSL_NAME]_FILTER_STACKTRACE: Filter internal stacktrace (default: True)
+    File options:
+    - [DSL_NAME]_KEEP_IR: Save generated IR in a file (default: False)
+    - [DSL_NAME]_LOG_TO_FILE: Store all logging into a file, excluding COMPILE_LOGS (default: False)
+    Other options:
+    - [DSL_NAME]_LOG_LEVEL: Logging level to set, for LOG_TO_CONSOLE or LOG_TO_FILE (default: 1).
+    - [DSL_NAME]_DRYRUN: Generates IR only (default: False)
+    - [DSL_NAME]_ARCH: GPU architecture (default: "sm_100")
+    - [DSL_NAME]_WARNINGS_AS_ERRORS: Enable warnings as error (default: False)
+    - [DSL_NAME]_WARNINGS_IGNORE: Ignore warnings (default: False)
+    - [DSL_NAME]_ENABLE_OPTIMIZATION_WARNINGS: Enable warnings of optimization warnings (default: False)
+    - [DSL_NAME]_JIT_TIME_PROFILING: Whether or not to profile the IR generation/compilation/execution time (default: False)
+    - [DSL_NAME]_DISABLE_FILE_CACHING: Disable file caching (default: False)
+    - [DSL_NAME]_FILE_CACHING_CAPACITY: Limits the number of the cache save/load files (default: 1000)
+    - [DSL_NAME]_LIBS: Path to dependent shared libraries (default: None)
+    - [DSL_NAME]_NO_SOURCE_LOCATION: Generate source location (default: False)
+    """
+
+    def __init__(self, prefix="DSL"):
+        self.prefix = prefix  # change if needed
+
+        # Printing options
+        self.print_after_preprocessor = get_bool_env_var(
+            f"{prefix}_PRINT_AFTER_PREPROCESSOR", False
+        )
+        self.printIR = get_bool_env_var(f"{prefix}_PRINT_IR", False)
+        self.filterStacktrace = get_bool_env_var(f"{prefix}_FILTER_STACKTRACE", True)
+        # File options
+        self.keepIR = get_bool_env_var(f"{prefix}_KEEP_IR", False)
+        # Logging options
+        self.log_to_console = get_bool_env_var(f"{prefix}_LOG_TO_CONSOLE", False)
+        self.log_to_file = get_bool_env_var(f"{prefix}_LOG_TO_FILE", False)
+        if (
+            has_env_var(f"{prefix}_LOG_LEVEL")
+            and not self.log_to_console
+            and not self.log_to_file
+        ):
+            log().warning(
+                f"Log level was set, but neither logging to file ({prefix}_LOG_TO_FILE) nor logging to console ({prefix}_LOG_TO_CONSOLE) is enabled!"
+            )
+        self.log_level = get_int_env_var(f"{prefix}_LOG_LEVEL", 1)
+
+        # Other options
+        self.dryrun = get_bool_env_var(f"{prefix}_DRYRUN", False)
+        self.arch = get_str_env_var(f"{prefix}_ARCH", detect_gpu_arch(prefix))
+        self.warnings_as_errors = get_bool_env_var(
+            f"{prefix}_WARNINGS_AS_ERRORS", False
+        )
+        self.warnings_ignore = get_bool_env_var(f"{prefix}_WARNINGS_IGNORE", False)
+        self.enable_optimization_warnings = get_bool_env_var(
+            f"{prefix}_ENABLE_OPTIMIZATION_WARNINGS", False
+        )
+        self.jitTimeProfiling = get_bool_env_var(f"{prefix}_JIT_TIME_PROFILING", False)
+        self.disable_file_caching = get_bool_env_var(
+            f"{prefix}_DISABLE_FILE_CACHING", False
+        )
+        self.file_caching_capacity = get_int_env_var(
+            f"{prefix}_FILE_CACHING_CAPACITY", 1000
+        )
+        self.generate_source_location = not get_bool_env_var(
+            f"{prefix}_NO_SOURCE_LOCATION", False
+        )
+        # set cuda
+        self.cuda_toolkit = get_cuda_toolkit_path()
+
+        # set mlir shared libraries
+        self.shared_libs = get_prefix_dsl_libs(prefix)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/jit_executor.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/jit_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..83268009c85ef64967d6a81ab886ebeb704f140d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/jit_executor.py
@@ -0,0 +1,357 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides jit executor related classes
+"""
+import ctypes
+import inspect
+import io
+from typing import get_origin
+
+import numpy as np
+
+# MLIR modules imports
+from .._mlir import ir
+
+# Local modules imports
+from . import typing as t
+from .common import DSLRuntimeError
+from .runtime import cuda as cuda_helpers
+from .runtime.jit_arg_adapters import JitArgAdapterRegistry, is_arg_spec_constexpr
+from .typing import get_c_pointers
+from .utils.logger import log
+from .utils.timer import timer
+
+
+class CudaSingleModule:
+    def __init__(self, cuda_module, kernel_ptr):
+        self.cuda_module = cuda_module
+        self.kernel_ptr = kernel_ptr
+
+
+class CudaModules:
+    def __init__(self, modules, args):
+        # list of CudaSingleModule
+        self.modules = modules
+        # extra kernel ptr arguments for launch
+        self.args = args
+
+
+class JitExecutor:
+    def __init__(
+        self,
+        dsl,
+        engine,
+        capi_func,
+        ir_module,
+        args_spec,
+        function_name,
+        cuda_modules: CudaModules = None,
+        jit_time_profiling=False,
+    ):
+        self.dsl = dsl
+        self.engine = engine
+        self.capi_func = capi_func
+        self.ir_module = ir_module
+        self.args_spec = args_spec
+        self.function_name = function_name
+        if args_spec is not None:
+            self.original_args_spec = args_spec
+            self.args_spec = self.filter_runtime_arg_spec(args_spec)
+        # cuda kernels
+        self.cuda_modules = cuda_modules
+        self.jit_time_profiling = jit_time_profiling
+
+    def filter_runtime_arg_spec(self, arg_spec: inspect.FullArgSpec):
+        runtime_args = []
+        runtime_annotations = {}
+        runtime_defaults = []
+
+        # Calculate the offset where defaults start in the original args
+        if arg_spec.defaults:
+            defaults_start_idx = len(arg_spec.args) - len(arg_spec.defaults)
+        else:
+            defaults_start_idx = len(arg_spec.args)
+
+        # Filter arguments and maintain their properties
+        for i, arg_name in enumerate(arg_spec.args):
+            arg_type = arg_spec.annotations.get(arg_name, None)
+
+            # Skip compile-time arguments
+            if is_arg_spec_constexpr(arg_type, arg_name, i, self.function_name):
+                continue
+
+            # Keep runtime arguments
+            runtime_args.append(arg_name)
+            if arg_name in arg_spec.annotations:
+                runtime_annotations[arg_name] = arg_type
+
+            # Keep corresponding default if it exists
+            if i >= defaults_start_idx:
+                default_idx = i - defaults_start_idx
+                runtime_defaults.append(arg_spec.defaults[default_idx])
+
+        # Filter kwonlyargs and their defaults
+        runtime_kwonlyargs = []
+        runtime_kwonlydefaults = {}
+
+        if arg_spec.kwonlyargs:
+            for kwarg in arg_spec.kwonlyargs:
+                arg_type = arg_spec.annotations.get(kwarg, None)
+
+                # Apply same filtering logic
+                if is_arg_spec_constexpr(arg_type, kwarg, i, self.function_name):
+                    continue
+
+                runtime_kwonlyargs.append(kwarg)
+                if kwarg in arg_spec.annotations:
+                    runtime_annotations[kwarg] = arg_type
+                if arg_spec.kwonlydefaults and kwarg in arg_spec.kwonlydefaults:
+                    runtime_kwonlydefaults[kwarg] = arg_spec.kwonlydefaults[kwarg]
+
+        # Convert runtime_defaults to tuple if not empty (as expected by FullArgSpec)
+        runtime_defaults = tuple(runtime_defaults) if runtime_defaults else None
+
+        return inspect.FullArgSpec(
+            args=runtime_args,
+            varargs=arg_spec.varargs,  # Keep original varargs
+            varkw=arg_spec.varkw,  # Keep original varkw
+            defaults=runtime_defaults,
+            kwonlyargs=runtime_kwonlyargs,
+            kwonlydefaults=runtime_kwonlydefaults if runtime_kwonlydefaults else None,
+            annotations=runtime_annotations,
+        )
+
+    def __del__(self):
+        if self.cuda_modules:
+            cuda_modules = [module.cuda_module for module in self.cuda_modules.modules]
+            for module in set(cuda_modules):
+                cuda_helpers.unload_cubin_module(module)
+
+    def get_constexpr_args(self) -> list[dict[str, int | str]]:
+        """
+        This function returns the constexpr args that have been pruned from the original function signature.
+        The return type is a list of dicts, each dict contains the argument index (argument_index) and argument name (argument_name).
+
+        :return: list of dicts, each dict contains the argument index (argument_index) and argument name (argument_name).
+        :rtype: list[dict[str, int | str]]
+        """
+        if self.original_args_spec is None:
+            return list()
+        constexpr_args = list()
+        for i, arg_name in enumerate(self.original_args_spec.args):
+            if arg_name not in self.args_spec.args:
+                constexpr_args.append({"argument_index": i, "argument_name": arg_name})
+
+        if self.original_args_spec.kwonlyargs:
+            for kwarg in self.original_args_spec.kwonlyargs:
+                if kwarg not in self.args_spec.kwonlyargs:
+                    constexpr_args.append(
+                        {"argument_index": None, "argument_name": kwarg}
+                    )
+        return constexpr_args
+
+    def generate_execution_args(self, args, kwargs, args_spec: inspect.FullArgSpec):
+        """
+        This function is the prune version of `generate_mlir_function_types` which only generates execution args
+        to get rid of mlir context.
+        """
+
+        # Process positional arguments with defaults
+        rectified_args = list(args)
+        if args_spec.defaults and len(args) < len(args_spec.args):
+            rectified_args.extend(args_spec.defaults[len(args) - len(args_spec.args) :])
+        for k, v in kwargs.items():
+            if k in args_spec.args:
+                idx = args_spec.args.index(k)
+                if idx < len(rectified_args):
+                    rectified_args[idx] = v
+                else:
+                    rectified_args.append(v)
+
+        # Process keyword arguments
+        rectified_kwargs = {k: v for k, v in kwargs.items() if k not in args_spec.args}
+        if args_spec.kwonlydefaults and len(rectified_kwargs) < len(
+            args_spec.kwonlyargs
+        ):
+            rectified_kwargs.update(args_spec.kwonlydefaults)
+
+        # args/kwargs must match arg_specs
+        if len(rectified_args) != len(args_spec.args) or len(rectified_kwargs) != len(
+            args_spec.kwonlyargs
+        ):
+            raise DSLRuntimeError(
+                "input args/kwargs length does not match runtime function signature!",
+                context={
+                    "input args length": len(rectified_args),
+                    "input kwargs length": len(rectified_kwargs),
+                    "function signature args length": len(args_spec.args),
+                    "function signature kwonlyargs length": len(args_spec.kwonlyargs),
+                },
+            )
+
+        exe_args = []
+        adapted_args = []
+        input_args = rectified_args + list(rectified_kwargs.values())
+        input_arg_names = args_spec.args + args_spec.kwonlyargs
+        for arg, arg_name in zip(input_args, input_arg_names):
+            # short-cut for args already converted
+            if hasattr(arg, "__c_pointers__"):
+                exe_args.extend(arg.__c_pointers__())
+                continue
+
+            arg_type = args_spec.annotations.get(arg_name, None)
+
+            # Implicit cast to NumericMeta
+            if isinstance(arg_type, t.NumericMeta):
+                arg = t.cast(arg, arg_type)
+            else:
+                # If not any known type, try registered adapter to do the conversion
+                adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg))
+                if adapter:
+                    arg = adapter(arg)
+                    adapted_args.append(arg)
+
+            exe_args.extend(get_c_pointers(arg))
+
+        return exe_args, adapted_args
+
+    def __call__(self, *args, **kwargs):
+        exe_args, adapted_args = self.generate_execution_args(
+            args, kwargs, self.args_spec
+        )
+
+        self.run_compiled_program(exe_args)
+
+    # Assume each execution args has type `c_void_p` to reduce the overhead of `ctypes.cast`.
+    def get_invoke_packed_args(self, exe_args):
+        if self.cuda_modules:
+            exe_args += self.cuda_modules.args
+        packed_args = (ctypes.c_void_p * len(exe_args))()
+        for argNum in range(len(exe_args)):
+            packed_args[argNum] = exe_args[argNum]
+        return packed_args
+
+    def run_compiled_program(self, exe_args):
+        if self.jit_time_profiling:
+            profiler = timer(enable=True)
+            try:
+                packed_args = profiler(self.get_invoke_packed_args)(exe_args)
+                profiler(self.capi_func)(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+        else:
+            try:
+                packed_args = self.get_invoke_packed_args(exe_args)
+                self.capi_func(packed_args)
+            except Exception as e:
+                raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e)
+
+    def update_jit_cuda_modules(self, kernel_symbols):
+        # preload cuda module from compiled cubin in ir and store to jit_executor.kernels.
+        if len(kernel_symbols) > 0:
+            extra_args = []
+            module = self.ir_module
+            cuda_kernel_cache = dict()
+            cuda_driver_version = cuda_helpers.get_driver_version()
+            for sym in kernel_symbols:
+                if sym not in cuda_kernel_cache:
+                    log().debug(f"Loading CUDA module for symbol: {sym}")
+
+                    # load cuda module/get function pointer from module and cache
+                    def walk_callback(sym, func_sym, cubin_data):
+                        cubin_module = cuda_helpers.load_cubin_module_data(cubin_data)
+                        kernel_ptr = cuda_helpers.get_kernel_function(
+                            cubin_module, func_sym
+                        )
+                        # Enable non-portable cluster size for CUDA version 11.8 or higher.
+                        if cuda_driver_version >= 11080:
+                            cuda_helpers.set_kernel_attribute(
+                                kernel_ptr,
+                                cuda_helpers.cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                                1,
+                            )
+                        cuda_kernel_cache[sym] = CudaSingleModule(
+                            cubin_module, kernel_ptr
+                        )
+
+                    self.walk_module_and_get_cubin_data(module, sym, walk_callback)
+                else:
+                    log().debug(f"Symbol {sym} already in cache")
+                # check if kernel is empty.
+                if sym in cuda_kernel_cache:
+                    extra_args.append(
+                        ctypes.c_void_p(cuda_kernel_cache[sym].kernel_ptr.getPtr())
+                    )
+            # store to the jit result if jit result is cached.
+            self.cuda_modules = CudaModules(cuda_kernel_cache.values(), extra_args)
+
+        return self
+
+    def _get_escaped_cubin_bytes(self, cubin_data):
+        """This function escapes cubin data from mlir raw bytecode to executable binary bytes"""
+
+        def ishex(inp):
+            return (
+                inp in range(0x30, 0x3A)
+                or inp in range(0x61, 0x67)
+                or inp in range(0x41, 0x47)
+            )
+
+        converted = bytearray()
+        idx = 0
+        while idx < len(cubin_data):
+            # escape the original bytes
+            if cubin_data[idx] == 0x5C:
+                # if data of idx is b'\\'
+                if ishex(cubin_data[idx + 1]) and ishex(cubin_data[idx + 2]):
+                    converted += bytearray.fromhex(
+                        cubin_data[idx + 1 : idx + 3].decode()
+                    )
+                    idx += 3
+                elif cubin_data[idx + 1] == 0x5C:
+                    converted.append(cubin_data[idx])
+                    idx += 2
+            else:
+                # no escape, directly write
+                converted.append(cubin_data[idx])
+                idx += 1
+        return bytes(converted)
+
+    def walk_module_and_get_cubin_data(self, module, sym, callback):
+        """This function is used to walk gpu binary op, extract the cubin inside, and process cubin data with callback."""
+
+        def walk_gpu_binary_op(op):
+            if op.name != "gpu.binary":
+                return ir.WalkResult.ADVANCE
+            s = io.BytesIO()
+            op.write_bytecode(s)
+            cubin_data = s.getvalue()
+            if sym.encode() not in cubin_data:
+                return ir.WalkResult.ADVANCE
+
+            if (
+                "kernels" != op.opview.sym_name.value
+                and sym != op.opview.sym_name.value
+            ):
+                return ir.WalkResult.ADVANCE
+            # function symbol of kernel(gpu.launch_func) is equal to sym name in mlir
+            func_sym = sym
+            if sym == op.opview.sym_name.value and not sym.endswith("_kernel"):
+                func_sym = sym.rsplit("_", 1)[0]
+
+            cubin_data = cubin_data.split(b'bin = "')[1].split(b'">')[0]
+            cubin_data = self._get_escaped_cubin_bytes(cubin_data)
+            callback(sym, func_sym, cubin_data)
+            return ir.WalkResult.ADVANCE
+
+        module.operation.walk(walk_gpu_binary_op)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc475fdda59450f07c35ae244d6223446470c6d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/__init__.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a runtime utility functions that are needed for
+the DSL.
+"""
+
+from . import dlpack_types
+from . import cuda
+from . import jit_arg_adapters
+
+__all__ = [
+    "dlpack_types",
+    "cuda",
+    "jit_arg_adapters",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/cuda.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..97ae778c0cd5ae19d20fac8e045e2021832f5bbc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/cuda.py
@@ -0,0 +1,476 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides CUDA Python helper functions
+"""
+
+
+from functools import lru_cache
+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+import os
+import ctypes
+
+import cuda.bindings.driver as cuda
+import cuda.bindings.nvrtc as nvrtc
+
+# MLIR imports
+from ..._mlir import ir
+from ..._mlir.dialects import gpu
+
+# Local module imports
+from ..utils.logger import log as _log
+from ..common import *
+from .jit_arg_adapters import JitArgAdapterRegistry
+
+
+# =============================================================================
+# Utils
+# =============================================================================
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, nvrtc.nvrtcResult):
+        return nvrtc.nvrtcGetErrorString(error)[1]
+    else:
+        raise DSLRuntimeError("Unknown error type: {}".format(error))
+
+
+def _get_gpu_arch_info(major, minor):
+    """Get GPU architecture information and compatibility details."""
+    gpu_arch_map = {
+        (7, 0): ("Volta", "sm_70", ["sm_70"]),  # V100
+        (7, 5): ("Turing", "sm_75", ["sm_75"]),  # RTX 20 Series, Quadro RTX
+        (8, 0): ("Ampere", "sm_80", ["sm_80"]),  # A100
+        (8, 6): ("Ampere", "sm_86", ["sm_86", "sm_80"]),  # RTX 30 Series
+        (8, 9): ("Ada", "sm_89", ["sm_89", "sm_86"]),  # RTX 40 Series
+        (8, 7): ("Ampere", "sm_87", ["sm_87", "sm_86", "sm_80"]),  # A10, A40
+        (9, 0): ("Hopper", "sm_90a", ["sm_90a"]),  # H100
+        (10, 0): ("Blackwell", "sm_100a", ["sm_100a"]),  # B200
+    }
+    return gpu_arch_map.get(
+        (major, minor), ("Unknown", f"sm_{major}{minor}", [f"sm_{major}{minor}"])
+    )
+
+
+def get_compute_capability_major_minor(device_id: int = 0):
+    """
+    Returns the compute capability of the CUDA device as a tuple of (major, minor).
+    For example: (8, 0) for Ampere, (9, 0) for Hopper, (10, 0) for Blackwell.
+    Returns None on failure.
+    """
+    try:
+        checkCudaErrors(cuda.cuInit(0))
+        device = checkCudaErrors(cuda.cuDeviceGet(device_id))
+        major = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                device,
+            )
+        )
+        minor = checkCudaErrors(
+            cuda.cuDeviceGetAttribute(
+                cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                device,
+            )
+        )
+        return major, minor
+    except RuntimeError as e:
+        _log().info(f"Failed to get CUDA compute capability: {e}")
+        return None, None
+
+
+@dataclass
+class DeviceInfo:
+    """Data class to store CUDA device information."""
+
+    device_count: int = 0
+    current_device: int = 0
+    device_name: Optional[str] = None
+    major_version: Optional[int] = None
+    minor_version: Optional[int] = None
+    arch_name: Optional[str] = None
+    sm_arch: Optional[str] = None
+    compatible_archs: Optional[List[str]] = None
+    memory_gb: Optional[float] = None
+    target_arch: Optional[str] = None
+    error_message: Optional[str] = None
+    initialization_failed: bool = False
+
+    def pretty_str(self) -> str:
+        """
+        Convert DeviceInfo to a formatted string for display.
+        """
+        info = ""
+
+        if self.initialization_failed:
+            return f"{Colors.BOLD}- CUDA initialization failed{Colors.RESET}"
+
+        if self.error_message:
+            return f"{Colors.BOLD}- Failed to get GPU info: {self.error_message}{Colors.RESET}"
+
+        if self.device_count > 0:
+            info += f"{Colors.BOLD}- CUDA devices available: {self.device_count} (current: {self.current_device})\n"
+
+            if self.major_version is not None and self.minor_version is not None:
+                info += f"- Architecture: {Colors.BLUE}{self.arch_name}{Colors.RESET} ({Colors.GREEN}{self.sm_arch}{Colors.RESET})\n"
+                info += f"- Compatible SM archs: {Colors.GREEN}{', '.join(self.compatible_archs or [])}{Colors.RESET}\n"
+
+                if self.memory_gb is not None:
+                    info += f"- Total Memory: {Colors.BLUE}{self.memory_gb:.2f} GB{Colors.RESET}\n"
+
+            else:
+                info += f"- Compute capability: unknown\n"
+                info += f"- SM arch: unknown{Colors.RESET}\n"
+        else:
+            info += f"- No devices available\n"
+
+        return info
+
+
+def get_device_info() -> DeviceInfo:
+    """
+    Get detailed information about CUDA devices.
+    Returns a DeviceInfo dataclass with device information.
+    """
+    device_info = DeviceInfo()
+
+    # Initialize CUDA if not already initialized
+    try:
+        result = cuda.cuInit(0)
+        if result[0].value:  # Check for error
+            device_info.initialization_failed = True
+            return device_info
+    except:
+        pass
+
+    try:
+        # Get device count
+        result = cuda.cuDeviceGetCount()
+        device_info.device_count = result[1] if result[0].value == 0 else 0
+
+        if device_info.device_count > 0:
+            # Get current device
+            try:
+                result = cuda.cuCtxGetDevice()
+                if result[0].value == 0:
+                    device_info.current_device = result[1]
+            except:
+                pass
+
+            # Get device name
+            try:
+                name_result = cuda.cuDeviceGetName(100, device_info.current_device)
+                if name_result[0].value == 0:
+                    device_info.device_name = name_result[1]
+            except:
+                pass
+
+            # Get compute capability and architecture info
+            try:
+                major, minor = get_compute_capability_major_minor(
+                    device_info.current_device
+                )
+
+                # Check if we successfully got the compute capability
+                if major is not None and minor is not None:
+                    device_info.major_version = major
+                    device_info.minor_version = minor
+
+                    arch_name, sm_arch, compatible_archs = _get_gpu_arch_info(
+                        device_info.major_version, device_info.minor_version
+                    )
+
+                    device_info.arch_name = arch_name
+                    device_info.sm_arch = sm_arch
+                    device_info.compatible_archs = compatible_archs
+
+                    # Get memory info
+                    try:
+                        total_mem = cuda.cuDeviceGetAttribute(
+                            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TOTAL_MEMORY,
+                            device_info.current_device,
+                        )
+                        if total_mem[0].value == 0:
+                            device_info.memory_gb = total_mem[1] / (
+                                1024 * 1024 * 1024
+                            )  # Convert to GB
+                    except:
+                        pass
+
+            except Exception as e:
+                pass  # Compute capability info will remain None
+
+    except Exception as e:
+        device_info.error_message = str(e)
+
+    return device_info
+
+
+def checkCudaErrors(result):
+    """Check CUDA errors and provide detailed error messages."""
+    if result[0].value:
+        error_code = result[0].value
+        error_name = _cudaGetErrorEnum(result[0])
+
+        raise DSLCudaRuntimeError(error_code, error_name)
+
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+# =============================================================================
+# Driver Helpers
+# =============================================================================
+
+
+@lru_cache(maxsize=1)
+def initialize_cuda_context(device_id: int = 0, flags: int = 0):
+    """
+    Initializes the CUDA context for a specified device.
+    """
+    # Initialize CUDA Driver API
+    _log().info(f"cuInit {flags}")
+    checkCudaErrors(cuda.cuInit(flags))
+    # Retrieve handle for device
+    _log().info(f"cuDeviceGet {device_id}")
+    cuDevice = checkCudaErrors(cuda.cuDeviceGet(device_id))
+    _log().info(f"{cuDevice} <-- cuDeviceGet")
+    # Create context
+    _log().info(f"cuCtxCreate {0} {cuDevice}")
+    if cuda.CUDA_VERSION >= 13000:
+        # Use cuCtxCreate_v4 API with explicit CUctxCreateParams None, since v2
+        # and v3 API has been removed from CTK 13.
+        # See https://github.com/NVIDIA/cuda-python/pull/792
+        context = checkCudaErrors(cuda.cuCtxCreate(None, 0, cuDevice))
+    else:
+        context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice))
+    _log().info(f"{context} <-- cuCtxCreate")
+
+    return context
+
+
+def load_cubin_module(cubin_file):
+    """
+    Loads a CUBIN file and returns the module.
+    """
+    # Load CUBIN file as binary data
+    _log().info(f"read cubin {cubin_file}")
+    with open(cubin_file, "rb") as f:
+        cubin_data = f.read()
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+
+
+def unload_cubin_module(module):
+    """
+    Unloads a CUBIN module.
+    """
+    _log().info(f"cuModuleUnload {module}")
+    checkCudaErrors(cuda.cuModuleUnload(module))
+
+
+def load_cubin_module_data(cubin_data):
+    """
+    Loads a CUBIN from data and returns the module.
+    """
+    # Load module data
+    _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}")
+    module = checkCudaErrors(
+        cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data)
+    )
+    return module
+
+
+def get_kernel_function(module, kernel_name):
+    """
+    Retrieves the kernel function from the module.
+    """
+    _log().info(f"cuModuleGetFunction {module} {kernel_name}")
+    kernel = checkCudaErrors(
+        cuda.cuModuleGetFunction(module, bytes(kernel_name, "utf-8"))
+    )
+    _log().info(f"{kernel} <-- cuModuleGetFunction")
+    return kernel
+
+
+def launch_kernel(kernel, grid_dims, block_dims, stream, smem_size, kernel_args=None):
+    """
+    Launches the CUDA kernel.
+    """
+    _log().info(
+        f"cuLaunchKernel {kernel} grid={grid_dims} blocks={block_dims} smem_size={smem_size} stream={stream} {kernel_args}"
+    )
+    checkCudaErrors(
+        cuda.cuLaunchKernel(
+            kernel,
+            grid_dims[0],
+            grid_dims[1],
+            grid_dims[2],
+            block_dims[0],
+            block_dims[1],
+            block_dims[2],
+            smem_size,  # Shared memory size
+            stream,
+            kernel_args,
+            0,  # Extra parameters
+        )
+    )
+
+
+def stream_sync(stream):
+    """
+    Synchronizes the CUDA stream.
+    """
+    _log().info(f"cuStreamSynchronize {stream}")
+    checkCudaErrors(cuda.cuStreamSynchronize(stream))
+
+
+def stream_create(id=0):
+    """
+    Creates the CUDA stream.
+    """
+    _log().info(f"cuStreamCreate {id}")
+    stream = checkCudaErrors(cuda.cuStreamCreate(id))
+    _log().info(f"{stream} <-- cuStreamCreate")
+    return stream
+
+
+def stream_destroy(stream):
+    """
+    Destroys the CUDA stream.
+    """
+    _log().info(f"cuStreamDestroy {stream}")
+    checkCudaErrors(cuda.cuStreamDestroy(stream))
+
+
+def context_destroy(context):
+    """
+    Destroys the CUDA context.
+    """
+    _log().info(f"cuCtxDestroy {context}")
+    checkCudaErrors(cuda.cuCtxDestroy(context))
+
+
+def allocate(size_in_bytes: int, stream=None):
+    """
+    Allocate device memory based on numpy host array size.
+    """
+    _log().info("Allocate size_in_bytes=[%s] stream=[%s]", size_in_bytes, stream)
+    if stream is None:
+        device_memory = checkCudaErrors(cuda.cuMemAlloc(size_in_bytes))
+    else:
+        device_memory = checkCudaErrors(cuda.cuMemAllocAsync(size_in_bytes, stream))
+    _log().info("Allocated [%s]", device_memory)
+    return device_memory
+
+
+def deallocate(device_pointer, stream=None):
+    """
+    Deallocate the specified device memory pointer.
+    """
+    _log().info(
+        "Deallocate device_pointer=[%s] stream=[%s]", hex(int(device_pointer)), stream
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemFree(device_pointer))
+    else:
+        checkCudaErrors(cuda.cuMemFreeAsync(device_pointer, stream))
+
+
+def memcpy_h2d(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from host to device memory.
+    """
+    _log().info(
+        "Copy host-to-device host_pointer[%s] device_ptr=[%s] size_in_bytes=[%s] stream=[%s]",
+        hex(host_pointer),
+        hex(int(device_pointer)),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyHtoD(device_pointer, host_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyHtoDAsync(device_pointer, host_pointer, size_in_bytes, stream)
+        )
+
+
+def memcpy_d2h(host_pointer, device_pointer, size_in_bytes, stream=None):
+    """
+    Copy data from device to host memory.
+    """
+    _log().info(
+        "Copy device-host-to device_pointer=[%s] host_pointer[%s]  size_in_bytes=[%s] stream=[%s]",
+        hex(int(device_pointer)),
+        hex(host_pointer),
+        size_in_bytes,
+        stream,
+    )
+    if stream is None:
+        checkCudaErrors(cuda.cuMemcpyDtoH(host_pointer, device_pointer, size_in_bytes))
+    else:
+        checkCudaErrors(
+            cuda.cuMemcpyDtoHAsync(host_pointer, device_pointer, size_in_bytes, stream)
+        )
+
+
+def default_stream():
+    return cuda.CUstream(0)
+
+
+def get_driver_version():
+    """
+    Returns the CUDA driver version.
+    """
+    return checkCudaErrors(cuda.cuDriverGetVersion())
+
+
+def set_kernel_attribute(kernel, attribute, value):
+    """
+    Sets a CUDA kernel attribute.
+    """
+    return checkCudaErrors(cuda.cuFuncSetAttribute(kernel, attribute, value))
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(cuda.CUstream)
+class StreamAdapter:
+    """
+    Convert a CUDA stream to a stream representation for JIT arg generation.
+    """
+
+    def __init__(self, arg):
+        self._arg = arg
+        self._c_pointer = self._arg.getPtr()
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+
+    def __c_pointers__(self):
+        return [self._c_pointer]
+
+    def __get_mlir_types__(self):
+        return [gpu.AsyncTokenType.get()]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/device_tensor.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/device_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5addb275b12f2b18e109b0592a87f3044d2fe595
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/device_tensor.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import copy
+
+from . import cuda as cuda_helpers
+from .tensor_descriptor import *
+from ..common import *
+
+
+def allocate(tensor: TensorDescriptor, stream=None):
+    """
+    Allocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if not tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is already allocated on the device.")
+
+    tensor.device_pointer = cuda_helpers.allocate(tensor.size_in_bytes, stream)
+
+    log().info("Allocate done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+
+
+def deallocate(tensor: TensorDescriptor, stream=None):
+    """
+    Deallocates GPU memory
+    """
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+
+    log().info(
+        "Deallocating done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer
+    )
+
+    cuda_helpers.deallocate(tensor.device_pointer, stream)
+    tensor.device_pointer = None
+
+
+def copy_to_gpu(tensor: TensorDescriptor, do_allocate=True, stream=None):
+    """
+    Copies data from host memory to the GPU memory.
+    If do_allocate is True, it first calls allocate
+    """
+    log().info("copyin tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if do_allocate:
+        allocate(tensor, stream)
+    cuda_helpers.memcpy_h2d(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    log().info("copyin done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    return tensor
+
+
+def copy_from_gpu(tensor: TensorDescriptor, do_deallocate=True, stream=None):
+    """
+    Copies data from GPU memory back to the host.
+    If do_deallocate is True, it calls deallocate
+    """
+    log().info("copyout tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+    if tensor._check_is_managed_by_framework():
+        raise DSLRuntimeError(
+            "GPU tensors are managed by the framework and cannot be modified."
+        )
+    if tensor.device_pointer is None:
+        raise DSLRuntimeError("Tensor is not allocated on the device.")
+
+    cuda_helpers.memcpy_d2h(
+        tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream
+    )
+    if do_deallocate:
+        deallocate(tensor, stream)
+    log().info("copyout done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer)
+
+
+def to_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_to_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    raise DSLRuntimeError("Unsupported type")
+
+
+def from_gpu(tensor, stream=None) -> TensorDescriptor:
+    """
+    Copies the tensor to the GPU memory from Host memory
+    """
+    if isinstance(tensor, TensorDescriptor):
+        new_tensor = copy.copy(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    if TensorDescriptor.can_transformed_to_dlpack(tensor):
+        new_tensor = TensorDescriptor(tensor)
+        copy_from_gpu(new_tensor, stream=stream)
+        return new_tensor
+
+    raise DSLRuntimeError("Unsupported type")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/dlpack_types.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/dlpack_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..168c2a9953f74b45cadfcbb6562f89d1bb35cd6d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/dlpack_types.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides helper structs for dlpack.
+DLPack is an open standard for in-memory tensor structures, enabling
+seamless sharing of tensors across different frameworks.
+Learn more at: https://github.com/dmlc/dlpack
+"""
+
+import ctypes
+import enum
+
+
+class DLDeviceType(enum.IntEnum):
+    """Enums for device types based on the DLPack specification."""
+
+    kDLCPU = 1
+    kDLGPU = 2
+    kDLCPUPinned = 3
+
+
+class DLDataTypeCode:
+    """Enums for data type codes based on the DLPack specification.
+
+    see https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h
+    """
+
+    kDLInt = 0
+    kDLUInt = 1
+    kDLFloat = 2
+    kDLOpaqueHandle = 3
+    kDLBfloat = 4
+    kDLComplex = 5
+    kDLBool = 6
+
+
+class DLDevice(ctypes.Structure):
+    """Structure representing the device information in DLPack."""
+
+    _fields_ = [
+        ("device_type", ctypes.c_int),  # kDLCPU, kDLGPU, etc.
+        ("device_id", ctypes.c_int),  # Device ID (e.g., GPU ID)
+    ]
+
+
+class DLDataType(ctypes.Structure):
+    """Structure representing the data type in DLPack."""
+
+    _fields_ = [
+        ("code", ctypes.c_uint8),  # Data type code (e.g., kDLFloat)
+        ("bits", ctypes.c_uint8),  # Number of bits per value
+        ("lanes", ctypes.c_uint16),  # Number of lanes
+    ]
+
+
+class DLTensor(ctypes.Structure):
+    """Structure representing the DLTensor in DLPack."""
+
+    _fields_ = [
+        ("data", ctypes.c_void_p),  # Pointer to tensor data
+        ("device", DLDevice),  # Device info
+        ("ndim", ctypes.c_int),  # Number of dimensions
+        ("dtype", DLDataType),  # Data type
+        ("shape", ctypes.POINTER(ctypes.c_int64)),  # Shape of tensor
+        ("strides", ctypes.POINTER(ctypes.c_int64)),  # Strides of tensor
+        ("byte_offset", ctypes.c_uint64),  # Byte offset to tensor data
+    ]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb998d16d8fb4bcf592f17ce0f23a81d6e11bff6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py
@@ -0,0 +1,188 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides runtime utilities for JIT argument conversion in DSL.
+"""
+
+from functools import wraps
+from typing import get_origin
+
+# Local modules imports
+from ..common import DSLRuntimeError
+from ..typing import (
+    Constexpr,
+    Int32,
+    Float32,
+    Boolean,
+)
+
+
+def is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument spec is a constexpr.
+    """
+
+    def _is_reserved_python_func_arg(arg_index, arg_name, func):
+        """
+        Check if the argument is a reserved python function argument.
+        """
+
+        if arg_index != 0:
+            return False
+
+        if arg_name == "self":
+            return True
+
+        is_classmethod = isinstance(func, classmethod) or (
+            hasattr(func, "__func__") and isinstance(func.__func__, classmethod)
+        )
+        return arg_name == "cls" and is_classmethod
+
+    return (
+        _is_reserved_python_func_arg(arg_index, arg_name, owning_func)
+        or (isinstance(arg_spec, type) and issubclass(arg_spec, Constexpr))
+        or (get_origin(arg_spec) is Constexpr)
+    )
+
+
+def is_argument_constexpr(arg, arg_spec, arg_name, arg_index, owning_func):
+    """
+    Check if the argument is a constexpr.
+    """
+
+    def _is_type_argument(arg, arg_annotation):
+        """
+        Check if the argument is a type argument like Type[X]
+        """
+
+        return isinstance(arg, type) and (
+            arg_annotation is None or get_origin(arg_annotation) is type
+        )
+
+    return (
+        is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func)
+        or _is_type_argument(arg, arg_spec)
+        or arg is None
+    )
+
+
+class JitArgAdapterRegistry:
+    """
+    A registry to keep track of the JIT argument adapters.
+
+    An adapter is a callable that converts a Python type to a type with following protocols supported:
+    - JitArgument
+    - DynamicExpression
+    The converted type can then be further processed by DSL to generate arguments for JIT functions.
+    """
+
+    # A dictionary with key=type and value=callable
+    jit_arg_adapter_registry = {}
+
+    @classmethod
+    def register_jit_arg_adapter(cls, *dargs, **dkwargs):
+        """
+        Register a JIT argument adapter callable
+
+        This can be used as a decorator on any callable like:
+
+        @register_jit_arg_adapter(my_py_type)
+        def my_adapter_for_my_py_type(arg):
+            ...
+
+        @register_jit_arg_adapter(my_py_type)
+        class MyAdapterForMyPythonType:
+            ...
+
+        The adapters are registered per type. If a type is already registerd, an error will be raised.
+        """
+
+        def decorator(*dargs, **dkwargs):
+            darg_python_ty = dargs[0]
+
+            @wraps(darg_python_ty)
+            def wrapper(*args, **kwargs):
+                if len(args) != 1 or not callable(args[0]):
+                    raise DSLRuntimeError(
+                        "a callable must be provided for registering JIT argument adapter"
+                    )
+                adapter = args[0]
+
+                if darg_python_ty in cls.jit_arg_adapter_registry:
+                    raise DSLRuntimeError(
+                        f"JIT argument adapter for {darg_python_ty} is already registered!",
+                        context={
+                            "Registered adapter": cls.jit_arg_adapter_registry[
+                                darg_python_ty
+                            ],
+                            "Adapter to be registered": adapter,
+                        },
+                    )
+                cls.jit_arg_adapter_registry[darg_python_ty] = adapter
+                return adapter
+
+            return wrapper
+
+        if len(dargs) > 0:
+            return decorator(*dargs, **dkwargs)
+        else:
+            raise DSLRuntimeError(
+                "a Python type must be provided for registering JIT argument adapter"
+            )
+
+    @classmethod
+    def get_registered_adapter(cls, ty):
+        """
+        Get the registered JIT argument adapter for the given type.
+        """
+        return cls.jit_arg_adapter_registry.get(ty, None)
+
+
+# =============================================================================
+# JIT Argument Adapters
+# =============================================================================
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(int)
+@JitArgAdapterRegistry.register_jit_arg_adapter(float)
+@JitArgAdapterRegistry.register_jit_arg_adapter(bool)
+def _convert_python_scalar(arg):
+    """
+    Convert a Python scalar to a DSL type.
+    """
+    conversion_map = {
+        int: Int32,
+        float: Float32,
+        bool: Boolean,
+    }
+    return conversion_map.get(type(arg))(arg)
+
+
+@JitArgAdapterRegistry.register_jit_arg_adapter(tuple)
+@JitArgAdapterRegistry.register_jit_arg_adapter(list)
+def _convert_python_sequence(arg):
+    """
+    Go through each element in the sequence and convert it to a type that can be
+    further processed by DSL to generate the corresponding JIT argument(s).
+    """
+    adapted_arg = []
+    for elem in arg:
+        adapter = JitArgAdapterRegistry.get_registered_adapter(type(elem))
+        if adapter is not None:
+            converted_elem = adapter(elem)
+            adapted_arg.append(converted_elem)
+        else:
+            # If no registered adapter is found, just return the original element
+            adapted_arg.append(elem)
+
+    assert len(adapted_arg) == len(arg)
+    return type(arg)(adapted_arg)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a992ef68293d6f969ab551b6321c3696c961037
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Helpers
+import itertools, operator
+import ctypes
+from . import dlpack_types as _dpack
+from .dlpack_runtime import (
+    dlpack_to_tensor_desc,
+    get_tensor_desc_data_ptr,
+    get_tensor_desc_is_in_device,
+    get_tensor_desc_element_type,
+    get_tensor_desc_shape,
+    get_tensor_desc_stride,
+    get_tensor_desc_element_size_in_bytes,
+    get_tensor_desc_ndim,
+    get_tensor_desc_dtype_code,
+    get_tensor_desc_dtype_bits,
+    get_tensor_desc_device_type,
+    get_tensor_desc_device_id,
+)
+
+from ..utils.logger import log
+from ..common import *
+from ..typing import (
+    Boolean,
+    Float8E5M2,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+)
+
+
+class TensorDescriptor:
+    def __init__(self, tensor):
+        """Initialize with a tensor that supports the DLPack protocol.
+
+        Args:
+            tensor: Any tensor object that implements __dlpack__ and __dlpack_device__
+        """
+
+        self.tensor = tensor
+        self._capsule = dlpack_to_tensor_desc(tensor)
+
+        self.data_ptr = get_tensor_desc_data_ptr(self._capsule)
+        self.device_type = get_tensor_desc_device_type(self._capsule)
+        self.device_type = _dpack.DLDeviceType(self.device_type)
+
+        if self.device_type == _dpack.DLDeviceType.kDLGPU:
+            self.device_pointer = self.data_ptr
+        elif self.device_type == _dpack.DLDeviceType.kDLCPU:
+            self.device_pointer = None
+        else:
+            raise DSLRuntimeError(
+                f"DLPack device type is not supported {self.dl_tensor.device.device_type}"
+            )
+
+        log().info("TensorDescriptor is created = [%s]", self)
+
+    @staticmethod
+    def can_transformed_to_dlpack(dl_tensor):
+        if not hasattr(dl_tensor, "__dlpack__") or not hasattr(
+            dl_tensor, "__dlpack_device__"
+        ):
+            return False
+        return True
+
+    @property
+    def is_in_device(self):
+        """Check if the tensor is stored on a device."""
+        return not self.device_pointer is None
+
+    @property
+    def device_id(self):
+        """Return device id where tensor resides."""
+        if self.is_in_device:
+            return get_tensor_desc_device_id(self._capsule)
+        return -1
+
+    @property
+    def element_type(self):
+        """Return the corresponding Python type based on DLPack dtype metadata."""
+        str_element_type = get_tensor_desc_element_type(self._capsule)
+        dtype_map = {
+            # bool is 8bit from numpy and torch
+            "Bool": Boolean,
+            "Int64": Int64,
+            "Int32": Int32,
+            "Int16": Int16,
+            "Int8": Int8,
+            "UInt64": Uint64,
+            "UInt32": Uint32,
+            "UInt16": Uint16,
+            "UInt8": Uint8,
+            "Float64": Float64,
+            "Float32": Float32,
+            "Float16": Float16,
+            "BFloat16": BFloat16,
+            "Float8E5M2": Float8E5M2,
+        }
+
+        if str_element_type not in dtype_map:
+            raise KeyError(
+                f"Unsupported element type in dlpack: '{str_element_type}'. Supported types are: {list(dtype_map.keys())}"
+            )
+
+        return dtype_map[str_element_type]
+
+    @property
+    def shape(self):
+        """Return the shape of the tensor."""
+        return get_tensor_desc_shape(self._capsule)
+
+    @property
+    def rank(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_ndim(self._capsule)
+
+    @property
+    def strides(self):
+        """Return the rank of the tensor."""
+        return get_tensor_desc_stride(self._capsule)
+
+    @property
+    def element_size_in_bytes(self):
+        """Calculate the element size in bytes of the DLPack tensor."""
+        return get_tensor_desc_element_size_in_bytes(self._capsule)
+
+    @property
+    def size_in_bytes(self):
+        """Calculate the total size in bytes of the DLPack tensor."""
+        # Calculate the number of elements using the shape
+        ndim = get_tensor_desc_ndim(self._capsule)
+        shape = get_tensor_desc_shape(self._capsule)
+        num_elements = 1
+        for i in range(ndim):
+            num_elements *= shape[i]
+
+        # Total bytes
+        total_bytes = self.element_size_in_bytes * num_elements
+        return total_bytes
+
+    def __str__(self):
+        """Return a compact string representation of the device_tensor with a tensor prefix."""
+        # Extract shape
+        shape = "x".join(map(str, self.shape))
+
+        # Extract dtype
+        dtype_code = get_tensor_desc_dtype_code(self._capsule)
+        dtype_bits = get_tensor_desc_dtype_bits(self._capsule)
+        dtype = (
+            f"i{dtype_bits}"
+            if dtype_code == _dpack.DLDataTypeCode.kDLInt
+            else f"f{dtype_bits}"
+        )
+
+        # Extract device
+        device_type = "cpu" if not self.is_in_device else "gpu"
+
+        return f"tensor<{shape}x{dtype}>_{device_type}"
+
+    def _check_is_managed_by_framework(self):
+        """
+        Ensure the tensor is not managed by the framework (e.g., GPU tensor).
+        Raises an exception if the tensor is framework-managed.
+        """
+        return self.device_type == _dpack.DLDeviceType.kDLGPU
+
+    @staticmethod
+    def is_compatible(maybe_tensor_descriptor) -> bool:
+        """Check if the object is a TensorDescriptor or can be converted to one."""
+        return isinstance(
+            maybe_tensor_descriptor, TensorDescriptor
+        ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor)
+
+
+def from_tensor(tensor) -> TensorDescriptor:
+    """Create a TensorDescriptor from a tensor object."""
+    return TensorDescriptor(tensor)
+
+
+def to_tensor(tensor_descriptor: TensorDescriptor):
+    """Return tensor object from tensor descriptor."""
+    return tensor_descriptor.tensor
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/typing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..b46cff6de8176217f38af05b8604716c34aae009
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/typing.py
@@ -0,0 +1,1962 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import ctypes
+import numpy as np
+import operator
+from typing_extensions import deprecated
+from functools import reduce
+from typing import (
+    Generic,
+    Protocol,
+    Union,
+    Any,
+    List,
+    Type,
+    TypeVar,
+    overload,
+    runtime_checkable,
+    get_origin,
+)
+from types import FunctionType
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+
+from .common import *
+from .ast_helpers import const_expr
+from ._mlir_helpers import arith as arith_helper, lru_cache_ir
+from ._mlir_helpers.arith import ArithValue
+
+from .._mlir import ir
+from .._mlir.extras import types as T
+from .._mlir.dialects import arith, math
+
+# =============================================================================
+# Dynamic Expression Protocol
+# =============================================================================
+
+
+@runtime_checkable
+class DynamicExpression(Protocol):
+    """Protocol defining the interface for object holding dynamic values in the DSL.
+
+    This protocol enables classes to represent dynamic values in the DSL. Classes implementing
+    this protocol can be used in JIT-compiled functions and dynamic value generation.
+
+    It is required for custom data types to work correctly with following JIT features:
+    * as function argument to call another JIT function from JIT function
+    * as return value from JIT function
+    * for constructions like if-else, while-loop, etc.
+
+    :param value: The MLIR operation result value to initialize the object with
+    :type value: ir.Value
+
+    **Required Methods**
+
+    * ``__extract_mlir_values__``: Extract MLIR values from the object
+    * ``__new_from_mlir_values__``: Create new instance from MLIR values
+
+    **Implementation Example**
+
+    To implement a custom data type that works with the DSL:
+
+    .. code-block:: python
+
+        class CustomData(metaclass=DslType):
+            def __init__(self, int_value):
+                self.int_value = int_value
+
+            def __extract_mlir_values__(self):
+                return [self.int_value]
+
+            def __new_from_mlir_values__(self, values):
+                return CustomData(values[0])
+
+    **Usage in JIT Functions**
+
+    When used in JIT-compiled functions, the DSL automatically extracts MLIR values:
+
+    .. code-block:: python
+
+        @jit
+        def caller():
+            x = CustomData(1)
+            return foo(x)
+
+    This generates MLIR like:
+
+    .. code-block:: mlir
+
+        func @caller() -> i32 {
+            %0 = func.call @foo(%arg0) : (i32) -> i32
+            return %0 : i32
+        }
+    """
+
+    def __extract_mlir_values__(self):
+        """Extract MLIR values from this object.
+
+        :return: List of MLIR values representing this object's data
+        :rtype: List[ir.Value]
+        """
+        raise NotImplementedError
+
+    def __new_from_mlir_values__(self, values):
+        """Create a new instance from MLIR values.
+
+        :param values: List of MLIR values to construct the object from
+        :type values: List[ir.Value]
+        :return: New instance of the implementing class
+        :rtype: Any
+        """
+        raise NotImplementedError
+
+
+@runtime_checkable
+class JitArgument(Protocol):
+    """
+    Protocol class defining the interface for JIT function argument generation.
+
+    This protocol enables classes to provide the necessary information for generating
+    JIT function arguments and allow the DSL JIT executor to call JIT compiled functions.
+
+    **Required Methods**
+
+    * ``__c_pointers__``: Returns ctypes pointers for runtime execution
+    * ``__get_mlir_types__``: Returns MLIR types for function definition
+    * ``__new_from_mlir_values__``: Creates new instances from MLIR values
+
+    **Example**
+
+    .. code-block:: python
+
+        class CustomData:
+            def __init__(self, int_value, ...):
+                self.int_value = int_value
+                ...
+
+            def __c_pointers__(self):
+                return [ctypes.pointer(ctypes.c_int32(self.int_value)), ...]
+
+            def __get_mlir_types__(self):
+                return [ir.IntegerType.get(32), ...]
+
+            def __new_from_mlir_values__(self, values):
+                return CustomData(values[0], ...)
+
+        @jit
+        def foo(x: CustomData):
+            a = x.int_value + 1
+            ...
+
+        # `CustomData` is an argument of `foo`
+        foo(CustomData(1, ...))
+
+    When called like ``y = foo(x)``, the following steps occur:
+
+    1. JIT compiler generates MLIR function definition using ``__get_mlir_types__``
+
+    .. code-block:: mlir
+
+        func.func @foo(%arg0: i32, ...) {
+            ...
+
+            return
+        }
+
+    2. JIT function can't use values from Python, so it needs to reconstruct the object from
+    MLIR values, a.k.a `%arg0`, with ``__new_from_mlir_values__`` and pass it to `foo`.
+
+    Following code demonstrates how JIT compiler reconstructs the object and pass to Python.
+
+    .. code-block:: python
+
+        # Implementation of IR tracing
+        new_x = CustomData(ir.Value(%arg0), ...)
+        y = foo(new_x)
+        # `x.int_value` is %arg0 rather than `c1` defined by Python.
+
+    3. For Python runtime execution, JIT engine invokes compiled function using ``__c_pointers__``
+    pointing to the underlying data object passing to JIT compiled function.
+
+    .. code-block:: python
+
+        jit_engine.invoke(compiled_foo, concat([x.__c_pointers__(), ...]))
+    """
+
+    def __c_pointers__(self):
+        """
+        Generate a list of ctypes pointers for the current object.
+
+        :return: List of ctypes pointers
+        :rtype: List[ctypes.c_void_p]
+        """
+        raise NotImplementedError
+
+    def __get_mlir_types__(self):
+        """
+        Generate a list of MLIR types for the current object.
+
+        :return: List of MLIR types
+        :rtype: List[ir.Type]
+        """
+        raise NotImplementedError
+
+    def __new_from_mlir_values__(self, values):
+        """
+        Create a new object from MLIR values.
+
+        :param values: List of MLIR values
+        :type values: List[ir.Value]
+        :return: A new object that represents the given MLIR values
+        :rtype: Any
+        """
+        raise NotImplementedError
+
+
+def get_c_pointers(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained C pointers
+    """
+    if hasattr(obj, "__c_pointers__"):
+        return obj.__c_pointers__()
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_c_pointers(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_c_pointers to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+
+
+def get_mlir_types(obj):
+    """
+    Given the `obj`, recursively go through it to extract all contained MLIR types
+    """
+    if hasattr(obj, "__get_mlir_types__"):
+        return obj.__get_mlir_types__()
+    elif hasattr(obj, "__extract_mlir_values__"):
+        return [v.type for v in obj.__extract_mlir_values__()]
+    elif isinstance(obj, ir.Value):
+        return [obj.type]
+    elif isinstance(obj, (tuple, list)):
+        return sum((get_mlir_types(x) for x in obj), [])
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_mlir_types to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    return []
+
+
+class DslType(type):
+    """Metaclass for all DSL types in the system.
+
+    This metaclass provides type system infrastructure for DSL types, handling MLIR
+    type mappings and NumPy type conversions.
+
+    All data types in DSL must provide the following methods:
+
+    :param mlir_type: Corresponding MLIR type for this DSL type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this type is abstract, defaults to False
+    :type is_abstract: bool, optional
+
+    **Required Methods**
+
+    * ``__str__`` (classmethod): Return string representation of the type
+    * ``__c_pointers__`` (optional): Return list of ctypes pointers of data used to invoke JIT function
+    * ``__get_mlir_types__``: Return list of MLIR types of the MLIR values contained in the instance
+    * ``__extract_mlir_values__``: Return list of MLIR values contained in the instance
+    * ``__new_from_mlir_values__``: Return a new instance from list of MLIR values
+
+    **Attributes**
+
+    :ivar _ir: MLIR provider
+    :vartype _ir: Any
+    :ivar _T: MLIR Type system provider
+    :vartype _T: Any
+
+    **Properties**
+
+    :property mlir_type: Returns the corresponding MLIR type for this DSL type
+    :type mlir_type: Any
+
+    """
+
+    _is_abstract: bool
+
+    def __new__(cls, name, bases, attrs, is_abstract=False, **kwargs):
+        new_cls = super().__new__(cls, name, bases, attrs)
+
+        new_cls._is_abstract = is_abstract
+
+        return new_cls
+
+    @property
+    def is_abstract(cls):
+        return cls._is_abstract
+
+
+class NumericMeta(DslType):
+    """Metaclass for numeric types providing width and numpy dtype information.
+
+    :param width: Bit width of the numeric type, defaults to 8
+    :type width: int
+    :param np_dtype: Corresponding NumPy dtype
+    :type np_dtype: numpy.dtype, optional
+    :param mlir_type: Corresponding MLIR type
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether the type is abstract, defaults to False
+    :type is_abstract: bool, optional
+
+    :ivar width: Bit width of the numeric type
+    :type width: int
+    :ivar _np_dtype: Corresponding NumPy dtype
+    :type _np_dtype: Union[numpy.dtype, None]
+
+    :property numpy_dtype: Returns the corresponding NumPy dtype
+    :rtype numpy_dtype: numpy.dtype
+    """
+
+    width: int
+
+    # Placeholder type
+    _mlir_type = Any
+    _np_dtype: Union[np.dtype, None]
+
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=8,
+        np_dtype=None,
+        mlir_type=None,
+        is_abstract=False,
+        **kwargs,
+    ):
+        def _extract_mlir_values(self):
+            return [self.ir_value()]
+
+        def _new_from_mlir_values(self, values: list) -> "Numeric":
+            res_ty = type(self)
+            return res_ty(values[0])
+
+        new_attrs = {
+            "__extract_mlir_values__": _extract_mlir_values,
+            "__new_from_mlir_values__": _new_from_mlir_values,
+        }
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            new_attrs | attrs,
+            is_abstract=is_abstract,
+            **kwargs,
+        )
+
+        if mlir_type is not None:
+            new_cls._mlir_type = staticmethod(mlir_type)
+
+        new_cls.width = width
+        new_cls._np_dtype = np_dtype
+        return new_cls
+
+    @property
+    def numpy_dtype(cls):
+        return cls._np_dtype
+
+    @property
+    def is_integer(cls) -> bool: ...
+
+    @property
+    def is_float(cls) -> bool: ...
+
+    def is_same_kind(cls, other: Type) -> bool:
+        return cls.is_integer == other.is_integer or cls.is_float == other.is_float
+
+    @staticmethod
+    def from_python(value: Any) -> Type["Numeric"]:
+        """
+        Deduce the DSL type from a Python value.
+        """
+        if isinstance(value, int):
+            return Int32
+        elif isinstance(value, float):
+            return Float32
+        elif isinstance(value, bool):
+            return Boolean
+        raise DSLRuntimeError(
+            f"Could not deduce Type[Numeric] from python value: {value} :{type(value)}"
+        )
+
+    @property
+    def mlir_type(cls):
+        return cls._mlir_type()  # type: ignore
+
+
+Value = TypeVar("Value")
+
+
+def cast(obj: Union[bool, int, float, Value], type_: Type["Numeric"]) -> "Numeric":
+    """Cast an object to the specified numeric type.
+
+    :param obj: Object to be cast
+    :type obj: Union[bool, int, float, Value]
+    :param type_: Target numeric type
+    :type type_: Type[Numeric]
+    :raises TypeError: If casting to an abstract type or unsupported type conversion
+    :return: Object cast to the target numeric type
+    :rtype: Numeric
+
+    Example::
+        >>> x = cast(5, Int32)  # Cast integer to Int32
+        >>> y = cast(3.14, Float32)  # Cast float to Float32
+    """
+    if type_.is_abstract:
+        if not isinstance(obj, type_):
+            raise TypeError(
+                f"can't cast {obj} to {type_}. Pass in concrete type instead, "
+                "e.g. Int32, Float32, etc."
+            )
+        # If target_type is abstract, and value is instance of target_type,
+        # then we can return value as is
+    else:
+        # Implicit cast based on using annotation type
+        obj = type_(obj)
+    return obj
+
+
+# Option 1: use ir.Value as base
+# class IntegerMeta(DslType, type(ir.Value)):
+class IntegerMeta(NumericMeta):
+    """Metaclass for integer types providing signedness information.
+
+    :param width: Bit width of the integer type, defaults to 32
+    :type width: int
+    :param signed: Whether the integer type is signed, defaults to True
+    :type signed: bool
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+
+    :ivar signed: Whether the integer type is signed
+    :vartype signed: bool
+    :ivar arith: Arithmetic operations interface
+    :vartype arith: Any
+    """
+
+    signed: bool
+
+    def __new__(
+        cls,
+        name,
+        bases,
+        attrs,
+        width=32,
+        signed=True,
+        mlir_type=None,
+        is_abstract=False,
+    ):
+        if width == 1:
+            np_dtype = np.bool_
+        elif width == 128:
+            np_dtype = None
+        elif signed:
+            np_dtype = getattr(np, f"int{width}")
+        else:
+            np_dtype = getattr(np, f"uint{width}")
+
+        def _c_pointers(self):
+            if width == 1:
+                c_value = ctypes.c_bool(self.value)
+            elif signed:
+                c_value = getattr(ctypes, f"c_int{width}")(self.value)
+            else:
+                c_value = getattr(ctypes, f"c_uint{width}")(self.value)
+
+            return [ctypes.cast(ctypes.pointer(c_value), ctypes.c_void_p)]
+
+        new_attrs = {
+            "__c_pointers__": _c_pointers,
+        }
+        new_cls = super().__new__(
+            cls, name, bases, attrs | new_attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        new_cls.signed = signed
+        return new_cls
+
+    def __str__(cls):
+        return f"{cls.__name__}"
+
+    @property
+    def is_integer(cls) -> bool:
+        return True
+
+    @property
+    def is_float(cls) -> bool:
+        return False
+
+    @property
+    def zero(cls) -> int:
+        return 0
+
+    @property
+    def min(cls) -> int:
+        if cls.signed:
+            return -(2 ** (cls.width - 1))
+        else:
+            return 0
+
+    @property
+    def max(cls) -> int:
+        if cls.signed:
+            return 2 ** (cls.width - 1) - 1
+        else:
+            return 2**cls.width - 1
+
+    def recast_width(cls, width):
+        type_map = {
+            8: Int8,
+            16: Int16,
+            32: Int32,
+            64: Int64,
+            128: Int128,
+        }
+        if width not in type_map:
+            raise TypeError(f"Unsupported width: {width}")
+        return type_map[width]
+
+
+class FloatMeta(NumericMeta):
+    """Metaclass for floating-point types.
+
+    This metaclass provides type system infrastructure for floating-point types in the DSL,
+    handling MLIR type mappings and NumPy type conversions.
+
+    :param width: Bit width of the float type, defaults to 32
+    :type width: int
+    :param mlir_type: Corresponding MLIR type, defaults to None
+    :type mlir_type: Any, optional
+    :param is_abstract: Whether this is an abstract base class, defaults to False
+    :type is_abstract: bool, optional
+
+    :ivar _arith: Arithmetic operations interface
+    :vartype _arith: Any
+    """
+
+    _exponent_width: int
+    _mantissa_width: int
+
+    def __new__(cls, name, bases, attrs, width=32, mlir_type=None, is_abstract=False):
+        np_dtype = getattr(np, name.lower(), None)
+        new_cls = super().__new__(
+            cls, name, bases, attrs, width, np_dtype, mlir_type, is_abstract
+        )
+        # Extract exponent and mantissa bits from class name if it follows Float<E><M> pattern
+        # For example: Float8E4M3 -> exponent_width=4, mantissa_width=3
+        import re
+
+        if not is_abstract:
+            match = re.match(r"Float(\d+)E(\d+)M(\d+)(?:.*)", name)
+            if match:
+                exp_bits = int(match.group(2))
+                mant_bits = int(match.group(3))
+
+                # Store extracted values as class attributes
+                new_cls._exponent_width = exp_bits
+                new_cls._mantissa_width = mant_bits
+        # Don't have 1-to-1 mapping of narrow precision types like bfloat16, tfloat32, etc.
+        return new_cls
+
+    def __str__(cls):
+        return f"{cls.__name__}"
+
+    @property
+    def is_integer(cls) -> bool:
+        return False
+
+    @property
+    def is_float(cls) -> bool:
+        return True
+
+    @property
+    def zero(cls) -> float:
+        return 0.0
+
+    @property
+    def inf(cls) -> float:
+        return float("inf")
+
+    @property
+    def nan(cls) -> float:
+        return float("nan")
+
+    @property
+    def exponent_width(cls) -> int:
+        return cls._exponent_width
+
+    @property
+    def mantissa_width(cls) -> int:
+        return cls._mantissa_width
+
+    def recast_width(cls, width):
+        type_map = {
+            16: Float16,
+            32: Float32,
+            64: Float64,
+        }
+        if width not in type_map:
+            raise TypeError(f"Unsupported width: {width}")
+        return type_map[width]
+
+
+def _arith_signless_to_int(a, target_type):
+    # is_signed: sign of result type
+    if target_type.width > a.type.width:
+        # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL
+        if target_type.signed and a.type.width > 1:
+            return arith.extsi(target_type.mlir_type, a)
+        else:
+            return arith.extui(target_type.mlir_type, a)
+    elif target_type.width < a.type.width:
+        return arith.trunci(target_type.mlir_type, a)
+    else:
+        return a
+
+
+def _binary_op_type_promote(a, b, promote_bool: bool = False):
+    """Promote two numeric operands following type promotion rules.
+
+    :param a: First numeric operand
+    :type a: Numeric
+    :param b: Second numeric operand
+    :type b: Numeric
+    :param promote_bool: Whether to promote boolean types to Int32 for arithmetic operations, defaults to False
+    :type promote_bool: bool, optional
+    :raises ValueError: If implicit float promotion is not supported between the given types
+    :return: Tuple containing promoted operands and their resulting type
+    :rtype: tuple[Numeric, Numeric, Type[Numeric]]
+
+    Type promotion rules:
+    1. If operands are same type and not bools needing promotion:
+       - No promotion needed, return original types
+    2. If either operand is float:
+       a. If one is float and one is int:
+          - Convert int to the float type
+       b. If both are float:
+          - Promote to higher precision float if width >= 16
+          - For same width, promote to more general type (Float32 over TFloat32)
+          - Otherwise raise ValueError for unsupported promotion
+    3. Otherwise, both operands are integers. Integer promotion rules:
+       a. If promote_bool is True and either operand is bool:
+          - Promote bool to Int32 for arithmetic operations
+
+    Exceptions for numpy dtype casting:
+    - array(dtype=np.bool_) + array(dtype=np.bool_) -> array(dtype=np.bool_)
+
+    What is not supported:
+    - promotion with narrow precision float types which requires explicit cast by user
+    """
+    a_type = a.dtype
+    b_type = b.dtype
+
+    # Early return for same types (except when they're bools that need promotion)
+    if a_type == b_type and not (promote_bool and a_type is Boolean):
+        return a, b, a_type
+
+    # Handle floating point promotions
+    if a_type.is_float or b_type.is_float:
+        # Get highest precision float type based on bitwidth
+        a_width = getattr(a_type, "width", 0)
+        b_width = getattr(b_type, "width", 0)
+
+        # If one type is integer, convert it to the float type
+        if a_type.is_float and not b_type.is_float:
+            b_type = a_type.recast_width(max(a_width, b_width))
+        elif b_type.is_float and not a_type.is_float:
+            a_type = b_type.recast_width(max(a_width, b_width))
+
+        # Both are float types - handle precision promotion
+        if a_width > b_width and a_width >= 16:
+            res_type = a_type
+        elif b_width > a_width and b_width >= 16:
+            res_type = b_type
+        elif a_width == b_width:
+            # Same bitwidth - handle special cases like TFloat32 -> Float32 and BFloat16 -> Float16
+            if a_type is Float64 or b_type is Float64:
+                res_type = Float64
+            elif a_type is Float32 or b_type is Float32:
+                res_type = Float32
+            elif a_type is Float16 or b_type is Float16:
+                res_type = Float16
+            else:
+                raise ValueError(
+                    f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+                )
+        else:
+            raise ValueError(
+                f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly"
+            )
+
+        # Only convert if type is different
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+
+    # Handle bool promotion for arithmetic operations
+    if promote_bool:
+        if a_type is Boolean and b_type is Boolean:
+            # Only promote to Int32 when both are bool
+            a = a.to(Int32)
+            b = b.to(Int32)
+            a_type = b_type = a.dtype
+
+        # If both were bools, they're now same type (Int32)
+        if a_type == b_type:
+            return a, b, a_type
+
+    # Same type, no promotion needed
+    if a_type == b_type:
+        return a, b, a_type
+
+    a_signed = a_type.signed
+    b_signed = b_type.signed
+    a_width = a_type.width
+    b_width = b_type.width
+
+    # Mixed signedness case
+    if a_signed != b_signed:
+        unsigned_type = a_type if not a_signed else b_type
+        signed_type = a_type if a_signed else b_type
+        unsigned_width = a_width if not a_signed else b_width
+
+        if unsigned_width >= signed_type.width:
+            # Promote both to unsigned of larger width
+            res_type = unsigned_type
+        else:
+            # Promote both to signed of larger width
+            res_type = signed_type
+
+        new_a = a.to(res_type) if a.dtype != res_type else a
+        new_b = b.to(res_type) if b.dtype != res_type else b
+        return new_a, new_b, res_type
+
+    # Same signedness, different width - promote to larger width
+    if a_width >= b_width:
+        return a, b.to(a.dtype), a.dtype
+    else:
+        return a.to(b.dtype), b, b.dtype
+
+
+def _binary_op(op, promote_operand=True, promote_bool=False, flip=False):
+    """Wrapper for binary operations on Numeric types.
+
+    This wrapper handles type promotion, operation execution, and result type determination
+    for binary operations between Numeric types.
+
+    :param op: The binary operation to perform (e.g., operator.add, operator.sub)
+    :type op: callable
+    :param emitter: Function that emits the MLIR operation for dynamic values
+    :type emitter: callable
+    :param promote_operand: Whether to promote operands to the same type, defaults to True
+    :type promote_operand: bool, optional
+    :param promote_bool: Whether to promote boolean results to Boolean type, defaults to False
+    :type promote_bool: bool, optional
+    :param flip: Whether to flip the operands when calling the operation, defaults to False
+    :type flip: bool, optional
+
+    :raises TypeError: When an unsupported operation is attempted on specific numeric types
+
+    .. note::
+        Not all operations are supported for all numeric types. In particular:
+
+        - Subtraction is not fully supported for Integer types
+        - Multiplication, floor division, and modulo operations may have limited support
+        - Division (truediv) with integer types is not fully supported and converts to Float32
+    """
+
+    def wrapper(lhs, rhs, *, loc=None, ip=None):
+        orig_lhs_type = type(lhs)
+        orig_rhs_type = type(rhs)
+
+        # When called directly with self and other
+        ty = type(lhs)
+        # Canonicalize to Numeric type for promotion
+        if not isinstance(rhs, Numeric):
+            if not isinstance(rhs, (ArithValue, int, float, bool)):
+                # This allows rhs class to implement __rmul__
+                return NotImplemented
+
+            if isinstance(rhs, ArithValue):
+                if isinstance(rhs.type, ir.VectorType):
+                    return NotImplemented
+
+            rhs = as_numeric(rhs)
+
+        # default result type to left-hand-side
+        res_type = ty
+
+        if promote_operand:
+            lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool)
+        else:
+            rhs = ty(rhs)
+
+        if op in (
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.eq,
+            operator.ne,
+        ):
+            res_type = Boolean
+        elif op == operator.truediv and isinstance(lhs, Integer):
+            res_type = Float32
+        elif promote_bool and orig_lhs_type == Boolean and orig_rhs_type == Boolean:
+            res_type = Boolean
+
+        if isinstance(lhs.value, ArithValue) and isinstance(lhs, Integer):
+            lhs_val = lhs.value.with_signedness(lhs.signed)
+        else:
+            lhs_val = lhs.value
+
+        if isinstance(rhs.value, ArithValue) and isinstance(rhs, Integer):
+            rhs_val = rhs.value.with_signedness(rhs.signed)
+        else:
+            rhs_val = rhs.value
+
+        if flip:
+            lhs_val, rhs_val = rhs_val, lhs_val
+
+        # Check if the operation is supported by the operands
+        res_val = op(lhs_val, rhs_val)
+        return res_type(res_val, loc=loc, ip=ip)
+
+    return wrapper
+
+
+class Numeric(metaclass=NumericMeta, is_abstract=True):
+    """Base class for all numeric types in the DSL.
+
+    This class provides the foundation for both Integer and Float types,
+    implementing basic arithmetic operations.
+
+    :param value: The value to store in the numeric type
+    :type value: Union[bool, int, float, Value]
+
+    :ivar value: The stored numeric value
+    :vartype value: Union[bool, int, float, Value]
+    """
+
+    def __init__(self, value: Union[bool, int, float, Value], *, loc=None, ip=None):
+        self.value = value
+
+    def __str__(self) -> str:
+        # Use member's pretty-str method if member object has method.
+        # This can be extended in future to have better support for IDE, jupyter notebook, etc.
+        pretty_str = getattr(self.value, "pretty_str", None)
+        if pretty_str is not None:
+            return pretty_str()
+        else:
+            return "?"
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({repr(self.value)})"
+
+    def __hash__(self):
+        return hash(type(self).__class__) ^ hash(self.value)
+
+    @property
+    def dtype(self) -> Type["Numeric"]:
+        return type(self)
+
+    @overload
+    def to(self, dtype: Type["Numeric"], *, loc=None, ip=None) -> "Numeric": ...
+
+    @overload
+    def to(self, dtype: Type[int], *, loc=None, ip=None) -> int: ...
+
+    @overload
+    def to(self, dtype: Type[float], *, loc=None, ip=None) -> float: ...
+
+    @overload
+    def to(self, dtype: Type[bool], *, loc=None, ip=None) -> bool: ...
+
+    @overload
+    def to(self, dtype: Type[ir.Value], *, loc=None, ip=None) -> ir.Value: ...
+
+    def to(self, dtype: Type, *, loc=None, ip=None):
+        """Convert this numeric value to another numeric type.
+
+        If the target type is the same as the current type, returns self.
+        Otherwise, creates a new instance of the target type with the same value.
+
+        :param dtype: The target numeric type to convert to
+        :type dtype: Union[Type["Numeric"], Type[int], Type[float], Type[bool]]
+        :return: A new instance of the target type, or self if types match
+        :rtype: Numeric
+        :raises TypeError: If trying to convert an MLIR value to a static Python type
+        :raises TypeError: If trying to convert to unsupported float types like Float8E4M3,
+                          Float8E4M3B11FNUZ, Float4E2M1FN, Float6E3M2FN, or Float6E2M3FN
+
+        .. note::
+
+            Unsupported destination float types:
+                - Float8E4M3
+                - Float8E4M3B11FNUZ
+                - Float4E2M1FN
+                - Float6E3M2FN
+                - Float6E2M3FN
+
+        Example::
+
+            .. code-block:: python
+
+                # Convert between DSL numeric types
+                x = Int32(5)
+                y = x.to(Float32)  # Converts to Float32(5.0)
+
+                # Convert to Python primitive types
+                # They are considered as static values at JIT time
+                z = x.to(int)      # Returns Python int 5
+                w = y.to(float)    # Returns Python float 5.0
+
+                # This will raise a ValueError
+                mlir_val = arith.constant(T.i32(), 42)
+                num = Int32(mlir_val)
+                num.to(int)        # ValueError: unable to convert MLIR value to static type: <class 'int'>
+        """
+        if dtype in _unsupported_dst_float_types:
+            raise TypeError(f"Unsupported destination float type: {dtype}")
+
+        if isinstance(dtype, type(self)):
+            return self
+        elif isinstance(dtype, NumericMeta):
+            return dtype(self)
+        elif dtype is ir.Value:
+            if isinstance(self.value, (int, float, bool)):
+                res = arith_helper.const(
+                    self.value, self.dtype.mlir_type, loc=loc, ip=ip
+                )
+            elif isinstance(self.value, ir.Value):
+                res = self.value
+            else:
+                raise ValueError(
+                    f"cannot convert {type(self)} to {dtype}, "
+                    f"self.value is {self.value.type}"
+                )
+
+            if not isinstance(res, ArithValue):
+                raise ValueError(f"Expected ArithValue, got {type(res)} as {res.type}")
+
+            return res.with_signedness(getattr(type(self), "signed", None))
+        elif dtype in (int, float, bool):
+            if isinstance(self.value, ir.Value):
+                raise ValueError(
+                    f"unable to convert {self.value} to static type: {dtype}"
+                )
+            return dtype(self.value)
+        else:
+            raise ValueError(f"unable to convert {type(self)} to {dtype}")
+
+    def ir_value(self, *, loc=None, ip=None) -> ir.Value:
+        return self.to(ir.Value, loc=loc, ip=ip)
+
+    @property
+    def zero(self) -> "Numeric": ...
+
+    def __dsl_not__(self, *, loc=None, ip=None):
+        """DSL implementation of Python's `not` operator.
+
+        Returns True if the value is equal to zero, False otherwise.
+        This matches Python's behavior where any non-zero number is considered True.
+
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical not operation
+        :rtype: Boolean
+        """
+        if isinstance(self.value, (int, float, bool)):
+            return not self.value
+        else:
+            ty = type(self)
+            zero_val = arith.constant(ty.mlir_type, ty.zero)
+            return self.__eq__(ty(zero_val), loc=loc, ip=ip)
+
+    def __dsl_and__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `and` operator.
+
+        Returns the second operand if the first is truthy, otherwise returns the first operand.
+        A numeric value is considered truthy if it is non-zero.
+
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical and operation
+        :rtype: Boolean
+
+        Example::
+
+            5 and 3 -> 3
+            0 and 3 -> 0
+            3 and 0 and ... -> 0
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+
+        def and_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs and rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip)
+
+        return _binary_op(and_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __dsl_or__(self, other, *, loc=None, ip=None):
+        """DSL implementation of Python's `or` operator.
+
+        Returns the first operand if it is truthy, otherwise returns the second operand.
+        A numeric value is considered truthy if it is non-zero.
+
+        :param other: The right-hand operand
+        :type other: Numeric
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: The result of the logical or operation
+        :rtype: Boolean
+
+        Example::
+
+            5 or 3 -> 5
+            0 or 3 -> 3
+            3 or 0 -> 3
+        """
+        is_true = self.__dsl_bool__(loc=loc, ip=ip)
+
+        def or_op(lhs, rhs):
+            if isinstance(lhs, (int, float, bool)):
+                if isinstance(rhs, (int, float, bool)):
+                    return lhs or rhs
+                else:
+                    lhs = arith.constant(rhs.type, lhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+            else:
+                if isinstance(rhs, (int, float, bool)):
+                    rhs = arith.constant(lhs.type, rhs)
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+                else:
+                    return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip)
+
+        return _binary_op(or_op, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __dsl_bool__(self, *, loc=None, ip=None) -> "Boolean":
+        """DSL implementation of Python's __bool__ method.
+
+        Returns a Boolean indicating whether this value is considered truthy.
+        For numeric types, returns True if the value is non-zero.
+
+        :param loc: The source location information, defaults to None
+        :type loc: Optional[Location]
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: True if this value is truthy (non-zero), False otherwise
+        :rtype: Boolean
+        """
+        zero = type(self).zero
+        return self.__ne__(zero, loc=loc, ip=ip)
+
+    def __bool__(self):
+        if isinstance(self.value, (int, float, bool)):
+            return bool(self.value)
+        else:
+            raise DSLRuntimeError(
+                f"Unable to convert dynamic `{type(self).__name__}` value to bool at compile time.",
+                suggestion=[
+                    "Decorate the parent function with `jit` decorator and with `preprocess` enabled.",
+                    "Ensure not using patterns that DSL does not support.",
+                    "Otherwise, please file a bug report.",
+                ],
+            )
+
+    def __index__(self):
+        if isinstance(self.value, (int, float, bool)):
+            return self.value
+        else:
+            raise DSLRuntimeError(
+                f"'{type(self.value)}' object cannot be interpreted as an integer",
+                suggestion="Mark the loop as dynamic with `dynamic_expr` or `range_dynamic` and decorate the parent function with `jit` decorator",
+            )
+
+    def __neg__(self, *, loc=None, ip=None):
+        if isinstance(self, (bool, int, float)):
+            return type(self)(-self.value)  # type: ignore
+        else:
+            return type(self)(-self.value, loc=loc, ip=ip)  # type: ignore
+
+    @staticmethod
+    def _from_python_value(value):
+        if isinstance(value, Numeric):
+            return value
+
+        if isinstance(value, bool):
+            res_type = Boolean
+        elif isinstance(value, int):
+            res_type = Int32
+        elif isinstance(value, float):
+            res_type = Float32
+        elif isinstance(value, ArithValue):
+            res_type = Numeric.from_mlir_type(value.type)
+        else:
+            raise ValueError(
+                f"unable to convert {value} in type {type(value)} to Numeric"
+            )
+        return res_type(value)
+
+    def __add__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.add, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __sub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __mul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mul, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __truediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __mod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True)(self, other, loc=loc, ip=ip)
+
+    def __radd__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__add__(other, loc=loc, ip=ip)
+
+    def __rsub__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.sub, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rmul__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return self.__mul__(other, loc=loc, ip=ip)
+
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.floordiv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.truediv, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __rmod__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.mod, promote_bool=True, flip=True)(
+            self, other, loc=loc, ip=ip
+        )
+
+    def __eq__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.eq)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __ne__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ne)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __lt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.lt)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __le__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.le)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __gt__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.gt)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __ge__(self, other, *, loc=None, ip=None) -> "Boolean":
+        return _binary_op(operator.ge)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __pow__(self, other, *, loc=None, ip=None) -> "Numeric":
+        return _binary_op(operator.pow)(self, other, loc=loc, ip=ip)  # type: ignore
+
+    def __c_pointers__(self):
+        raise ValueError(
+            f"only support built-in types: bool, (u)int{8, 16, 32, 64}, float{32, 64}, but got {type(self)}"
+        )
+
+    def __get_mlir_types__(self):
+        return [type(self).mlir_type]
+
+    @staticmethod
+    def from_mlir_type(mlir_type):
+        type_map = {
+            T.bool(): Boolean,
+            T.f64(): Float64,
+            T.f32(): Float32,
+            T.tf32(): TFloat32,
+            T.f16(): Float16,
+            T.bf16(): BFloat16,
+            T.i(128): Int128,
+            T.i64(): Int64,
+            T.i32(): Int32,
+            T.i16(): Int16,
+            T.i8(): Int8,
+            T.si(128): Int128,
+            T.si64(): Int64,
+            T.si32(): Int32,
+            T.si16(): Int16,
+            T.si8(): Int8,
+            T.ui(128): Uint128,
+            T.ui64(): Uint64,
+            T.ui32(): Uint32,
+            T.ui16(): Uint16,
+            T.ui8(): Uint8,
+            T.f8E5M2(): Float8E5M2,
+            T.f8E4M3(): Float8E4M3,
+            T.f8E4M3FN(): Float8E4M3FN,
+            T.f8E4M3B11FNUZ(): Float8E4M3B11FNUZ,
+            T.f4E2M1FN(): Float4E2M1FN,
+            T.f6E2M3FN(): Float6E2M3FN,
+            T.f6E3M2FN(): Float6E3M2FN,
+            T.f8E8M0FNU(): Float8E8M0FNU,
+        }
+        if mlir_type not in type_map:
+            raise DSLRuntimeError(f"Unsupported DSL type: {mlir_type}")
+        return type_map[mlir_type]
+
+
+def as_numeric(obj: Union[bool, int, float, ir.Value, Numeric]) -> Numeric:
+    """Convert a Python primitive value to a Numeric type.
+
+    :param obj: Python primitive value to convert
+    :type obj: Union[bool, int, float]
+    :return: The converted Numeric object
+    :rtype: Numeric
+
+    Example::
+
+        .. code-block:: python
+
+            x = as_numeric(5)  # Converts to Int32
+            y = as_numeric(3.14)  # Converts to Float32
+            z = as_numeric(True)  # Converts to Boolean
+    """
+    if isinstance(obj, Numeric):
+        return obj
+    return Numeric._from_python_value(obj)
+
+
+class Integer(Numeric, metaclass=IntegerMeta, mlir_type=T.i32, is_abstract=True):
+    """A class representing integer values with specific width and signedness.
+
+    This class provides functionality to create and manipulate integer values with
+    configurable width and signedness. It supports conversion from various input types
+    including Python scalars, MLIR Values, and other numeric types.
+
+    :param x: The input value to convert to this integer type
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+
+    :return: A new Integer instance with the converted value
+    :rtype: Integer
+
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises NotImplementedError: If converting between different Integer types
+    :raises ValueError: If the input type is not supported for conversion
+    :raises OverflowError: If converting float infinity to integer
+
+    Type conversion behavior:
+
+    * Python scalars (bool, int, float):
+        * Converted through numpy dtype casting
+        * NaN and infinity values are rejected
+        * Example: Int8(256) -> -256 (overflow behavior)
+
+    * MLIR Value with IntegerType:
+        * Width differences handled by signless to signed/unsigned conversion
+        * Example: i8 -> i8/ui8 depending on target type
+
+    * MLIR Value with FloatType:
+        * Uses MLIR float-to-int conversion
+        * NaN and infinity values is undefined behavior
+        * Example: f32 -> i32/ui32 depending on target type
+
+    * Integer:
+        * Uses MLIR float-to-int conversion or numpy dtype casting
+        * Example: Int32(Int32(5)) => 5
+
+    * Float:
+        * Uses MLIR float-to-int conversion
+        * Example: Int32(Float(5.7)) -> 5
+
+    Example usage:
+
+    .. code-block:: python
+
+        x = Int32(5)  # From integer
+        y = Int32(True)  # From boolean
+        z = Int32(3.7)  # From float (truncates)
+        w = Int32(x)  # From same Integer type
+        c5 = arith.constant(5, T.i32())
+        a = Int32(c5)  # Treat c5 as int32 bitwise
+    """
+
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+
+        if isinstance(x, (bool, int, float)):
+            # Add check for NaN before numpy conversion
+            if isinstance(x, float):
+                if np.isnan(x):
+                    raise ValueError("Cannot convert float NaN to integer")
+                elif np.isinf(x):
+                    raise OverflowError("Cannot convert float infinity to integer")
+
+            np_dtype = ty.numpy_dtype
+            assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            x_val = int(np.array(x).astype(np_dtype))
+        elif type(x) == ty:
+            x_val = x.value
+        elif isinstance(x, ir.Value):  # type: ignore
+            x_val = x
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                if x.type.width != ty.width:
+                    # signless -> (u)int
+                    x_val = _arith_signless_to_int(x, ty)
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                # float -> (u)int
+                x_val = arith_helper.fptoi(x, ty.signed, ty.mlir_type, loc=loc, ip=ip)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):
+                x_val = arith_helper.int_to_int(x.ir_value(), ty)
+            else:
+                # For non-MLIR values, use numpy casting
+                src_val = np.array(x.value, dtype=type(x).numpy_dtype)
+                x_val = int(src_val.astype(ty.numpy_dtype))
+        elif isinstance(x, Float):
+            # float -> int is handled by Integer.__init__ recursively
+            Integer.__init__(self, x.value)
+            return
+        else:
+            raise DSLRuntimeError(f"{x} to integer conversion is not supported")
+
+        super().__init__(x_val)
+
+    def __invert__(self, *, loc=None, ip=None):
+        res_type = type(self)
+        return res_type(self.ir_value(loc=loc, ip=ip).__invert__(loc=loc, ip=ip))
+
+    def __lshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.lshift)(self, other, loc=loc, ip=ip)
+
+    def __rlshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot left shift {other_} with {self}")
+        return other_.__lshift__(self, loc=loc, ip=ip)
+
+    def __rshift__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.rshift)(self, other, loc=loc, ip=ip)
+
+    def __rrshift__(self, other, *, loc=None, ip=None):
+        other_ = as_numeric(other)
+        if not isinstance(other_, Integer):
+            raise ValueError(f"Cannot right shift {other_} with {self}")
+        return other_.__rshift__(self, loc=loc, ip=ip)
+
+    def __and__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.and_)(self, other, loc=loc, ip=ip)
+
+    def __rand__(self, other, *, loc=None, ip=None):
+        return self.__and__(other, loc=loc, ip=ip)
+
+    def __or__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.or_)(self, other, loc=loc, ip=ip)
+
+    def __ror__(self, other, *, loc=None, ip=None):
+        return self.__or__(other, loc=loc, ip=ip)
+
+    def __xor__(self, other, *, loc=None, ip=None):
+        return _binary_op(operator.xor)(self, other, loc=loc, ip=ip)
+
+    def __rxor__(self, other, *, loc=None, ip=None):
+        return self.__xor__(other, loc=loc, ip=ip)
+
+
+class Float(Numeric, metaclass=FloatMeta, mlir_type=T.f32, is_abstract=True):
+    """A class representing floating-point values.
+
+    :param x: The input value to convert to this float type.
+    :type x: Union[bool, int, float, ir.Value, Integer, Float]
+
+    Type conversion behavior:
+
+    1. Python scalars (bool, int, float):
+       - Converted through numpy dtype casting
+       - Example: Float32(1.7) -> 1.7
+
+    2. MLIR Value with FloatType:
+       - If width differs: converts between float types
+       - Example: f16 -> f32
+
+    3. MLIR Value with IntegerType:
+       - Not supported, raises ValueError
+
+    4. Integer:
+       - Converts using MLIR int-to-float operation
+       - Example: Float32(Int32(5)) -> 5.0
+
+    5. Float:
+       - Direct conversion between float types
+       - Example: Float32(Float32(1.5)) -> 1.5
+
+    .. note::
+        The following narrow precision types are only supported in device code:
+
+        8-bit float types:
+            - Float8E5M2
+            - Float8E4M3
+            - Float8E4M3FN
+            - Float8E8M0FNU
+            - Float8E4M3B11FNUZ
+
+        6-bit float types:
+            - Float6E3M2FN
+            - Float6E2M3FN
+
+        4-bit float types:
+            - Float4E2M1FN
+
+        Narrow precision types and special floating-point formats support matrix on device:
+
+    :raises AssertionError: If the type's numpy_dtype is None
+    :raises ValueError: If conversion from the input type is not supported
+    """
+
+    def __init__(self, x, *, loc=None, ip=None):
+        ty = type(self)
+
+        if isinstance(x, (bool, int, float)):  # type: ignore
+            # Why we need to convert x to with numpy?
+            # np_dtype = ty.numpy_dtype
+            # assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}"
+            # x = float(np.array(x).astype(np_dtype))
+            super().__init__(float(x))
+        elif isinstance(x, ir.Value):  # type: ignore
+            if isinstance(x.type, ir.IntegerType):  # type: ignore
+                raise DSLRuntimeError("signless to float conversion is not implemented")
+            elif isinstance(x.type, ir.FloatType):  # type: ignore
+                if x.type != ty.mlir_type:
+                    x = arith_helper.cvtf(x, ty.mlir_type, loc=loc, ip=ip)
+            super().__init__(x)
+        elif isinstance(x, Integer):
+            if isinstance(x.value, ir.Value):  # type: ignore
+                x = arith_helper.itofp(
+                    x.value, type(x).signed, ty.mlir_type, loc=loc, ip=ip
+                )
+            else:
+                x = float(x.value)
+            super().__init__(x)
+        elif isinstance(x, Float):
+            Float.__init__(self, x.value)
+        else:
+            raise DSLRuntimeError(f"{x} to Float conversion is not supported")
+
+
+class Boolean(Integer, metaclass=IntegerMeta, width=1, signed=True, mlir_type=T.bool):
+    """Boolean type representation in the DSL.
+
+    This class represents boolean values in the DSL, with a width of 1 bit.
+    It supports conversion from various types to boolean values.
+
+    :param a: Value to convert to Boolean
+    :type a: Union[bool, int, float, "Value", Numeric]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point for MLIR operations, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+    :raises DSLRuntimeError: If the input value cannot be converted to Boolean
+
+    Conversion rules:
+
+    1. Python bool/int/float:
+       - Converted using Python's bool() function
+       - Example: Boolean(1) -> True, Boolean(0) -> False
+
+    2. Numeric:
+       - Uses the Numeric.value to construct Boolean recursively
+
+    3. MLIR Value with IntegerType:
+       - If width is 1: Direct assignment
+       - Otherwise: Compares with 0 using arith.cmpi
+
+    4. MLIR Value with FloatType:
+       - Compares with 0.0 using arith.cmpf
+       - Uses unordered comparison to handle NaN values
+    """
+
+    def __init__(
+        self, a: Union[bool, int, float, ir.Value, Numeric], *, loc=None, ip=None
+    ):
+        value = None
+        if isinstance(a, (bool, int, float)):
+            value = bool(a)
+        elif isinstance(a, Numeric):
+            Boolean.__init__(self, a.value, loc=loc, ip=ip)
+            return
+        elif isinstance(a, ArithValue):
+            if a.type == T.bool():
+                value = a
+            else:
+                value = a != arith_helper.const(0, a.type, loc=loc, ip=ip)
+        if value is None:
+            raise DSLRuntimeError(f"Cannot convert {a} to Boolean")
+        super().__init__(value, loc=loc, ip=ip)
+        self._value_int8 = None
+
+    def ir_value_int8(self, *, loc=None, ip=None):
+        """
+        Returns int8 ir value of Boolean.
+        When we need to store Boolean tensor element, use ir_value_int8().
+
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :return: The int8 value of this Boolean
+        :rtype: ir.Value
+        """
+        if self._value_int8 is not None:
+            return self._value_int8
+        self._value_int8 = Int8(self.value, loc=loc, ip=ip).ir_value()
+        return self._value_int8
+
+    def __neg__(self, *, loc=None, ip=None):
+        """Negation operator is not supported for boolean type.
+
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :raises TypeError: Always raises this error as negation is not supported
+        """
+        raise TypeError("Negation, the operator `-` is not supported for boolean type")
+
+
+class Int8(Integer, metaclass=IntegerMeta, width=8, signed=True, mlir_type=T.i8): ...
+
+
+class Int16(Integer, metaclass=IntegerMeta, width=16, signed=True, mlir_type=T.i16): ...
+
+
+class Int32(Integer, metaclass=IntegerMeta, width=32, signed=True, mlir_type=T.i32): ...
+
+
+class Int64(Integer, metaclass=IntegerMeta, width=64, signed=True, mlir_type=T.i64): ...
+
+
+class Int128(
+    Integer, metaclass=IntegerMeta, width=128, signed=True, mlir_type=lambda: T.i(128)
+): ...
+
+
+class Uint8(Integer, metaclass=IntegerMeta, width=8, signed=False, mlir_type=T.i8): ...
+
+
+class Uint16(
+    Integer, metaclass=IntegerMeta, width=16, signed=False, mlir_type=T.i16
+): ...
+
+
+class Uint32(
+    Integer, metaclass=IntegerMeta, width=32, signed=False, mlir_type=T.i32
+): ...
+
+
+class Uint64(
+    Integer, metaclass=IntegerMeta, width=64, signed=False, mlir_type=T.i64
+): ...
+
+
+class Uint128(
+    Integer, metaclass=IntegerMeta, width=128, signed=False, mlir_type=lambda: T.i(128)
+): ...
+
+
+class Float64(Float, metaclass=FloatMeta, width=64, mlir_type=T.f64):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return [
+            ctypes.cast(ctypes.pointer(ctypes.c_double(self.value)), ctypes.c_void_p)
+        ]
+
+
+class Float32(Float, metaclass=FloatMeta, width=32, mlir_type=T.f32):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        return ctypes.cast(ctypes.pointer(ctypes.c_float(value)), ctypes.c_void_p)
+
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return [Float32._get_c_pointer(self.value)]
+
+
+class TFloat32(Float, metaclass=FloatMeta, width=32, mlir_type=T.tf32):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float32._get_c_pointer(self.value)]
+
+
+class Float16(Float, metaclass=FloatMeta, width=16, mlir_type=T.f16):
+    @staticmethod
+    def _get_c_pointer(value: float):
+        # Convert float to float16 binary representation
+        # First convert to numpy float16 to handle the conversion
+        f16_val = np.float16(value)
+        # Get the raw bits as a 16-bit integer
+        bits = f16_val.view(np.uint16)
+        # Create a short (16-bit int) with those bits
+        c_val = ctypes.c_short(bits)
+        return ctypes.cast(ctypes.pointer(c_val), ctypes.c_void_p)
+
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+        return [Float16._get_c_pointer(self.value)]
+
+
+class BFloat16(Float, metaclass=FloatMeta, width=16, mlir_type=T.bf16):
+    def __c_pointers__(self):
+        if not isinstance(self.value, float):
+            raise ValueError("only float is supported")
+
+        return Float.__c_pointers__(self)
+
+
+class Float8E5M2(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E5M2): ...
+
+
+class Float8E4M3FN(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3FN): ...
+
+
+class Float8E4M3B11FNUZ(
+    Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3B11FNUZ
+): ...
+
+
+
+# Added missing float types
+class Float8E4M3(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3): ...
+
+
+class Float8E8M0FNU(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E8M0FNU): ...
+
+
+class Float4E2M1FN(Float, metaclass=FloatMeta, width=4, mlir_type=T.f4E2M1FN): ...
+
+
+class Float6E3M2FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E3M2FN): ...
+
+
+class Float6E2M3FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E2M3FN): ...
+
+
+_unsupported_dst_float_types = [
+    Float8E4M3,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E3M2FN,
+    Float6E2M3FN,
+]
+
+
+ALL_DTYPES = {
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Int128,
+    Uint8,
+    Uint16,
+    Uint32,
+    Uint64,
+    Uint128,
+    BFloat16,
+    Float16,
+    Float32,
+    TFloat32,
+    Float64,
+    Float8E5M2,
+    Float8E4M3,
+    Float8E4M3FN,
+    Float8E8M0FNU,
+    Float8E4M3B11FNUZ,
+    Float4E2M1FN,
+    Float6E2M3FN,
+    Float6E3M2FN,
+}
+__STR_TO_DTYPE__ = {dt.__name__: dt for dt in ALL_DTYPES}
+
+
+def dtype(dtype_) -> Type[Numeric]:
+    t = None
+    if const_expr(isinstance(dtype_, str) and dtype_ in __STR_TO_DTYPE__):
+        t = __STR_TO_DTYPE__[dtype_]
+    else:
+        raise TypeError(f"can't interpret {dtype_} as data type")
+
+    return t
+
+
+##############################################################
+# Tensor
+##############################################################
+
+
+class TensorMeta(DslType):
+    _element_type = Any
+    _shape = Any
+
+    """
+    Examples:
+        >>> Tensor[Int32, (3,)]
+        >>> Tensor[Float32, (3, 4)]
+        >>> T = TypeVar("T")
+        >>> Tensor[T, (3, 4, 5)]
+    """
+
+    def __new__(cls, name, bases, attrs, element_type=Any, shape=Any):
+        new_cls = super().__new__(cls, name, bases, attrs)
+        new_cls._element_type = element_type
+        new_cls._shape = shape
+        return new_cls
+
+
+# Generic type
+TY = TypeVar("TY")
+
+
+class Constexpr(Generic[TY]):
+    """Value is passed and computed by python interpreter"""
+
+    pass
+
+
+class align:
+    def __init__(self, value: int):
+        if value <= 0 or (value & (value - 1)) != 0:
+            raise DSLRuntimeError("expects align be power of 2 as positive value")
+        self._value = value
+
+    def __str__(self):
+        return f"align({self._value})"
+
+
+class PointerMeta(DslType):
+    def __new__(cls, name, bases, attrs, value_type=Int32, align_=align(1)):
+        new_cls = super().__new__(
+            cls,
+            name,
+            bases,
+            attrs,
+            mlir_type=lambda: getattr(ir, "UnrankedMemRefType").get(
+                value_type.mlir_type, getattr(ir, "Attribute").parse("0")
+            ),
+        )
+        new_cls._value_type = value_type
+        new_cls._align = align_
+        return new_cls
+
+    def __eq__(cls, other):
+        if not isinstance(other, PointerMeta):
+            return False
+        return (
+            cls._value_type == other._value_type
+            and cls._align._value == other._align._value
+        )  # Compare alignment values
+
+    def __hash__(cls):
+        return hash((cls._value_type, cls._align._value))  # Hash alignment value
+
+    def __getitem__(cls, params) -> Type["Pointer"]:
+        value_type, align_ = params
+
+        if not isinstance(align_, align):
+            raise DSLRuntimeError(f"expects align but got {align_}")
+
+        # Create new class with proper name and parameters
+        new_cls = type(
+            f"Pointer[{value_type.__name__}, {align_}]",
+            (Pointer,),
+            {},
+            value_type=value_type,
+            align_=align_,  # Pass alignment to __new__
+        )
+        return new_cls
+
+    def __str__(cls):
+        return f"ptr<{cls._value_type}, {cls._align}>"
+
+
+class Pointer(metaclass=PointerMeta):
+    """
+    A pointer to a memory location.
+
+    Examples:
+
+        def foo(a : Pointer[Int32, align=8]):
+            ...
+
+    """
+
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return f"{self.value} : {type(self)}"
+
+
+class IRConst(Generic[TY]):
+    """Value is passed as MLIR constant value for (arith.constant)."""
+
+    def __init__(self, ty: TY):
+        self.ty = ty
+
+
+class IRValue(Generic[TY]):
+    """Value is passed as MLIR dynamic value."""
+
+    def __init__(self, ty: TY):
+        self.ty = ty
+
+
+class IRVariadic:
+    """
+    A helper class to pass a variadic number of arguments to a function.
+    """
+
+    def __init__(self, operands):
+        """
+        Create a list of variadic operands. `operands` must be dynamic values.
+        """
+        self.operands = operands
+
+    def block_arg_types(self):
+        """
+        Return the list of block args types.
+        """
+        return [operand.type for operand in self.operands]
+
+    def set_func_args(self, block_args):
+        """
+        This function is called after entering a function. `block_args` are the
+        block arguments that correspond to the passed operands. Derived classes
+        may implement this function to provide convenience getters for block
+        arguments.
+        """
+        pass
+
+    def __len__(self):
+        """
+        Return the length of variadic operands.
+        """
+        return len(self.operands)
+
+
+class FuncArgWithAttr(IRValue):
+    """
+    This derived class is specifically for func op arg with attr
+    """
+
+    def __init__(self, ty, attr_name, attr_ty, attr_value=None):
+        super().__init__(ty)
+        assert attr_name is not None and (
+            attr_ty is not None or attr_value is not None
+        ), "Invalid attr_name and/or attr_ty and/or attr_value for FuncArgWithAttr"
+        self.attr_name = attr_name
+        self.attr_ty = attr_ty
+        self.attr_value = attr_value
+
+
+
+def implicitDowncastNumericType(value):
+    if isinstance(value, Numeric):
+        return value.ir_value()
+    return value
+
+
+__all__ = [
+    "DslType",
+    "Numeric",
+    "NumericMeta",
+    "IntegerMeta",
+    "FloatMeta",
+    "Boolean",
+    "Integer",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Int128",
+    "Int8",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Uint128",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "as_numeric",
+    "align",
+    "Pointer",
+    "dtype",
+    "Constexpr",
+    "IRConst",
+    "IRValue",
+    "IRVariadic",
+    "implicitDowncastNumericType",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4bfb2b7d91ee72b04a89de59e7dfbdec2be646c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from . import stacktrace
+from . import logger
+from . import timer
+__all__ = [
+    "logger",
+    "timer",
+    "stacktrace",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/logger.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e4b4edf359ec86b6b5806cb0b2296f9cb918f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/logger.py
@@ -0,0 +1,81 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides logging helper functions
+"""
+
+import logging
+
+logger = None
+
+
+def log():
+    return logger
+
+
+def setup_log(
+    name, log_to_console=False, log_to_file=False, log_file_path=None, log_level=1
+):
+    """Set up and configure a logger with console and/or file handlers.
+
+    :param name: Name of the logger to create
+    :type name: str
+    :param log_to_console: Whether to enable logging to console, defaults to False
+    :type log_to_console: bool, optional
+    :param log_to_file: Whether to enable logging to file, defaults to False
+    :type log_to_file: bool, optional
+    :param log_file_path: Path to the log file, required if log_to_file is True
+    :type log_file_path: str, optional
+    :param log_level: Logging level to set, defaults to 1
+    :type log_level: int, optional
+    :raises ValueError: If log_to_file is True but log_file_path is not provided
+    :return: Configured logger instance
+    :rtype: logging.Logger
+    """
+    # Create a custom logger
+    global logger
+    logger = logging.getLogger(name)
+    if log_to_console or log_to_file:
+        logger.setLevel(log_level)
+    else:
+        # Makes sure logging is OFF
+        logger.setLevel(logging.CRITICAL + 1)
+
+    # Clear existing handlers to prevent duplicate logs
+    if logger.hasHandlers():
+        logger.handlers.clear()
+
+    # Define formatter
+    formatter = logging.Formatter(
+        f"%(asctime)s - %(name)s - %(levelname)s - [%(funcName)s] - %(message)s"
+    )
+
+    # Add console handler if enabled
+    if log_to_console:
+        console_handler = logging.StreamHandler()
+        console_handler.setLevel(log_level)
+        console_handler.setFormatter(formatter)
+        logger.addHandler(console_handler)
+
+    # Add file handler if enabled
+    if log_to_file:
+        if not log_file_path:
+            raise ValueError("log_file_path must be provided when enable_file is True")
+        file_handler = logging.FileHandler(log_file_path)
+        file_handler.setLevel(log_level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+    return logger
+
+
+logger = setup_log("generic")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/stacktrace.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/stacktrace.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2091098c173e8a941ed7958802dfbdee24199bc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/stacktrace.py
@@ -0,0 +1,165 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+ This module provides stacktrace helper functions
+"""
+
+import os
+import re
+
+
+def walk_to_top_module(start_path):
+    """
+    Walk up from the start_path to find the top-level Python module.
+
+    :param start_path: The path to start from.
+    :return: The path of the top-level module.
+    """
+    current_path = start_path
+
+    while True:
+        # Check if we are at the root directory
+        if os.path.dirname(current_path) == current_path:
+            break
+
+        # Check for __init__.py
+        init_file_path = os.path.join(current_path, "__init__.py")
+        if os.path.isfile(init_file_path):
+            # If __init__.py exists, move up one level
+            current_path = os.path.dirname(current_path)
+        else:
+            # If no __init__.py, we are not in a module; stop
+            break
+
+    # If we reached the root without finding a module, return None
+    if os.path.dirname(current_path) == current_path and not os.path.isfile(
+        os.path.join(current_path, "__init__.py")
+    ):
+        return None
+
+    # Return the path of the top-level module
+    return current_path
+
+
+def _filter_internal_frames(traceback, internal_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        if os.path.abspath(iter_tb.tb_frame.f_code.co_filename).startswith(
+            internal_path
+        ):
+            if iter_tb.tb_next:
+                if iter_prev:
+                    iter_prev.tb_next = iter_tb.tb_next
+                else:
+                    traceback = iter_tb.tb_next
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+    return traceback
+
+
+_generated_function_names = re.compile(
+    r"^(loop_body|while_region|while_before_block|while_after_block|if_region|then_block|else_block|elif_region)_\d+$"
+)
+
+
+def _filter_duplicated_frames(traceback):
+    """
+    Filter out duplicated stack frames from the traceback.
+    The function filters out consecutive frames that are in the same file and have the same line number.
+    In a sequence of consecutive frames, the logic prefers to keep the non-generated frame or the last frame.
+    """
+    iter_prev = None
+    iter_tb = traceback
+    while iter_tb is not None:
+        skip_current = False
+        skip_next = False
+        if iter_tb.tb_next:
+            current_filename = os.path.abspath(iter_tb.tb_frame.f_code.co_filename)
+            next_filename = os.path.abspath(iter_tb.tb_next.tb_frame.f_code.co_filename)
+            # if in the same file, check if the line number is the same
+            if current_filename == next_filename:
+                current_lineno = iter_tb.tb_lineno
+                next_lineno = iter_tb.tb_next.tb_lineno
+                if current_lineno == next_lineno:
+                    # Same file and line number, check name, if current is generated, skip current, otherwise skip next
+                    name = iter_tb.tb_frame.f_code.co_name
+                    is_generated = bool(_generated_function_names.match(name))
+                    if is_generated:
+                        # Skip current
+                        skip_current = True
+                    else:
+                        # Skip next if it's generated, otherwise keep both
+                        next_name = iter_tb.tb_next.tb_frame.f_code.co_name
+                        skip_next = bool(_generated_function_names.match(next_name))
+        if skip_current:
+            if iter_prev:
+                iter_prev.tb_next = iter_tb.tb_next
+            else:
+                traceback = iter_tb.tb_next
+        elif skip_next:
+            # if next is last frame, don't skip
+            if iter_tb.tb_next.tb_next:
+                iter_tb.tb_next = iter_tb.tb_next.tb_next
+            iter_prev = iter_tb
+        else:
+            iter_prev = iter_tb
+        iter_tb = iter_tb.tb_next
+
+    return traceback
+
+
+def filter_stackframe(traceback, prefix_path):
+    """
+    Filter out stack frames from the traceback that belong to the specified module path.
+
+    This function removes stack frames from the traceback whose file paths start with
+    the given prefix_path, effectively hiding internal implementation details from
+    the error traceback shown to users.
+
+    :param traceback: The traceback object to filter.
+    :param prefix_path: The path prefix to filter out from the traceback.
+    :return: The filtered traceback with internal frames removed.
+    """
+    # Step 1: filter internal frames
+    traceback = _filter_internal_frames(traceback, prefix_path)
+
+    # Step 2: consolidate duplicated frames
+    return _filter_duplicated_frames(traceback)
+
+
+def filter_exception(value, module_dir):
+    """
+    Filter out internal implementation details from exception traceback.
+
+    This function recursively processes an exception and its cause chain,
+    removing stack frames that belong to the specified module directory.
+    This helps to present cleaner error messages to users by hiding
+    implementation details.
+
+    :param value: The exception object to filter.
+    :param module_dir: The module directory path to filter out from tracebacks.
+    :return: The filtered exception with internal frames removed.
+    """
+    if hasattr(value, "__cause__") and value.__cause__:
+        filter_exception(value.__cause__, module_dir)
+
+    if hasattr(value, "__traceback__"):
+        filter_stackframe(value.__traceback__, module_dir)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/timer.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f41d3f7410c0227ff1b1f8df4b8ce14557cf649b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/base_dsl/utils/timer.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a timing helper functions
+"""
+from functools import wraps
+
+from .logger import log
+
+
+# TODO: revisit this part when mlir timing manager is ready for pybind.
+def timer(*dargs, **kwargs):
+    enable = kwargs.get("enable", True)
+
+    def decorator(func):
+        @wraps(func)
+        def func_wrapper(*args, **kwargs):
+            if not enable:
+                return func(*args, **kwargs)
+            from time import time
+
+            start = time()
+            result = func(*args, **kwargs)
+            end = time()
+
+            # Convert time from seconds to us
+            spend_us = (end - start) * 1e6
+
+            # Determine the function type and format the log message
+            if hasattr(func, "__name__"):
+                func_name = func.__name__
+                log_message = f"[JIT-TIMER] Function: {func_name} | Execution Time: {spend_us:.2f} µs"
+            elif "CFunctionType" in str(type(func)):
+                log_message = f"[JIT-TIMER] C API Function: {str(func)} | Execution Time: {spend_us:.2f} µs"
+            else:
+                log_message = f"[JIT-TIMER] Anonymous Function | Execution Time: {spend_us:.2f} µs"
+
+            log().info(log_message)
+
+            return result
+
+        return func_wrapper
+
+    if len(dargs) == 1 and callable(dargs[0]):
+        return decorator(dargs[0])
+    else:
+        return decorator
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2c7ed2607675990ad9579fa06b25935b2ccb46e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/__init__.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .cutlass_dsl import (
+    Constexpr,
+    as_numeric,
+    min,
+    max,
+    and_,
+    or_,
+    all_,
+    any_,
+    not_,
+    all_,
+    any_,
+    select_,
+    # Control-flow without AST pre-processor
+    if_generate,
+    for_generate,
+    LoopUnroll,
+    while_generate,
+    yield_out,
+    # Control-flow with AST pre-processor
+    range_constexpr,
+    range_dynamic,
+    const_expr,
+    dynamic_expr,
+    # Data types
+    dtype,  # Provides conversions to types inheriting from NumericType
+    DSLRuntimeError,
+    JitArgAdapterRegistry,
+    # Construction utilities for user-defined classes
+    extract_mlir_values,
+    new_from_mlir_values,
+)
+
+from .cute.typing import *
+
+# Utilities not belonging to CuTe
+from . import utils as utils
+
+# Used as internal symbol
+from . import cutlass_dsl as _dsl
+
+# Aliases
+LaunchConfig = _dsl.BaseDSL.LaunchConfig
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+gpu = _dsl.cutlass_gpu
+cuda = _dsl.cuda_helpers
+
+CACHE_FILE = "compiled_cache.db"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8702ed9163837925057b48f9aafd11cffbb26a7e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/__init__.py
@@ -0,0 +1,319 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+# Use the auto-generated enum AddressSpace
+from cutlass._mlir.dialects.cute import AddressSpace
+
+# Explicitly import types that might be directly used by other modules.
+# This is a fix for using Sphinx to generate documentation
+# Because Sphinx processes each module in isolation, it won't be able to rely
+# on re-exported symbols via wildcard imports (from .typing import *) in the
+# same way that Python does at runtime.
+from .typing import (
+    Shape,
+    Stride,
+    IntTuple,
+    Coord,
+    Tile,
+    XTuple,
+    Tiler,
+    Layout,
+    Pointer,
+    Tensor,
+)
+
+# Import everything else
+from .typing import *
+
+from .core import (
+    assume,
+    is_integer,
+    is_int_tuple,
+    is_static,
+    size,
+    has_underscore,
+    slice_,
+    make_ptr,
+    make_layout,
+    recast_layout,
+    make_fragment_like,
+    depth,
+    rank,
+    flatten_to_tuple,
+    flatten,
+    unflatten,
+    product,
+    product_like,
+    shape,
+    size_in_bytes,
+    make_identity_layout,
+    make_ordered_layout,
+    make_composed_layout,
+    make_layout_tv,
+    make_swizzle,
+    recast_ptr,
+    make_tensor,
+    make_identity_tensor,
+    make_fragment,
+    recast_tensor,
+    get,
+    select,
+    front,
+    is_major,
+    leading_dim,
+    find,
+    find_if,
+    coalesce,
+    group_modes,
+    cosize,
+    dice,
+    product_each,
+    prepend,
+    append,
+    prepend_ones,
+    append_ones,
+    ceil_div,
+    slice_and_offset,
+    crd2idx,
+    domain_offset,
+    elem_less,
+    transform_leaf,
+    filter_zeros,
+    filter,
+    tile_to_shape,
+    shape_div,
+    composition,
+    complement,
+    right_inverse,
+    left_inverse,
+    max_common_layout,
+    max_common_vector,
+    logical_product,
+    zipped_product,
+    tiled_product,
+    flat_product,
+    raked_product,
+    blocked_product,
+    flat_divide,
+    logical_divide,
+    zipped_divide,
+    tiled_divide,
+    local_partition,
+    local_tile,
+    printf,
+    print_tensor,
+    # tiled mma/tiled copy
+    make_mma_atom,
+    make_tiled_mma,
+    make_copy_atom,
+    make_tiled_copy_tv,
+    make_tiled_copy,
+    make_tiled_copy_S,
+    make_tiled_copy_D,
+    make_tiled_copy_A,
+    make_tiled_copy_B,
+    make_tiled_copy_C,
+    make_tiled_copy_C_atom,
+    basic_copy,
+    basic_copy_if,
+    autovec_copy,
+    copy,
+    copy_atom_call,
+    gemm,
+    # Wrapper classes
+    ComposedLayout,
+    Swizzle,
+    E,
+    Atom,
+    MmaAtom,
+    CopyAtom,
+    TiledCopy,
+    TiledMma,
+    TensorSSA,
+    ReductionOp,
+    full,
+    full_like,
+    empty_like,
+    ones_like,
+    zeros_like,
+    where,
+    any_,
+    all_,
+    # User defined struct
+    struct,
+    pretty_str,
+    make_layout_image_mask,
+    repeat_like,
+    round_up,
+    is_congruent,
+    is_weakly_congruent,
+    ScaledBasis,
+    get_divisibility,
+    Ratio,
+)
+
+from . import arch
+from . import nvgpu
+from . import testing
+from . import runtime
+
+# Export all math ops without "math."
+from .math import *
+
+# Used as internal symbol
+from .. import cutlass_dsl as _dsl
+
+# Aliases
+jit = _dsl.CuTeDSL.jit
+kernel = _dsl.CuTeDSL.kernel
+register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter
+compile = _dsl.compile
+
+# Explicitly export all symbols for documentation generation
+__all__ = [
+    # Core types
+    "AddressSpace",
+    "Tensor",
+    "Layout",
+    "ComposedLayout",
+    "Swizzle",
+    "E",
+    "Atom",
+    "MmaAtom",
+    "CopyAtom",
+    "TiledCopy",
+    "TiledMma",
+    "TensorSSA",
+    # Basic utility functions
+    "assume",
+    "is_integer",
+    "is_int_tuple",
+    "is_static",
+    "size",
+    "has_underscore",
+    "slice_",
+    "depth",
+    "rank",
+    "shape",
+    "printf",
+    "print_tensor",
+    "pretty_str",
+    # Layout functions
+    "make_layout",
+    "recast_layout",
+    "make_identity_layout",
+    "make_ordered_layout",
+    "make_composed_layout",
+    "make_layout_tv",
+    "make_layout_image_mask",
+    # Tensor functions
+    "make_ptr",
+    "make_tensor",
+    "make_identity_tensor",
+    "make_fragment",
+    "make_fragment_like",
+    "recast_ptr",
+    "recast_tensor",
+    # Tensor manipulation
+    "get",
+    "select",
+    "front",
+    "is_major",
+    "leading_dim",
+    "find",
+    "find_if",
+    "coalesce",
+    "group_modes",
+    "cosize",
+    "size_in_bytes",
+    # Tuple operations
+    "flatten_to_tuple",
+    "flatten",
+    "product",
+    "product_like",
+    "product_each",
+    "prepend",
+    "append",
+    "prepend_ones",
+    "append_ones",
+    # Math operations
+    "ceil_div",
+    "round_up",
+    # Layout operations
+    "slice_and_offset",
+    "crd2idx",
+    "domain_offset",
+    "elem_less",
+    "filter_zeros",
+    "filter",
+    "tile_to_shape",
+    "shape_div",
+    "dice",
+    # Layout algebra
+    "composition",
+    "complement",
+    "right_inverse",
+    "left_inverse",
+    "max_common_layout",
+    "max_common_vector",
+    "is_congruent",
+    "is_weakly_congruent",
+    # Product operations
+    "logical_product",
+    "zipped_product",
+    "tiled_product",
+    "flat_product",
+    "raked_product",
+    "blocked_product",
+    # Division operations
+    "flat_divide",
+    "logical_divide",
+    "zipped_divide",
+    "tiled_divide",
+    "local_partition",
+    "local_tile",
+    # MMA and Copy operations
+    "make_mma_atom",
+    "make_tiled_mma",
+    "make_copy_atom",
+    "make_tiled_copy_tv",
+    "make_tiled_copy",
+    "make_tiled_copy_C_atom",
+    "basic_copy",
+    "basic_copy_if",
+    "autovec_copy",
+    "copy",
+    "copy_atom_call",
+    "gemm",
+    # Tensor creation
+    "full",
+    "full_like",
+    "empty_like",
+    "ones_like",
+    "zeros_like",
+    "where",
+    "any_",
+    "all_",
+    "repeat_like",
+    "ScaledBasis",
+    # User defined struct
+    "struct",
+    # Modules
+    "arch",
+    "nvgpu",
+    "testing",
+    "runtime",
+    # Decorators and code generation
+    "jit",
+    "kernel",
+    "register_jit_arg_adapter",
+    "compile",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01198215f74b07f224b1d5e53ff37075775bb201
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/__init__.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .elect import *
+from .mbar import *
+from .nvvm_wrappers import *
+from .smem import *
+from .tmem import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # elect.py
+    #
+    "make_warp_uniform",
+    "elect_one",
+    #
+    # mbar.py
+    #
+    "mbarrier_init",
+    "mbarrier_init_fence",
+    "mbarrier_arrive_and_expect_tx",
+    "mbarrier_expect_tx",
+    "mbarrier_wait",
+    "mbarrier_try_wait",
+    "mbarrier_conditional_try_wait",
+    "mbarrier_arrive",
+    #
+    # nvvm_wrappers.py
+    #
+    "lane_idx",
+    "warp_idx",
+    "thread_idx",
+    "block_dim",
+    "block_idx",
+    "grid_dim",
+    "cluster_idx",
+    "cluster_dim",
+    "block_in_cluster_idx",
+    "block_in_cluster_dim",
+    "block_idx_in_cluster",
+    "shuffle_sync",
+    "shuffle_sync_up",
+    "shuffle_sync_down",
+    "shuffle_sync_bfly",
+    "barrier",
+    "barrier_arrive",
+    "sync_threads",
+    "sync_warp",
+    "fence_acq_rel_cta",
+    "fence_acq_rel_cluster",
+    "fence_acq_rel_gpu",
+    "fence_acq_rel_sys",
+    "cp_async_commit_group",
+    "cp_async_wait_group",
+    "cp_async_bulk_commit_group",
+    "cp_async_bulk_wait_group",
+    "cluster_wait",
+    "cluster_arrive",
+    "cluster_arrive_relaxed",
+    "fence_proxy",
+    "vote_ballot_sync",
+    "popc",
+    "fence_view_async_tmem_load",
+    "fence_view_async_tmem_store",
+    "warpgroup_reg_alloc",
+    "warpgroup_reg_dealloc",
+    "fma_packed_f32x2",
+    "mul_packed_f32x2",
+    "add_packed_f32x2",
+    "fmax",
+    "rcp_approx",
+    "exp2",
+    # Constants
+    "WARP_SIZE",
+    # Forward from auto-generated nvvm python
+    "ProxyKind",
+    "SharedSpace",
+    "RoundingModeKind",
+    #
+    # smem.py
+    #
+    "alloc_smem",
+    "get_dyn_smem",
+    "get_dyn_smem_size",
+    #
+    # tmem.py
+    #
+    "retrieve_tmem_ptr",
+    "alloc_tmem",
+    "relinquish_tmem_alloc_permit",
+    "dealloc_tmem",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/elect.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/elect.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead552afab7de50a62f95eee7b4d8a2d9b4dfca9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/elect.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from cutlass.cutlass_dsl import CuTeDSL, T, dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm, scf
+from cutlass._mlir import ir
+
+from ..typing import Int, Int32
+from ...impl_utils import check_value_in
+
+
+@dsl_user_op
+def make_warp_uniform(value: Int, *, loc=None, ip=None) -> Int32:
+    """
+    Creates a warp-uniform value from the given integer input.
+
+    :param value: The integer to make warp uniform.
+    :type value:  Int
+    :return:      The warp-uniform value equal to the input.
+    :rtype:       Int32
+    """
+    return Int32(
+        _cute_nvgpu_ir.arch_make_warp_uniform(
+            Int32(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+
+
+class IfOpRegion:
+    """
+    A context manager for if Op.
+    Automatically inserts `scf.yield([])` when exiting the context.
+    """
+
+    def __init__(self, block, *, loc=None, ip=None):
+        self.block = block
+        self.insert_point = ir.InsertionPoint(self.block)
+        self.loc = loc
+        self.ip = ip
+
+    def __enter__(self):
+        self.insert_point.__enter__()
+        return self.block.arguments
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        scf.yield_([], loc=self.loc, ip=self.ip)
+        self.insert_point.__exit__(exc_type, exc_value, traceback)
+
+
+@dsl_user_op
+def elect_one(*, loc=None, ip=None) -> IfOpRegion:
+    """
+    Elects one thread within a warp.
+
+    .. code-block:: python
+
+        with elect_one():
+            # Only one thread in the warp executes the code in this context
+            pass
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    is_thread_leader = nvvm.elect_sync(T.bool())
+    if_op = scf.IfOp(is_thread_leader, loc=loc, ip=ip)
+    return IfOpRegion(if_op.then_block, loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/mbar.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/mbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..80cb7b0b5fc6e226a39d68197382cbde2e32861d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/mbar.py
@@ -0,0 +1,349 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+from typing import Optional
+
+from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op
+
+from cutlass._mlir.dialects import nvvm
+from cutlass._mlir import ir
+
+from ..typing import Pointer, Int, Boolean, Int32
+from ...impl_utils import check_value_in
+
+
+####################################################################################################
+#
+# Mbarrier management utilities
+#
+####################################################################################################
+
+
+@dsl_user_op
+def mbarrier_init(mbar_ptr: Pointer, cnt: Int, *, loc=None, ip=None) -> None:
+    """
+    Initializes a mbarrier with the specified thread arrival count.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param cnt:      The arrival count of the mbarrier
+    :type cnt:       Int
+    """
+    nvvm.mbarrier_init_shared(
+        mbar_ptr.llvm_ptr, Int32(cnt).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def mbarrier_init_fence(*, loc=None, ip=None) -> None:
+    """
+    A fence operation that applies to the mbarrier initializations.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    nvvm.fence_mbarrier_init(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def mbarrier_arrive_and_expect_tx(
+    mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None
+) -> None:
+    """
+    Arrives on a mbarrier and expects a specified number of transaction bytes.
+
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param bytes:                    The number of transaction bytes
+    :type bytes:                     Int
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(bytes).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE_EXPECT_TX,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_expect_tx(
+    mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None
+) -> None:
+    """
+    Expects a specified number of transaction bytes without an arrive.
+
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param bytes:                    The number of transaction bytes
+    :type bytes:                     Int
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        mbar_llvm_ptr = nvvm.mapa(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(bytes).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.EXPECT_TX,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> None:
+    """
+    Waits on a mbarrier with a specified phase.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+
+    timeout_ns = 10000000
+    # This NVVM Op is a spin-loop wrapping the mbarrier.try_wait.parity.shared.b64 PTX
+    # The timeout in ns only applies to the latter and this call is truly blocking
+    nvvm.mbarrier_try_wait_parity_shared(
+        mbar_ptr.llvm_ptr,
+        Int32(phase).ir_value(loc=loc, ip=ip),
+        Int32(timeout_ns).ir_value(loc=loc, ip=ip),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_try_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> Boolean:
+    """
+    Attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+
+    return Boolean(
+        nvvm.mbarrier_wait_parity(
+            T.bool(),
+            mbar_ptr.llvm_ptr,
+            Int32(phase).ir_value(loc=loc, ip=ip),
+            nvvm.MBarrierWaitKind.TRY,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def mbarrier_conditional_try_wait(
+    cond, mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None
+) -> Boolean:
+    """
+    Conditionally attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.
+
+    :param cond:     A boolean predicate
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param phase:    The phase to wait for (either 0 or 1)
+    :type phase:     Int
+    :return:         A boolean value indicating whether the wait operation was successful
+    :rtype:          Boolean
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+    return if_generate(
+        cond,
+        lambda: mbarrier_try_wait(mbar_ptr, phase, loc=loc, ip=ip),
+        lambda: Boolean(True).ir_value(loc=loc, ip=ip),
+        None,
+        [Boolean],
+    )
+
+
+@dsl_user_op
+def mbarrier_arrive(
+    mbar_ptr: Pointer,
+    peer_cta_rank_in_cluster: Optional[Int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Arrives on an mbarrier.
+
+    :param mbar_ptr:                 A pointer to the mbarrier in SMEM
+    :type mbar_ptr:                  Pointer
+    :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to
+                                     the mbarrier is converted to a remote address in the peer CTA's
+                                     SMEM.
+    """
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    if peer_cta_rank_in_cluster is not None:
+        arch = CuTeDSL._get_dsl().envar.arch
+        check_value_in(
+            arch,
+            [
+                "sm_90",
+                "sm_90a",
+                "sm_100a",
+                "sm_100f",
+            ],
+            "arch",
+        )
+
+        mbar_llvm_ptr = nvvm.mapa_shared_cluster(
+            mbar_llvm_ptr.type,
+            mbar_llvm_ptr,
+            Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+        space = nvvm.MBarrierSpaceKind.CLUSTER
+    else:
+        space = nvvm.MBarrierSpaceKind.CTA
+
+    nvvm.mbarrier_txn(
+        mbar_llvm_ptr,
+        Int32(1).ir_value(loc=loc, ip=ip),
+        kind=nvvm.MBarrierTxnKind.ARRIVE,
+        space=space,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def cp_async_mbarrier_arrive_noinc(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    """
+    Arrives on an mbarrier for async load **without incrementing** the arrival count
+    (`cp.async.mbarrier.arrive.shared ..., noinc=1`).
+    Used in the warp-specialized kernel when the non-TMA load warp(producer) is not the same
+    as the math/epilogue warp(consumer).
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    """
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(
+        arch,
+        [
+            "sm_90",
+            "sm_90a",
+            "sm_100a",
+            "sm_100f",
+        ],
+        "arch",
+    )
+
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=True,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e3b8acb1fd0d1bc6615cd835235c0bbd62027b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py
@@ -0,0 +1,681 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from functools import partial
+from typing import Optional, Tuple, Union, Callable
+from typing_extensions import deprecated
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import llvm, nvvm, vector
+
+# Forward nvvm enums
+from cutlass._mlir.dialects.nvvm import (
+    ProxyKind,
+    SharedSpace,
+    Tcgen05WaitKind,
+    SetMaxRegisterAction,
+    RoundingModeKind,
+)
+
+from ..typing import (
+    Int,
+    Boolean,
+    Int16,
+    Uint16,
+    Int32,
+    Uint32,
+    Int64,
+    Float32,
+    BFloat16,
+    Numeric,
+    as_numeric,
+)
+
+WARP_SIZE = 32
+FULL_MASK = 0xFFFFFFFF
+
+
+@dsl_user_op
+def lane_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the lane index of the current thread within the warp.
+    """
+    return Int32(nvvm.read_ptx_sreg_laneid(T.i32(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def warp_idx(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the warp index within a CTA.
+    """
+    warp_size = 32
+    tid_x = Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip))
+    tid_y = Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip))
+    tid_z = Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip))
+    ntid_x = Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip))
+    ntid_y = Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip))
+    tid = tid_x + tid_y * ntid_x + tid_z * ntid_x * ntid_y
+    return tid // warp_size
+
+
+@dsl_user_op
+def thread_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the thread index within a CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of threads in each dimension of the CTA.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ntid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def grid_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of CTAs in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the cluster identifier within a grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_clusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_clusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the number of clusters in each dimension of the grid.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_nclusterid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_nclusterid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_in_cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the CTA index within a cluster across all dimensions.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_ctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_in_cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]:
+    """
+    Returns the dimensions of the cluster.
+    """
+    return (
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_x(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_y(T.i32(), loc=loc, ip=ip)),
+        Int32(nvvm.read_ptx_sreg_cluster_nctaid_z(T.i32(), loc=loc, ip=ip)),
+    )
+
+
+@dsl_user_op
+def block_idx_in_cluster(*, loc=None, ip=None) -> Int32:
+    """
+    Returns the linearized identifier of the CTA within the cluster.
+    """
+    return Int32(nvvm.read_ptx_sreg_cluster_ctarank(T.i32(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def shuffle_sync_op(
+    value: Numeric,
+    offset: Int,
+    mask: Int = FULL_MASK,
+    mask_and_clamp: Int = WARP_SIZE - 1,
+    kind: nvvm.ShflKind = nvvm.ShflKind.idx,
+    *,
+    loc=None,
+    ip=None,
+) -> Numeric:
+    """
+    Shuffles a value within the threads of a warp.
+
+    :param value:          The value to shuffle
+    :type value:           Numeric
+    :param mask:           A mask describing the threads participating in this operation
+    :type mask:            Int
+    :param offset:         A source lane or a source lane offset depending on kind
+    :type offset:          Int
+    :param mask_and_clamp: An integer containing two packed values specifying a mask for logically
+                           splitting warps into sub-segments and an upper bound for clamping the
+                           source lane index.
+    :type mask_and_clamp:  Int
+    :param kind:           The kind of shuffle, can be idx, up, down, or bfly
+    :type kind:            ShflKind
+    :return:               The shuffled value
+    :rtype:                Numeric
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    if value.width > 64:
+        raise ValueError("shuffle_sync only supports values up to 64 bits")
+
+    orig_type = type(value)
+    if value.width < 32:
+        if value.dtype.is_float:
+            value = value.to(Float32)
+        else:
+            if value.signed:
+                value = value.to(Int32)
+            else:
+                value = value.to(Uint32)
+        return orig_type(
+            nvvm.shfl_sync(
+                type(value).mlir_type,
+                Int32(mask).ir_value(loc=loc, ip=ip),
+                value.ir_value(loc=loc, ip=ip),
+                Int32(offset).ir_value(loc=loc, ip=ip),
+                Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+                kind,
+                loc=loc,
+                ip=ip,
+            )
+        )
+    elif value.width == 32:
+        return orig_type(
+            nvvm.shfl_sync(
+                type(value).mlir_type,
+                Int32(mask).ir_value(loc=loc, ip=ip),
+                value.ir_value(loc=loc, ip=ip),
+                Int32(offset).ir_value(loc=loc, ip=ip),
+                Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+                kind,
+                loc=loc,
+                ip=ip,
+            )
+        )
+    else:
+        if value.width != 64:
+            raise ValueError(
+                "shuffle_sync only supports 64 bits values when the bit width is larger than 32"
+            )
+        value = llvm.bitcast(
+            T.i64(), value.to(ir.Value, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        # extract low 32 bits
+        low_32_bits = llvm.trunc(
+            T.i32(), value, llvm.IntegerOverflowFlags.none, loc=loc, ip=ip
+        )
+        # extract high 32 bits
+        high_32_bits = llvm.lshr(
+            value, Int64(32).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        high_32_bits = llvm.trunc(
+            T.i32(), high_32_bits, llvm.IntegerOverflowFlags.none, loc=loc, ip=ip
+        )
+
+        low_32_bits_shfl = nvvm.shfl_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            low_32_bits,
+            Int32(offset).ir_value(loc=loc, ip=ip),
+            Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+            kind,
+            loc=loc,
+            ip=ip,
+        )
+        high_32_bits_shfl = nvvm.shfl_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            high_32_bits,
+            Int32(offset).ir_value(loc=loc, ip=ip),
+            Int32(mask_and_clamp).ir_value(loc=loc, ip=ip),
+            kind,
+            loc=loc,
+            ip=ip,
+        )
+
+        # combine low and high 32 bits
+        low_64_bit = llvm.zext(T.i64(), low_32_bits_shfl, loc=loc, ip=ip)
+        high_64_bit = llvm.zext(T.i64(), high_32_bits_shfl, loc=loc, ip=ip)
+        shlf_res = llvm.shl(
+            high_64_bit,
+            Int64(32).ir_value(loc=loc, ip=ip),
+            llvm.IntegerOverflowFlags.none,
+            loc=loc,
+            ip=ip,
+        )
+        shlf_res = llvm.or_(shlf_res, low_64_bit, loc=loc, ip=ip)
+        shlf_res = llvm.bitcast(orig_type.mlir_type, shlf_res, loc=loc, ip=ip)
+        return orig_type(shlf_res)
+
+shuffle_sync = partial(shuffle_sync_op, kind=nvvm.ShflKind.idx)
+shuffle_sync_up = partial(shuffle_sync_op, kind=nvvm.ShflKind.up)
+shuffle_sync_down = partial(shuffle_sync_op, kind=nvvm.ShflKind.down)
+shuffle_sync_bfly = partial(shuffle_sync_op, kind=nvvm.ShflKind.bfly)
+
+
+@dsl_user_op
+def barrier(*, barrier_id=None, number_of_threads=None, loc=None, ip=None) -> None:
+    """
+    Creates a barrier, optionally named.
+    """
+    if barrier_id is not None:
+        barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip)
+
+    if number_of_threads is not None:
+        number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip)
+
+    nvvm.barrier(
+        barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def barrier_arrive(
+    *, barrier_id=None, number_of_threads=None, loc=None, ip=None
+) -> None:
+    if barrier_id is not None:
+        barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip)
+
+    if number_of_threads is None:
+        raise ValueError(
+            "barrier_arrive needs pass number_of_threads to arrive the barrier",
+        )
+    number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip)
+
+    nvvm.barrier_arrive(
+        barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def sync_threads(*, loc=None, ip=None) -> None:
+    """
+    Synchronizes all threads within a CTA.
+    """
+    nvvm.barrier(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def sync_warp(mask: Int = FULL_MASK, *, loc=None, ip=None) -> None:
+    """
+    Performs a warp-wide sync with an optional mask.
+    """
+    nvvm.bar_warp_sync(Int32(mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_cta(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cta(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_cluster(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_cluster(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_gpu(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_gpu(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_acq_rel_sys(*, loc=None, ip=None) -> None:
+    """
+    Fence operation with acquire-release semantics.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    nvvm.fence_acq_rel_sys(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async instructions.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group>`__.
+    """
+    nvvm.cp_async_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_wait_group(n, *, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async groups are pending.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all>`__.
+    """
+    nvvm.cp_async_wait_group(n, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_bulk_commit_group(*, loc=None, ip=None) -> None:
+    """
+    Commits all prior initiated but uncommitted cp.async.bulk instructions.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-commit-group>`__.
+    """
+    nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> None:
+    """
+    Waits till only a specified numbers of cp.async.bulk groups are pending.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-wait-group>`__.
+    """
+    nvvm.cp_async_bulk_wait_group(group, read=read, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_wait(*, loc=None, ip=None) -> None:
+    """
+    A cluster-wide wait operation.
+    """
+    nvvm.cluster_wait(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_arrive(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation.
+    """
+    nvvm.cluster_arrive(aligned=aligned, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> None:
+    """
+    A cluster-wide arrive operation with relaxed semantics.
+    """
+    nvvm.cluster_arrive_relaxed(aligned=aligned, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def fence_proxy(
+    kind: ProxyKind,
+    *,
+    space: Optional[SharedSpace] = None,
+    use_intrinsic=None,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.fence_proxy(
+        kind=kind, space=space, use_intrinsic=use_intrinsic, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def vote_ballot_sync(
+    pred: Boolean, mask: Int = FULL_MASK, *, loc=None, ip=None
+) -> Int32:
+    """
+    Performs a ballot operation across the warp.
+    """
+    return Int32(
+        nvvm.vote_ballot_sync(
+            T.i32(),
+            Int32(mask).ir_value(loc=loc, ip=ip),
+            Boolean(pred).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def popc(value: Numeric, *, loc=None, ip=None) -> Numeric:
+    """
+    Performs a population count operation.
+    """
+    if not isinstance(value, Numeric):
+        value = as_numeric(value)
+    return type(value)(llvm.intr_ctpop(value.ir_value(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def fence_view_async_tmem_op(
+    kind: Tcgen05WaitKind,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform a fence operation on the async TMEM load or store.
+
+    .. note::
+        This function is only available on sm_100a and above.
+        The fence is required to synchronize the TMEM load/store
+        and let the pipeline release or commit the buffer.
+
+        Take a mma2acc pipeline as an example of LOAD fence, the ACC tensor is from TMEM.
+        ```
+        # Start to copy ACC from TMEM to register
+        cute.copy(tmem_load, tACC, rACC)
+        fence_view_async_tmem_load()
+        # After fence, we can ensure the TMEM buffer is consumed totally.
+        # Release the buffer to let the MMA know it can overwrite the buffer.
+        mma2accum_pipeline.consumer_release(curr_consumer_state)
+        ```
+        Take a TS GEMM kernel as an example of STORE fence, the A tensor is from TMEM.
+        ```
+        # Start to copy A from register to TMEM
+        cute.copy(tmem_store, rA, tA)
+        fence_view_async_tmem_store()
+        # After fence, we can ensure the TMEM buffer is ready.
+        # Commit the buffer to let the MMA know it can start to load A.
+        tmem_mma_pipeline.producer_commit(curr_producer_state)
+        ```
+
+
+    :param kind: The kind of fence operation to perform including LOAD and STORE.
+    :type kind: Tcgen05WaitKind
+    """
+    nvvm.tcgen05_wait(kind, loc=loc, ip=ip)
+
+
+fence_view_async_tmem_load = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.LOAD
+)
+fence_view_async_tmem_store = partial(
+    fence_view_async_tmem_op, kind=Tcgen05WaitKind.STORE
+)
+
+
+@dsl_user_op
+def warpgroup_reg_realloc_op(
+    reg_count: int,
+    kind: SetMaxRegisterAction,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    nvvm.setmaxregister(reg_count, kind, loc=loc, ip=ip)
+
+
+warpgroup_reg_alloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.increase
+)
+warpgroup_reg_dealloc = partial(
+    warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.decrease
+)
+
+
+@dsl_user_op
+def calc_packed_f32x2_op(
+    src_a: Tuple[Float32, Float32],
+    src_b: Tuple[Float32, Float32],
+    src_c: Tuple[Float32, Float32] | None,
+    calc_func: Callable,
+    *,
+    rnd=RoundingModeKind.RZ,
+    ftz=True,
+    loc=None,
+    ip=None,
+) -> Tuple[Float32, Float32]:
+    vec_type = ir.VectorType.get([2], Float32.mlir_type, loc=loc)
+    vec_src_a = vector.from_elements(
+        vec_type, tuple(as_numeric(a).ir_value() for a in src_a), loc=loc, ip=ip
+    )
+    vec_src_b = vector.from_elements(
+        vec_type, tuple(as_numeric(b).ir_value() for b in src_b), loc=loc, ip=ip
+    )
+    if src_c is not None:
+        vec_src_c = vector.from_elements(
+            vec_type, tuple(as_numeric(c).ir_value() for c in src_c), loc=loc, ip=ip
+        )
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, vec_src_c, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+    else:
+        vec_res = calc_func(
+            vec_type, vec_src_a, vec_src_b, rnd=rnd, ftz=ftz, loc=loc, ip=ip
+        )
+
+    res0 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[0], loc=loc, ip=ip
+        )
+    )
+    res1 = Float32(
+        vector.extract(
+            vec_res, dynamic_position=[], static_position=[1], loc=loc, ip=ip
+        )
+    )
+    return res0, res1
+
+
+fma_packed_f32x2 = partial(calc_packed_f32x2_op, calc_func=nvvm.fma_packed_f32x2)
+mul_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.mul_packed_f32x2
+)
+add_packed_f32x2 = partial(
+    calc_packed_f32x2_op, src_c=None, calc_func=nvvm.add_packed_f32x2
+)
+
+
+@dsl_user_op
+def fmax(
+    a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None
+) -> Float32:
+    return Float32(
+        nvvm.fmax(
+            T.f32(),
+            Float32(a).ir_value(loc=loc, ip=ip),
+            Float32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def rcp_approx(a: Union[float, Float32], *, loc=None, ip=None):
+    return Float32(
+        nvvm.rcp_approx_ftz_f(
+            T.f32(), Float32(a).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+    )
+
+
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp2 is deprecated, use cute.math.exp2 with `fastmath=True` instead"
+)
+def exp2(a: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(loc=loc, ip=ip)],
+            "ex2.approx.ftz.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+    )
+
+
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp is deprecated, use cute.math.exp with `fastmath=True` instead"
+)
+def exp(a: Union[float, Float32], *, loc=None, ip=None) -> Float32:
+    LOG2_E = 1.4426950408889634
+    return exp2(a * LOG2_E, loc=loc, ip=ip)
+
+
+@dsl_user_op
+@deprecated(
+    "cute.arch.exp_packed_f32x2 is deprecated, use cute.arch.mul_packed_f32x2 and cute.math.exp2 with `fastmath=True` instead"
+)
+def exp_packed_f32x2(
+    a: Tuple[Float32, Float32], *, loc=None, ip=None
+) -> Tuple[Float32, Float32]:
+    LOG2_E = Float32(1.4426950408889634)
+    b = mul_packed_f32x2(a, (LOG2_E, LOG2_E), loc=loc, ip=ip)
+    return exp2(b[0], loc=loc, ip=ip), exp2(b[1], loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/smem.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/smem.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f87ea64d7f7482f3b2f464be6a0ee1a2e3494f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/smem.py
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Type
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..typing import Pointer, Numeric, NumericMeta
+
+
+@dsl_user_op
+def alloc_smem(
+    element_type: Type[Numeric],
+    size_in_elems: int,
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Statically allocates SMEM.
+
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param size_in_elems: The size of the allocation in terms of number of elements of the
+                          pointee type
+    :type size_in_elems:  int
+    :param alignment:     An optional pointer alignment for the allocation
+    :type alignment:      int
+    :return:              A pointer to the start of the allocation
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.smem, alignment
+    )
+    return _cute_nvgpu_ir.arch_alloc_smem(
+        ptr=ptr_ty,
+        input=ir.IntegerAttr.get(T.i32(), size_in_elems),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def get_dyn_smem(
+    element_type: Type[Numeric],
+    alignment: Optional[int] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to a dynamic SMEM allocation.
+
+    :param element_type:  The pointee type of the pointer.
+    :type element_type:   Type[Numeric]
+    :param alignment:     An optional pointer alignment, the result pointer is offset appropriately
+    :type alignment:      int
+    :return:              A pointer to the start of the dynamic SMEM allocation with a correct
+                          alignement
+    :rtype:               Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    if alignment is None:
+        # Default alignment based on the element type's width
+        alignment = element_type.width // 8
+    ptr_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type,
+        _cute_ir.AddressSpace.smem,
+        alignment,
+    )
+    return _cute_nvgpu_ir.arch_get_dyn_smem(ptr=ptr_ty, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def get_dyn_smem_size(*, loc=None, ip=None) -> int:
+    """
+    Gets the size in bytes of the dynamic shared memory that was specified at kernel launch time.
+    This can be used for bounds checking during shared memory allocation.
+
+    :return: The size of dynamic shared memory in bytes
+    :rtype:  int
+    """
+    return _cute_nvgpu_ir.arch_get_dyn_smem_size(loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/tmem.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/tmem.py
new file mode 100644
index 0000000000000000000000000000000000000000..302616d20b34ccfe1d3194e48bf94114eeafeaec
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/arch/tmem.py
@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from ..typing import Pointer, Int, Int32, Numeric, NumericMeta
+
+
+SM100_TMEM_CAPACITY_COLUMNS = 512
+SM100_TMEM_MIN_ALLOC_COLUMNS = 32
+
+
+@dsl_user_op
+def retrieve_tmem_ptr(
+    element_type: Type[Numeric],
+    alignment: int,
+    ptr_to_buffer_holding_addr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    """
+    Retrieves a pointer to TMEM with the provided element type and alignment.
+
+    :param element_type:               The pointee type of the pointer.
+    :type element_type:                Type[Numeric]
+    :param alignment:                  The alignment of the result pointer
+    :type alignment:                   int
+    :param ptr_to_buffer_holding_addr: A pointer to a SMEM buffer holding the TMEM address of the
+                                       start of the allocation allocation
+    :type ptr_to_buffer_holding_addr:  Pointer
+    :return:                           A pointer to TMEM
+    :rtype:                            Pointer
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(
+            f"element_type must be a type of Numeric, but got {element_type}"
+        )
+
+    res_ty = _cute_ir.PtrType.get(
+        element_type.mlir_type, _cute_ir.AddressSpace.tmem, alignment
+    )
+    return _cute_nvgpu_ir.arch_sm100_retrieve_tmem_ptr(
+        res_ty, ptr_to_buffer_holding_addr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def alloc_tmem(
+    num_columns: Int,
+    smem_ptr_to_write_address: Pointer,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Allocates TMEM.
+
+    :param num_columns: The number of TMEM columns to allocate
+    :type num_columns:  Int
+    :param smem_ptr_to_write_address: A pointer to a SMEM buffer where the TMEM address is written
+                                      to
+    :type smem_ptr_to_write_address:  Pointer
+    :param is_two_cta:                Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_alloc_tmem(
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        smem_ptr_to_write_address.value,
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def relinquish_tmem_alloc_permit(is_two_cta=None, *, loc=None, ip=None) -> None:
+    """
+    Relinquishes the right to allocate TMEM so that other CTAs potentially in a different grid can
+    allocate.
+    """
+    _cute_nvgpu_ir.arch_sm100_relinquish_tmem_alloc_permit(
+        is_two_cta=is_two_cta, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def dealloc_tmem(
+    tmem_ptr: Pointer,
+    num_columns: Int,
+    is_two_cta=None,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Deallocates TMEM using the provided pointer and number of columns.
+
+    :param tmem_ptr:    A pointer to the TMEM allocation to de-allocate
+    :type tmem_ptr:     Pointer
+    :param num_columns: The number of columns in the TMEM allocation
+    :type num_columns:  Int
+    :param is_two_cta:  Optional boolean parameter for 2-CTA MMAs
+    """
+    if isinstance(num_columns, int):
+        if (
+            num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS
+            or num_columns > SM100_TMEM_CAPACITY_COLUMNS
+            or not (num_columns & (num_columns - 1) == 0)
+        ):
+            raise ValueError(
+                f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}"
+            )
+    _cute_nvgpu_ir.arch_sm100_dealloc_tmem(
+        tmem_ptr.value,
+        Int32(num_columns).ir_value(loc=loc, ip=ip),
+        is_two_cta=is_two_cta,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/core.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d5e4221a3e6007656a9400966e84d8b9a25a79
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/core.py
@@ -0,0 +1,7070 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import copy as py_copy
+from dataclasses import dataclass
+import inspect
+import math
+import operator
+from abc import ABC, abstractmethod
+from functools import lru_cache, partial, reduce
+from inspect import isclass
+from itertools import chain
+from typing import (
+    Callable,
+    Iterable,
+    overload,
+    List,
+    Tuple,
+    Union,
+    Type,
+    Any,
+    Dict,
+    Optional,
+)
+from enum import Enum, auto
+
+from cutlass.cutlass_dsl import (
+    const,
+    T,
+    lru_cache_ir,
+    is_dynamic_expression,
+    for_generate,
+    yield_out,
+    if_generate,
+    extract_mlir_values,
+    new_from_mlir_values,
+    _binary_op_type_promote,
+    not_,
+    cutlass_arith,
+    dsl_user_op,
+)
+
+from cutlass._mlir import ir
+from cutlass._mlir.dialects._ods_common import get_op_result_or_op_results
+from cutlass._mlir.dialects import cute as _cute_ir
+from cutlass._mlir.dialects.cute import (
+    ScaledBasis as _ScaledBasis,
+    Ratio as _Ratio,
+)
+
+from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import llvm, builtin, vector, arith
+
+from .typing import (
+    Numeric,
+    Integer,
+    NumericMeta,
+    Boolean,
+    Int32,
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Float32,
+    TFloat32,
+    Int,
+    IntTuple,
+    Shape,
+    Stride,
+    Coord,
+    Layout,
+    Tile,
+    Tiler,
+    XTuple,
+    Tensor,
+    Pointer,
+    AddressSpace,
+    as_numeric,
+)
+
+
+####################################################################################################
+#
+# Internal IntTuple helpers
+#
+####################################################################################################
+
+
+def _get_typed_value(x):
+    if isinstance(x, Integer):
+        return (
+            x.value.get_typed_value() if isinstance(x.value, IntValue) else x.ir_value()
+        )
+    else:
+        return x
+
+
+def _pack_x(x, packer, op, *, loc=None, ip=None) -> ir.Value:
+    x = transform_leaf(_get_typed_value, x)
+    res_ty, dyn_elems = packer(x)
+    # <"0"> is deduced from type inference which should be removed for make_... operations
+    dyn_elems = [t for t in dyn_elems if not is_static(t)]
+    return op(res_ty, dyn_elems, loc=loc, ip=ip).result
+
+
+def _pack_shape(shape: Shape, *, loc=None, ip=None) -> ir.Value:
+    _check_shape(shape)
+    return _pack_x(shape, _cute_ir.pack_shape, _cute_ir.MakeShapeOp, loc=loc, ip=ip)
+
+
+def _pack_stride(stride: Stride, *, loc=None, ip=None) -> ir.Value:
+    _check_stride(stride)
+    # Convert basis elements to the base class before _pack_x
+    stride = transform_leaf(
+        lambda x: x.to(_cute_ir.ScaledBasis) if isinstance(x, ScaledBasis) else x,
+        stride,
+    )
+    return _pack_x(stride, _cute_ir.pack_stride, _cute_ir.MakeStrideOp, loc=loc, ip=ip)
+
+
+def _pack_coord(coord: Coord, *, loc=None, ip=None) -> ir.Value:
+    _check_coord(coord)
+    return _pack_x(coord, _cute_ir.pack_coord, _cute_ir.MakeCoordOp, loc=loc, ip=ip)
+
+
+def _pack_int_tuple(int_tuple: IntTuple, *, loc=None, ip=None) -> ir.Value:
+    _check_int_tuple(int_tuple)
+    return _pack_x(
+        int_tuple, _cute_ir.pack_int_tuple, _cute_ir.MakeIntTupleOp, loc=loc, ip=ip
+    )
+
+
+def _pack_tile(tile: Tile, *, loc=None, ip=None) -> ir.Value:
+    _check_tile(tile)
+
+    def expand_leaves(tile) -> list:
+        leaves = []
+        for e in tile:
+            if isinstance(e, _Layout):
+                leaves.extend(list(flatten_to_tuple(e.shape)))
+                leaves.extend(list(flatten_to_tuple(e.stride)))
+            else:
+                leaves.append(e)
+        return leaves
+
+    layout_leaves = flatten_to_tuple(tile)
+    dyn_elems = expand_leaves(layout_leaves)
+    dyn_elems = [
+        _get_typed_value(x) for x in dyn_elems if isinstance(x, (Integer, ir.Value))
+    ]
+
+    res_ty = _cute_ir.pack_tile(tile)
+    return _cute_ir.make_tile(res_ty, dyn_elems, loc=loc, ip=ip)
+
+
+def _unpack_x_tuple(t: Union[ir.Type, ir.Value], *, loc=None, ip=None) -> XTuple:
+    # If t is an MLIR type, make sure it's static and make a Value
+    if isinstance(t, ir.Type):
+        if not _cute_ir.is_static(t):
+            raise ValueError()
+        t = _cute_ir.static(t)
+
+    if isinstance(t, ir.Value):
+        input_ty = t.type
+        if t.type.rank == 0:
+            # Handle this case separately, _cute_ir.get_leaves will return an Op in this case
+            vals = []
+        else:
+            vals = _cute_ir.get_leaves(t, loc=loc, ip=ip)
+            if not isinstance(vals, list):
+                vals = [vals]
+    else:
+        raise TypeError(f"expects static type or value, but got {t}")
+
+    # CuTe IR only supports Int32 for now. Need to support detection of other types
+    res = _cute_ir.unpack_x_tuple(input_ty, vals)
+
+    def post_process(x):
+        if isinstance(x, _cute_ir.ScaledBasis):
+            return ScaledBasis(post_process(x.get_value()), x.get_mode())
+        elif isinstance(x, _cute_ir.Ratio):
+            return Ratio(x.numerator, x.denominator)
+        else:
+            return x
+
+    return transform_leaf(post_process, res)
+
+
+####################################################################################################
+# Validation helpers
+####################################################################################################
+
+
+def _check_shape(shape: Shape) -> None:
+    if is_integer(shape):
+        if isinstance(shape, int):
+            if shape <= 0:
+                raise ValueError(
+                    f"Expected size in shape to be strictly positive, but got {shape}"
+                )
+        elif isinstance(shape, Integer):
+            pass
+        else:
+            raise TypeError(f"Expected size be int or Integer, but got {type(shape)}")
+    elif isinstance(shape, tuple):
+        for s in shape:
+            _check_shape(s)
+    else:
+        raise ValueError(
+            f"Expected Shape, which is a positive integer or tuple of Shapes, but got {shape}"
+        )
+
+
+def _check_coord(coord: Coord) -> None:
+    flat_coord = flatten_to_tuple(coord)
+    if not all(is_integer(c) or c is None for c in flat_coord):
+        raise ValueError(
+            f"Expected Coord, whose leaves are integers or None, but got {coord}"
+        )
+
+
+def _check_stride(stride: Stride) -> None:
+    flat_stride = flatten_to_tuple(stride)
+    if not all(is_integer(s) or isinstance(s, ScaledBasis) for s in flat_stride):
+        raise ValueError(
+            f"Expected Stride, whose leaves are integers or ScaledBasis, but got {stride}"
+        )
+
+
+def _check_int_tuple(int_tuple: IntTuple) -> None:
+    flat_int_tuple = flatten_to_tuple(int_tuple)
+    if not all(is_integer(d) for d in flat_int_tuple):
+        raise ValueError(
+            f"Expected IntTuple, whose leaves are integers, but got {int_tuple}"
+        )
+
+
+def _check_tile(tile: Tile) -> None:
+    flat_tile = flatten_to_tuple(tile)
+    if not all(is_integer(t) or isinstance(t, _Layout) or t is None for t in flat_tile):
+        raise ValueError(
+            f"Expected Tile, whose leaves are integers or Layout or None, but got {tile}"
+        )
+
+
+####################################################################################################
+#
+# Core types
+#
+####################################################################################################
+
+
+class IntValue(cutlass_arith.ArithValue):
+    """Internal representation of constrained integer types with divisibility information.
+
+    IntValue serves as a proxy for constrained integer types in the CuTe IR. Rather than
+    directly storing values of IntTupleType with depth=0, it stores the result of the
+    `cute.get_scalars` operation applied to such values.
+
+    This class represents the following sequence of operations in the IR:
+      %0 = ... : (...) -> !cute.int_tuple<"?">
+      %1 = cute.get_scalars(%0) : (!cute.int_tuple<"?">) -> i32
+
+    where the first operation produces a `cute.int_tuple<"?">` with depth=0 and rank=1. It
+    automatically emit `cute.get_scalars` and track it.
+
+    IntValue inherits behavior from ArithValue with the following extensions:
+      * Overloaded operations that accept IntTupleType values to propagate divisibility information
+      * Support for CuTe operations that utilize divisibility constraints
+
+    API for interacting with IntValue:
+      * get_typed_value() - Returns the value as an IntTupleType
+      * get_divisibility() - Returns the divisibility constraint of the value
+    """
+
+    def __init__(self, v, signed=True):
+        # Cute Constrained Int Type is always signed
+        if isinstance(v, int):
+            v = _pack_int_tuple(v)
+
+        if isinstance(v.type, _cute_ir.IntTupleType):
+            scalar_val = _cute_ir.get_scalars(v)
+            super().__init__(scalar_val, True)
+        else:
+            super().__init__(v, True)
+
+    def get_typed_value(self):
+        if isinstance(self.type, ir.IntegerType):
+            def_op = self.owner.operation
+            if def_op.name == "cute.get_scalars":
+                return def_op.operands[0]
+
+        assert not isinstance(self.type, _cute_ir.IntTupleType)
+
+        return _pack_int_tuple(self)
+
+    @property
+    def divisibility(self):
+        if isinstance(self.get_typed_value().type, _cute_ir.IntTupleType):
+            return self.get_typed_value().type.get_divisibility([0])
+        else:
+            return 1
+
+    def __str__(self):
+        if self.divisibility == 1:
+            return f"?"
+        else:
+            return f"?{{div={self.divisibility}}}"
+
+    def __repr__(self):
+        parent_name = cutlass_arith.ArithValue.__name__
+        return super().__str__().replace(parent_name, IntValue.__name__)
+
+    def pretty_str(self):
+        return self.__str__()
+
+    @staticmethod
+    def _binary_op(op):
+        def wrapper(self, other, **kwargs):
+            if isinstance(other, IntValue):
+                other_val = other.get_typed_value()
+            elif isinstance(other, ir.Value) and isinstance(
+                other.type, _cute_ir.IntTupleType
+            ):
+                other_val = other
+            elif isinstance(other, ir.Value) and isinstance(other.type, ir.IntegerType):
+                other = cutlass_arith.int_to_int(other, Int32, **kwargs)
+                other_val = _pack_int_tuple(other)
+            elif isinstance(other, (int, bool)):
+                other_val = _pack_int_tuple(int(other))
+            else:
+                # Dispatch to `__rmul__` of `other`
+                return NotImplemented
+
+            return IntValue(op(self, other_val, **kwargs))
+
+        return wrapper
+
+    @dsl_user_op
+    @_binary_op
+    def __add__(self, other, *, loc=None, ip=None):
+        return _cute_ir.add_offset(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __sub__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_sub(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __mul__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_mul(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_div(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __mod__(self, other, *, loc=None, ip=None) -> cutlass_arith.ArithValue:
+        return _cute_ir.tuple_mod(self.get_typed_value(), other, loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __radd__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.add_offset(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rsub__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_sub(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rmul__(self, other, *, loc=None, ip=None):
+        return _cute_ir.tuple_mul(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_div(other, self.get_typed_value(), loc=loc, ip=ip)
+
+    @dsl_user_op
+    @_binary_op
+    def __rmod__(self, other, *, loc=None, ip=None) -> "IntValue":
+        return _cute_ir.tuple_mod(other, self.get_typed_value(), loc=loc, ip=ip)
+
+
+class Ratio(_Ratio):
+    """A class representing a rational number as a ratio of two integers.
+
+    Ratio is used in CuTe to represent exact fractional values that arise in
+    tensor layout operations, particularly in composition operations where
+    divisibility conditions may not be satisfied.
+
+    :param numerator: The numerator of the ratio
+    :type numerator: int
+    :param denominator: The denominator of the ratio
+    :type denominator: int
+    :raises TypeError: If numerator or denominator are not integers
+    """
+
+    def __init__(self, numerator: int, denominator: int):
+        if not isinstance(numerator, int) or not isinstance(denominator, int):
+            raise TypeError(
+                f"numerator and denominator must be integers, but got {numerator} and {denominator}"
+            )
+        super().__init__(numerator, denominator)
+
+    def is_integral(self) -> bool:
+        """Check if the ratio represents an integer value.
+
+        :return: True if the numerator is divisible by the denominator
+        :rtype: bool
+        """
+        return super().is_integral()
+
+    def reduced(self) -> "Ratio":
+        """Return a new Ratio with the numerator and denominator reduced to lowest terms.
+
+        :return: A new Ratio in reduced form
+        :rtype: Ratio
+        """
+        res = super().reduced()
+        return Ratio(res.numerator, res.denominator)
+
+    def __mul__(self, other):
+        """Multiply this ratio by another ratio or an integer.
+
+        :param other: The value to multiply by
+        :type other: Union[Ratio, int]
+        :return: A new ratio representing the product
+        :rtype: Ratio
+        :raises TypeError: If other is not a Ratio or int
+        """
+        if isinstance(other, Ratio):
+            return Ratio(
+                self.numerator * other.numerator,
+                self.denominator * other.denominator,
+            )
+        elif isinstance(other, int):
+            return Ratio(self.numerator * other, self.denominator)
+        else:
+            raise TypeError(f"Cannot multiply Ratio with {type(other)}")
+
+    def __rmul__(self, other):
+        """Right multiplication operation.
+
+        :param other: The value to multiply by
+        :type other: Union[Ratio, int]
+        :return: A new ratio representing the product
+        :rtype: Ratio
+        """
+        return self.__mul__(other)
+
+    def __str__(self):
+        """String representation of the ratio.
+
+        :return: String in the format "numerator/denominator"
+        :rtype: str
+        """
+        return super().__str__()
+
+    def to(self, dtype):
+        """Convert the ratio to another type.
+
+        :param dtype: The target type for conversion
+        :type dtype: type
+        :return: The ratio converted to the specified type
+        :raises TypeError: If conversion to the specified type is not supported
+        """
+        if dtype is Ratio:
+            return self
+        elif dtype is float:
+            return self.numerator / self.denominator
+        elif dtype is int:
+            return self.numerator // self.denominator
+        elif issubclass(dtype, _Ratio):
+            return self
+        else:
+            raise TypeError(f"Cannot convert Ratio to {dtype}")
+
+
+class ScaledBasis:
+    """A class representing a scaled basis element in CuTe's layout algebra.
+
+    ScaledBasis is used to represent elements in the layout algebra, particularly
+    in the context of composition operations. It consists of a value (scale) and
+    a mode that identifies mode of the basis element.
+
+    :param value: The scale value
+    :type value: Union[int, Integer, Ratio, ir.Value]
+    :param mode: The mode identifying the basis element
+    :type mode: Union[int, List[int]]
+    :raises TypeError: If mode is not an integer or list of integers
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a scaled basis with integer scale and mode
+        sb1 = ScaledBasis(2, 0)  # 2 * E(0)
+
+        # Create a scaled basis with a Ratio scale
+        sb2 = ScaledBasis(Ratio(1, 2), 1)  # (1/2) * E(1)
+
+        # Create a scaled basis with a list of modes
+        sb3 = ScaledBasis(4, [0, 1])  # 4 * E([0, 1])
+
+        # Scaled basis elements are commonly used in layout strides
+        layout = make_layout((4, 8), stride=(ScaledBasis(2, 0), ScaledBasis(1, 1)))
+
+        # This creates a layout with strides (2@0, 1@1) representing
+        # a coordinate system where each dimension has its own basis
+
+        # Example: Mapping coordinates to indices using the layout
+        coord = (2, 3)
+        idx = crd2idx(coord, layout)  # Maps (2, 3) to (4, 3)
+    """
+
+    def __init__(self, value, mode) -> None:
+        if isinstance(mode, int):
+            self._mode = [mode]
+        else:
+            if any(not isinstance(x, int) for x in mode):
+                raise TypeError("Mode must be a list of integers")
+            self._mode = mode
+
+        self._value = value
+
+    def is_static(self) -> bool:
+        """Check if the value is statically known.
+
+        :return: True if the value is not a dynamic expression
+        :rtype: bool
+        """
+        return not is_dynamic_expression(self._value)
+
+    def to(self, dtype):
+        """Convert to another type.
+
+        :param dtype: The target type for conversion
+        :type dtype: type
+        :return: The ScaledBasis converted to the specified type
+        :raises TypeError: If conversion to the specified type is not supported
+        """
+        if dtype is ScaledBasis:
+            return self
+        elif dtype is _ScaledBasis:
+            if isinstance(self._value, Ratio):
+                scale = self._value
+            elif isinstance(self._value, Integer):
+                scale = self._value.ir_value()
+            else:
+                scale = self._value
+
+            if isinstance(scale, IntValue):
+                return _ScaledBasis(scale.get_typed_value(), self._mode)
+            else:
+                return _ScaledBasis(scale, self._mode)
+        else:
+            raise TypeError(f"Cannot convert ScaledBasis to {dtype}")
+
+    def __str__(self):
+        return f"{self.to(_ScaledBasis).__str__()}"
+
+    def __hash__(self):
+        if isinstance(self.mode, list):
+            return hash((self.value, tuple(self.mode)))
+        else:
+            return hash((self.value, self.mode))
+
+    @property
+    def value(self):
+        """Get the scale value.
+
+        :return: The scale value
+        """
+        return self._value
+
+    @property
+    def mode(self) -> List[int]:
+        """Get the mode identifying the basis element.
+
+        :return: The mode as a list of integers
+        :rtype: List[int]
+        """
+        return self._mode
+
+    def __eq__(self, other):
+        if isinstance(other, ScaledBasis):
+            return self.value == other.value and self.mode == other.mode
+        else:
+            return False
+
+    def __rmul__(self, scale: Union[Int, ir.Value, Ratio]) -> "ScaledBasis":
+        """Right multiplication by a scale factor.
+
+        This operation is used in layout algebra to scale basis elements,
+        which is essential for operations like composition and partitioning.
+
+        :param scale: The scale factor
+        :type scale: Union[Int, ir.Value, Ratio]
+        :return: A new scaled basis element
+        :rtype: ScaledBasis
+        :raises TypeError: If scale is not of a supported type
+        :raises NotImplementedError: If scaling a basis element with a ratio value
+        """
+        if not isinstance(scale, (int, Integer, Ratio, ir.Value)):
+            raise TypeError(
+                f"scale must be an integer or a ratio, but got {type(scale)}"
+            )
+        if isinstance(self.value, Ratio):
+            raise NotImplementedError(
+                "scaling a basis element having a ratio is not supported"
+            )
+
+        value = self.value
+
+        if not isinstance(value, (Integer, Ratio, int, cutlass_arith.ArithValue)):
+            raise TypeError(f"Don't support {type(value)} for ScaledBasis")
+
+        # Lift to IntValue type to preserve type info as much as possible
+        if isinstance(scale, cutlass_arith.ArithValue):
+            scale = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(scale, Int32)))
+
+        if isinstance(value, cutlass_arith.ArithValue):
+            value = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(value, Int32)))
+        elif isinstance(value, Integer):
+            value = value.ir_value()
+
+        return ScaledBasis(scale * value, self.mode)  # type: ignore
+
+
+def E(mode: Union[int, List[int]]) -> ScaledBasis:
+    """Create a unit ScaledBasis element with the specified mode.
+
+    This function creates a ScaledBasis with value 1 and the given mode.
+    The mode represents the coordinate axis or dimension in the layout.
+
+    :param mode: The mode (dimension) for the basis element, either a single integer or a list of integers
+    :type mode: Union[int, List[int]]
+    :return: A ScaledBasis with value 1 and the specified mode
+    :rtype: ScaledBasis
+    :raises TypeError: If mode is not an integer or a list
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a basis element for the first dimension (mode 0)
+        e0 = E(0)
+
+        # Create a basis element for the second dimension (mode 1)
+        e1 = E(1)
+
+        # Create a basis element for a hierarchical dimension
+        e_hier = E([0, 1])
+    """
+    if isinstance(mode, int):
+        mode = [mode]
+
+    if not isinstance(mode, list):
+        raise TypeError(f"expects a list, got {type(mode)}")
+
+    if not mode:
+        return 1
+
+    return ScaledBasis(1, mode)
+
+
+def get_divisibility(x: Union[int, Integer]) -> int:
+    if isinstance(x, int):
+        return x
+
+    if isinstance(x, Integer):
+        x = x.value
+
+    if isinstance(x, IntValue):
+        return x.divisibility
+    else:
+        return 1
+
+
+@ir.register_value_caster(_cute_ir.SwizzleType.get_static_typeid(), replace=True)
+class Swizzle(ir.Value):
+    """
+    Swizzle is a transformation that permutes the elements of a layout.
+
+    Swizzles are used to rearrange data elements to improve memory access patterns
+    and computational efficiency.
+
+    Swizzle is defined by three parameters:
+    - MBase: The number of least-significant bits to keep constant
+    - BBits: The number of bits in the mask
+    - SShift: The distance to shift the mask
+
+    The mask is applied to the least-significant bits of the layout.
+
+    .. code-block::
+
+        0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+                                      ^--^ MBase is the number of least-sig bits to keep constant
+                         ^-^       ^-^     BBits is the number of bits in the mask
+                           ^---------^     SShift is the distance to shift the YYY mask
+                                              (pos shifts YYY to the right, neg shifts YYY to the left)
+
+        e.g. Given
+        0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+
+        the result is
+        0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ `xor` YY
+
+    """
+
+    def __str__(self):
+        # Cut off the MLIR type's string for making pretty_str more concise
+        return self.type.__str__()[15 : 15 + 8]
+
+
+@ir.register_value_caster(_cute_ir.LayoutType.get_static_typeid(), replace=True)
+class _Layout(Layout):
+    """Layout is CuTe's core abstraction for representing tensor layouts.
+
+    A Layout maps from a logical coordinate space to an index space, defined by a
+    pair of (Shape, Stride). The Shape defines the abstract dimensions of the Layout,
+    while the Stride defines how coordinates within the Shape map to linear indices.
+
+    Layouts present a common interface to multidimensional array access that abstracts
+    away the details of how array elements are organized in memory. This allows algorithms
+    to be written generically, so that layouts can change without requiring code changes.
+
+    CuTe layouts are inherently hierarchical, constructed from smaller, nested layouts
+    that can represent complex mappings required by GPU tensor instructions. They support
+    a rich algebra of operations including concatenation, coalescence, composition,
+    complement, and inversion.
+
+    :ivar shape: An IntTuple representing the dimensions of the layout.
+    :ivar stride: An IntTuple representing the strides of the layout.
+    :ivar max_alignment: The maximum alignment of the layout.
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Creating a layout with shape (4,8) and default stride (layout left / "column major")
+        layout = cute.make_layout((4, 8))
+
+        # Creating a layout with explicit shape and stride
+        layout = cute.make_layout((4, 8), stride=(8, 1))
+
+        # Accessing a specific coordinate: (2, 3) -> 2 * 8 + 3 * 1 = 19
+        idx = cute.crd2idx((2, 3), layout)
+    """
+
+    def __init__(self, op_result) -> None:
+        """Initialize a Layout object.
+
+        :param op_result: The operation result value to wrap.
+        """
+        super().__init__(op_result)
+
+    def __str__(self) -> str:
+        """Return a string representation of the layout.
+
+        :return: A string in the format "shape:stride".
+        """
+        return f"{pretty_str(self.shape)}:{pretty_str(self.stride)}"
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape:
+        """Get the shape of the layout.
+
+        The shape defines the dimensions and structure of the layout's
+        coordinate space.
+
+        :param loc: Optional location information for debugging.
+        :param ip: Optional insertion point for IR generation.
+        :return: The hierarchical shape of the layout.
+        """
+        return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    @property
+    def stride(self, *, loc=None, ip=None) -> Stride:
+        """Get the stride of the layout.
+
+        The stride defines how coordinates map to linear indices in memory.
+
+        :param loc: Optional location information for debugging.
+        :param ip: Optional insertion point for IR generation.
+        :return: The hierarchical stride of the layout.
+        """
+        return _unpack_x_tuple(
+            _cute_ir.get_stride(self, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+    @property
+    def max_alignment(self) -> int:
+        """Get the maximum alignment of the layout.
+
+        :return: The maximum alignment in bytes.
+        """
+        return self.type.max_alignment
+
+    def __eq__(self, other) -> Union[bool, Boolean]:
+        """Check if this layout is equal to another layout.
+
+        Two layouts are equal if they have the same shape and stride.
+
+        :param other: The layout to compare with.
+        :return: True if layouts are equal, False otherwise.
+            May return an IR value for dynamic layouts.
+        """
+        if isinstance(other, Layout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type == other.type
+            return Boolean(_cute_ir.equal(self, other))
+        else:
+            return False
+
+    def __req__(self, other) -> Union[bool, Boolean]:
+        """Reflected equality check.
+
+        :param other: The layout to compare with.
+        :return: Result of other.__eq__(self).
+        """
+        if isinstance(other, Layout):
+            return other.__eq__(self)
+        return False
+
+    def __ne__(self, other) -> Union[bool, Boolean]:
+        """Check if this layout is not equal to another layout.
+
+        :param other: The layout to compare with.
+        :return: True if layouts are not equal, False otherwise.
+        """
+        if isinstance(other, Layout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type != other.type
+            return Boolean(not_(_cute_ir.equal(self, other)))
+        else:
+            return True
+
+    def __rne__(self, other) -> Union[bool, Boolean]:
+        """Reflected inequality check.
+
+        :param other: The layout to compare with.
+        :return: Result of other.__ne__(self).
+        """
+        if isinstance(other, Layout):
+            return other.__ne__(self)
+        return False
+
+    def __getitem__(self, idx: int) -> Layout:
+        """
+        Top-level `get` to provide a syntax similar to `tuple`.
+        """
+        return get(self, mode=[idx])
+
+    @dsl_user_op
+    def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple:
+        return crd2idx(coord, self, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def get_hier_coord(self, idx, *, loc=None, ip=None) -> Coord:
+        """Get the hierarchical coordinate corresponding to a linear index.
+
+        This method maps from a linear index back to the logical coordinate
+        in the layout's coordinate space.
+
+        :param idx: The linear index to convert.
+        :return: The hierarchical coordinate corresponding to the index.
+
+        **Examples:**
+
+        .. code-block:: python
+
+            layout = make_layout((4, 8), stride=(8, 1))
+
+            # map linear index back to coordinate: 5 -> (1, 1)
+            coord = get_hier_coord(5, layout)
+        """
+        idx_val = Int32(idx).ir_value()
+        crd = _cute_ir.get_hier_coord(idx_val, self, loc=loc, ip=ip)
+        return _unpack_x_tuple(crd)
+
+    @dsl_user_op
+    def get_flat_coord(self, idx, *, loc=None, ip=None) -> Coord:
+        idx_val = Int32(idx).ir_value()
+        res = _cute_ir.get_flat_coord(idx_val, self, loc=loc, ip=ip)
+        return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@ir.register_value_caster(_cute_ir.ComposedLayoutType.get_static_typeid(), replace=True)
+class ComposedLayout(ir.Value):
+    r"""ComposedLayout represents the functional composition of layouts in CuTe.
+
+    A ComposedLayout is formed by the composition of three components:
+    inner o offset o outer, where:
+
+    - inner: The inner layout or swizzle that is applied last
+    - offset: An integer tuple representing a coordinate offset
+    - outer: The outer layout that is applied first
+
+    ComposedLayout implements the functional composition operation where:
+
+    .. math::
+
+        R(c) := (inner \\circ offset \\circ outer)(c) := inner(offset + outer(c))
+
+    This composition allows for complex transformations of coordinates and indices,
+    enabling operations like tiling, partitioning, and reshaping of data.
+
+    :ivar inner: The inner layout or swizzle component
+    :ivar offset: The coordinate offset applied between inner and outer layouts
+    :ivar outer: The outer layout component
+    :ivar max_alignment: The maximum alignment of the composed layout
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a composed layout with inner layout, offset, and outer layout
+
+        # inner layout: (4, 8):(1, 4)
+        inner_layout = make_layout((4, 8))
+
+        offset = (0, 0)
+
+        # outer layout: (2, 2):(1@0, 1@1)
+        outer_layout = make_layout((2, 2), stride=(1 * E(0), 1 * E(1)))
+
+        # composed layout: (inner o offset o outer)
+        composed = make_composed_layout(inner_layout, offset, outer_layout)
+
+        # Accessing components of the composed layout
+        inner = composed.inner
+        offset = composed.offset
+        outer = composed.outer
+
+        # map coordinate (0, 1) to linear index
+        #  - outer(0, 1) = (0, 1)
+        #  - offset + outer(0, 1) = (0, 1)
+        #  - inner(0, 1) = 0 * 1 + 1 * 4 = 4
+        idx = crd2idx((0, 1), composed)
+
+        # Composition is used in many tiling operations
+        # For example, in logical_product, raked_product, and blocked_product
+    """
+
+    def __init__(self, value) -> None:
+        """Initialize a ComposedLayout object.
+
+        :param value: The operation result value to wrap.
+        """
+        super().__init__(value)
+
+    def __str__(self) -> str:
+        return f"{pretty_str(self.inner)} o {pretty_str(self.offset)} o {pretty_str(self.outer)}"
+
+    @property
+    def inner(self, *, loc=None, ip=None) -> Union[Swizzle, Layout]:
+        return _cute_ir.composed_get_inner(self, loc=loc, ip=ip)
+
+    @property
+    def offset(self, *, loc=None, ip=None) -> IntTuple:
+        return _unpack_x_tuple(_cute_ir.composed_get_offset(self, loc=loc, ip=ip))
+
+    @property
+    def outer(self, *, loc=None, ip=None) -> Layout:
+        return _cute_ir.composed_get_outer(self, loc=loc, ip=ip)
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape:
+        return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    @property
+    def max_alignment(self) -> int:
+        return self.type.max_alignment
+
+    def __eq__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            if is_static(self.type) and is_static(other.type):
+                return self.type == other.type
+            else:
+                raise NotImplementedError(
+                    f"runtime comparison of composed layouts is not supported, got `{self}` and `{other}`"
+                )
+        else:
+            return False
+
+    def __req__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            return Boolean(other.__eq__(self))
+        return False
+
+    def __ne__(self, other) -> Union[bool, Boolean]:
+        return not self.__eq__(other)
+
+    def __rne__(self, other) -> Union[bool, Boolean]:
+        if isinstance(other, ComposedLayout):
+            return other.__ne__(self)
+        return False
+
+    def __getitem__(self, idx: int) -> "ComposedLayout":
+        """
+        Top-level `get` to provide a syntax similar to `tuple`.
+        """
+        return get(self, mode=[idx])
+
+    @dsl_user_op
+    def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple:
+        return crd2idx(coord, self, loc=loc, ip=ip)
+
+
+@ir.register_value_caster(_cute_ir.PtrType.get_static_typeid(), replace=True)
+class _Pointer(Pointer):
+    """
+    A pointer class representing a memory address with specific properties.
+
+    Pointers are a fundamental type of iterator/engine that support random-access operations.
+    They can be offset by elements of a layout's codomain and dereferenced to produce values.
+
+    :param value: The MLIR operation result value to initialize the pointer with
+    :type value: ir.Value
+
+    :ivar type: The MLIR type of the pointer
+    :vartype type: Type
+    :ivar value_type: The type of value this pointer points to
+    :vartype value_type: Type
+    :ivar memspace: The memory space where the pointer data resides (e.g., gmem, smem, rmem)
+    :vartype memspace: AddressSpace
+
+    :note: When composed with a layout, a pointer forms a tensor: T = E ∘ L, where E is the pointer
+           and L is the layout. The tensor evaluates the layout by mapping a coordinate c to the
+           codomain, offsets the pointer accordingly, and dereferences the result:
+           T(c) = (E ∘ L)(c) = *(E + L(c))
+    """
+
+    def __init__(self, value) -> None:
+        assert isinstance(value, ir.Value)
+        self.value = ir.Value(value)
+
+    def __str__(self) -> str:
+        # Cut off the MLIR type's string for making pretty_str more concise
+        return self.type.__str__()[6:]
+
+    def __get_mlir_types__(self):
+        return [self.value.type]
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        # Only expecting single value of _Pointer instance or ir.Value
+        # In this context, a _Pointer instance is an encapsulated ir.Value which is automatically created
+        # by value caster for cute.ptr typed values
+        assert len(values) == 1, f"Expected 1 value, but got {len(values)}"
+        assert isinstance(
+            values[0], (_Pointer, ir.Value)
+        ), f"Expected _Pointer or ir.Value, but got {type(values[0])}"
+        return _Pointer(
+            values[0] if isinstance(values[0], ir.Value) else values[0].value
+        )
+
+    @property
+    @lru_cache_ir()
+    def dtype(self) -> Type[Numeric]:
+        return Numeric.from_mlir_type(self.value.type.value_type)
+
+    @property
+    def alignment(self) -> int:
+        return self.type.alignment
+
+    @property
+    def max_alignment(self) -> int:
+        return self.type.max_alignment
+
+    @property
+    @lru_cache_ir()
+    def memspace(self) -> AddressSpace:
+        return AddressSpace(self.type.address_space)
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    # Only use if you absolutely need to get the LLVM pointer Value
+    @property
+    @lru_cache_ir()
+    def llvm_ptr(self, *, loc=None, ip=None) -> ir.Value:
+        """
+        Get the LLVM pointer representation of this pointer.
+
+        :param loc: The source location for the operation, defaults to None
+        :type loc: Location, optional
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: InsertionPoint, optional
+        :return: The LLVM pointer representation
+        :rtype: ir.Value
+        """
+        llvm_ptr_ty = llvm.PointerType.get(self.memspace.value)
+        return builtin.unrealized_conversion_cast(
+            [llvm_ptr_ty], [self.value], loc=loc, ip=ip
+        )
+
+    def __add__(self, offset: IntTuple) -> Pointer:
+        """
+        Offset the pointer by elements of a layout's codomain.
+
+        :param offset: The offset to add to the pointer
+        :type offset: IntTuple
+        :return: A new pointer offset by the specified amount
+        :rtype: ir.Value
+        """
+        offset = _pack_int_tuple(offset)
+        return _cute_ir.add_offset(self.value, offset=offset)
+
+    @dsl_user_op
+    def toint(self, *, loc=None, ip=None):
+        if self.memspace in (AddressSpace.gmem, AddressSpace.generic):
+            res_type = Int64
+        else:
+            res_type = Int32
+
+        return res_type(
+            _cute_ir.ptrtoint(res_type.mlir_type, self.value, loc=loc, ip=ip)
+        )
+
+    @dsl_user_op
+    def align(self, min_align: int, *, loc=None, ip=None) -> Pointer:
+        """
+        Align a pointer to a specified byte alignment.
+
+        :param min_align: The minimum byte alignment requirement. Must be a power of 2.
+        :type min_align: int
+        :param loc: The source location for the operation, defaults to None
+        :type loc: Location, optional
+        :param ip: The insertion point for the operation, defaults to None
+        :type ip: InsertionPoint, optional
+        :return: The aligned new pointer that satisfies alignment request.
+        :rtype: Pointer
+        :raises ValueError: If the alignment is not a power of 2.
+        :raises TypeError: If pointer is in tmem address space.
+        """
+
+        if (min_align & (min_align - 1)) != 0:
+            raise ValueError("Alignment must be a power of 2")
+
+        assert isinstance(self.type, _cute_ir.PtrType)
+        if self.memspace is AddressSpace.tmem:
+            raise ValueError("aligning a TMEM pointer is not supported")
+
+        if min_align <= self.alignment:
+            return self
+
+        dtype = Numeric.from_mlir_type(self.type.value_type)
+        # Convert pointer to integer
+        address_int = self.toint(loc=loc, ip=ip)
+        # Align the address
+        aligned_address = (address_int + min_align - 1) & ~(min_align - 1)
+
+        return make_ptr(
+            dtype,
+            aligned_address,
+            self.memspace,
+            assumed_align=min_align,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@ir.register_value_caster(_cute_ir.MemRefType.get_static_typeid(), replace=True)
+@ir.register_value_caster(_cute_ir.CoordTensorType.get_static_typeid(), replace=True)
+@ir.register_value_caster(
+    _cute_nvgpu_ir.SmemDescViewType.get_static_typeid(), replace=True
+)
+class _Tensor(Tensor):
+    """A tensor class representing the composition of an iterator (engine) with a layout.
+
+    A tensor evaluates the layout by mapping a coordinate to the codomain, offsets the
+    iterator accordingly, and dereferences the result to obtain the tensor's value.
+    Formally: T(c) = (E ∘ L)(c) = *(E + L(c)), where E is the iterator/engine and L is the layout.
+
+    :param value: The MLIR operation result value to initialize the tensor with
+    :type value: ir.Value
+    :param dtype: The user specified data type of the tensor elements. It could be \
+        different from the underlying dtype in the iterator. The default is None.
+    :type dtype: Type[Numeric], optional
+
+    Attributes:
+        iterator: The pointer or iterator (engine) component of the tensor
+        layout: The layout component defining the mapping from coordinates to offsets
+        shape: The shape of the tensor, inherited from the layout
+        stride: The stride of the tensor, inherited from the layout
+        element_type: The data type of the tensor elements
+        memspace: The memory space where the tensor data resides
+
+    Notes:
+        - The tensor supports both direct element access via coordinates and slicing operations
+        - Load/store operations are only supported for specific memory spaces (rmem, smem, gmem, generic)
+        - For composed layouts, stride information is not directly accessible
+        - Dynamic layouts do not support vector load/store operations
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a tensor with shape (4,8) in row-major layout
+        tensor = make_tensor(ptr, make_layout(shape=(4,8), stride=(8,1)))
+
+        # Access individual element
+        val = tensor[0, 0]    # or val = tensor[(0, 0)]
+
+        # Slice operation - get first column
+        subtensor = tensor[None, 0]  # or subtensor = tensor[(None, 0)]
+    """
+
+    def __init__(self, value, dtype: Optional[Type[Numeric]] = None):
+        self._dtype = dtype
+        if isinstance(value, ir.Value):
+            self.value = value
+        elif isinstance(value, _Tensor):
+            self.value = value.value
+        else:
+            raise TypeError(f"Expected ir.Value or core._Tensor, got {type(value)}")
+
+        # Set iterator
+        iter_val = _cute_ir.get_iter(self.value)
+        if isinstance(iter_val, Pointer):
+            self._iterator = iter_val
+        elif isinstance(iter_val.type, _cute_ir.IntTupleType):
+            self._iterator = _unpack_x_tuple(iter_val)
+        elif isinstance(iter_val, ir.Value):
+            # Example: SMEM descriptor iterator, not well supported today
+            self._iterator = iter_val
+        else:
+            raise TypeError(f"unsupported iterator type, got {type(iter_val)}")
+
+        # Set dtype
+        if self._dtype is None:
+            if is_int_tuple(self.iterator):
+                self._dtype = IntTuple
+            elif isinstance(self.iterator, Pointer):
+                self._dtype = self.iterator.value_type
+            elif isinstance(self.type, _cute_nvgpu_ir.SmemDescViewType):
+                # SmemDescViewType do not need dtype
+                self._dtype = None
+            else:
+                raise TypeError(f"unsupported iterator type, got {type(self.iterator)}")
+
+    def __str__(self):
+        return f"tensor<{pretty_str(self.iterator)} o {pretty_str(self.layout)}>"
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        # Only expecting single value of _Tensor or ir.Value
+        # In this context, a _Tensor instance is an encapsulated ir.Value which is automatically created
+        # by value caster for MemRef/CoordTensor/SmemDescView typed values
+        assert len(values) == 1, f"Expected 1 value, but got {len(values)}"
+        assert isinstance(
+            values[0], (_Tensor, ir.Value)
+        ), f"Expected _Tensor or ir.Value, but got {type(values[0])}"
+        return _Tensor(
+            values[0] if isinstance(values[0], ir.Value) else values[0].value,
+            dtype=self.element_type,
+        )
+
+    # Cheat to let `Type(_Tensor())` to return cute.Tensor
+    @property
+    def __class__(self) -> Type[Tensor]:
+        return Tensor
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    @dsl_user_op
+    def __getitem__(
+        self, crd: Coord, *, loc=None, ip=None
+    ) -> Union[Tensor, Numeric, IntTuple]:
+        """Access or slice tensor elements using coordinates.
+
+        This method implements
+        * tensor evaluation T(c) = *(E + L(c)) when `c` is a coordinate without slicing, or
+        * tensor slicing operations T(c) = make_tensor(E + L(c), slice(L, c))
+        where E is the iterator/engine and L is the layout
+
+        :param crd: Coordinate or slice specification for accessing tensor elements
+        :type crd: Coord
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Tensor element value or sliced subtensor
+        :rtype: Union[Tensor, ir.Value, IntTuple]
+
+        :raises ValueError: If coordinate access is invalid for the tensor layout
+
+        **Examples:**
+
+        .. code-block:: python
+
+            # Create a tensor with pointer iterator
+            ptr = make_ptr(cutlass.Float32, 0, cutlass.AddressSpace.gmem)
+            layout = make_layout((64, 128))  # leftmost mode is major
+            tensor = make_tensor(ptr, layout)  # Tensor using pointer iterator
+
+            # Direct element access loads from memory
+            val = tensor[0]  # Loads element at offset 0
+            val = tensor[1]  # Loads element at offset 4 (4bytes per Float32)
+            val = tensor[(0, 1)]  # Loads element at offset 64
+
+            # Create a coord tensor
+            layout = make_layout((64, 128), stride=(1 * E(0), 1 * E(1)))
+            tensor = make_tensor((128, 128), layout)
+
+            # Direct element access
+            val = tensor[0]  # Returns (128, 128)
+            val = tensor[(0, 1)]  # Returns (128, 129)
+
+            # Slice access
+            sliced = view[(3, None)]  # Returns tensor slice
+
+        .. note::
+            Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar
+            dereference operations. Attempting to set individual elements of tensors with
+            these element types will result in errors.
+
+        **Examples:**
+
+        .. code-block:: python
+
+            # Unsupported operations with sub-byte types:
+            ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            # The following will raise an error:
+            val = tensor[0]  # Error: sub-byte scalar dereference not supported
+
+            # Similarly for other sub-byte types:
+            ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            val = tensor[0]  # Error: sub-byte scalar dereference not supported
+        """
+        if has_underscore(crd):
+            return slice_(self.value, crd)
+        elif isinstance(self.type, _cute_ir.CoordTensorType):
+            res = _cute_ir.get_iter(slice_(self, crd).value, loc=loc, ip=ip)
+            return _unpack_x_tuple(res)
+        else:
+            self._check_can_load_store()
+            self._check_can_dereference()
+
+            crd_val = _pack_coord(crd, loc=loc, ip=ip)
+            data_val = _cute_ir.memref_load(self.value, crd_val, loc=loc, ip=ip)
+            return self.element_type(data_val)
+
+    def _cvt_to_dest(self, data: Union["TensorSSA", Numeric], *, loc=None, ip=None):
+        orig_dtype = data.dtype
+        # Implicit upcast to wider type
+        if (
+            data.dtype.is_same_kind(self.element_type)
+            and self.element_type.width >= data.dtype.width
+        ):
+            data = data.to(self.element_type, loc=loc, ip=ip)  # type: ignore
+
+        if data.dtype.width != self.element_type.width:
+            raise ValueError(
+                f"Type mismatch, store {orig_dtype} (-> {data.dtype}) "
+                f"to Tensor with element type {self.element_type}"
+            )
+
+        if data.dtype is Boolean and self.element_type is Boolean:
+            # Boolean Numeric and Boolean TensorSSA both hold i1 value, but we need int8 value store to memory
+            val = data.ir_value_int8()
+        else:
+            val = data.ir_value()
+        return val
+
+    @dsl_user_op
+    def __setitem__(
+        self,
+        crd: Coord,
+        data: Union[int, float, ir.Value, Numeric, "TensorSSA"],
+        *,
+        loc=None,
+        ip=None,
+    ) -> None:
+        """Set tensor elements at specified coordinates.
+
+        Assigns values to tensor elements through direct coordinate access or slice assignment.
+        For slice assignment, the value must be a TensorSSA with matching shape.
+
+        :param crd: Coordinate or slice specification for tensor element assignment
+        :type crd: Coord
+        :param data: Value to assign - can be scalar or TensorSSA for slice assignment
+        :type data: Union[int, float, ir.Value, Numeric, TensorSSA]
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises ValueError: If tensor type doesn't support load/store operations
+        :raises ValueError: If slice assignment value is not a TensorSSA
+        :raises ValueError: If value type doesn't match tensor element type
+        :raises NotImplementedError: If value type is not supported
+
+        .. note::
+            Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar
+            dereference operations. Attempting to set individual elements of tensors with
+            these element types will result in errors.
+
+        **Examples:**
+
+        .. code-block:: python
+
+            # Unsupported operations with sub-byte types:
+            ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            # The following will raise an error:
+            tensor[0] = 1.0  # Error: sub-byte scalar dereference not supported
+
+            # Similarly for other sub-byte types:
+            ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem)
+            tensor = make_tensor(ptr, layout)
+            tensor[0] = 0.5  # Error: sub-byte scalar dereference not supported
+        """
+        self._check_can_load_store()
+
+        # convert scalar type
+        if not has_underscore(crd):
+            self._check_can_dereference()
+            # First, convert ir.Value to Numeric
+            if isinstance(data, ir.Value):
+                data = as_numeric(data)
+            elif isinstance(data, (int, float, bool)):
+                data = as_numeric(data)
+
+            if not isinstance(data, Numeric):
+                raise ValueError(f"unsupported data type: {type(data)}")
+
+            # Implicit upcast to wider type
+            val = self._cvt_to_dest(data, loc=loc, ip=ip)
+            if val.type != self.type.value_type:
+                raise ValueError(
+                    f"type mismatch, store {val.type} to {self.element_type}"
+                )
+
+            crd_val = _pack_coord(crd, loc=loc, ip=ip)
+            _cute_ir.memref_store(self.value, crd_val, val, loc=loc, ip=ip)
+        else:
+            if not isinstance(data, TensorSSA):
+                raise ValueError(f"expects TensorSSA, but got {data}")
+
+            self.__getitem__(crd).store(data, loc=loc, ip=ip)  # type: ignore
+
+    @property
+    def __class__(self) -> Type[Tensor]:
+        return Tensor
+
+    # Make it behave as if it inherited from ir.Value
+    @property
+    @lru_cache_ir()
+    def type(self) -> ir.Type:
+        return self.value.type
+
+    @property
+    def iterator(self) -> Union[Pointer, IntTuple]:
+        return self._iterator
+
+    @property
+    def layout(self) -> Layout:
+        return _cute_ir.get_layout(self.value)
+
+    @property
+    def shape(self) -> Shape:
+        return self.layout.shape
+
+    @property
+    def stride(self) -> Stride:
+        if isinstance(self.type, _cute_ir.ComposedLayoutType):
+            raise ValueError(f"can't get stride from composed layout")
+        return self.layout.stride
+
+    @property
+    def leading_dim(self) -> Union[int, Tuple[int], None]:
+        """Get the leading dimension of this Tensor.
+
+        :return: The index or indices of the first mode (from left to right) with stride 1
+        :rtype: Union[int, Tuple[int], None]
+        :returns:
+            - int: Single leading dimension index if found
+            - Tuple[int]: Tuple of indices for nested leading dimensions
+            - None: If no leading dimension is found
+
+        :postcondition: ``get(self.stride(), mode=self.leading_dim()) == 1 if self.leading_dim() != None else True``
+        """
+        return leading_dim(self.shape, self.stride)
+
+    @property
+    @lru_cache_ir()
+    def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]:
+        return self._dtype
+
+    @property
+    @lru_cache_ir()
+    def memspace(self) -> AddressSpace:
+        if isinstance(self.iterator, Pointer):
+            return self.iterator.memspace
+
+        raise ValueError(f"{self} doesn't have memspace")
+
+    @dsl_user_op
+    def load(self, *, loc=None, ip=None) -> "TensorSSA":
+        """Load tensor elements as a vector.
+
+        Loads all elements of the tensor into a vector representation, assuming the tensor
+        has a static shape and is in a memory space that supports load operations.
+
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Vector representation of tensor elements
+        :rtype: TensorSSA
+
+        :raises ValueError: If tensor has dynamic layout
+        :raises ValueError: If tensor memory space doesn't support load operations
+        """
+        if not is_static(self.shape):
+            raise ValueError("dynamic layout doesn't support load")
+
+        self._check_can_load_store()
+
+        res_vect = _cute_ir.memref_load_vec(self.value, row_major=True, loc=loc, ip=ip)
+        if self.element_type is Boolean:
+            assert (
+                res_vect.type.element_type == T.i8()
+            ), f"Boolean tensor must be stored as i8 in memory, but got {res_vect.type.element_type}"
+            zeros = full_like(self, 0, Int8, loc=loc, ip=ip)
+            res_vect = arith.cmpi(
+                arith.CmpIPredicate.ne, res_vect, zeros, loc=loc, ip=ip
+            )
+        return TensorSSA(res_vect, self.shape, self.element_type)
+
+    @dsl_user_op
+    def store(self, data: "TensorSSA", *, loc=None, ip=None):
+        """Store vector data into tensor.
+
+        Stores vector data into the tensor, assuming matching shapes and a memory space
+        that supports store operations.
+
+        :param data: Vector data to store into tensor
+        :type data: TensorSSA
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises ValueError: If tensor has dynamic layout
+        :raises ValueError: If tensor memory space doesn't support store operations
+        :raises ValueError: If data shape doesn't match tensor shape
+        """
+        if not isinstance(data, TensorSSA):
+            raise ValueError(f"Expects TensorSSA, but got {type(data)}")
+
+        if not is_static(self.shape):
+            raise ValueError("Dynamic layout doesn't support vectorized store")
+
+        self._check_can_load_store()
+
+        n_elems = size(self.shape, loc=loc, ip=ip)
+        if n_elems != size(data.shape, loc=loc, ip=ip):
+            raise ValueError(
+                f"lhs and rhs must have the same shape, but got {self.shape} and {data.shape}"
+            )
+
+        elem_mlir_type = cutlass_arith.element_type(data.dtype.mlir_type)
+        if cutlass_arith.is_narrow_precision(elem_mlir_type):
+            if elem_mlir_type.width * n_elems % 32 != 0:
+                raise ValueError(
+                    f"narrow precision type must be 32-bit aligned vector, but got {elem_mlir_type} with {n_elems} elements"
+                )
+
+        # Implicit upcast to wider type
+        new_data = self._cvt_to_dest(data, loc=loc, ip=ip)
+
+        return _cute_ir.memref_store_vec(
+            new_data, self.value, row_major=True, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def fill(self, value: Numeric, *, loc=None, ip=None) -> None:
+        """Fill tensor with a constant value.
+
+        Fills all elements of the tensor with the specified value, assuming static size
+        and supported memory space.
+
+        :param value: Value to fill tensor with
+        :type value: Union[int, float]
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+
+        :raises NotImplementedError: If tensor has dynamic size
+
+        **Examples:**
+
+        .. code-block:: python
+
+            # Create tensor from numpy array
+            b = np.random.randn(4, 8).astype(np.float32)
+            tensor = from_dlpack(b)
+
+            # Fill tensor with constant value
+            tensor.fill(0.5)  # All elements become 0.5
+        """
+        self._check_can_load_store()
+
+        sz = size(self, loc=loc, ip=ip)
+        if type(sz) is not int:
+            raise NotImplementedError(f"dynamic size is not supported: {self.type}")
+
+        # Should we cast to destination type even with narrow cast?
+        dst_type = self.element_type
+        value = dst_type(value)
+
+        self[None] = full(self.shape, fill_value=value, dtype=dst_type, loc=loc, ip=ip)
+
+    def _check_can_load_store(self):
+        if not isinstance(self.type, _cute_ir.MemRefType) or not self.memspace in (
+            AddressSpace.rmem,
+            AddressSpace.smem,
+            AddressSpace.gmem,
+            AddressSpace.generic,
+        ):
+            raise ValueError(f"{self} doesn't support load and store")
+
+    def _check_can_dereference(self):
+        # Check for sub-byte types and raise error if needed
+        if self.element_type.width % 8 != 0 and self.element_type is not Boolean:
+            raise ValueError(
+                f"Sub-byte scalar dereference not supported for type {self.element_type}"
+            )
+
+
+@dsl_user_op
+def print_tensor(
+    tensor: Union[Tensor, "TensorSSA"], *, verbose: bool = False, loc=None, ip=None
+):
+    """Print content of the tensor in human readable format.
+
+    Outputs the tensor data in a structured format showing both metadata
+    and the actual data values. The output includes tensor type information,
+    layout details, and a formatted array representation of the values.
+
+    :param tensor: The tensor to print
+    :type tensor: Tensor
+    :param verbose: If True, includes additional debug information in the output
+    :type verbose: bool
+    :param loc: Source location where it's called, defaults to None
+    :type loc: source location, optional
+    :param ip: Insertion pointer for IR generation, defaults to None
+    :type ip: insertion pointer, optional
+    :raises NotImplementedError: If the tensor type doesn't support trivial dereferencing
+
+    **Example output:**
+
+    .. code-block:: text
+
+        tensor(raw_ptr<@..., Float32, generic, align(4)> o (8,5):(5,1), data=
+               [[-0.4326, -0.5434,  0.1238,  0.7132,  0.8042],
+                [-0.8462,  0.9871,  0.4389,  0.7298,  0.6948],
+                [ 0.3426,  0.5856,  0.1541,  0.2923,  0.6976],
+                [-0.1649,  0.8811,  0.1788,  0.1404,  0.2568],
+                [-0.2944,  0.8593,  0.4171,  0.8998,  0.1766],
+                [ 0.8814,  0.7919,  0.7390,  0.4566,  0.1576],
+                [ 0.9159,  0.7577,  0.6918,  0.0754,  0.0591],
+                [ 0.6551,  0.1626,  0.1189,  0.0292,  0.8655]])
+    """
+    if isinstance(tensor, TensorSSA):
+        tmp = make_fragment(tensor.shape, tensor.dtype)
+        tmp.store(tensor)
+        tensor = tmp
+
+    if not isinstance(tensor.type, _cute_ir.MemRefType):
+        raise NotImplementedError(
+            f"printing {tensor} is not supported because it doesn't support trivial dereferencing. "
+            f"Coordinate Tensor will be supported in the future."
+        )
+
+    tensor._check_can_load_store()  # type: ignore
+
+    if tensor.element_type.is_integer:
+        signed = tensor.element_type.signed
+    else:
+        signed = False
+
+    _cute_ir.print_view(tensor.value, verbose=verbose, is_signed=signed, loc=loc, ip=ip)
+
+
+####################################################################################################
+#
+# Core API
+#
+####################################################################################################
+
+
+#
+# Utilties
+#
+
+
+@lru_cache_ir()
+def is_integer(a) -> bool:
+    """Check if an object is static integer or dynamic integer"""
+    return isinstance(a, (int, Integer)) or (
+        isinstance(a, ir.Value)
+        and isinstance(a.type, (ir.IntegerType, _cute_ir.ConstrainedIntType))
+    )
+
+
+def is_valid_leaf(a) -> bool:
+    """
+    Returns whether `a` has a type that is valid for a CuTe tuple's leaf.
+    """
+    return (
+        is_integer(a)
+        or (a is None)
+        or isinstance(a, (ScaledBasis, Layout, ComposedLayout))
+    )
+
+
+def is_int_tuple(a) -> bool:
+    if isinstance(a, tuple):
+        return all([is_int_tuple(x) for x in a])
+    else:
+        return is_integer(a)
+
+
+def is_static(x: Union[ir.Type, ir.Value, XTuple]) -> bool:
+    """Check if a value is statically known at compile time.
+
+    In CuTe, static values are those whose values are known at compile time,
+    as opposed to dynamic values which are only known at runtime.
+
+    :param x: The value to check
+    :type x: Union[ir.Type, ir.Value, XTuple]
+    :return: True if the value is static, False otherwise
+    :rtype: bool
+    :raises TypeError: If an unsupported type is provided
+    """
+    if isinstance(x, ir.Type):
+        return _cute_ir.is_static(x)
+    elif isinstance(x, tuple):
+        return all(is_static(a) for a in x)
+    # Can it be a static int?
+    elif isinstance(x, Numeric):
+        return False
+    elif is_dynamic_expression(x):
+        return _cute_ir.is_static(x.type)
+    elif isinstance(x, (bool, int, float)) or x is None:
+        return True
+    elif isinstance(x, ScaledBasis):
+        return x.is_static()
+    else:
+        raise TypeError(f"unsupported type {x}")
+
+
+def has_underscore(a: XTuple) -> bool:
+    if type(a) is tuple:
+        return any([has_underscore(x) for x in a])
+    else:
+        return a is None
+
+
+def has_scaled_basis(a: XTuple) -> bool:
+    """Check if a tuple or its nested elements contain ScaledBasis objects.
+
+    ScaledBasis objects are fundamental components in CuTe layouts,
+    representing the basis vectors of coordinate systems.
+
+    :param a: The tuple to check
+    :type a: XTuple
+    :return: True if the tuple contains ScaledBasis objects, False otherwise
+    :rtype: bool
+    """
+    if type(a) is tuple:
+        return any([has_scaled_basis(x) for x in a])
+    else:
+        return isinstance(a, ScaledBasis)
+
+
+def _tuple_str(t: tuple) -> str:
+    """
+    Constructs a string representation of a python tuple without calling __repr__ on its elements.
+    """
+
+    def construct_inner_str(t) -> str:
+        if not isinstance(t, tuple):
+            return pretty_str(t)
+        res = ""
+        l = len(t)
+        for i in range(l):
+            res += pretty_str(t[i])
+            if i < l - 1:
+                res += ","
+        return res
+
+    res = "(" + construct_inner_str(t) + ")"
+    return res
+
+
+def pretty_str(arg) -> str:
+    """
+    Constructs a concise readable pretty string.
+    """
+    if isinstance(arg, tuple):
+        # _tuple_str for tuples
+        return _tuple_str(arg)
+    elif arg is None:
+        # We interpret None as underscores for slicers
+        return "_"
+    else:
+        # Fallback to __str__
+        return arg.__str__()
+
+
+@dsl_user_op
+def printf(*args, loc=None, ip=None) -> None:
+    """
+    Print a value or a list of values.
+
+    It supports c-style printf format as well:
+
+    .. code-block:: python
+
+        a = cute.make_layout(shape=(10, 10), stride=(10, 1))
+        b = cutlass.Float32(1.234)
+        cute.printf(a, b)
+        cute.printf("a={}, b={}", a, b)
+        cute.printf("a={}, b=%.2f", a, b)
+
+    :param args: List of values to print
+    :type args: list
+    :param loc: Source location where it's called, defaults to None
+    :type loc: source location, optional
+    :param ip: Insertion pointer, defaults to None
+    :type ip: insertion pointer, optional
+    :raises ValueError: If no arguments are provided or if an unsupported argument type is passed
+    """
+
+    if len(args) == 0:
+        raise ValueError("expects at least one argument to print")
+
+    if isinstance(args[0], str):
+        fmt = args[0] + "\n"
+        args = args[1:]
+    else:
+        fmt = "{}" + ", {}" * (len(args) - 1) + "\n"
+
+    def process_arg(arg):
+        arg0 = arg.value if isinstance(arg, Numeric) else arg
+
+        if isinstance(arg0, ir.Value):
+            return arg0
+        elif isinstance(arg0, bool):
+            return const(arg0, Boolean)
+        elif isinstance(arg0, int):
+            return const(arg0, Int32)
+        elif isinstance(arg0, float):
+            return const(arg0, Float32)
+        elif has_underscore(arg0):
+            # Assume it's a coordinate
+            return _pack_coord(arg0)
+        elif has_scaled_basis(arg0):
+            # Assume it's a stride
+            return _pack_stride(arg0)
+        elif isinstance(arg0, tuple):
+            # Assume it's an int_tuple
+            return _pack_int_tuple(arg0)
+        elif isinstance(arg0, (_Tensor, _Pointer)):
+            return arg0.value
+        else:
+            raise TypeError(f"unsupported argument type in printf, got {type(arg)}")
+
+    args = [process_arg(a) for a in args]
+    _cute_ir.print_(args, fmt=fmt, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def front(input, *, loc=None, ip=None):
+    """Recursively get the first element of input.
+
+    This function traverses a hierarchical structure (like a layout or tensor)
+    and returns the first element at the deepest level. It's particularly useful
+    for accessing the first stride value in a layout to determine properties like
+    majorness.
+
+    :param input: The hierarchical structure to traverse
+    :type input: Union[Tensor, Layout, Stride]
+    :param loc: Source location where it's called, defaults to None
+    :type loc: source location, optional
+    :param ip: Insertion pointer for IR generation, defaults to None
+    :type ip: insertion pointer, optional
+    :return: The first element at the deepest level of the input structure
+    :rtype: Union[int, float, bool, ir.Value]
+    """
+    if rank(input) == 1 and depth(input) == 0:
+        return input
+    else:
+        return front(get(input, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip)
+
+
+@dsl_user_op
+def is_major(mode, stride: Stride, *, loc=None, ip=None) -> bool:
+    """
+    Check whether a mode in stride is the major mode.
+    """
+    first_stride = front(get(stride, mode=[mode], loc=loc, ip=ip), loc=loc, ip=ip)
+    if is_dynamic_expression(first_stride):
+        return False
+    return True if first_stride == 1 else False
+
+
+def leading_dim(shape: Shape, stride: Stride) -> Union[int, Tuple[int, ...], None]:
+    """
+    Find the leading dimension of a shape and stride.
+
+    :param shape: The shape of the tensor or layout
+    :type shape: Shape
+    :param stride: The stride of the tensor or layout
+    :type stride: Stride
+    :return: The leading dimension index or indices
+    :rtype: Union[int, Tuple[int, ...], None]
+
+    The return value depends on the stride pattern:
+
+        * If a single leading dimension is found, returns an integer index
+        * If nested leading dimensions are found, returns a tuple of indices
+        * If no leading dimension is found, returns None
+    """
+
+    def pred_fn(val, pos):
+        # skip dynamic values which can't be compared
+        # find the candidate target val, stride at this position is 1
+        if (not is_dynamic_expression(val)) and (val == 1):
+            # extract the shape at this position
+            mode = [pos] if isinstance(pos, int) else list(pos)
+            s = get(shape, mode)
+            if is_dynamic_expression(s) or s != 1:
+                # shape at this position is dynamic value or not 1
+                # we found the leading dimension
+                return True
+        return False
+
+    return find_if(stride, pred_fn=pred_fn)
+
+
+@dsl_user_op
+def find_if(
+    t: Union[tuple, ir.Value, int],
+    pred_fn: Callable[[int, Tuple[int, ...]], bool],
+    *,
+    loc=None,
+    ip=None,
+) -> Union[int, Tuple[int, ...], None]:
+    """Find the first position in t where pred_fn(val, pos) returns True.
+
+    :param t: The search space
+    :type t: Union[tuple, ir.Value, int]
+    :param pred_fn: A callable object (lambda, function, etc.) that predicates the value and position in t.
+                    It takes the current leaf value and position, returns True if the value or position is satisfied.
+    :type pred_fn: Callable[[int, Tuple[int, ...]], bool]
+    :return: Index if found at top level, tuple of indices showing nested position, or None if not found
+    :rtype: Union[int, Tuple[int, ...], None]
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Find the first position of x in t
+        t = (3, 4)
+        find_if(t, pred_fn=lambda val, pos: val == x)
+
+    .. code-block:: python
+
+        # find the leading dimension
+        shape = (3, 4)
+        stride = (4, 1)
+        # Find value 1 in stride where the corresponding shape is not 1
+        def pred_fn(val, pos):
+            mode = [pos] if isinstance(pos, int) else list(pos)
+            return val == 1 and get(shape, mode) != 1
+        find_if(stride, pred_fn=pred_fn)
+    """
+
+    def _find_if_impl(curr, pos, *, loc=None, ip=None):
+        if isinstance(curr, tuple):
+            # Recursively search nested tuple
+            for i in range(rank(curr)):
+                sub_curr = get(curr, mode=[i], loc=loc, ip=ip)
+                sub_pos = (pos, i) if isinstance(pos, int) else pos + (i,)
+                res_pos = _find_if_impl(sub_curr, sub_pos, loc=loc, ip=ip)
+                if res_pos is not None:
+                    return res_pos
+        else:
+            # For leaf values, check if it matches x
+            if pred_fn(curr, pos):
+                return pos
+        return None
+
+    def _check_pred_fn():
+        if not callable(pred_fn):
+            raise TypeError(f"pred_fn must be callable, but got {type(pred_fn)}")
+        signature = inspect.signature(pred_fn)
+        if len(signature.parameters) != 2:
+            raise ValueError(
+                f"pred_fn must have two parameters (value, pos), but got {len(signature.parameters)}"
+            )
+
+    _check_pred_fn()
+
+    for i in range(rank(t)):
+        curr = get(t, mode=[i], loc=loc, ip=ip)
+        res_pos = _find_if_impl(curr, i, loc=loc, ip=ip)
+        if res_pos is not None:
+            return res_pos
+    return None
+
+
+@dsl_user_op
+def find(
+    t: Union[tuple, ir.Value, int],
+    x: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[int, Tuple[int, ...], None]:
+    """Find the first position of a value ``x`` in a hierarchical structure ``t``.
+
+    Searches for the first occurrence of x in t, optionally excluding positions
+    where a comparison value matches. The search can traverse nested structures
+    and returns either a single index or a tuple of indices for nested positions.
+
+    :param t: The search space
+    :type t: Union[tuple, ir.Value, int]
+    :param x: The static integer x to search for
+    :type x: int
+    :return: Index if found at top level, tuple of indices showing nested position, or None if not found
+    :rtype: Union[int, Tuple[int, ...], None]
+    """
+    if not isinstance(x, int):
+        raise TypeError(f"find() requires a static x to search for, but got {x}")
+
+    def pred_fn(val, pos):
+        # Skip dynamic values which can't be compared
+        return not is_dynamic_expression(val) and val == x
+
+    return find_if(t, pred_fn=pred_fn, loc=loc, ip=ip)
+
+
+def transform_leaf(f, *args):
+    """
+    Apply a function to the leaf nodes of nested tuple structures.
+
+    This function traverses nested tuple structures in parallel and applies the function f
+    to corresponding leaf nodes. All input tuples must have the same nested structure.
+
+    :param f: Function to apply to leaf nodes
+    :type f: Callable
+    :param args: One or more nested tuple structures with matching profiles
+    :return: A new nested tuple with the same structure as the inputs, but with leaf values transformed by f
+    :raises TypeError: If the input tuples have different nested structures
+
+    Example:
+
+    .. code-block:: python
+
+        >>> transform_leaf(lambda x: x + 1, (1, 2))
+        (2, 3)
+        >>> transform_leaf(lambda x, y: x + y, (1, 2), (3, 4))
+        (4, 6)
+        >>> transform_leaf(lambda x: x * 2, ((1, 2), (3, 4)))
+        ((2, 4), (6, 8))
+    """
+    if all(isinstance(t, tuple) for t in args):
+        return tuple(transform_leaf(f, *_args) for _args in zip(*args))
+    elif all(not isinstance(t, tuple) for t in args):
+        return f(*args)
+    else:
+        raise TypeError(f"profile of input tuples doesn't match: {args}")
+
+
+@dsl_user_op
+def assume(src, divby=None, *, loc=None, ip=None):
+    if divby is None:
+        return src
+
+    if isinstance(src, Integer):
+        width = type(src).width
+        src_val = src.ir_value()
+    else:
+        width = src.type.width
+        src_val = src
+
+    res_ty = _cute_ir.ConstrainedIntType.get(divby, width)
+    assumed_val = _cute_ir.assume(res_ty, src_val, loc=loc, ip=ip)
+    return type(src)(IntValue(_pack_int_tuple(assumed_val, loc=loc, ip=ip)))
+
+
+@dsl_user_op
+def make_swizzle(b, m, s, *, loc=None, ip=None):
+    # canonicalize to <0, 4, 3> for identity swizzle (as compiler assumes <0, 4, 3>)
+    if b == 0:
+        m, s = 4, 3
+    ty = ir.Type.parse(f'!cute.swizzle<"S<{b},{m},{s}>">')
+    return Swizzle(_cute_ir.static(ty, loc=loc, ip=ip))
+
+
+#
+# Tuple API (also used by layouts and tensors)
+#
+
+
+def depth(a: Union[XTuple, Layout, "ComposedLayout"]) -> int:
+    """Returns the depth (nesting level) of a tuple, layout, or tensor.
+
+    The depth of a tuple is the maximum depth of its elements plus 1.
+    For an empty tuple, the depth is 1. For layouts and tensors, the depth
+    is determined by the depth of their shape. For non-tuple values (e.g., integers),
+    the depth is considered 0.
+
+    :param a: The object whose depth is to be determined
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any]
+    :return: The depth of the input object
+    :rtype: int
+
+    Example:
+
+    .. code-block:: python
+
+        >>> depth(1)
+        0
+        >>> depth((1, 2))
+        1
+        >>> depth(((1, 2), (3, 4)))
+        2
+    """
+    if type(a) is tuple:
+        if not a:
+            return 1
+        return max(depth(x) for x in a) + 1
+    elif isinstance(a, (Layout, ComposedLayout, Tensor)):
+        return depth(a.shape)
+    else:
+        return 0
+
+
+@lru_cache_ir()
+def rank(a: Union[XTuple, Layout, "ComposedLayout"]) -> int:
+    """Returns the rank (dimensionality) of a tuple, layout, or tensor.
+
+    The rank of a tuple is its length. For layouts and tensors, the rank is
+    determined by the rank of their shape. For non-tuple values (e.g., integers),
+    the rank is considered 1 for convenience.
+
+    :param a: The object whose rank is to be determined
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any]
+    :return: The rank of the input object
+    :rtype: int
+
+    This function is used in layout algebra to determine the dimensionality
+    of tensors and layouts for operations like slicing and evaluation.
+    """
+    if isinstance(a, tuple):
+        return len(a)
+    elif isinstance(a, (Layout, ComposedLayout, Tensor)):
+        return rank(a.shape)
+    elif depth(a) == 0:
+        return 1
+    else:
+        raise TypeError(f"unsupported type in rank, got {type(a)}")
+
+
+def is_congruent(
+    a: Union[XTuple, Layout, ComposedLayout, Tensor],
+    b: Union[XTuple, Layout, ComposedLayout, Tensor],
+) -> bool:
+    """
+    Returns whether a is congruent to b.
+
+    Congruence is an equivalence relation between hierarchical structures.
+
+    Two objects are congruent if:
+    * They have the same rank, AND
+    * They are both non-tuple values, OR
+    * They are both tuples AND all corresponding elements are congruent.
+
+    Congruence requires type matching at each level -- scalar values match with
+    scalar values, and tuples match with tuples of the same rank.
+
+    :param a: First object to compare
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor]
+    :param b: Second object to compare
+    :type b: Union[XTuple, Layout, ComposedLayout, Tensor]
+    :return: True if a and b are congruent, False otherwise
+    :rtype: bool
+    """
+    if isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a = a.shape
+    if isinstance(b, (Layout, ComposedLayout, Tensor)):
+        b = b.shape
+    if isinstance(a, tuple) and isinstance(b, tuple):
+        return (len(a) == len(b)) and all(is_congruent(x, y) for x, y in zip(a, b))
+    if isinstance(a, tuple) or isinstance(b, tuple):
+        return False
+    return True
+
+
+def is_weakly_congruent(
+    a: Union[XTuple, Layout, ComposedLayout, Tensor],
+    b: Union[XTuple, Layout, ComposedLayout, Tensor],
+) -> bool:
+    """
+    Returns whether a is weakly congruent to b.
+
+    Weak congruence is a partial order on hierarchical structures.
+
+    Object X is weakly congruent to object Y if:
+    * X is a non-tuple value, OR
+    * X and Y are both tuples of the same rank AND all corresponding elements are weakly congruent.
+
+    Weak congruence allows scalar values to match with tuples, making it useful
+    for determining whether an object has a hierarchical structure "up to" another.
+
+    :param a: First object to compare
+    :type a: Union[XTuple, Layout, ComposedLayout, Tensor]
+    :param b: Second object to compare
+    :type b: Union[XTuple, Layout, ComposedLayout, Tensor]
+    :return: True if a and b are weakly congruent, False otherwise
+    :rtype: bool
+    """
+    if isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a = a.shape
+    if isinstance(b, (Layout, ComposedLayout, Tensor)):
+        b = b.shape
+    if not isinstance(a, tuple):
+        return True
+    if isinstance(a, tuple) and isinstance(b, tuple):
+        return (len(a) == len(b)) and all(
+            is_weakly_congruent(x, y) for x, y in zip(a, b)
+        )
+    if isinstance(a, tuple) or isinstance(b, tuple):
+        return False
+    return True
+
+
+@overload
+def get(input: Shape, mode, *, loc=None, ip=None) -> Shape: ...
+@overload
+def get(input: Stride, mode, *, loc=None, ip=None) -> Stride: ...
+@overload
+def get(input: Coord, mode, *, loc=None, ip=None) -> Coord: ...
+@overload
+def get(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def get(input: Tile, mode, *, loc=None, ip=None) -> Tile: ...
+@overload
+def get(input: Layout, mode, *, loc=None, ip=None) -> Layout: ...
+@overload
+def get(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def get(input, mode: List[int], *, loc=None, ip=None):
+    """Extract a specific element or sub-layout from a layout or tuple.
+
+    This function recursively traverses the input according to the mode indices,
+    extracting the element at the specified path. For layouts, this operation
+    corresponds to extracting a specific sub-layout.
+
+    :param input: The input layout or tuple to extract from
+    :type input: Layout, ComposedLayout, tuple
+    :param mode: Indices specifying the path to traverse for extraction
+    :type mode: List[int]
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The extracted element or sub-layout
+    :rtype: Layout, ComposedLayout, or element type
+    :raises ValueError: If any index in mode is out of range
+    :raises TypeError: If mode contains non-integer elements or if input has unsupported type
+
+    :postcondition: ``get(t, mode=find(x,t)) == x if find(x,t) != None else True``
+
+    **Examples:**
+
+    .. code-block:: python
+
+        layout = make_layout(((4, 8), (16, 1), 8), stride=((1, 4), (32, 0), 512))
+        sub_layout = get(layout, mode=[0, 1])   # 8:4
+        sub_layout = get(layout, mode=[1])      # (16, 1):(32, 0)
+    """
+    # Empty mode returns input and terminates the recursive call
+    if not mode:
+        return input
+
+    if rank(input) <= mode[0]:
+        raise ValueError(
+            f"elements in mode must be less than rank({input}), got {mode}"
+        )
+
+    if depth(input) == 0:
+        return input
+    elif isinstance(input, tuple):
+        if not isinstance(mode[0], int):
+            raise TypeError(
+                f"invalid element in mode, expects int, got {type(mode[0])}"
+            )
+        return get(input[mode[0]], mode=mode[1:])
+    else:
+        if not isinstance(input, (Layout, ComposedLayout)):
+            raise TypeError(f"unsupported type of input, got {type(input)}")
+        return _cute_ir.get(
+            input.type.get_op_res_type(mode=mode), input, mode=mode, loc=loc, ip=ip
+        )
+
+
+@overload
+def select(input: Shape, mode, *, loc=None, ip=None) -> Shape: ...
+@overload
+def select(input: Stride, mode, *, loc=None, ip=None) -> Stride: ...
+@overload
+def select(input: Coord, mode, *, loc=None, ip=None) -> Coord: ...
+@overload
+def select(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def select(input: Tile, mode, *, loc=None, ip=None) -> Tile: ...
+@overload
+def select(input: Layout, mode, *, loc=None, ip=None) -> Layout: ...
+@overload
+def select(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def select(input, mode: List[int], *, loc=None, ip=None):
+    """Select modes from input.
+
+    :param input: Input to select from
+    :type input: Layout, ComposedLayout, tuple
+    :param mode: Indices specifying which dimensions or elements to select
+    :type mode: List[int]
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A new instance with selected dimensions/elements
+    :rtype: Layout, ComposedLayout, tuple
+    :raises ValueError: If any index in mode is out of range
+    :raises TypeError: If the input type is invalid
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Select specific dimensions from a layout
+        layout = make_layout((4, 8, 16), stride=(32, 4, 1))
+        selected = select(layout, mode=[0, 2])  # Select mode 0 and mode 2
+        # Result: (4, 16):(32, 1)
+
+        # Select elements from a tuple
+        t = (1, 2, 3, 4, 5)
+        selected = select(t, mode=[0, 2, 4])  # Select mode 0, mode 2, and mode 4
+        # Result: (1, 3, 5)
+    """
+    if any((not isinstance(i, int)) or (i >= rank(input)) for i in mode):
+        raise ValueError(
+            f"invalid mode element for input of rank {rank(input)}, got {mode=}"
+        )
+
+    if isinstance(input, tuple):
+        return tuple(input[i] for i in mode)
+
+    if not isinstance(input, (Layout, ComposedLayout)):
+        raise TypeError(f"unsupported type of input, got {type(input)}")
+
+    return _cute_ir.select(input, mode=mode, loc=loc, ip=ip)
+
+
+@overload
+def group_modes(input: Shape, begin: int, end: int, *, loc=None, ip=None) -> Shape: ...
+@overload
+def group_modes(
+    input: Stride, begin: int, end: int, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def group_modes(input: Coord, begin: int, end: int, *, loc=None, ip=None) -> Coord: ...
+@overload
+def group_modes(
+    input: IntTuple, begin: int, end: int, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def group_modes(input: Tile, begin: int, end: int, *, loc=None, ip=None) -> Tile: ...
+@overload
+def group_modes(
+    input: Layout, begin: int, end: int, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def group_modes(
+    input: ComposedLayout, begin: int, end: int, *, loc=None, ip=None
+) -> ComposedLayout: ...
+@overload
+def group_modes(
+    input: Tensor, begin: int, end: int, *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def group_modes(input, begin: int, end: int = -1, *, loc=None, ip=None):
+    """Group modes of a hierarchical tuple or layout into a single mode.
+
+    This function groups a range of modes from the input object into a single mode,
+    creating a hierarchical structure. For tuples, it creates a nested tuple containing
+    the specified range of elements. For layouts and other CuTe objects, it creates
+    a hierarchical representation where the specified modes are grouped together.
+
+    :param input: Input object to group modes from (layout, tuple, etc.)
+    :type input: Layout, ComposedLayout, tuple, Shape, Stride, etc.
+    :param beg: Beginning index of the range to group (inclusive)
+    :type beg: int
+    :param end: Ending index of the range to group (exclusive)
+    :type end: int
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A new object with the specified modes grouped
+    :rtype: Same type as input with modified structure
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Group modes in a tuple
+        t = (2, 3, 4, 5)
+        grouped = group_modes(t, 1, 3)  # (2, (3, 4), 5)
+
+        # Group modes in a layout
+        layout = make_layout((2, 3, 4, 5))
+        grouped_layout = group_modes(layout, 1, 3)  # Layout with shape (2, (3, 4), 5)
+
+        # Group modes in a shape
+        shape = make_shape(2, 3, 4, 5)
+        grouped_shape = group_modes(shape, 0, 2)  # Shape ((2, 3), 4, 5)
+    """
+    if depth(input) == 0 and is_integer(input):
+        return (input,)
+    if isinstance(input, tuple):
+        return (*input[:begin], (input[begin:end]), *input[end:])
+    return _cute_ir.group_modes(
+        input.value if isinstance(input, Tensor) else input, begin, end, loc=loc, ip=ip
+    )
+
+
+@overload
+def slice_(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ...
+@overload
+def slice_(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ...
+@overload
+def slice_(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ...
+@overload
+def slice_(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def slice_(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ...
+@overload
+def slice_(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ...
+@overload
+def slice_(
+    src: ComposedLayout, coord: Coord, *, loc=None, ip=None
+) -> ComposedLayout: ...
+@overload
+def slice_(src: Tensor, coord: Coord, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def slice_(src, coord: Coord, *, loc=None, ip=None):
+    """Perform a slice operation on a source object using the given coordinate.
+
+    This function implements CuTe's slicing operation which extracts a subset of elements
+    from a source object (tensor, layout, etc.) based on a coordinate pattern. The slice
+    operation preserves the structure of the source while selecting specific elements.
+
+    :param src: Source object to be sliced (tensor, layout, tuple, etc.)
+    :type src: Union[Tensor, Layout, IntTuple, Value]
+    :param coord: Coordinate pattern specifying which elements to select
+    :type coord: Coord
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new object containing the sliced elements
+    :rtype: Union[Tensor, Layout, IntTuple, tuple]
+    :raises ValueError: If the coordinate pattern is incompatible with source
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Layout slicing
+        layout = make_layout((4,4))
+
+        # Select 1st index of first mode and keep all elements in second mode
+        sub_layout = slice_(layout, (1, None))
+
+    .. code-block:: python
+
+        # Basic tensor slicing
+        tensor = make_tensor(...)           # Create a 2D tensor
+
+        # Select 1st index of first mode and keep all elements in second mode
+        sliced = slice_(tensor, (1, None))
+
+    .. code-block:: python
+
+        # Select 2nd index of second mode and keep all elements in first mode
+        sliced = slice_(tensor, (None, 2))
+
+    Note:
+        - `None` represents keeping all elements in that mode
+        - Slicing preserves the layout/structure of the original object
+        - Can be used for:
+          * Extracting sub-tensors/sub-layouts
+          * Creating views into data
+          * Selecting specific patterns of elements
+    """
+
+    def lift_slice(a, b):
+        if isinstance(a, tuple):
+            if (not isinstance(b, tuple)) or (len(a) != len(b)):
+                raise ValueError("coord must be weakly congruent to src in slice_")
+            return reduce(
+                lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(a, b)), ()
+            )
+        elif a is None:
+            return (b,)
+        else:
+            return ()
+
+    if is_integer(src) or isinstance(src, tuple):
+        if isinstance(coord, tuple):
+            if (not isinstance(src, tuple)) or (len(coord) != len(src)):
+                raise ValueError("coord must be weakly congruent to src in slice_")
+            return reduce(
+                lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(coord, src)), ()
+            )
+        elif coord is None:
+            return src
+        else:
+            return ()
+
+    res_type = None
+    if isinstance(src, Tensor):
+        res_type = src.element_type
+        src = src.value
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    res = _cute_ir.slice(input=src, coord=coord_val, loc=loc, ip=ip)
+    return _Tensor(res, dtype=res_type) if isinstance(res, _Tensor) else res
+
+
+@overload
+def dice(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ...
+@overload
+def dice(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ...
+@overload
+def dice(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ...
+@overload
+def dice(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def dice(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ...
+@overload
+def dice(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ...
+@overload
+def dice(src: ComposedLayout, coord: Coord, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+@lru_cache_ir()
+def dice(src, dicer, *, loc=None, ip=None):
+    """Keep modes in input when it is paired with an integer in dicer.
+
+    This function performs dicing operation on the input based on the dicer coordinate.
+    Dicing is a fundamental operation in CuTe that allows selecting specific modes from
+    a tensor or layout based on a coordinate pattern.
+
+    :param dicer: A static coordinate indicating how to dice the input
+    :type dicer: Coord
+    :param input: The operand to be diced on
+    :type input: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The diced result with selected modes from the input
+    :rtype: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout]
+    :raises TypeError: If dicer has an unsupported type
+    :raises ValueError: If input is not provided
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Basic dicing of a layout
+        layout = make_layout((32,16,8))
+
+        # Keep only first and last modes
+        diced = dice((1,None,1), layout)
+
+    Note:
+        - The dicer coordinate must be static
+        - Use underscore (_) to remove a mode
+    """
+    if not is_static(dicer):
+        raise ValueError(f"expects dicer to be static, but got {dicer}")
+
+    def lift_dice(a, b):
+        if isinstance(a, tuple):
+            if (not isinstance(b, tuple)) or (len(a) != len(b)):
+                raise ValueError("dicer must be weakly congruent to input in dice")
+            return reduce(
+                lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(a, b)), ()
+            )
+        elif a is None:
+            return ()
+        else:
+            return (b,)
+
+    if is_integer(src) or isinstance(src, tuple):
+        if isinstance(dicer, tuple):
+            if (not isinstance(src, tuple)) or (len(dicer) != len(src)):
+                raise ValueError("dicer must be weakly congruent to src in dice")
+            return reduce(
+                lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(dicer, src)), ()
+            )
+        elif dicer is None:
+            return ()
+        else:
+            return src
+
+    dicer_val = _pack_coord(dicer, loc=loc, ip=ip)
+    return _cute_ir.dice(src, dicer_val.type.attribute, loc=loc, ip=ip)
+
+
+def wrap(x) -> tuple:
+    """
+    Wraps the input into a tuple if not a tuple.
+    """
+    if isinstance(x, tuple):
+        return x
+    return (x,)
+
+
+def _extend(func, input, elem, up_to_rank, loc, ip):
+    if input is None:
+        raise ValueError(f"No input provided for input")
+
+    if isinstance(input, (Layout, ComposedLayout)):
+        if elem is None:
+            elem = make_layout(1)
+        elif not isinstance(elem, Layout):
+            raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!")
+        N = rank(input) + 1 if up_to_rank is None else up_to_rank
+        return func(N, input, elem, loc=loc, ip=ip)
+
+    if is_valid_leaf(input) or isinstance(input, tuple):
+        if elem is None:
+            elem = 1
+        if (not isinstance(elem, tuple)) and (not is_valid_leaf(elem)):
+            raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!")
+
+        input = wrap(input)
+        repeat_cnt = 1 if up_to_rank is None else up_to_rank - rank(input)
+        if repeat_cnt == 0:
+            return input
+        elif repeat_cnt < 0:
+            raise ValueError(f"up_to_rank must be >= rank(input)")
+        else:
+            if func is _cute_ir.prepend_to_rank:
+                return (elem,) * repeat_cnt + input
+            else:
+                return input + (elem,) * repeat_cnt
+
+    raise TypeError(f"invalid type for input, got {type(input)}")
+
+
+@overload
+def prepend(
+    input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None
+) -> Shape: ...
+@overload
+def prepend(
+    input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def prepend(
+    input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None
+) -> Coord: ...
+@overload
+def prepend(
+    input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def prepend(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ...
+@overload
+def prepend(
+    input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def prepend(
+    input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def prepend(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None):
+    """Extend input to rank up_to_rank by prepending elem in front of input.
+
+    This function extends the input object by prepending elements to reach a desired rank.
+    It supports various CuTe types including shapes, layouts, tensors etc.
+
+    :param input: Source to be prepended to
+    :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :param elem: Element to prepend to input
+    :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout]
+    :param up_to_rank: The target rank after extension, defaults to None
+    :type up_to_rank: Union[None, int], optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The extended result with prepended elements
+    :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :raises ValueError: If up_to_rank is less than input's current rank
+    :raises TypeError: If input or elem has unsupported type
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Prepend to a Shape
+        shape = (4,4)
+        prepend(shape, 2)                   # Returns (2,4,4)
+
+        # Prepend to a Layout
+        layout = make_layout((8,8))
+        prepend(layout, make_layout((2,)))  # Returns (2,8,8):(1,1,8)
+
+        # Prepend with target rank
+        coord = (1,1)
+        prepend(coord, 0, up_to_rank=4)     # Returns (0,0,1,1)
+    """
+    return _extend(_cute_ir.prepend_to_rank, input, elem, up_to_rank, loc=loc, ip=ip)
+
+
+@overload
+def append(
+    input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None
+) -> Shape: ...
+@overload
+def append(
+    input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None
+) -> Stride: ...
+@overload
+def append(
+    input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None
+) -> Coord: ...
+@overload
+def append(
+    input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def append(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ...
+@overload
+def append(
+    input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> Layout: ...
+@overload
+def append(
+    input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def append(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None):
+    """Extend input to rank up_to_rank by appending elem to the end of input.
+
+    This function extends the input object by appending elements to reach a desired rank.
+    It supports various CuTe types including shapes, layouts, tensors etc.
+
+    :param input: Source to be appended to
+    :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :param elem: Element to append to input
+    :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout]
+    :param up_to_rank: The target rank after extension, defaults to None
+    :type up_to_rank: Union[None, int], optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: The extended result with appended elements
+    :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor]
+    :raises ValueError: If up_to_rank is less than input's current rank
+    :raises TypeError: If input or elem has unsupported type
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Append to a Shape
+        shape = (4,4)
+        append(shape, 2)                   # Returns (4,4,2)
+
+        # Append to a Layout
+        layout = make_layout((8,8))
+        append(layout, make_layout((2,)))  # Returns (8,8,2):(1,8,1)
+
+        # Append with target rank
+        coord = (1,1)
+        append(coord, 0, up_to_rank=4)     # Returns (1,1,0,0)
+
+    Note:
+        - The function preserves the structure of the input while extending it
+        - Can be used to extend tensors, layouts, shapes and other CuTe types
+        - When up_to_rank is specified, fills remaining positions with elem
+        - Useful for tensor reshaping and layout transformations
+    """
+    return _extend(_cute_ir.append_to_rank, input, elem, up_to_rank, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def prepend_ones(
+    t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None
+) -> Tensor:
+    return make_tensor(
+        t.iterator, prepend(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def append_ones(
+    t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None
+) -> Tensor:
+    return make_tensor(
+        t.iterator, append(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip
+    )
+
+
+def repeat_like(x, target):
+    """Creates an object congruent to target and filled with x.
+
+    This function recursively creates a nested tuple structure that matches the structure
+    of the target, with each leaf node filled with the value x.
+
+    :param x: The value to fill the resulting structure with
+    :type x: Any
+    :param target: The structure to mimic
+    :type target: Union[tuple, Any]
+    :return: A structure matching target but filled with x
+    :rtype: Union[tuple, Any]
+
+    **Examples:**
+
+    .. code-block:: python
+
+        repeat_like(0, (1, 2, 3))      # Returns (0, 0, 0)
+        repeat_like(1, ((1, 2), 3))    # Returns ((1, 1), 1)
+        repeat_like(2, 5)              # Returns 2
+    """
+    if not isinstance(target, tuple):
+        return x
+    if not target:
+        return ()
+    if len(target) == 1:
+        return (repeat_like(x, target[0]),)
+    return tuple(repeat_like(x, t) for t in target)
+
+
+def flatten_to_tuple(a: Union[IntTuple, Coord, Shape, Stride]) -> tuple:
+    """Flattens a potentially nested tuple structure into a flat tuple.
+
+    This function recursively traverses the input structure and flattens it into
+    a single-level tuple, preserving the order of elements.
+
+    :param a: The structure to flatten
+    :type a: Union[IntTuple, Coord, Shape, Stride]
+    :return: A flattened tuple containing all elements from the input
+    :rtype: tuple
+
+    **Examples:**
+
+    .. code-block:: python
+
+        flatten_to_tuple((1, 2, 3))       # Returns (1, 2, 3)
+        flatten_to_tuple(((1, 2), 3))     # Returns (1, 2, 3)
+        flatten_to_tuple((1, (2, (3,))))  # Returns (1, 2, 3)
+    """
+    if not isinstance(a, tuple):
+        return wrap(a)
+    else:
+        return tuple(chain.from_iterable(tuple(flatten_to_tuple(x) for x in a)))
+
+
+@overload
+def flatten(a: Union[IntTuple, Coord, Shape, Stride]) -> IntTuple: ...
+@overload
+def flatten(a: Tensor) -> Tensor: ...
+@overload
+def flatten(a: Layout) -> Layout: ...
+
+
+def flatten(a):
+    """Flattens a CuTe data structure into a simpler form.
+
+    For tuples, this function flattens the structure into a single-level tuple.
+    For layouts, it returns a new layout with flattened shape and stride.
+    For tensors, it returns a new tensor with flattened layout.
+    For other types, it returns the input unchanged.
+
+    :param a: The structure to flatten
+    :type a: Union[IntTuple, Coord, Shape, Stride, Layout, Tensor]
+    :return: The flattened structure
+    :rtype: Union[tuple, Any]
+
+    **Examples:**
+
+    .. code-block:: python
+
+        flatten((1, 2, 3))                      # Returns (1, 2, 3)
+        flatten(((1, 2), (3, 4)))               # Returns (1, 2, 3, 4)
+        flatten(5)                              # Returns 5
+        flatten(Layout(shape, stride))          # Returns Layout(flatten(shape), flatten(stride))
+        flatten(Tensor(layout))                 # Returns Tensor(flatten(layout))
+
+    """
+    if isinstance(a, Tensor):
+        return make_tensor(a.iterator, flatten(a.layout))
+    elif isinstance(a, Layout):
+        return make_layout(flatten(a.shape), stride=flatten(a.stride))
+    elif isinstance(a, tuple):
+        return flatten_to_tuple(a)
+    else:
+        return a
+
+
+def unflatten(
+    sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]], profile: XTuple
+) -> XTuple:
+    """Unflatten a flat tuple into a nested tuple structure according to a profile.
+
+    This function transforms a flat sequence of elements into a nested tuple structure
+    that matches the structure defined by the profile parameter. It traverses the profile
+    structure and populates it with elements from the sequence.
+
+    sequence must be long enough to fill the profile. Raises RuntimeError if it is not.
+
+    :param sequence: A flat sequence of elements to be restructured
+    :type sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]]
+    :param profile: A nested tuple structure that defines the shape of the output
+    :type profile: XTuple
+    :return: A nested tuple with the same structure as profile but containing elements from sequence
+    :rtype: XTuple
+
+    Example:
+        >>> unflatten([1, 2, 3, 4], ((0, 0), (0, 0)))
+        ((1, 2), (3, 4))
+    """
+
+    def _make_generator():
+        for element in sequence:
+            yield element
+
+    xs = _make_generator()
+    return transform_leaf(lambda _: next(xs), profile)
+
+
+@dsl_user_op
+def elem_less(
+    lhs: Union[Shape, IntTuple, Coord],
+    rhs: Union[Shape, IntTuple, Coord],
+    *,
+    loc=None,
+    ip=None,
+):
+    lhs_val = _pack_coord(lhs, loc=loc, ip=ip)
+    rhs_val = _pack_coord(rhs, loc=loc, ip=ip)
+    return Boolean(_cute_ir.elem_less(lhs_val, rhs_val, loc=loc, ip=ip))
+
+
+@overload
+def filter_zeros(
+    input: Layout, *, target_profile=None, loc=None, ip=None
+) -> Layout: ...
+@overload
+def filter_zeros(
+    input: Tensor, *, target_profile=None, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def filter_zeros(input, *, target_profile=None, loc=None, ip=None):
+    """Filter out zeros from a layout or tensor.
+
+    This function removes zero-stride dimensions from a layout or tensor.
+    Refer to https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/02_layout_algebra.md
+    for more layout algebra operations.
+
+    :param input: The input layout or tensor to filter
+    :type input: Layout or Tensor
+    :param target_profile: Target profile for the filtered result, defaults to None
+    :type target_profile: optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The filtered layout or tensor with zeros removed
+    :rtype: Layout or Tensor
+    :raises TypeError: If input is not a Layout or Tensor
+    """
+    if not isinstance(input, (Layout, Tensor)):
+        raise TypeError(f"Expect layout or tensor as input but got {type(input)=}")
+    if isinstance(input, Tensor):
+        input = input.value
+    return _cute_ir.filter_zeros(input, target_profile=target_profile, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def filter(input: Union[Layout, Tensor], *, loc=None, ip=None):
+    """Filter a layout or tensor.
+
+    This function filters a layout or tensor according to CuTe's filtering rules.
+
+    :param input: The input layout or tensor to filter
+    :type input: Layout or Tensor
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The filtered layout or tensor
+    :rtype: Layout or Tensor
+    :raises TypeError: If input is not a Layout or Tensor
+    """
+    if not isinstance(input, (Layout, Tensor)):
+        raise TypeError(f"Expect layout or tensor as input but got {type(input)=}")
+    if isinstance(input, _Tensor):
+        input = input.value
+    return _cute_ir.filter(input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def product(a: Union[IntTuple, Shape], *, loc=None, ip=None):
+    """Return product of the given IntTuple or Shape.
+
+    Computes the product of all elements in the input tuple or shape.
+    Returns static value if type is static.
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: Static product of IntTuple or Shape if static, otherwise a Value
+    :rtype: int or Value
+    :raises TypeError: If input is not an IntTuple or Shape
+    """
+    if is_integer(a):
+        return a
+    if isinstance(a, tuple):
+        a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+        res = _cute_ir.tuple_product(a_val, loc=loc, ip=ip)
+        return _unpack_x_tuple(res, loc=loc, ip=ip)
+    else:
+        raise TypeError(f"expects IntTuple or Shape, but got {type(a)}")
+
+
+@overload
+def product_like(
+    a: IntTuple, target_profile: XTuple, *, loc=None, ip=None
+) -> IntTuple: ...
+@overload
+def product_like(a: Shape, target_profile: XTuple, *, loc=None, ip=None) -> Shape: ...
+
+
+@dsl_user_op
+def product_like(
+    a: Union[IntTuple, Shape], target_profile: XTuple, *, loc=None, ip=None
+):
+    """Return product of the given IntTuple or Shape at leaves of `target_profile`.
+
+    This function computes products according to the structure defined by target_profile.
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param target_profile: The profile that guides how products are computed
+    :type target_profile: XTuple
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The resulting tuple with products computed according to target_profile
+    :rtype: IntTuple or Shape
+    :raises TypeError: If inputs have incompatible types
+    :raises ValueError: If inputs have incompatible shapes
+    """
+    # Perform product at leaf of `target_profile`
+    if not isinstance(target_profile, tuple):
+        return product(a, loc=loc, ip=ip)
+    else:
+        if not isinstance(a, tuple):
+            raise TypeError(f"expects `a` tuple but got {a}")
+
+        if len(a) != len(target_profile):
+            raise ValueError(f"expects `a` and `guide` have the same rank")
+
+        return tuple(
+            product_like(x, g, loc=loc, ip=ip) for x, g in zip(a, target_profile)
+        )
+
+
+@overload
+def product_each(a: IntTuple, *, loc=None, ip=None) -> IntTuple: ...
+@overload
+def product_each(a: Shape, *, loc=None, ip=None) -> Shape: ...
+
+
+@dsl_user_op
+def product_each(a, *, loc=None, ip=None):
+    """Compute products for each component of the input.
+
+    Returns a rank(a) tuple `result` such that get(result, mode=[i]) == product(get(a, mode=[i]))
+
+    :param a: The input tuple or shape
+    :type a: IntTuple or Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: A tuple containing products for each component
+    :rtype: tuple
+    :raises TypeError: If input is not an IntTuple or Shape
+    """
+    if is_integer(a):
+        return a
+    if isinstance(a, tuple):
+        if not a:
+            return 1
+        else:
+            a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+            res = _cute_ir.tuple_product_each(a_val, loc=loc, ip=ip)
+            return _unpack_x_tuple(res, loc=loc, ip=ip)
+    else:
+        raise TypeError(f"expects IntTuple or Shape, but got {type(a)}")
+
+
+@dsl_user_op
+def size(
+    a: Union[IntTuple, Shape, Layout, ComposedLayout, Tensor],
+    mode: List[int] = [],
+    *,
+    loc=None,
+    ip=None,
+) -> Int:
+    """Return size of domain of layout or tensor.
+
+    Computes the size (number of elements) in the domain of a layout or tensor.
+    For layouts, this corresponds to the shape of the coordinate space.
+    See https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/01_layout.md
+    for more details on layout domains.
+
+    :param a: The input object whose size to compute
+    :type a: IntTuple, Shape, Layout, ComposedLayout or Tensor
+    :param mode: List of mode(s) for size calculation. If empty, computes total size, defaults to []
+    :type mode: list of int, optional
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: Static size of layout or tensor if static, otherwise a Value
+    :rtype: int or Value
+    :raises ValueError: If mode contains non-integer elements
+    """
+    if any(not isinstance(m, int) for m in mode):
+        raise ValueError(f"expects integer elements in mode, but got {mode}")
+
+    if isinstance(a, (TiledMma, TiledCopy)):
+        return a.size
+    a_val = None
+    if not isinstance(a, (Layout, ComposedLayout, Tensor)):
+        a_val = _pack_int_tuple(a, loc=loc, ip=ip)
+    elif isinstance(a, Tensor):
+        a_val = a.value
+    else:
+        a_val = a
+
+    res = _cute_ir.size(a_val, mode=mode, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)  # type: ignore
+
+
+@dsl_user_op
+def shape_div(lhs: Shape, rhs: Shape, *, loc=None, ip=None) -> Shape:
+    """Perform element-wise division of shapes.
+
+    This function performs element-wise division between two shapes.
+
+    :param lhs: Left-hand side shape
+    :type lhs: Shape
+    :param rhs: Right-hand side shape
+    :type rhs: Shape
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: optional
+    :param ip: Insertion point, defaults to None
+    :type ip: optional
+    :return: The result of element-wise division
+    :rtype: Shape
+    """
+    lhs = _pack_shape(lhs, loc=loc, ip=ip)
+    rhs = _pack_shape(rhs, loc=loc, ip=ip)
+    res = _cute_ir.shape_div(lhs, rhs, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def ceil_div(input: Shape, tiler: Tiler, *, loc=None, ip=None) -> Shape:
+    """
+    Compute the ceiling division of a target shape by a tiling specification.
+
+    This function computes the number of tiles required to cover the target domain.
+    It is equivalent to the second mode of `zipped_divide(input, tiler)`.
+
+    :param input: A tuple of integers representing the dimensions of the target domain.
+    :type input: Shape
+    :param tiler: The tiling specification.
+    :type tiler: Union[Layout, Shape, Tile]
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :return: A tuple of integers representing the number of tiles required along each dimension,
+             i.e. the result of the ceiling division of the input dimensions by the tiler dimensions.
+    :rtype: Shape
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            input = (10, 6)
+            tiler = (3, 4)
+            result = cute.ceil_div(input, tiler)
+            print(result)  # Outputs: (4, 2)
+    """
+    input_val = _pack_shape(input, loc=loc, ip=ip)
+    tiler_val = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.ceil_div(input=input_val, tiler=tiler_val, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+def round_up(a: IntTuple, b: IntTuple) -> IntTuple:
+    """
+    Rounds up elements of a using elements of b.
+    """
+    if isinstance(a, tuple):
+        if not a:
+            raise ValueError(f"inputs cannot be empty")
+        if not isinstance(b, tuple):
+            raise TypeError(
+                f"expects both inputs to be tuple, but got {type(a)} and {type(b)}"
+            )
+        if rank(a) < rank(b):
+            raise ValueError(
+                f"expects rank(a) to be greater or equal than rank(b), but got {a}, {b}"
+            )
+        b = append(b, 1, rank(a))
+        return tuple(round_up(x, y) for x, y in zip(a, b))
+    return ((a + b - 1) // b) * b
+
+
+#
+# Layout API (also used by tensors)
+#
+
+
+@dsl_user_op
+def make_layout(
+    shape: Shape, *, stride: Union[Stride, None] = None, loc=None, ip=None
+) -> Layout:
+    """Create a CuTe Layout object from shape and optional stride information.
+
+    A Layout in CuTe represents the mapping between logical and physical coordinates of a tensor.
+    This function creates a Layout object that defines how tensor elements are arranged in memory.
+
+    :param shape: Shape of the layout defining the size of each mode
+    :type shape: Shape
+    :param stride: Optional stride values for each mode, defaults to None
+    :type stride: Union[Stride, None]
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new Layout object with the specified shape and stride
+    :rtype: Layout
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a 2D compact left-most layout with shape (4,4)
+        layout = make_layout((4,4))                     # compact left-most layout
+
+        # Create a left-most layout with custom strides
+        layout = make_layout((4,4), stride=(1,4))       # left-most layout with strides (1,4)
+
+        # Create a layout for a 3D tensor
+        layout = make_layout((32,16,8))                 # left-most layout
+
+        # Create a layout with custom strides
+        layout = make_layout((2,2,2), stride=(4,1,2))   # layout with strides (4,1,2)
+
+    Note:
+        - If stride is not provided, a default compact left-most stride is computed based on the shape
+        - The resulting layout maps logical coordinates to physical memory locations
+        - The layout object can be used for tensor creation and memory access patterns
+        - Strides can be used to implement:
+          * Row-major vs column-major layouts
+          * Padding and alignment
+          * Blocked/tiled memory arrangements
+          * Interleaved data formats
+        - Stride is keyword only argument to improve readability, e.g.
+          * make_layout((3,4), (1,4)) can be confusing with make_layout(((3,4), (1,4)))
+          * make_layout((3,4), stride=(1,4)) is more readable
+    """
+    if stride is not None and not is_congruent(shape, stride):
+        raise ValueError(f"shape and stride must be congruent")
+
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    if stride is not None:
+        stride_val = _pack_stride(stride, loc=loc, ip=ip)
+        layout_ty = _cute_ir.LayoutType.get(shape_val, stride_val)
+    else:
+        stride_val = None
+        layout_ty = _cute_ir.LayoutType.get(shape_val)
+
+    return _cute_ir.make_layout(
+        layout_ty, shape=shape_val, stride=stride_val, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_identity_layout(shape: Shape, *, loc=None, ip=None) -> Layout:
+    """Create an identity layout with the given shape.
+
+    An identity layout maps logical coordinates directly to themselves without any transformation.
+    This is equivalent to a layout with stride (1@0,1@1,...,1@(N-1)).
+
+    :param shape: The shape of the layout
+    :type shape: Shape
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new identity Layout object with the specified shape
+    :rtype: Layout
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a 2D identity layout with shape (4,4)
+        layout = make_identity_layout((4,4))     # stride=(1@0,1@1)
+
+        # Create a 3D identity layout
+        layout = make_identity_layout((32,16,8)) # stride=(1@0,1@1,1@2)
+
+    Note:
+        - An identity layout is a special case where each coordinate maps to itself
+        - Useful for direct coordinate mapping without any transformation
+    """
+    if not is_int_tuple(shape):
+        raise TypeError(f"expects a shape input, got {type(shape)}")
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    return _cute_ir.make_identity_layout(shape_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_ordered_layout(shape: Shape, order: Shape, *, loc=None, ip=None) -> Layout:
+    """Create a layout with a specific ordering of dimensions.
+
+    This function creates a layout where the dimensions are ordered according to the
+    specified order parameter, allowing for custom dimension ordering in the layout.
+
+    :param shape: The shape of the layout
+    :type shape: Shape
+    :param order: The ordering of dimensions
+    :type order: Shape
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new Layout object with the specified shape and dimension ordering
+    :rtype: Layout
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a row-major layout
+        layout = make_ordered_layout((4,4), order=(1,0))
+
+        # Create a column-major layout
+        layout = make_ordered_layout((4,4), order=(0,1))         # stride=(1,4)
+
+        # Create a layout with custom dimension ordering for a 3D tensor
+        layout = make_ordered_layout((32,16,8), order=(2,0,1))   # stride=(128,1,16)
+
+    Note:
+        - The order parameter specifies the ordering of dimensions from fastest-varying to slowest-varying
+        - For a 2D tensor, (0,1) creates a column-major layout, while (1,0) creates a row-major layout
+        - The length of order must match the rank of the shape
+    """
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    order_val = _pack_int_tuple(order, loc=loc, ip=ip)
+    return _cute_ir.make_ordered_layout(
+        shape=shape_val, order=order_val, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_composed_layout(
+    inner, offset: IntTuple, outer: Layout, *, loc=None, ip=None
+) -> ComposedLayout:
+    """Create a composed layout by composing an inner transformation with an outer layout.
+
+    A composed layout applies a sequence of transformations
+    to coordinates. The composition is defined as (inner ∘ offset ∘ outer), where the operations
+    are applied from right to left.
+
+    :param inner: The inner transformation (can be a Layout or Swizzle)
+    :type inner: Union[Layout, Swizzle]
+    :param offset: An integral offset applied between transformations
+    :type offset: IntTuple
+    :param outer: The outer (right-most) layout that is applied first
+    :type outer: Layout
+    :param loc: Source location information, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for IR generation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A new ComposedLayout representing the composition
+    :rtype: ComposedLayout
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a basic layout
+        inner = make_layout(...)
+        outer = make_layout((4,4), stride=(E(0), E(1)))
+
+        # Create a composed layout with an offset
+        composed = make_composed_layout(inner, (2,0), outer)
+
+    Note:
+        - The composition applies transformations in the order: outer → offset → inner
+        - The stride divisibility condition must be satisfied for valid composition
+        - Certain compositions (like Swizzle with scaled basis) are invalid and will raise errors
+        - Composed layouts inherit many properties from the outer layout
+    """
+    if not isinstance(outer, Layout):
+        raise TypeError(
+            f"expects the outer (or right-most or effectively visible) layout to be an affine layout, but got {outer}"
+        )
+    if isinstance(inner, Swizzle) and has_scaled_basis(outer.stride):
+        raise TypeError(f"invalid composition {inner} o {offset} o {outer}")
+    offset_val = _pack_int_tuple(offset, loc=loc, ip=ip)
+    return _cute_ir.make_composed_layout(inner, offset_val, outer, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def cosize(
+    a: Union[Layout, ComposedLayout, Tensor], mode: List[int] = [], *, loc=None, ip=None
+):
+    """Return size of codomain of layout or tensor. Return static value if type is static.
+
+    :param a: Layout, ComposedLayout, or Tensor object
+    :type a: Union[Layout, ComposedLayout, Tensor]
+    :param mode: List of mode(s) for cosize calculation
+    :type mode: List[int], optional
+    :param loc: Location information for diagnostics, defaults to None
+    :type loc: optional
+    :param ip: Instruction pointer for diagnostics, defaults to None
+    :type ip: optional
+    :return: Static size of layout or tensor (fast fold) if static, or a dynamic Value
+    :rtype: Union[int, Value]
+    """
+    if any(not is_static(m) for m in mode):
+        raise ValueError(f"expects static mode, but got {mode}")
+
+    if isinstance(a, _Tensor):
+        a = a.value
+    res = _cute_ir.cosize(a, mode=mode, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def size_in_bytes(
+    dtype: Type[Numeric], layout: Union[Layout, ComposedLayout], *, loc=None, ip=None
+):
+    """Calculate the size in bytes based on its data type and layout.
+
+    :param dtype: The DSL numeric data type
+    :type dtype: Type[Numeric]
+    :param layout: The layout of the elements. If None, the function returns 0
+    :type layout: Layout, optional
+    :param loc: Location information for diagnostics, defaults to None
+    :type loc: optional
+    :param ip: Instruction pointer for diagnostics, defaults to None
+    :type ip: optional
+    :return: The total size in bytes. Returns 0 if the layout is None
+    :rtype: int
+    """
+    if not isinstance(dtype, NumericMeta):
+        raise TypeError(f"dtype must be a Numeric, but got {dtype}")
+
+    if layout is None:
+        return 0
+    elif isinstance(layout, ComposedLayout):
+        if not isinstance(layout.inner, Swizzle):
+            raise TypeError(
+                f"invalid composed layout {layout}, inner must be a Swizzle"
+            )
+        else:
+            return cosize(layout.outer, loc=loc, ip=ip) * dtype.width // 8
+    else:
+        return cosize(layout, loc=loc, ip=ip) * dtype.width // 8
+
+
+@dsl_user_op
+def coalesce(input, *, target_profile: Coord = None, loc=None, ip=None):
+    if target_profile:
+        profile_val = _pack_coord(target_profile, loc=loc, ip=ip)
+        return _cute_ir.coalesce(input, target_profile=profile_val, loc=loc, ip=ip)
+    else:
+        return _cute_ir.coalesce(input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def crd2idx(coord: Coord, layout, *, loc=None, ip=None):
+    """
+    Convert a multi-dimensional coordinate into a value using the specified layout.
+
+    This function computes the inner product of the flattened coordinate and stride:
+
+        index = sum(flatten(coord)[i] * flatten(stride)[i] for i in range(len(coord)))
+
+    :param coord: A tuple or list representing the multi-dimensional coordinate
+                  (e.g., (i, j) for a 2D layout).
+    :type coord: Coord
+    :param layout: A layout object that defines the memory storage layout, including shape and stride,
+                   used to compute the inner product.
+    :type layout: Layout or ComposedLayout
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :returns: The result of applying the layout transformation to the provided coordinate.
+    :rtype: Any type that the layout maps to
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            L = cute.make_layout((5, 4), stride=(4, 1))
+            idx = cute.crd2idx((2, 3), L)
+            # Computed as: 2 * 4 + 3 = 11
+            print(idx)
+        foo()  # Expected output: 11
+    """
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    if isinstance(layout, (tuple, int)):
+        layout = make_layout(layout, loc=loc, ip=ip)
+
+    res = _cute_ir.crd2idx(coord_val, layout, loc=loc, ip=ip)
+    return _unpack_x_tuple(res, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def recast_layout(new_type_bits, old_type_bits, src_layout, *, loc=None, ip=None):
+    return _cute_ir.recast_layout(
+        new_type_bits, old_type_bits, src_layout, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def slice_and_offset(coord, src, *, loc=None, ip=None):
+    layout = slice_(src, coord, loc=loc, ip=ip)
+    offset = crd2idx(coord, src, loc=loc, ip=ip)
+    return layout, offset
+
+
+@dsl_user_op
+@lru_cache_ir()
+def shape(
+    input: Union[Shape, Tensor, Layout, Tile], *, mode=None, loc=None, ip=None
+) -> Shape:
+    """Returns the shape of a tensor, layout or tiler.
+
+    For shapes, this function is identical to get.
+
+    This function extracts the shape information from the input object. For tensors and layouts,
+    it returns their internal shape property. For tilers, it unpacks the shape from the tile
+    representation.
+
+    :param input: The object to extract shape from
+    :type input: Union[Tensor, Layout, Tile]
+    :param mode: Optional mode selector to extract specific dimensions from the shape
+    :type mode: Optional[int]
+    :param loc: Source location for MLIR operation tracking
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation
+    :type ip: Optional[InsertionPoint]
+    :return: The shape of the input object, optionally filtered by mode
+    :rtype: Shape
+
+    Example:
+
+    .. code-block:: python
+
+        # Get shape of a layout
+        l0 = cute.make_layout((2, 3, 4))
+        s0 = cute.shape(l0)  # => (2, 3, 4)
+
+        # Get shape of a hierarchical tiler
+        l1 = cute.make_layout(1)
+        s1 = cute.shape((l0, l1))  # => ((2, 3, 4), 1)
+
+        # Get specific mode from a shape
+        s2 = cute.shape(l0, mode=0)  # => 2
+    """
+    if is_int_tuple(input):
+        return get(input, mode=mode)
+
+    if isinstance(input, (Tensor, Layout)):
+        shp = input.shape
+    else:
+        val = _cute_ir.get_shape(_pack_tile(input, loc=loc, ip=ip))
+        shp = _unpack_x_tuple(val, loc=loc, ip=ip)
+    return get(shp, mode=mode)
+
+
+#
+# Pointer API
+#
+
+
+@dsl_user_op
+def recast_ptr(
+    ptr: Pointer,
+    swizzle_=None,
+    dtype: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    if dtype is not None:
+        if not isclass(dtype) or not issubclass(dtype, Numeric):
+            raise TypeError(f"dtype must be a type of Numeric, but got {dtype}")
+        dtype = dtype.mlir_type
+
+    value_type = ptr.type.value_type if dtype is None else dtype
+    swizzle = swizzle_.type.attribute if swizzle_ is not None else None
+    res_ty = _cute_ir.PtrType.get(value_type, ptr.memspace, ptr.alignment, swizzle)
+    return _cute_ir.recast_iter(res_ty, ptr.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_ptr(
+    dtype: Union[Type[Numeric], None],
+    value,
+    mem_space: AddressSpace = AddressSpace.generic,
+    *,
+    assumed_align=None,
+    loc=None,
+    ip=None,
+) -> Pointer:
+    if dtype is None or not isinstance(dtype, NumericMeta):
+        raise TypeError(f"expects dtype to be a type of Numeric, but got {dtype}")
+
+    if not isinstance(mem_space, AddressSpace):
+        raise TypeError(f"expects mem_space to be an AddressSpace, but got {mem_space}")
+
+    if isinstance(value, ir.Value) and llvm.PointerType.isinstance(value.type):
+        value = llvm.ptrtoint(T.i64(), value)
+
+    if not is_integer(value):
+        raise TypeError(f"expects integer value, but got {type(value)}")
+    value = Int32(value) if mem_space == AddressSpace.tmem else Int64(value)
+
+    bytes_per_elt = max(1, dtype.width // 8)
+    if assumed_align is None:
+        assumed_align = bytes_per_elt
+
+    if bytes_per_elt % assumed_align != 0 and assumed_align % bytes_per_elt != 0:
+        raise ValueError(
+            f"{bytes_per_elt=} is not a multiple of {assumed_align=} and vice versa."
+        )
+
+    aligned_ty = _cute_ir.ConstrainedIntType.get(assumed_align, type(value).width)
+    aligned_intptr = _cute_ir.assume(aligned_ty, value.ir_value(), loc=loc, ip=ip)
+
+    data_ty = T.i8() if dtype is None else dtype.mlir_type
+    ptr_ty = _cute_ir.PtrType.get(data_ty, mem_space, assumed_align)
+    return _cute_ir.inttoptr(ptr_ty, aligned_intptr, loc=loc, ip=ip)
+
+
+#
+# Tensor API
+#
+
+
+@dsl_user_op
+def make_tensor(
+    iterator, layout: Union[Shape, Layout, ComposedLayout], *, loc=None, ip=None
+) -> Tensor:
+    """Creates a tensor by composing an engine (iterator/pointer) with a layout.
+
+    A tensor is defined as T = E ∘ L, where E is an engine (array, pointer, or counting iterator)
+    and L is a layout that maps logical coordinates to physical offsets. The tensor
+    evaluates coordinates by applying the layout mapping and dereferencing the engine
+    at the resulting offset.
+
+    :param iterator: Engine component (pointer, iterator, or counting iterator) that provides
+                    data access capabilities
+    :type iterator: Union[Pointer, IntTuple]
+    :param layout: Layout component that defines the mapping from logical coordinates to
+                  physical offsets
+    :type layout: Union[Shape, Layout, ComposedLayout]
+    :param loc: Source location for MLIR operation tracking, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A tensor object representing the composition E ∘ L
+    :rtype: Tensor
+
+    :raises ValueError: If iterator type is not supported
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a tensor with row-major layout
+        layout = make_layout((64, 128), stride=(128, 1))
+        tensor = make_tensor(ptr, layout)
+
+        # Create a tensor with hierarchical layout
+        layout = make_layout(((128, 8), (1, 4, 1)), stride=((32, 1), (0, 8, 4096)))
+        tensor = make_tensor(smem_ptr, layout)
+
+        # Create a coord tensor
+        layout = make_layout(2, stride=16 * E(0))
+        tensor = make_tensor(5, layout)
+
+    Notes:
+        - The engine (iterator) must support random access operations
+        - Common engine types include raw pointers, arrays, and random-access iterators
+        - The layout defines both the shape (logical dimensions) and stride (physical mapping)
+        - Supports both direct coordinate evaluation T(c) and partial evaluation (slicing)
+    """
+    if not isinstance(layout, (Layout, ComposedLayout)):
+        layout = make_layout(layout, loc=loc, ip=ip)
+    elif isinstance(layout, ComposedLayout) and layout.type.is_normal_layout:
+        layout = layout.outer
+
+    ty = None
+    if is_integer(iterator) or isinstance(iterator, tuple):
+        iterator = _pack_int_tuple(iterator, loc=loc, ip=ip)
+        ty = _cute_ir.CoordTensorType.get(iterator.type, layout.type)
+    elif isinstance(iterator, Pointer):
+        iterator = iterator.value
+        ty = _cute_ir.MemRefType.get(iterator.type, layout.type)
+    else:
+        raise TypeError(f"unsupported iterator type, got {type(iterator)}")
+
+    return _cute_ir.make_view(result=ty, iter=iterator, layout=layout, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_identity_tensor(shape: Shape, *, loc=None, ip=None) -> Tensor:
+    """Creates an identity tensor with the given shape.
+
+    An identity tensor maps each coordinate to itself, effectively creating a counting
+    sequence within the shape's bounds. This is useful for generating coordinate indices
+    or creating reference tensors for layout transformations.
+
+    :param shape: The shape defining the tensor's dimensions. Can be a simple integer
+                 sequence or a hierarchical structure ((m,n),(p,q))
+    :type shape: Shape
+    :param loc: Source location for MLIR operation tracking, defaults to None
+    :type loc: Optional[Location]
+    :param ip: Insertion point for MLIR operation, defaults to None
+    :type ip: Optional[InsertionPoint]
+    :return: A tensor that maps each coordinate to itself
+    :rtype: Tensor
+
+    **Examples:**
+
+    .. code-block:: python
+
+        # Create a simple 1D coord tensor
+        tensor = make_identity_tensor(6)  # [0,1,2,3,4,5]
+
+        # Create a 2D coord tensor
+        tensor = make_identity_tensor((3,2))  # [(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]
+
+        # Create hierarchical coord tensor
+        tensor = make_identity_tensor(((2,1),3))
+        # [((0,0),0),((1,0),0),((0,0),1),((1,0),1),((0,0),2),((1,0),2)]
+
+    Notes:
+        - The shape parameter follows CuTe's IntTuple concept
+        - Coordinates are ordered colexicographically
+        - Useful for generating reference coordinates in layout transformations
+    """
+    shape_val = _pack_shape(shape, loc=loc, ip=ip)
+    return _cute_ir.make_identity_tensor(shape_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_fragment(
+    layout_or_shape: Union[Layout, Shape],
+    dtype: Type[Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    if not issubclass(dtype, Numeric):
+        raise TypeError(f"value_type must be a type of Numeric, but got {type(dtype)}")
+    elem_ty = dtype.mlir_type if dtype is not Boolean else T.i8()
+
+    # Alignment for register memory is useless(?), pick-up large enough number
+    # to allow .128 (> 16B) load store
+    alignment = 32
+    layout = None
+    if not isinstance(layout_or_shape, Layout):
+        layout = make_layout(layout_or_shape, loc=loc, ip=ip)
+    else:
+        layout = layout_or_shape
+
+    ptr_ty = _cute_ir.PtrType.get(elem_ty, AddressSpace.rmem, alignment)
+    res_ty = _cute_ir.MemRefType.get(ptr_ty, layout.type)
+    tensor = _cute_ir.memref_alloca(res_ty, layout=layout, loc=loc, ip=ip)
+    return _Tensor(tensor.value, dtype)
+
+
+@overload
+def make_fragment_like(
+    src: Tensor, dtype: Optional[Type[Numeric]], *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@overload
+def make_fragment_like(src: Layout, *, loc=None, ip=None) -> Layout: ...
+
+
+@overload
+def make_fragment_like(src: ComposedLayout, *, loc=None, ip=None) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def make_fragment_like(src, dtype=None, *, loc=None, ip=None):
+    """Create tensor with a compact layout in the same shape as the source on stack.
+
+    This function either creates a fragment tensor with compact layout in
+    same shape as the source layout or a new layout with the same shape as the source.
+    The strides of the new layout follow the order induced by the source's strides, with a
+    special handling of the 0th mode: it is always stride-1 and generated in column-major order
+    (LayoutLeft).
+
+    :param src: The source layout or tensor whose shape will be matched
+    :type src: Union[Layout, ComposedLayout, Tensor]
+    :param dtype: The element type for the fragment tensor, defaults to None
+    :type dtype: Type[Numeric], optional
+    :param loc: Source location for MLIR operations, defaults to None
+    :type loc: Location, optional
+    :param ip: Insertion point for MLIR operations, defaults to None
+    :type ip: InsertionPoint, optional
+
+    :return: A new layout or fragment tensor with matching shape
+    :rtype: Union[Layout, Tensor]
+
+    **Examples:**
+
+    Creating a rmem tensor from a tensor:
+
+    .. code-block:: python
+
+        smem_tensor = cute.make_tensor(smem_ptr, layout)
+        frag_tensor = cute.make_fragment_like(smem_tensor, cutlass.Float32)
+        # frag_tensor will be a register-backed tensor with the same shape
+
+    Creating a fragment with a different element type:
+
+    .. code-block:: python
+
+        tensor = cute.make_tensor(gmem_ptr, layout)
+        bool_frag = cute.make_fragment_like(tensor, cutlass.Boolean)
+        # bool_frag will be a register-backed tensor with Boolean elements
+
+    **Notes**
+
+    - When used with a Tensor, if a type is provided, it will create a new
+      fragment tensor with that element type.
+    - For layouts with ScaledBasis strides, the function creates a fragment
+      from the shape only.
+    - This function is commonly used in GEMM and other tensor operations to
+      create register storage for intermediate results.
+
+    """
+    if isinstance(src, (Layout, ComposedLayout)):
+        new_layout = None
+        # Create base fragment layout
+        if isinstance(src, Layout) and has_scaled_basis(src.stride):
+            # For scaled basis strides, create fragment from shape only
+            new_layout = _cute_ir.make_fragment_like(
+                make_layout(src.shape), loc=loc, ip=ip
+            )
+        else:
+            # Otherwise use full source layout
+            new_layout = _cute_ir.make_fragment_like(src, loc=loc, ip=ip)
+        if dtype is not None:
+            # call make_fragment to convert layout to tensor
+            return make_fragment(new_layout, dtype, loc=loc, ip=ip)
+        else:
+            return new_layout
+    elif isinstance(src, Tensor):
+        if isinstance(src.type, _cute_ir.CoordTensorType):
+            if dtype is None:
+                raise ValueError(
+                    "dtype must be provided when src is a coordinate tensor"
+                )
+
+            new_layout = _cute_ir.make_fragment_like(
+                make_layout(src.shape), loc=loc, ip=ip
+            )
+            return make_fragment(new_layout, dtype, loc=loc, ip=ip)
+        else:
+            dtype = src.element_type if dtype is None else dtype
+            ty = dtype.mlir_type if dtype is not Boolean else T.i8()
+            new_tensor = _cute_ir.make_fragment_like(
+                src.value, elem_type=ty, loc=loc, ip=ip
+            )
+            return _Tensor(new_tensor.value, dtype)
+    else:
+        raise TypeError(
+            f"src must be a Layout or ComposedLayout or tensor, got {type(src)}"
+        )
+
+
+@dsl_user_op
+def recast_tensor(
+    src: Tensor, dtype: Type[Numeric], swizzle_=None, *, loc=None, ip=None
+):
+    if not isclass(dtype) or not issubclass(dtype, Numeric):
+        raise TypeError(f"dtype must be a type of Numeric, but got {dtype}")
+
+    if dtype is Boolean:
+        dst_width = 8
+    else:
+        dst_width = dtype.width
+
+    if src.element_type is Boolean:
+        src_width = 8
+    else:
+        src_width = src.element_type.width
+
+    src_iter = recast_ptr(src.iterator, dtype=dtype, loc=loc, ip=ip)
+    src_layout = recast_layout(dst_width, src_width, src.layout, loc=loc, ip=ip)
+    return make_tensor(src_iter, src_layout, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def domain_offset(coord: Coord, tensor: Tensor, *, loc=None, ip=None) -> Tensor:
+    offset = crd2idx(coord, tensor.layout, loc=loc, ip=ip)
+    if isinstance(tensor.iterator, Pointer):
+        return make_tensor(tensor.iterator + offset, tensor.layout)
+    elif is_integer(tensor.iterator) or isinstance(tensor.iterator, tuple):
+        new_iter = _cute_ir.add_offset(
+            _pack_int_tuple(tensor.iterator), _pack_int_tuple(offset)
+        )
+        return make_tensor(_unpack_x_tuple(new_iter), tensor.layout)
+    else:
+        raise ValueError(f"unsupported tensor for domain_offset, got {tensor}")
+
+
+#
+# Layout algebra
+#
+
+
+@overload
+def composition(
+    lhs: Layout, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None
+) -> Layout: ...
+
+
+@overload
+def composition(
+    lhs: Tensor, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None
+) -> Tensor: ...
+
+
+@dsl_user_op
+def composition(lhs, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None):
+    """
+    Compose two layout representations using the CuTe layout algebra.
+
+    Compose a left-hand layout (or tensor) with a right-hand operand into a new layout R, such that
+    for every coordinate c in the domain of the right-hand operand, the composed layout satisfies:
+
+        R(c) = A(B(c))
+
+    where A is the left-hand operand provided as ``lhs`` and B is the right-hand operand provided as
+    ``rhs``. In this formulation, B defines the coordinate domain while A applies its transformation to
+    B's output, and the resulting layout R inherits the stride and shape adjustments from A.
+
+    Satisfies:
+        cute.shape(cute.composition(lhs, rhs)) is compatible with cute.shape(rhs)
+
+    :param lhs: The left-hand operand representing the transformation to be applied.
+    :type lhs: Layout or Tensor
+    :param rhs: The right-hand operand defining the coordinate domain. If provided as an int or tuple,
+                it will be converted to a tile layout.
+    :type rhs: Layout, Shape, or Tile, or int or tuple
+    :param loc: Optional location information for IR diagnostics.
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions.
+    :type ip: optional
+    :returns: A new composed layout R, such that for all coordinates c in the domain of ``rhs``,
+              R(c) = lhs(rhs(c)).
+    :rtype: Layout or Tensor
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            # Create a layout that maps (i,j) to i*4 + j
+            L1 = cute.make_layout((2, 3), stride=(4, 1))
+            # Create a layout that maps (i,j) to i*3 + j
+            L2 = cute.make_layout((3, 4), stride=(3, 1))
+            # Compose L1 and L2
+            L3 = cute.composition(L1, L2)
+            # L3 now maps coordinates through L2 then L1
+    """
+    rhs_val = rhs
+    if not isinstance(rhs, Layout) and isinstance(rhs, (int, tuple)):
+        rhs_val = _pack_tile(rhs, loc=loc, ip=ip)
+    if isinstance(lhs, _Tensor):
+        lhs = lhs.value
+    return _cute_ir.composition(lhs, rhs_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def complement(
+    input: Layout, cotarget: Union[Layout, Shape], *, loc=None, ip=None
+) -> Layout:
+    """
+    Compute the complement layout of the input layout with respect to the cotarget.
+
+    The complement of a layout A with respect to cotarget n is a layout A* such that
+    for every k in Z_n and c in the domain of A, there exists a unique c* in the domain
+    of A* where k = A(c) + A*(c*).
+
+    This operation is useful for creating layouts that partition a space in complementary ways,
+    such as row and column layouts that together cover a matrix.
+
+    :param input: The layout to compute the complement of
+    :type input: Layout
+    :param cotarget: The target layout or shape that defines the codomain
+    :type cotarget: Union[Layout, Shape]
+    :param loc: Optional location information for IR diagnostics
+    :type loc: optional
+    :param ip: Optional instruction pointer or context for underlying IR functions
+    :type ip: optional
+    :returns: The complement layout
+    :rtype: Layout
+
+    Example:
+
+    .. code-block:: python
+
+        import cutlass.cute as cute
+        @cute.jit
+        def foo():
+            # Create a right-major layout for a 4x4 matrix
+            row_layout = cute.make_layout((4, 4), stride=(4, 1))
+            # Create a left-major layout that complements the row layout
+            col_layout = cute.complement(row_layout, 16)
+            # The two layouts are complementary under 16
+    """
+    if isinstance(cotarget, Layout):
+        return _cute_ir.complement(input, cotarget=cotarget, loc=loc, ip=ip)
+    else:
+        cotarget_val = _pack_shape(cotarget, loc=loc, ip=ip)
+        return _cute_ir.complement(input, cotarget=cotarget_val, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def right_inverse(input: Layout, *, loc=None, ip=None) -> Layout:
+    if not isinstance(input, Layout):
+        raise TypeError(f"expects input of type Layout, but got {type(input)}")
+    return _cute_ir.right_inverse(input=input, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def left_inverse(input: Layout, *, loc=None, ip=None) -> Layout:
+    if not isinstance(input, Layout):
+        raise TypeError(f"expects input of type Layout, but got {type(input)}")
+    return _cute_ir.left_inverse(input=input, loc=loc, ip=ip)
+
+
+@overload
+def logical_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def logical_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def logical_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.logical_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def zipped_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def zipped_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def zipped_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.zipped_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def tiled_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def tiled_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def tiled_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.tiled_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def flat_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def flat_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def flat_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.flat_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def raked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def raked_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def raked_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.raked_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def blocked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ...
+@overload
+def blocked_product(
+    block: ComposedLayout, tiler: Layout, *, loc=None, ip=None
+) -> ComposedLayout: ...
+
+
+@dsl_user_op
+def blocked_product(block, tiler: Layout, *, loc=None, ip=None):
+    return _cute_ir.blocked_product(input=block, tiler=tiler, loc=loc, ip=ip)
+
+
+@overload
+def logical_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def logical_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def logical_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    res_type = None
+    if isinstance(target, _Tensor):
+        res_type = target.element_type
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.logical_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+    return _Tensor(res, dtype=res_type) if isinstance(res, _Tensor) else res
+
+
+@overload
+def zipped_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def zipped_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def zipped_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    res_type = None
+    if isinstance(target, _Tensor):
+        res_type = target.element_type
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.zipped_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+    return _Tensor(res, dtype=res_type) if isinstance(res, _Tensor) else res
+
+
+@overload
+def tiled_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def tiled_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def tiled_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    res_type = None
+    if isinstance(target, _Tensor):
+        res_type = target.element_type
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.tiled_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+    return _Tensor(res, dtype=res_type) if isinstance(res, _Tensor) else res
+
+
+@overload
+def flat_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ...
+@overload
+def flat_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ...
+
+
+@dsl_user_op
+def flat_divide(target, tiler: Tiler, *, loc=None, ip=None):
+    res_type = None
+    if isinstance(target, _Tensor):
+        res_type = target.element_type
+        target = target.value
+    if isinstance(tiler, tuple):
+        tiler = _pack_tile(tiler, loc=loc, ip=ip)
+    res = _cute_ir.flat_divide(input=target, tiler=tiler, loc=loc, ip=ip)
+    return _Tensor(res, dtype=res_type) if isinstance(res, _Tensor) else res
+
+
+#
+# Higher-level utilties
+#
+
+
+@dsl_user_op
+def max_common_layout(
+    a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None
+) -> Layout:
+    a_layout = a.layout if isinstance(a, _Tensor) else a
+    b_layout = b.layout if isinstance(b, _Tensor) else b
+
+    inv_b = right_inverse(b_layout, loc=loc, ip=ip)
+    common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    # some_ir_value == 1 generates a new IR Value which evaluates to True!
+    s = get(common.shape, mode=[0], loc=loc, ip=ip)
+    d = get(common.stride, mode=[0], loc=loc, ip=ip)
+    # Keep only the static identity component of the common layout
+    if isinstance(s, int) and isinstance(d, int) and d == 1:
+        # Truncate to the size of the contiguous vector (static stride-1 mode)
+        return composition(inv_b, get(common, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip)
+    else:
+        return make_layout(1, stride=0, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def max_common_vector(
+    a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None
+) -> int:
+    a_layout = a.layout if isinstance(a, _Tensor) else a
+    b_layout = b.layout if isinstance(b, _Tensor) else b
+
+    inv_b = right_inverse(b_layout, loc=loc, ip=ip)
+    common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip)
+
+    # Keep only the static identity component of the common layout
+    if (
+        is_static(get(common.shape, mode=[0], loc=loc, ip=ip))
+        and get(common.stride, mode=[0], loc=loc, ip=ip) == 1
+    ):
+        # Truncate to the size of the contiguous vector (static stride-1 mode)
+        return get(common.shape, mode=[0], loc=loc, ip=ip)
+    else:
+        return 1
+
+
+@dsl_user_op
+def tile_to_shape(
+    atom: Union[Layout, ComposedLayout],
+    trg_shape: Shape,
+    order: Shape,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[Layout, ComposedLayout]:
+    trg_shape = _pack_shape(shape(trg_shape), loc=loc, ip=ip)
+    order = _pack_int_tuple(order, loc=loc, ip=ip)
+    return _cute_ir.tile_to_shape(atom, trg_shape, order, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def local_partition(
+    target: Tensor,
+    tiler: Union[Layout, Shape],
+    index: Union[int, Numeric],
+    proj: XTuple = 1,
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    if isinstance(index, cutlass_arith.ArithValue):
+        index_val = index
+    else:
+        index_val = index.ir_value()
+    if index_val.type.width > 32:
+        raise NotImplementedError(
+            f"Index value should be 32-bit or smaller integer type, but got {index_val.type}"
+        )
+    return _cute_ir.local_partition(
+        input=target.value, tiler=dice(tiler, proj), index=index_val, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def local_tile(
+    input: Tensor,
+    tiler: Union[Layout, Shape],
+    coord: Coord,
+    proj: XTuple = None,
+    *,
+    loc=None,
+    ip=None,
+) -> Tensor:
+    tiler_val = _pack_shape(tiler, loc=loc, ip=ip)
+    coord_val = _pack_coord(coord, loc=loc, ip=ip)
+    if proj is not None:
+        if not isinstance(proj, tuple):
+            raise TypeError(f"Expects tuple for proj, but got {type(proj)}")
+        proj_val = _pack_coord(proj, loc=loc, ip=ip)
+        proj = proj_val.type.attribute
+
+    return _cute_ir.local_tile(
+        input=input.value,
+        tile=tiler_val,
+        static_tile=None,
+        coord=coord_val,
+        static_coord=None,
+        proj=proj,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def make_layout_image_mask(
+    lay: Layout, coord: Coord, mode: int, *, loc=None, ip=None
+) -> Int16:
+    """
+    Makes a 16-bit integer mask of the image of a layout sliced at a given mode
+    and accounting for the offset given by the input coordinate for the other modes.
+    """
+    if not is_static(lay):
+        raise ValueError(
+            f"make_layout_image_mask requires the layout to be static, but got {pretty_str(lay)}"
+        )
+    r = rank(lay)
+    if rank(coord) != r:
+        raise ValueError(
+            f"the rank of the coordinate must be equal to the one of the layout, but got {pretty_str(coord)}"
+        )
+    if mode > r or mode < 0:
+        raise ValueError(f"expects `mode` to be in [0,rank(lay)), but got {mode}")
+    # Given that we require the layout to be static, we can check that the mask fits in 16 bits
+    # This might be too conservative but safe
+    if cosize(lay) > 16:
+        raise ValueError("the mask may not fit into a 16-bit integer")
+
+    # Replace the mode to keep with _ in the coordinate
+    slicer = tuple(None if idx == mode else x for idx, x in enumerate(coord))
+    # Slice the layout with the slicer above and keep track of the offset
+    sliced_lay, offset = slice_and_offset(slicer, lay, loc=loc, ip=ip)
+    # Given that we replace only one mode with _, the rank of the slice should be 1
+    assert rank(sliced_lay) == 1
+
+    # Create the mask of the image
+    mcast_mask = Int16(0)
+    for i in range(size(sliced_lay)):
+        mcast_mask = mcast_mask | (1 << sliced_lay(i))
+    mcast_mask <<= offset
+    return Int16(mcast_mask)
+
+
+####################################################################################################
+#
+# Atom
+#
+####################################################################################################
+
+
+class Op(ABC):
+    """
+    Operation abstract base class.
+    """
+
+    pass
+
+
+class MmaOp(Op):
+    """
+    MMA Operation abstract base class.
+    """
+
+    @abstractmethod
+    def _make_trait(self, *, loc=None, ip=None, **kwargs):
+        pass
+
+
+class CopyOp(Op):
+    """
+    Copy Operation abstract base class.
+    """
+
+    @abstractmethod
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ):
+        pass
+
+
+class Trait(ABC):
+    """
+    Trait abstract base class.
+
+    Traits are internal-only classes used by Atoms that wrap the underlying IR Value. The Python
+    user should only interact with Ops and Atoms.
+    """
+
+    def __init__(self, value: ir.Value) -> None:
+        self.value = value
+
+    def __extract_mlir_values__(self):
+        return [self.value]
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(values[0])
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        raise NotImplementedError(
+            "set not implemented, the requesting Atom has likely no runtime state"
+        )
+
+    def unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value:
+        return self.value
+
+
+class Atom(ABC):
+    """
+    Atom base class.
+
+    An Atom is the composition of
+
+    - a MMA or Copy Operation;
+    - an internal MMA or Copy Trait.
+
+    An Operation is a pure Python class that is used to model a specific MMA or Copy instruction.
+    The Trait wraps the underlying IR Value and provides access to the metadata of the instruction
+    encoded using CuTe Layouts. When the Trait can be constructed straighforwardly from an
+    Operation, the ``make_mma_atom`` or ``make_copy_atom`` API should be used. There are cases where
+    constructing the metadata is not trivial and requires more information, for example to determine
+    the number of bytes copied per TMA instruction ("the TMA vector length"). In such cases,
+    dedicated helper functions are provided with an appropriate API such that the Atom is
+    constructed internally in an optimal fashion for the user.
+    """
+
+    def __init__(self, op: Op, trait: Trait) -> None:
+        self._op = op
+        self._trait = trait
+
+    def __extract_mlir_values__(self):
+        return extract_mlir_values(self._trait)
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(self.op, new_from_mlir_values(self._trait, values))
+
+    @property
+    def op(self) -> Op:
+        return self._op
+
+    @property
+    def type(self):
+        return self._trait.value.type
+
+    @dsl_user_op
+    def set(self, modifier, value, *, loc=None, ip=None) -> None:
+        """
+        Sets runtime fields of the Atom.
+
+        Some Atoms have runtime state, for example a tcgen05 MMA Atom
+
+
+        .. code-block:: python
+
+            tiled_mma = cute.make_tiled_mma(some_tcgen05_mma_op)
+            tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True)
+
+        The ``set`` method provides a way to the user to modify such runtime state. Modifiable
+        fields are provided by arch-specific enumerations, for example ``tcgen05.Field``. The Atom
+        instance internally validates the field as well as the value provided by the user to set
+        the field to.
+        """
+        self._trait.set(modifier, value, loc=loc, ip=ip)
+
+    def _unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value:
+        return self._trait.unpack(loc=loc, ip=ip, **kwargs)
+
+
+####################################################################################################
+#
+# MMA Atoms, TiledMma, and ThrMma
+#
+####################################################################################################
+
+
+class MmaAtom(Atom):
+    """
+    The MMA Atom class.
+    """
+
+    def __str__(self) -> str:
+        res = "MMA Atom\n"
+        res += "  ThrID:       " + pretty_str(self.thr_id) + "\n"
+        res += "  Shape MNK:   " + pretty_str(self.shape_mnk) + "\n"
+        res += "  TV Layout A: " + pretty_str(self.tv_layout_A) + "\n"
+        res += "  TV Layout B: " + pretty_str(self.tv_layout_B) + "\n"
+        res += "  TV Layout C: " + pretty_str(self.tv_layout_C)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def thr_id(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_id)
+
+    @property
+    def shape_mnk(self) -> Shape:
+        return _unpack_x_tuple(self._trait.value.type.shape_mnk)
+
+    @property
+    def tv_layout_A(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_a_tv)
+
+    @property
+    def tv_layout_B(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_b_tv)
+
+    @property
+    def tv_layout_C(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_c_tv)
+
+    #
+    # make_fragment
+    #
+
+    @dsl_user_op
+    def make_fragment_A(self, input, *, loc=None, ip=None):
+        # input could be memref/shape/layout for tmem based fragment
+        if isinstance(input, _Tensor):
+            if self.op is not None:
+                self.op._verify_fragment_A(input, loc=loc, ip=ip)
+            input = input.value
+        if isinstance(input, tuple):
+            input = _pack_shape(input, loc=loc, ip=ip)
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.A,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def make_fragment_B(self, input, *, loc=None, ip=None):
+        if isinstance(input, _Tensor):
+            if self.op is not None:
+                self.op._verify_fragment_B(input, loc=loc, ip=ip)
+            input = input.value
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.B,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def make_fragment_C(self, input, *, loc=None, ip=None):
+        # input could be memref/shape/layout for tmem based fragment
+        if isinstance(input, _Tensor):
+            input = input.value
+        if isinstance(input, tuple):
+            input = _pack_shape(input, loc=loc, ip=ip)
+        return _cute_ir.mma_make_fragment(
+            _cute_ir.MmaOperand.C,
+            self._trait.value,
+            input,
+            loc=loc,
+            ip=ip,
+        )
+
+
+class TiledMma(MmaAtom):
+    """
+    The tiled MMA class.
+    """
+
+    def __str__(self) -> str:
+        res = "Tiled MMA\n"
+        res += "  Thr Layout VMNK: " + pretty_str(self.thr_layout_vmnk) + "\n"
+        res += "  Permutation MNK: " + pretty_str(self.permutation_mnk) + "\n"
+        res += "MMA Atom\n"
+        res += "  ThrID:           " + pretty_str(self.thr_id) + "\n"
+        res += "  Shape MNK:       " + pretty_str(self.shape_mnk) + "\n"
+        res += "  TV Layout A:     " + pretty_str(self.tv_layout_A) + "\n"
+        res += "  TV Layout B:     " + pretty_str(self.tv_layout_B) + "\n"
+        res += "  TV Layout C:     " + pretty_str(self.tv_layout_C)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def tv_layout_A_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_a_tv_tiled)
+
+    @property
+    def tv_layout_B_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_b_tv_tiled)
+
+    @property
+    def tv_layout_C_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_c_tv_tiled)
+
+    @property
+    def permutation_mnk(self) -> Tile:
+        return _unpack_x_tuple(self._trait.value.type.permutation_mnk)
+
+    @property
+    def thr_layout_vmnk(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_layout_vmnk)
+
+    @property
+    def size(self) -> int:
+        return self._trait.value.type.size
+
+    #
+    # Tiler
+    #
+
+    def get_tile_size(self, mode_idx: int) -> Shape:
+        assert (mode_idx >= 0) and (mode_idx < 3)
+        perm_tile = self.permutation_mnk[mode_idx]
+        if perm_tile is None:
+            thr_layout_vmnk = self.thr_layout_vmnk
+            atom_shape_mnk = self.shape_mnk
+            return size(atom_shape_mnk, mode=[mode_idx]) * size(
+                thr_layout_vmnk, mode=[mode_idx + 1]
+            )
+        else:
+            return size(perm_tile)
+
+    #
+    # get_slice
+    #
+
+    def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrMma":
+        return ThrMma(self.op, self._trait, thr_idx)
+
+    #
+    # partition_shape
+    #
+
+    def _partition_shape(self, operand_id, shape, *, loc=None, ip=None):
+        shape = _pack_shape(shape, loc=loc, ip=ip)
+        return _unpack_x_tuple(
+            _cute_ir.tiled_mma_partition_shape(
+                operand_id, self._trait.value, shape, loc=loc, ip=ip
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_shape_A(self, shape_mk, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.A, shape_mk, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def partition_shape_B(self, shape_nk, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.B, shape_nk, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def partition_shape_C(self, shape_mn, *, loc=None, ip=None):
+        return self._partition_shape(_cute_ir.MmaOperand.C, shape_mn, loc=loc, ip=ip)
+
+    #
+    # _thrfrg
+    #
+
+    @overload
+    def _thrfrg(self, operand_id, input: Layout, *, loc=None, ip=None) -> Layout: ...
+
+    @overload
+    def _thrfrg(self, operand_id, input: Tensor, *, loc=None, ip=None) -> Tensor: ...
+
+    def _thrfrg(self, operand_id, input, *, loc=None, ip=None) -> Union[Tensor, Layout]:
+        if isinstance(input, Tensor):
+            return make_tensor(
+                input.iterator,
+                self._thrfrg(operand_id, input.layout, loc=loc, ip=ip),
+            )
+        elif isinstance(input, Layout):
+            if not is_static(input.type):
+                raise ValueError(f"Expects a static layout but got {input.type}")
+            return _cute_ir.static(
+                self._trait.value.type.thrfrg(operand_id, input), loc=loc, ip=ip
+            )
+
+        raise ValueError(
+            f"Expects a layout or a tensor as input but got {type(input)=}"
+        )
+
+    def _thrfrg_A(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.A, input, loc=loc, ip=ip)
+
+    def _thrfrg_B(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.B, input, loc=loc, ip=ip)
+
+    def _thrfrg_C(
+        self, input: Union[Layout, Tensor], *, loc=None, ip=None
+    ) -> Union[Layout, Tensor]:
+        return self._thrfrg(_cute_ir.MmaOperand.C, input, loc=loc, ip=ip)
+
+
+class ThrMma(TiledMma):
+    """
+    The thread MMA class for modeling a thread-slice of a tiled MMA.
+    """
+
+    def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None:
+        super().__init__(op, trait)
+        self._thr_idx = thr_idx
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(
+            self.op, new_from_mlir_values(self._trait, values), self.thr_idx
+        )
+
+    @property
+    def thr_idx(self):
+        return self._thr_idx
+
+    @dsl_user_op
+    def partition_A(self, input_mk: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.A,
+            self._trait.value,
+            input_mk.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_B(self, input_nk: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.B,
+            self._trait.value,
+            input_nk.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+    @dsl_user_op
+    def partition_C(self, input_mn: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_mma_partition(
+            _cute_ir.MmaOperand.C,
+            self._trait.value,
+            input_mn.value,
+            thr_idx,
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def make_mma_atom(op: MmaOp, *, loc=None, ip=None, **kwargs) -> MmaAtom:
+    """
+    Makes an MMA Atom from an MMA Operation.
+
+    This function creates an MMA Atom from a given MMA Operation. Arbitrary kw arguments can be
+    provided for Op-specific additional parameters. They are not used as of today.
+
+    :param op: The MMA Operation to construct an Atom for
+    :type op:  MmaOp
+    :return:   The MMA Atom
+    :rtype:    MmaAtom
+    """
+    trait = op._make_trait(loc=loc, ip=ip, **kwargs)
+    return MmaAtom(op, trait)
+
+
+@dsl_user_op
+def make_tiled_mma(
+    op_or_atom: Union[Op, MmaAtom],
+    atom_layout_mnk=(1, 1, 1),
+    permutation_mnk=None,
+    *,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> TiledMma:
+    """
+    Makes a tiled MMA from an MMA Operation or an MMA Atom.
+
+    :param op_or_atom:      The MMA Operation or Atom
+    :type op_or_atom:       Union[Op, MmaAtom]
+    :param atom_layout_mnk: A Layout describing the tiling of Atom across threads
+    :type atom_layout_mnk:  Layout
+    :param permutation_mnk: A permutation Tiler describing the tiling of Atom across values including any permutation of such tiling
+    :type permutation_mnk:  Tiler
+    :return:                The resulting tiled MMA
+    :rtype:                 TiledMma
+    """
+    if isinstance(op_or_atom, Op):
+        op = op_or_atom
+        atom = make_mma_atom(op_or_atom, loc=loc, ip=ip, **kwargs)
+    elif isinstance(op_or_atom, MmaAtom):
+        op = op_or_atom.op
+        atom = op_or_atom
+    else:
+        raise TypeError(
+            f"expected an MMA Op or Atom, but got an instance of {type(op_or_atom)}"
+        )
+    if isinstance(atom_layout_mnk, tuple):
+        atom_layout_mnk = make_layout(atom_layout_mnk, loc=loc, ip=ip)
+    if rank(atom_layout_mnk) != 3:
+        raise ValueError(f"expects rank-3 MNK atom layout, but got {atom_layout_mnk}")
+    permutation_mnk_ty = None
+    if permutation_mnk is not None:
+        permutation_mnk_ty = _pack_tile(permutation_mnk, loc=loc, ip=ip).type
+    ty = _cute_nvgpu_ir.TiledMmaType.get(
+        atom._trait.value.type,
+        atom_layout_mnk.type,
+        permutation_mnk_ty,
+    )
+    val = _cute_ir.make_tiled_mma(ty, atom._trait.value, loc=loc, ip=ip)
+    # Instead of modifying atom which might have been provided by the user, create a brand new
+    # trait instance and replace the Atom ir.Value with the tiled one
+    trait = new_from_mlir_values(atom._trait, [val])
+    return TiledMma(op, trait)
+
+
+####################################################################################################
+#
+# Copy Atoms, TiledCopy, and ThrCopy
+#
+####################################################################################################
+
+
+class CopyAtom(Atom):
+    """
+    The Copy Atom class.
+    """
+
+    def __str__(self) -> str:
+        res = "Copy Atom\n"
+        res += "  ThrID:         " + str(self.thr_id) + "\n"
+        res += "  TV Layout Src: " + str(self.layout_src_tv) + "\n"
+        res += "  TV Layout Dst: " + str(self.layout_dst_tv) + "\n"
+        res += "  Value type:    " + str(self._trait.value.type.value_type)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def value_type(self) -> Type[Numeric]:
+        return Numeric.from_mlir_type(self._trait.value.type.value_type)
+
+    @property
+    def thr_id(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.thr_id)
+
+    @property
+    def layout_src_tv(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_src_tv)
+
+    @property
+    def layout_dst_tv(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_dst_tv)
+
+
+class TiledCopy(CopyAtom):
+    """
+    The tiled Copy class.
+    """
+
+    def __str__(self) -> str:
+        res = "Tiled Copy\n"
+        res += "  Tiler MN:        " + pretty_str(self.tiler_mn) + "\n"
+        res += "  TV Layout tiled: " + str(self.layout_tv_tiled) + "\n"
+        res += "Copy Atom\n"
+        res += "  ThrID:           " + str(self.thr_id) + "\n"
+        res += "  TV Layout Src:   " + str(self.layout_src_tv) + "\n"
+        res += "  TV Layout Dst:   " + str(self.layout_dst_tv) + "\n"
+        res += "  Value type:      " + str(self._trait.value.type.value_type)
+        return res
+
+    #
+    # Properties
+    #
+
+    @property
+    def layout_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_tv_tiled)
+
+    @property
+    def tiler_mn(self) -> Tile:
+        return _unpack_x_tuple(self._trait.value.type.tiler_mn)
+
+    @property
+    def layout_src_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_src_tv_tiled)
+
+    @property
+    def layout_dst_tv_tiled(self) -> Layout:
+        return _cute_ir.static(self._trait.value.type.layout_dst_tv_tiled)
+
+    @property
+    def size(self) -> int:
+        return self._trait.value.type.size
+
+    #
+    # get_slice and retile
+    #
+
+    def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrCopy":
+        return ThrCopy(self.op, self._trait, thr_idx)
+
+    @dsl_user_op
+    def retile(self, src, *, loc=None, ip=None):
+        return _cute_ir.tiled_copy_retile(
+            tiled_copy=self._trait.value, input=src.value, loc=loc, ip=ip
+        )
+
+
+class ThrCopy(TiledCopy):
+    """
+    The thread Copy class for modeling a thread-slice of a tiled Copy.
+    """
+
+    def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None:
+        super().__init__(op, trait)
+        self._thr_idx = thr_idx
+
+    def __new_from_mlir_values__(self, values):
+        return self.__class__(
+            self.op, new_from_mlir_values(self._trait, values), self.thr_idx
+        )
+
+    @property
+    def thr_idx(self):
+        return self._thr_idx
+
+    @dsl_user_op
+    def partition_S(self, src: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_copy_partition_S(
+            self._trait.value, src.value, thr_idx, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def partition_D(self, dst: Tensor, *, loc=None, ip=None) -> Tensor:
+        thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip)
+        return _cute_ir.tiled_copy_partition_D(
+            self._trait.value, dst.value, thr_idx, loc=loc, ip=ip
+        )
+
+
+@dsl_user_op
+def make_copy_atom(
+    op: CopyOp, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+) -> CopyAtom:
+    """
+    Makes a Copy Atom from a Copy Operation.
+
+    This function creates a Copy Atom from a given Copy Operation. Arbitrary kw arguments can be
+    provided for Op-specific additional parameters.
+
+    Example:
+
+    .. code-block:: python
+
+        op = cute.nvgpu.CopyUniversalOp()
+        atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64)
+
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  CopyOp
+    :param copy_internal_type: An internal data type used to construct the source/destination layouts in unit of tensor elements
+    :type copy_internal_type:  Type[Numeric]
+    :return:                   The Copy Atom
+    :rtype:                    CopyAtom
+    """
+    trait = op._make_trait(copy_internal_type, loc=loc, ip=ip, **kwargs)
+    return CopyAtom(op, trait)
+
+
+@dsl_user_op
+def make_layout_tv(
+    thr_layout: Layout, val_layout: Layout, *, loc=None, ip=None
+) -> Tuple[Shape, Layout]:
+    """Create a thread-value layout for partitioning data tensors.
+
+    This function creates a thread-value layout that maps between ``(thread_idx, value_idx)``
+    coordinates and logical ``(M,N)`` coordinates. The thread layout must be compact to ensure
+    proper partitioning.
+
+    This implements the thread-value partitioning pattern shown in
+    Figure TVLayout, where data is partitioned across threads and values within each thread.
+
+    :param thr_layout: Layout mapping from ``(TileM,TileN)`` coordinates to thread IDs (must be compact)
+    :type thr_layout: Layout
+    :param val_layout: Layout mapping from ``(ValueM,ValueN)`` coordinates to value IDs within each thread
+    :type val_layout: Layout
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tuple containing ``tiler_mn`` and ``layout_tv``
+    :rtype: Tuple[Shape, Layout]
+
+    where:
+        * ``tiler_mn`` is tiler and ``shape(tiler_mn)`` is compatible with ``shape(zipped_divide(x, tiler_mn))[0]``
+        * ``layout_tv``: Thread-value layout mapping (thread_idx, value_idx) -> (M,N)
+
+    **Example:**
+
+    .. code-block:: python
+
+        tiler_mn, layout_tv = cute.make_layout_tv(
+            cute.make_layout((4, 8), stride=(8, 1)), cute.make_layout(2, stride=1)
+        )
+
+    Above code creates a TV layout that maps between thread/value coordinates
+    and the logical coordinates in a 8x8 matrix with:
+
+    * thread block layout ``(4,8):(8,1)``
+    * 2 elements per thread
+    """
+
+    if not isinstance(thr_layout, Layout):
+        raise TypeError(f"expected a Layout for thr_layout, but got {type(thr_layout)}")
+    if not isinstance(val_layout, Layout):
+        raise TypeError(f"expected a Layout for val_layout, but got {type(val_layout)}")
+
+    # Take the raked_products to compute the Layout_MN
+    # (M,N) -> (thr_idx, val_idx)
+    layout_mn = raked_product(thr_layout, val_layout, loc=loc, ip=ip)
+    thr_size = size(thr_layout, loc=loc, ip=ip)
+    val_size = size(val_layout, loc=loc, ip=ip)
+    tmp = make_layout((thr_size, val_size), loc=loc, ip=ip)
+    # (thr_idx, val_idx) -> (M,N)
+    layout_tv = composition(
+        right_inverse(layout_mn, loc=loc, ip=ip), tmp, loc=loc, ip=ip
+    )
+
+    tiler_mn = product_each(layout_mn.shape, loc=loc, ip=ip)
+
+    return (tiler_mn, layout_tv)
+
+
+def _make_tiled_copy(atom, layout_tv, tiler_mn, *, loc=None, ip=None):
+    if type(tiler_mn) is tuple:
+        tiler_mn = _pack_tile(tiler_mn, loc=loc, ip=ip)
+
+    assert isinstance(tiler_mn, ir.Value) and _cute_ir.TileType.isinstance(
+        tiler_mn.type
+    ), f"tiler_mn must be a Tile, but got {type(tiler_mn)}"
+    assert is_static(layout_tv.type) and is_static(
+        tiler_mn.type
+    ), "layout tv and tiler mn must be static"
+    tiled_copy_ty = _cute_nvgpu_ir.TiledCopyType.get(
+        atom.type, layout_tv.type, tiler_mn.type
+    )
+
+    val = _cute_ir.make_tiled_copy(tiled_copy_ty, atom._trait.value, loc=loc, ip=ip)
+    # Instead of modifying atom which might have been provided by the user, create a brand new
+    # trait instance and replace the Atom ir.Value with the tiled one
+    trait = new_from_mlir_values(atom._trait, [val])
+    return TiledCopy(atom.op, trait)
+
+
+def make_tiled_copy(atom, layout_tv, tiler_mn, *, loc=None, ip=None):
+    """Create a tiled type given a TV partitioner and tiler.
+
+    :param atom: Copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc.
+    :type atom: CopyAtom
+    :param layout_tv: Thread-value layout
+    :type layout_tv: Layout
+    :param tiler_mn: Tile size
+    :type tiler_mn: Tiler
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+    return _make_tiled_copy(atom, layout_tv, tiler_mn, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_tiled_copy_tv(
+    atom: CopyAtom, thr_layout: Layout, val_layout: Layout, *, loc=None, ip=None
+) -> TiledCopy:
+    """Create a tiled copy given separate thread and value layouts.
+
+    A TV partitioner is inferred based on the input layouts. The input thread layout
+    must be compact.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param thr_layout: Layout mapping from ``(TileM,TileN)`` coordinates to thread IDs (must be compact)
+    :type thr_layout: Layout
+    :param val_layout: Layout mapping from ``(ValueM,ValueN)`` coordinates to value IDs
+    :type val_layout: Layout
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    tiler_mn, layout_tv = make_layout_tv(thr_layout, val_layout, loc=loc, ip=ip)
+    tiler_mn = _pack_tile(product_each(tiler_mn, loc=loc, ip=ip), loc=loc, ip=ip)
+    return _make_tiled_copy(atom, layout_tv, tiler_mn, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_tiled_copy_A(atom, tiled_mma, *, loc=None, ip=None):
+    """Create a tiled copy out of the copy_atom that matches the A-Layout of tiled_mma.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param tiled_mma: Tiled MMA
+    :type tiled_mma: TiledMma
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    return _make_tiled_copy(
+        atom,
+        tiled_mma.tv_layout_A_tiled,
+        (tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(2)),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_B(atom, tiled_mma, *, loc=None, ip=None):
+    """Create a tiled copy out of the copy_atom that matches the B-Layout of tiled_mma.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param tiled_mma: Tiled MMA
+    :type tiled_mma: TiledMma
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    return _make_tiled_copy(
+        atom,
+        tiled_mma.tv_layout_B_tiled,
+        (tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_C(atom, tiled_mma, *, loc=None, ip=None):
+    """Create a tiled copy out of the copy_atom that matches the C-Layout of tiled_mma.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param tiled_mma: Tiled MMA
+    :type tiled_mma: TiledMma
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    return _make_tiled_copy(
+        atom,
+        tiled_mma.tv_layout_C_tiled,
+        (tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(1)),
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_S(atom, tiled_copy, *, loc=None, ip=None):
+    """Create a tiled copy out of the copy_atom that matches the Src-Layout of tiled_copy.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param tiled_copy: Tiled copy
+    :type tiled_copy: TiledCopy
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    return _make_tiled_copy(
+        atom, tiled_copy.layout_src_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_D(atom, tiled_copy, *, loc=None, ip=None):
+    """Create a tiled copy out of the copy_atom that matches the Dst-Layout of tiled_copy.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param tiled_copy: Tiled copy
+    :type tiled_copy: TiledCopy
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for the partitioner
+    :rtype: TiledCopy
+    """
+
+    return _make_tiled_copy(
+        atom, tiled_copy.layout_dst_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def make_tiled_copy_C_atom(atom: CopyAtom, mma: TiledMma, *, loc=None, ip=None):
+    """Create the smallest tiled copy that can retile LayoutC_TV for use with pipelined epilogues with subtiled stores.
+
+    :param atom: Copy atom
+    :type atom: CopyAtom
+    :param mma: Tiled MMA
+    :type mma: TiledMma
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+
+    :return: A tiled copy for partitioner
+    :rtype: TiledCopy
+
+    :raises ValueError: If the number value of CopyAtom's source layout is greater than the size of TiledMma's LayoutC_TV
+    """
+    # Truncate the V-layout to just the Copy_Atom, keep the V-order
+    layoutC_tv = mma.tv_layout_C_tiled
+    val_layout_src = atom.layout_src_tv
+    num_val_src = size(val_layout_src, mode=[1], loc=loc, ip=ip)
+    num_val_layoutC_tv = size(layoutC_tv, mode=[1], loc=loc, ip=ip)
+    if num_val_src > num_val_layoutC_tv:
+        raise ValueError(
+            f"The number value of CopyAtom's source layout {num_val_src} "
+            f"is greater than the size of TiledMma's LayoutC_TV {num_val_layoutC_tv}"
+        )
+    layout_TV = composition(
+        layoutC_tv,
+        make_layout(
+            (size(layoutC_tv, mode=[0], loc=loc, ip=ip), num_val_src), loc=loc, ip=ip
+        ),
+        loc=loc,
+        ip=ip,
+    )
+
+    # Recompute tiler and restride the TV layout for the new tiler
+
+    # Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them
+    # Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA
+    mma_tiler = (mma.get_tile_size(0), mma.get_tile_size(1))
+
+    tiler_0 = filter(
+        composition(
+            make_layout(mma_tiler, stride=(1, 0), loc=loc, ip=ip),
+            layout_TV,
+            loc=loc,
+            ip=ip,
+        ),
+        loc=loc,
+        ip=ip,
+    )
+    tiler_1 = filter(
+        composition(
+            make_layout(mma_tiler, stride=(0, 1), loc=loc, ip=ip),
+            layout_TV,
+            loc=loc,
+            ip=ip,
+        ),
+        loc=loc,
+        ip=ip,
+    )
+    tiler = (tiler_0, tiler_1)
+
+    tile2mma = composition(
+        make_layout(mma_tiler, loc=loc, ip=ip), tiler, loc=loc, ip=ip
+    )
+    layout_tv = composition(
+        left_inverse(tile2mma, loc=loc, ip=ip), layout_TV, loc=loc, ip=ip
+    )
+
+    tiler_mn = _pack_tile(tiler, loc=loc, ip=ip)
+
+    return _make_tiled_copy(atom, layout_tv, tiler_mn, loc=loc, ip=ip)
+
+
+####################################################################################################
+#
+# cute.gemm and cute.copy
+#
+####################################################################################################
+
+
+@dsl_user_op
+def gemm(
+    atom: MmaAtom,
+    d: Tensor,
+    a: Tensor,
+    b: Tensor,
+    c: Tensor,
+    *,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    """The GEMM algorithm.
+
+    Computes ``D <- A * B + C`` where ``C`` and ``D`` can alias. Note that some MMA Atoms (e.g.
+    warpgroup-wide or tcgen05 MMAs) require manually setting an "accumulate" boolean field.
+
+    All tensors must be partitioned according to the provided MMA Atom.
+
+    For MMA Atoms that require single-threaded execution, the gemm op automatically handles thread
+    election internally. Manual thread selection is not required in such cases.
+
+    Following dispatch rules are supported:
+
+    - Dispatch [1]: (V) x (V) => (V)          => (V,1,1) x (V,1,1) => (V,1,1)
+    - Dispatch [2]: (M) x (N) => (M,N)        => (1,M,1) x (1,N,1) => (1,M,N)
+    - Dispatch [3]: (M,K) x (N,K) => (M,N)    => (1,M,K) x (1,N,K) => (1,M,N)
+    - Dispatch [4]: (V,M) x (V,N) => (V,M,N)  => (V,M,1) x (V,N,1) => (V,M,N)
+    - Dispatch [5]: (V,M,K) x (V,N,K) => (V,M,N)
+
+    :param atom: MMA atom
+    :type atom: MmaAtom
+    :param d: Destination tensor
+    :type d: Tensor
+    :param a: First source tensor
+    :type a: Tensor
+    :param b: Second source tensor
+    :type b: Tensor
+    :param c: Third source tensor
+    :type c: Tensor
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point for MLIR, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+    :param kwargs: Additional keyword arguments
+    :type kwargs: dict
+    :return: None
+    :rtype: None
+    """
+
+    a_rank = rank(a.shape)
+    b_rank = rank(b.shape)
+    c_rank = rank(c.shape)
+    d_rank = rank(d.shape)
+
+    if a_rank != b_rank:
+        raise ValueError("`a` and `b` must have the same rank")
+
+    if c_rank != d_rank:
+        raise ValueError("`c` and `d` must have the same rank")
+
+    if a_rank == 1:
+        if c_rank > 2:
+            raise ValueError("`c` must have rank <= 2 when `a` has rank 1")
+    elif a_rank == 2:
+        if c_rank not in (2, 3):
+            raise ValueError("`c` must have rank 2 or 3 when `a` has rank 2")
+    elif a_rank == 3:
+        if c_rank != 3:
+            raise ValueError("`c` must have rank 3 when `a` has rank 3")
+
+    value = atom._unpack(loc=loc, ip=ip, **kwargs)
+    return _cute_ir.gemm(value, d.value, a.value, b.value, c.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def basic_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """Performs a basic element-wise copy.
+
+    This functions **assumes** the following pre-conditions:
+    1. `size(src) == size(dst)`
+
+    When the `src` and `dst` shapes are static, the pre-conditions are actually verified and the
+    element-wise loop is fully unrolled.
+
+    :param src: Source tensor
+    :type src: Tensor
+    :param dst: Destination tensor
+    :type dst: Tensor
+    :param loc: Source location for MLIR, defaults to None
+    :type loc: Optional[Location], optional
+    :param ip: Insertion point, defaults to None
+    :type ip: Optional[InsertionPoint], optional
+    """
+
+    if is_static(src.shape) and is_static(dst.shape):
+        simt_copy_ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            src.element_type.mlir_type, src.element_type.width
+        )
+        simt_copy = _cute_ir.atom(simt_copy_ty, loc=loc, ip=ip)
+        return _cute_ir.copy(simt_copy, src.value, dst.value, loc=loc, ip=ip)
+
+    s = size(dst, loc=loc, ip=ip)
+    # Always generate an scf.for Op when one of the tensors is dynamic
+    for i in for_generate(0, s):
+        dst[i] = src[i]
+        yield_out()
+
+
+@dsl_user_op
+def basic_copy_if(pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """Performs a basic predicated element-wise copy.
+
+    This functions **assumes** the following pre-conditions:
+    1. `size(src) == size(dst)`
+    2. `size(src) == size(pred)`
+
+    When all shapes are static, the pre-conditions are actually verified and the element-wise loop
+    is fully unrolled.
+
+    """
+    if src.element_type.width != dst.element_type.width:
+        raise NotImplementedError(
+            "basic_copy_if currently only supports equal source and destination "
+            "element type bit width"
+        )
+
+    if is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape):
+        return _basic_copy_if_static(pred, src, dst, loc=loc, ip=ip)
+
+    s = size(dst, loc=loc, ip=ip)
+    # Always generate an scf.for Op when one of the tensors is dynamic
+    for i in for_generate(0, s):
+        if_generate(pred[i], lambda: dst.__setitem__(i, src[i]))
+        yield_out()
+
+
+# Version of basic_copy_if when src and dst have static shapes
+# - verify size(src) == size(dst) == size(prd)
+# - fully unroll the loop for now
+def _basic_copy_if_static(
+    pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None
+) -> None:
+    assert is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape)
+    if size(src, loc=loc, ip=ip) != size(dst, loc=loc, ip=ip):
+        raise ValueError(
+            "basic_copy expects the size of source, destination, and predicate tensors to match"
+        )
+    # Fully unrolled loop in the static case for now
+    for i in range(size(dst, loc=loc, ip=ip)):
+        if_generate(pred[i], lambda: dst.__setitem__(i, src[i]))
+
+
+@dsl_user_op
+def autovec_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None:
+    """
+    Auto-vectorizing SIMT copy policy.
+
+    Given a source and destination tensors that are statically shaped, this policy figures out the
+    largest safe vector width that the copy instruction can take and performs the copy.
+    """
+    if src.element_type.width != dst.element_type.width:
+        raise NotImplementedError(
+            "autovec_copy currently only supports equal source and destination "
+            "element type bit width"
+        )
+
+    # We are going to dispatch to copy-with-atom which requires shapes to be static
+    if not is_static(src.shape) or not is_static(dst.shape):
+        raise ValueError(
+            "autovec_copy expects source and destination tensors to be statically shaped"
+        )
+
+    vec_layout = max_common_layout(src, dst, loc=loc, ip=ip)
+    num_common_elements = size(vec_layout, loc=loc, ip=ip)
+
+    # Next we construct an upper-bound on the number bits that can be vectorized by considering
+    # - the maximum alignment of the layouts
+    # - the maximum alignment of the pointers
+
+    upper_bound = math.gcd(src.layout.max_alignment, dst.layout.max_alignment)
+    upper_bound = math.gcd(upper_bound, num_common_elements)
+    upper_bound *= src.element_type.width
+
+    # For our instructions, the alignment of the pointer is an upper bound to the vector width
+    # max_alignment, as opposed to alignment, takes into account possible address swizzling
+    upper_bound = math.gcd(upper_bound, src.iterator.max_alignment * 8)
+    upper_bound = math.gcd(upper_bound, dst.iterator.max_alignment * 8)
+
+    # Finally, we put a cap at 128b
+    num_bits_per_copy = math.gcd(upper_bound, 128)
+
+    if (num_common_elements > 1) and (num_bits_per_copy % 8 == 0):
+        num_common_elements = num_bits_per_copy // src.element_type.width
+
+        # 2 step logical divides ensuring that the divides are valid at every step
+        vec_src = logical_divide(src, vec_layout, loc=loc, ip=ip)
+        vec_dst = logical_divide(dst, vec_layout, loc=loc, ip=ip)
+        tiled_src = logical_divide(
+            vec_src, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        tiled_dst = logical_divide(
+            vec_dst, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+        # Dispatch to copy with atom
+        simt_type = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            src.element_type.mlir_type, num_bits_per_copy
+        )
+        simt_copy = _cute_ir.atom(simt_type, loc=loc, ip=ip)
+        return _cute_ir.copy(
+            simt_copy, tiled_src.value, tiled_dst.value, loc=loc, ip=ip
+        )
+
+    # Failed to vectorize, use a basic copy
+    basic_copy(src, dst, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy(
+    atom: CopyAtom,
+    src: Tensor,
+    dst: Tensor,
+    *,
+    pred: Optional[Tensor] = None,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    """
+    The Copy algorithm.
+
+    The "copy with Atom" expects source and destination tensors to be partitioned according to the
+    provided Copy Atom. Some Atoms require additional Op-specific kw arguments, for example TMA
+    copies:
+
+    .. code-block:: python
+
+        cute.copy(tma_atom, src, dst, tma_bar_ptr=mbar_ptr, mcast_mask=mask)
+
+    An additional predication tensor can be provided. If the partitioned tensors have the following
+    logical profile ``((ATOM_V,ATOM_REST),REST_M,...)``, the predication tensor must have a profile
+    consistent with ``(ATOM_REST,REST_M,...)``.
+
+    For Copy Atoms that require single-threaded execution, the copy op automatically handles thread
+    election internally. Manual thread selection is not required in such cases.
+    """
+    if isinstance(src.type, _cute_ir.MemRefType) and isinstance(
+        dst.type, _cute_ir.MemRefType
+    ):
+        if src.element_type.width != dst.element_type.width:
+            raise TypeError(
+                "`copy` currently only supports equal source and destination "
+                "element type bit width"
+            )
+
+    value = atom._unpack(loc=loc, ip=ip, **kwargs)
+    if isinstance(pred, Tensor):
+        pred = pred.value
+    return _cute_ir.copy(value, src.value, dst.value, pred=pred, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy_atom_call(
+    atom: CopyAtom,
+    src: Tensor,
+    dst: Tensor,
+    *,
+    pred: Optional[Tensor] = None,
+    loc=None,
+    ip=None,
+    **kwargs,
+) -> None:
+    """
+    Execute a single copy atom operation.
+
+    The copy_atom_call operation executes a copy atom with the given operands.
+    Following src/dst layout of atom are valid:
+    * ((atom_v))
+    * (atom_v)
+
+    Note: The format ((atom_v, rest_v)) is NOT valid for copy_atom_call since it would
+    require multiple atom operations, which contradicts the definition of a single copy atom call.
+
+    Examples:
+
+    .. code-block:: python
+
+        # Call a copy atom operation
+        cute.copy_atom_call(copy_atom, src_tensor, dst_tensor)
+
+    An additional predication tensor can be provided. If the partitioned tensors have the following
+    logical profile ``((ATOM_V,ATOM_REST),REST_M,...)``, the predication tensor must have a profile
+    consistent with ``(ATOM_REST,REST_M,...)``.
+    """
+    if isinstance(src.type, _cute_ir.MemRefType) and isinstance(
+        dst.type, _cute_ir.MemRefType
+    ):
+        if src.element_type.width != dst.element_type.width:
+            raise TypeError(
+                "`copy_atom_call` currently only supports equal source and destination "
+                "element type bit width"
+            )
+
+    value = atom._unpack(loc=loc, ip=ip, **kwargs)
+    if isinstance(pred, Tensor):
+        pred = pred.value
+    return _cute_ir.copy_atom_call(
+        value, src.value, dst.value, pred=pred, loc=loc, ip=ip
+    )
+
+
+def prefetch(atom: CopyAtom, src: Tensor, *, loc=None, ip=None) -> None:
+    """
+    The Prefetch algorithm.
+
+    The "prefetch" expects source tensors to be partitioned according to the provided Copy Atom.
+    Prefetch is used for loading tensors from global memory to L2.
+
+    Prefetch accepts Copy Atom but not all are allowed. Currently, only support for tma load tensor prefetch.
+
+    .. code-block:: python
+
+        cute.prefetch(tma_atom, src)
+
+    For Copy Atoms that require single-threaded execution, the copy op automatically handles thread
+    election internally. Manual thread selection is not required in such cases.
+    """
+    dummy_tma_bar_ptr = make_ptr(Int64, 0, AddressSpace.smem, loc=loc, ip=ip)
+    value = atom._unpack(loc=loc, ip=ip, tma_bar_ptr=dummy_tma_bar_ptr)
+    return _cute_ir.prefetch(value, src.value, loc=loc, ip=ip)
+
+####################################################################################################
+#
+# TensorSSA class (experimental)
+#
+####################################################################################################
+
+
+class ReductionOp(Enum):
+    ADD = auto()
+    MUL = auto()
+    MAX = auto()
+    MIN = auto()
+    INC = auto()
+    DEC = auto()
+    AND = auto()
+    OR = auto()
+    XOR = auto()
+
+    def __str__(self):
+        return self.name.lower()
+
+
+class TensorSSA(cutlass_arith.ArithValue):
+    """A class representing thread local data from CuTe Tensor in value semantic and immutable.
+
+    :param value: Flatten vector as ir.Value holding logic data of SSA Tensor
+    :type value: ir.Value
+    :param shape: The nested shape in CuTe of the vector
+    :type shape: Shape
+    :param dtype: Data type of the tensor elements
+    :type dtype: Type[Numeric]
+
+    :ivar _shape: The nested shape in CuTe of the vector
+    :ivar _dtype: Data type of the tensor elements
+
+    :raises ValueError: If shape is not static
+    """
+
+    def __init__(self, value, shape: Shape, dtype: Type[Numeric]):
+        """Initialize a new TensorSSA object.
+
+        :param value: Flatten vector as ir.Value holding logic data of SSA Tensor
+        :type value: ir.Value
+        :param shape: The nested shape in CuTe of the vector
+        :type shape: Shape
+        :param dtype: Data type of the tensor elements
+        :type dtype: Type[Numeric]
+        :raises ValueError: If shape is not static
+        """
+        if not is_static(shape):
+            raise ValueError("dynamic shape is not supported")
+
+        signed = dtype.signed if issubclass(dtype, Integer) else False
+        super().__init__(value, signed)
+
+        self._shape = shape
+        self._dtype = dtype
+        self._layout = None
+
+    @property
+    def dtype(self) -> Type[Numeric]:
+        return self._dtype
+
+    @property
+    def element_type(self) -> Type[Numeric]:
+        return self._dtype
+
+    @abstractmethod
+    def __extract_mlir_values__(self):
+        return [self]
+
+    @abstractmethod
+    def __new_from_mlir_values__(self, values):
+        return TensorSSA(values[0], self.shape, self.dtype)
+
+    def __str__(self):
+        return f"tensor_value<{self.type} o {self.shape}>"
+
+    @property
+    def shape(self):
+        return self._shape
+
+    @overload
+    def _apply_op(self, op, other: "TensorSSA", flip, *, loc, ip) -> "TensorSSA": ...
+
+    @overload
+    def _apply_op(
+        self, op, other: cutlass_arith.ArithValue, flip, *, loc, ip
+    ) -> "TensorSSA": ...
+
+    @overload
+    def _apply_op(
+        self, op, other: Union[int, float, bool], flip, *, loc, ip
+    ) -> "TensorSSA": ...
+
+    def _apply_op(self, op, other, flip=False, *, loc=None, ip=None):
+        def get_attr_for_type(ty, value):
+            if isinstance(ty, ir.IntegerType):
+                return ir.IntegerAttr.get(ty, value)
+            elif isinstance(ty, ir.FloatType):
+                return ir.FloatAttr.get(ty, value)
+            else:
+                raise TypeError(f"unsupported type: {ty}")
+
+        # Canonicalize into Numeric
+        if isinstance(other, (int, float, bool)) or (
+            not isinstance(other, TensorSSA)
+            and isinstance(other, cutlass_arith.ArithValue)
+        ):
+            other = as_numeric(other)
+
+        # Promote types
+        lhs, rhs, res_type = _binary_op_type_promote(self, other)
+
+        # Promote scalar to vector
+        if not isinstance(rhs, TensorSSA):
+            if isinstance(rhs, Numeric):
+                vect_val = vector.broadcast(lhs.type, rhs.ir_value(loc=loc, ip=ip))
+            else:
+                elem_attr = get_attr_for_type(lhs.type.element_type, rhs)
+                vect_attr = ir.DenseElementsAttr.get_splat(lhs.type, elem_attr)
+                vect_val = arith.constant(lhs.type, vect_attr, loc=loc, ip=ip)
+            rhs = TensorSSA(vect_val, lhs.shape, lhs.dtype)
+
+        if flip:
+            lhs, rhs = rhs, lhs
+
+        if op in (
+            operator.lt,
+            operator.le,
+            operator.gt,
+            operator.ge,
+            operator.eq,
+            operator.ne,
+        ):
+            res_type = Boolean
+
+        assert isinstance(rhs, TensorSSA), f"rhs must be TensorSSA but got {rhs}"
+
+        def _broadcast(s, t):
+            if s == 1:
+                return t
+            elif t == 1:
+                return s
+            elif s == t:
+                return s
+            else:
+                raise ValueError(f"cannot broadcast {s} and {t}")
+
+        max_rank = max(rank(lhs.shape), rank(rhs.shape))
+        lhs_shape = append(lhs.shape, 1, up_to_rank=max_rank)
+        rhs_shape = append(rhs.shape, 1, up_to_rank=max_rank)
+        res_shape = transform_leaf(_broadcast, lhs_shape, rhs_shape)
+
+        # broadcast to the same shape
+        lhs = lhs.broadcast_to(res_shape)
+        rhs = rhs.broadcast_to(res_shape)
+
+        if (
+            op in (operator.add, operator.sub)
+            and lhs.dtype == Boolean
+            and rhs.dtype == Boolean
+        ):
+            res = op(lhs.to(Int32), rhs.to(Int32))
+            zero = zeros_like(res)
+            res = res.__ne__(zero).to(res_type)
+        else:
+            lhs_val = lhs.maybe_downcast()
+            rhs_val = rhs.maybe_downcast()
+
+            if issubclass(lhs.dtype, Integer):
+                lhs_val = lhs_val.with_signedness(lhs.dtype.signed)
+
+            if issubclass(rhs.dtype, Integer):
+                rhs_val = rhs_val.with_signedness(rhs.dtype.signed)
+
+            res_vect = op(lhs_val, rhs_val)
+            res = TensorSSA(res_vect, lhs._shape, res_type)
+
+        return res
+
+    def broadcast_to(self, target_shape: Shape, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Broadcast the tensor to the target shape.
+        """
+        # pad source shape to the same rank
+        shape = append(self.shape, 1, up_to_rank=rank(target_shape))
+        if shape == target_shape:
+            return self
+
+        def _check_broadcast(s, t):
+            if s != t and s != 1:
+                raise ValueError(
+                    f"src_shape and target_shape must be the same when src_shape is not 1, but got {s} and {t}"
+                )
+
+        transform_leaf(_check_broadcast, shape, target_shape)
+
+        # reshape to flatten N-D vector
+        flat_shp = flatten_to_tuple(shape)
+        temp_ty = ir.VectorType.get(list(flat_shp), self.dtype.mlir_type)
+        temp_vect = vector.shape_cast(temp_ty, self, loc=loc, ip=ip)
+
+        # broadcast to result N-D vector
+        flat_tgt_shp = flatten_to_tuple(target_shape)
+        temp_tgt_ty = ir.VectorType.get(list(flat_tgt_shp), self.dtype.mlir_type)
+        temp_tgt_vect = vector.broadcast(temp_tgt_ty, temp_vect, loc=loc, ip=ip)
+
+        res_1d_ty = ir.VectorType.get([size(target_shape)], self.dtype.mlir_type)  # type: ignore
+        res_1d_vect = vector.shape_cast(res_1d_ty, temp_tgt_vect, loc=loc, ip=ip)
+
+        return TensorSSA(res_1d_vect, target_shape, self.dtype)
+
+    def __pow__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the results of tensor^other.
+
+        :param other: The other tensor for exponent.
+        :type other: TensorSSA
+        :return: The power of the tensor.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.pow, other, loc=loc, ip=ip)
+
+    def __rpow__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the results of other^tensor.
+
+        :param other: The other tensor to compute power with.
+        :type other: TensorSSA
+        :return: The element-wise power of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.pow, other, flip=True, loc=loc, ip=ip)
+
+    def __add__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the sum of the tensor and another tensor.
+
+        :param other: The other tensor to add.
+        :type other: TensorSSA
+        :return: The sum of the two tensors with the same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.add, other, loc=loc, ip=ip)
+
+    def __radd__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the sum of the tensor and another tensor (reverse add)
+
+        :param other: The other tensor to add.
+        :type other: TensorSSA
+        :return: The sum of the two tensors with the same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.add, other, flip=True, loc=loc, ip=ip)
+
+    def __sub__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the difference of the tensor and another tensor.
+
+        :param other: The other tensor to subtract.
+        :type other: TensorSSA
+        :return: The subtraction of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.sub, other, loc=loc, ip=ip)
+
+    def __rsub__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the difference of the tensor and another tensor (reverse subtract)
+
+        :param other: The other tensor to subtract.
+        :type other: TensorSSA
+        :return: The subtraction of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.sub, other, flip=True, loc=loc, ip=ip)
+
+    def __mul__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the multiplication of the tensor and another tensor.
+
+        :param other: The other tensor to multiply.
+        :type other: TensorSSA
+        :return: The multiplication of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mul, other, loc=loc, ip=ip)
+
+    def __rmul__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the multiplication of the tensor and another tensor (reverse multiply)
+
+        :param other: The other tensor to multiply.
+        :type other: TensorSSA
+        :return: The multiplication of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mul, other, flip=True, loc=loc, ip=ip)
+
+    def __mod__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the modulo of the tensor and another tensor.
+
+        :param other: The other tensor to compute modulo with.
+        :type other: TensorSSA
+        :return: The element-wise modulo of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mod, other, loc=loc, ip=ip)
+
+    def __rmod__(self, other) -> "TensorSSA":
+        """
+        Returns the modulo of the tensor and another tensor (reverse modulo)
+
+        :param other: The other tensor to compute modulo with.
+        :type other: TensorSSA
+        :return: The element-wise modulo of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.mod, other, flip=True)
+
+    def __floordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the floordiv(//) of the tensor and another tensor.
+
+        :param other: The other tensor to compute floordiv with.
+        :type other: TensorSSA
+        :return: The floordiv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.floordiv, other, loc=loc, ip=ip)
+
+    def __rfloordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the floordiv(//) of the tensor and another tensor (reverse floordiv)
+
+        :param other: The other tensor to compute floordiv with.
+        :type other: TensorSSA
+        :return: The floordiv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.floordiv, other, flip=True, loc=loc, ip=ip)
+
+    def __truediv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the truediv(/) of the tensor and another tensor.
+
+        :param other: The other tensor to compute truediv with.
+        :type other: TensorSSA
+        :return: The truediv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.truediv, other, loc=loc, ip=ip)
+
+    def __rtruediv__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the truediv(/) of the tensor and another tensor (reverse truediv)
+
+        :param other: The other tensor to compute truediv with.
+        :type other: TensorSSA
+        :return: The truediv of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.truediv, other, flip=True, loc=loc, ip=ip)
+
+    def __eq__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the comparison of the tensor and another tensor as mask
+
+        :param other: The other tensor to compare.
+        :type other: TensorSSA
+        :return: The comparison of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.eq, other, loc=loc, ip=ip)
+
+    def __ne__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise not equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self != other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.ne, other, loc=loc, ip=ip)
+
+    def __lt__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise less than comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self < other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.lt, other, loc=loc, ip=ip)
+
+    def __le__(self, other) -> "TensorSSA":
+        """
+        Returns the element-wise less than or equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self <= other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.le, other)
+
+    def __gt__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise greater than comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self > other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.gt, other)
+
+    def __ge__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise greater than or equal comparison of the tensor and another tensor.
+
+        :param other: The other tensor to compare with.
+        :type other: TensorSSA
+        :return: A boolean tensor with same shape as inputs, True where self >= other.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.ge, other, loc=loc, ip=ip)
+
+    def __xor__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise XOR of the tensor and another tensor.
+
+        :param other: The other tensor to perform XOR with.
+        :type other: TensorSSA
+        :return: The element-wise XOR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.xor, other)
+
+    def __rxor__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the bitwise XOR of the tensor and another tensor.
+
+        :param other: The other tensor to compute XOR with.
+        :type other: TensorSSA
+        :return: The element-wise bitwise XOR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.xor, other, flip=True, loc=loc, ip=ip)
+
+    def __or__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise OR of the tensor and another tensor.
+
+        :param other: The other tensor to perform OR with.
+        :type other: TensorSSA
+        :return: The element-wise OR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.or_, other)
+
+    def __ror__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise OR of the tensor and another tensor.
+
+        :param other: The other tensor to perform OR with.
+        :type other: TensorSSA
+        :return: The element-wise OR of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.or_, other, flip=True)
+
+    def __and__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise AND of the tensor and another tensor.
+
+        :param other: The other tensor to perform AND with.
+        :type other: TensorSSA
+        :return: The element-wise AND of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.and_, other)
+
+    def __rand__(self, other, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the element-wise AND of the tensor and another tensor.
+
+        :param other: The other tensor to perform AND with.
+        :type other: TensorSSA
+        :return: The element-wise AND of two tensors with same shape as inputs.
+        :rtype: TensorSSA
+        """
+        return self._apply_op(operator.and_, other, flip=True, loc=loc, ip=ip)
+
+    def __neg__(self, *, loc=None, ip=None) -> "TensorSSA":
+        """
+        Returns the negation of the tensor.
+
+        :return: The element-wise negation of the tensor
+        :rtype: TensorSSA
+        """
+
+        return self._apply_op(operator.sub, 0, flip=True, loc=loc, ip=ip)
+
+    def _flatten_shape_and_coord(self, crd, *, loc=None, ip=None):
+        # Coalesce and flatten source layout at terminal of coordinate
+        # (N_0,(N_1,...), ...) -> (N_0,N_1,N_2,...)
+        crd_shp = product_like(self._shape, target_profile=crd, loc=loc, ip=ip)
+
+        # Flatten coordinate
+        flat_shp = flatten(crd_shp)
+        assert isinstance(flat_shp, tuple) and is_static(flat_shp)
+        # (C_0,(C_1,...), ...) -> (C_0,C_1,C_2,...)
+        flat_crd = flatten(crd)
+
+        assert isinstance(flat_crd, tuple) and is_static(flat_crd)
+        return flat_shp, flat_crd
+
+    def _build_result(self, res_vect, res_shp, *, loc=None, ip=None):
+        if isinstance(res_shp, ir.Value):
+            raise ValueError(
+                f"expects static shape and coordinates, but got {self._shape} and {crd}"
+            )
+
+        # cast back to 1D vector
+        res_1d_ty = ir.VectorType.get([size(res_shp)], self.type.element_type)
+        res_1d_vect = vector.shape_cast(res_1d_ty, res_vect, loc=loc, ip=ip)
+        return TensorSSA(res_1d_vect, res_shp, self.dtype)
+
+    @dsl_user_op
+    def __getitem__(
+        self, crd: Coord, *, loc=None, ip=None
+    ) -> Union["TensorSSA", Numeric]:
+        """Access or slice tensor elements using coordinates.
+
+        This method implements tensor evaluation T(c) = *(E + L(c)) where E is the iterator/engine
+        and L is the layout. It supports both direct element access and slicing operations.
+
+        :param crd: Coordinate or slice specification for accessing tensor elements
+        :type crd: Coord
+        :param loc: Source location for MLIR operation tracking, defaults to None
+        :type loc: Optional[Location]
+        :param ip: Insertion point for MLIR operation, defaults to None
+        :type ip: Optional[InsertionPoint]
+        :return: Tensor element value or sliced subtensor
+        :rtype: Union[TensorSSA, Numeric]
+
+        :raises ValueError: If coordinate access is invalid for the tensor layout
+
+        **Examples:**
+
+        .. code-block:: python
+
+            # Create a fragment from rmem as shape (8, 4)
+            layout = make_layout((8, 4))
+            tensor = make_fragment(layout, Float32)
+            frg = tensor.load()
+
+            # Direct element access
+            val = frg[0]  # Returns first element of fragment
+            val = frg[(0, 1)]  # Returns element at (0, 1)
+
+            # Slice access
+            sliced = frg[(3, None)]  # Returns fragment slice
+        """
+        # short-cut to no-op
+        if crd is None:
+            return self
+
+        if not has_underscore(crd):
+            if self._layout is None:
+                self._layout = make_layout(self._shape, loc=loc, ip=ip)
+            idx = crd2idx(crd, self._layout, loc=loc, ip=ip)
+            idx_val = as_numeric(idx).ir_value(loc=loc, ip=ip)
+            res_val = vector.extractelement(self, position=idx_val, loc=loc, ip=ip)
+            return self.dtype(res_val)
+
+        if not is_static(crd):
+            raise ValueError("dynamic coordinate is not supported")
+
+        flat_shp, flat_crd = self._flatten_shape_and_coord(crd)
+
+        multi_dim_ty = ir.VectorType.get(list(flat_shp), self.type.element_type)
+        # vector<NxTy> -> vector<N_0xN_1x...xTy>
+        tmp_vect = vector.shape_cast(multi_dim_ty, self)
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self._shape, crd)
+        if isinstance(res_shp, ir.Value):
+            raise TypeError(
+                f"expects static shape and coordinates, but got {self._shape} and {crd}"
+            )
+
+        # Offsets is index of coordinates if NOT `_` otherwise 0
+        offsets = [c if c is not None else 0 for c in flat_crd]
+        # Sizes is size of shapes if `_` otherwise 1
+        sizes = [s if c is None else 1 for s, c in zip(flat_shp, flat_crd)]
+        # Logic stride to index vector. Only support stride-1 by vector
+        strides = [1] * rank(flat_shp)
+
+        # Vector slice on N-D vector
+        res_ty = ir.VectorType.get(list(sizes), self.type.element_type)
+        res_vect = vector.extract_strided_slice(
+            res_ty, tmp_vect, offsets=offsets, sizes=sizes, strides=strides
+        )
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self._shape, crd)
+        return self._build_result(res_vect, res_shp, loc=loc, ip=ip)
+
+    @dsl_user_op
+    def to(self, dtype: Type[Numeric], *, loc=None, ip=None):
+        """Convert the tensor to a different numeric type.
+
+        :param dtype: The target numeric type to cast to.
+        :type dtype: Type[Numeric]
+        :return: A new tensor with the same shape but with elements cast to the target type.
+        :rtype: TensorSSA
+        :raises TypeError: If dtype is not a subclass of Numeric.
+        :raises NotImplementedError: If dtype is an unsigned integer type.
+        """
+        if dtype is ir.Value:
+            return self
+
+        if not isclass(dtype) or not issubclass(dtype, Numeric):
+            raise TypeError(f"dtype must be a type of Numeric, but got {type(dtype)}")
+
+        src_dtype = self.dtype
+        if src_dtype == dtype:
+            return self
+
+        # maybe downcast can lose signedness
+        src = self.maybe_downcast().with_signedness(self.signed)
+        if src_dtype.is_float and dtype.is_float:
+            res_vect = cutlass_arith.cvtf(src, dtype.mlir_type, loc=loc, ip=ip)
+        elif src_dtype.is_float and issubclass(dtype, Integer):
+            res_vect = cutlass_arith.fptoi(
+                src, dtype.signed, dtype.mlir_type, loc=loc, ip=ip
+            )
+        elif issubclass(src_dtype, Integer) and dtype.is_float:
+            res_vect = cutlass_arith.itofp(
+                src, src_dtype.signed, dtype.mlir_type, loc=loc, ip=ip
+            )
+        else:
+            res_vect = cutlass_arith.int_to_int(src, dtype, loc=loc, ip=ip)
+
+        return TensorSSA(res_vect, self._shape, dtype)
+
+    def ir_value(self, *, loc=None, ip=None):
+        return self
+
+    def ir_value_int8(self, *, loc=None, ip=None):
+        """
+        Returns int8 ir value of Boolean tensor.
+        When we need to store Boolean tensor ssa, use ir_value_int8().
+
+        :param loc: Source location information, defaults to None
+        :type loc: Optional[Location], optional
+        :param ip: Insertion point for MLIR operations, defaults to None
+        :type ip: Optional[InsertionPoint], optional
+        :return: The int8 value of this Boolean
+        :rtype: ir.Value
+        """
+        assert (
+            self.element_type is Boolean
+        ), f"Only boolean type needs to be converted to int8, got {self.element_type}"
+
+        if not hasattr(self, "_value_int8"):
+            self._value_int8 = arith.extsi(
+                T.vector(self.type.shape[0], T.i8()), self, loc=loc, ip=ip
+            )
+        return self._value_int8
+
+    def reduce(self, op, init_val, reduction_profile: Coord, *, loc=None, ip=None):
+        """
+        Perform reduce on selected modes with given predefined reduction op.
+
+        :param op: The reduction operator to use (operator.add or operator.mul)
+        :type op: operator
+        :param init_val: The initial value for the reduction
+        :type init_val: numeric
+        :param reduction_profile: Specifies which dimensions to reduce. Dimensions marked with `None` are kept.
+        :type reduction_profile: Coord
+
+        :return: The reduced tensor
+        :rtype: TensorSSA
+
+        **Examples:**
+
+        .. code-block:: python
+
+            reduce(f32 o (4,))
+              => f32
+
+            reduce(f32 o (4, 5))
+              => f32
+            reduce(f32 o (4, (5, 4)), reduction_profile=(None, 1))
+              => f32 o (4,)
+            reduce(f32 o (4, (5, 4)), reduction_profile=(None, (None, 1)))
+              => f32 o (4, (5,))
+        """
+        # short-cut to no-op
+        if reduction_profile is None:
+            return self
+
+        if not is_weakly_congruent(reduction_profile, self.shape):
+            raise ValueError(
+                f"Expect reduction_profile be weakly congruent to the shape of the tensor, "
+                f"but got {reduction_profile} and {self.shape}"
+            )
+
+        if op is ReductionOp.ADD:
+            red_kind = vector.CombiningKind.ADD
+        elif op is ReductionOp.MUL:
+            red_kind = vector.CombiningKind.MUL
+        elif op is ReductionOp.MAX:
+            red_kind = vector.CombiningKind.MAXIMUMF
+        elif op is ReductionOp.MIN:
+            red_kind = vector.CombiningKind.MINIMUMF
+        else:
+            raise NotImplementedError(
+                f"{op} is not supported, expects one of "
+                f"{ReductionOp.ADD, ReductionOp.MUL, ReductionOp.MAX, ReductionOp.MIN}"
+            )
+
+        elem_ty = self.element_type
+        # Canonicalize to `Numeric` and convert into MLIR value
+        init_val = as_numeric(init_val).ir_value(loc=loc, ip=ip)
+
+        if depth(reduction_profile) == 0:
+            return vector.reduction(
+                elem_ty.mlir_type, red_kind, self, acc=init_val, loc=loc, ip=ip
+            )
+
+        flat_shp, flat_prof = self._flatten_shape_and_coord(
+            reduction_profile, loc=loc, ip=ip
+        )
+        assert depth(flat_shp) == 1 and depth(flat_prof) == 1
+        assert rank(flat_shp) == rank(flat_prof)
+
+        temp_ty = ir.VectorType.get(list(flat_shp), elem_ty.mlir_type)
+        temp_vect = vector.shape_cast(temp_ty, self, loc=loc, ip=ip)
+
+        if isinstance(flat_prof, tuple):
+            red_dims = [i for i, x in enumerate(flat_prof) if x is not None]
+        else:
+            red_dims = [0]
+
+        temp_acc_shp = slice_(flat_shp, flat_prof, loc=loc, ip=ip)
+        temp_acc_ty = ir.VectorType.get(list(temp_acc_shp), elem_ty.mlir_type)
+
+        init_val = vector.broadcast(temp_acc_ty, init_val, loc=loc, ip=ip)
+        res_vect = vector.multi_reduction(
+            red_kind, temp_vect, acc=init_val, reduction_dims=red_dims, loc=loc, ip=ip
+        )
+
+        # Slice and keep dims matching `_` or None
+        res_shp = slice_(self.shape, reduction_profile, loc=loc, ip=ip)
+        return self._build_result(res_vect, res_shp, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def full(shape, fill_value, dtype: Type[Numeric], *, loc=None, ip=None) -> TensorSSA:
+    """
+    Return a new TensorSSA of given shape and type, filled with fill_value.
+
+    :param shape: Shape of the new tensor.
+    :type shape: tuple
+    :param fill_value: Value to fill the tensor with.
+    :type fill_value: scalar
+    :param dtype: Data type of the tensor.
+    :type dtype: Type[Numeric]
+    :return: Tensor of fill_value with the specified shape and dtype.
+    :rtype: TensorSSA
+    """
+    size = product(shape, loc=loc, ip=ip)
+    if not is_static(size):
+        raise ValueError("shape must be static")
+
+    if isinstance(fill_value, (ir.Value, int, float, bool)):
+        fill_value = dtype(fill_value)
+    elif isinstance(fill_value, Numeric):
+        fill_value = fill_value.to(dtype, loc=loc, ip=ip)
+    else:
+        raise ValueError(f"Expected fill_value be numeric type, but got {fill_value}")
+
+    res_ty = T.vector(size, dtype.mlir_type)
+    res_val = vector.splat(res_ty, fill_value.ir_value(loc=loc, ip=ip), loc=loc, ip=ip)
+    return TensorSSA(res_val, shape, dtype)
+
+
+def full_like(
+    a: Union[TensorSSA, Tensor],
+    fill_value,
+    dtype: Union[None, Type[Numeric]] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> TensorSSA:
+    """
+    Return a full TensorSSA with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: array_like
+    :param fill_value: Fill value.
+    :type fill_value: array_like
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Union[None, Type[Numeric]], optional
+    :return: Tensor of `fill_value` with the same shape and type as `a`.
+    :rtype: TensorSSA
+
+    .. seealso::
+       :func:`empty_like`: Return an empty array with shape and type of input.
+       :func:`ones_like`: Return an array of ones with shape and type of input.
+       :func:`zeros_like`: Return an array of zeros with shape and type of input.
+       :func:`full`: Return a new array of given shape filled with value.
+
+    **Examples:**
+
+    .. code-block:: python
+
+        frg = cute.make_fragment(Float32, (2, 3))
+        a = frg.load()
+        b = cute.full_like(a, 1.0)
+    """
+    if not hasattr(a, "shape"):
+        raise TypeError(f"Expect `a` be shaped type, but got {type(a)}")
+
+    return full(
+        a.shape, fill_value, dtype if dtype is not None else a.dtype, loc=loc, ip=ip
+    )
+
+
+def empty_like(a, dtype=None):
+    """
+    Return a new TensorSSA with the same shape and type as a given array, without initializing entries.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Uninitialized tensor with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 0, dtype)
+
+
+def ones_like(a, dtype=None):
+    """
+    Return a TensorSSA of ones with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Tensor of ones with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 1, dtype)
+
+
+def zeros_like(a, dtype=None, *, loc=None, ip=None):
+    """
+    Return a TensorSSA of zeros with the same shape and type as a given array.
+
+    :param a: The shape and data-type of `a` define these same attributes of the returned array.
+    :type a: TensorSSA
+    :param dtype: Overrides the data type of the result, defaults to None
+    :type dtype: Type[Numeric], optional
+    :return: Tensor of zeros with the same shape and type (unless overridden) as `a`.
+    :rtype: TensorSSA
+    """
+    return full_like(a, 0, dtype, loc=loc, ip=ip)
+
+
+def where(
+    cond: TensorSSA, x: TensorSSA, y: TensorSSA, *, loc=None, ip=None
+) -> TensorSSA:
+    """
+    Return elements chosen from x or y depending on condition.
+
+    :param cond: Where True, yield x, where False, yield y.
+    :type cond: TensorSSA
+    :param x: Values from which to choose when condition is True.
+    :type x: TensorSSA
+    :param y: Values from which to choose when condition is False.
+    :type y: TensorSSA
+    :return: A tensor with elements from x where condition is True, and elements from y where condition is False.
+    :rtype: TensorSSA
+    """
+    if x.dtype != y.dtype:
+        raise ValueError(
+            f"x and y must have the same dtype, but got {x.dtype} and {y.dtype}"
+        )
+
+    if cond.dtype != Boolean:
+        raise ValueError(f"cond must be Boolean type, but got {cond.dtype}")
+
+    return TensorSSA(
+        arith.select(cond.ir_value(), x, y, loc=loc, ip=ip), x.shape, x.dtype
+    )
+
+
+def any_(x: TensorSSA, *, loc=None, ip=None) -> Boolean:
+    """
+    Test whether any tensor element evaluates to True.
+
+    :param x: Input tensor.
+    :type x: TensorSSA
+    :return: Returns a TensorSSA scalar containing True if any element of x is True, False otherwise.
+    :rtype: TensorSSA
+    """
+    is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip)
+    return Boolean(
+        vector.reduction(T.bool(), vector.CombiningKind.OR, is_true, loc=loc, ip=ip)
+    )
+
+
+def all_(x: TensorSSA, *, loc=None, ip=None) -> Boolean:
+    """
+    Test whether all tensor elements evaluate to True.
+
+    :param x: Input tensor.
+    :type x: TensorSSA
+    :return: Returns a TensorSSA scalar containing True if all elements of x are True, False otherwise.
+    :rtype: TensorSSA
+    """
+    is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip)
+    return Boolean(
+        vector.reduction(T.bool(), vector.CombiningKind.AND, is_true, loc=loc, ip=ip)
+    )
+
+
+##############################################################################
+# User defined struct
+##############################################################################
+
+
+class struct:
+    """
+    Decorator to abstract C structure in Python DSL.
+
+    **Usage:**
+
+    .. code-block:: python
+
+        # Supports base_dsl scalar int/float elements, array and nested struct:
+        @cute.struct
+        class complex:
+            real : cutlass.Float32
+            imag : cutlass.Float32
+
+
+        @cute.struct
+        class StorageA:
+            mbarA : cute.struct.MemRange[cutlass.Int64, stage]
+            compA : complex
+            intA : cutlass.Int16
+
+
+        # Supports aligment for its elements:
+        @cute.struct
+        class StorageB:
+            a: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, size_a], 1024
+            ]
+            b: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Float32, size_b], 1024
+            ]
+            x: cute.struct.Align[cutlass.Int32, 16]
+            compA: cute.struct.Align[complex, 16]
+
+
+        # Statically get size and alignment:
+        size = StorageB.__sizeof__()
+        align = StorageB.__alignof__()
+
+        # Allocate and referencing elements:
+        storage = allocator.allocate(StorageB)
+
+        storage.a[0] ...
+        storage.x ...
+        storage.compA.real ...
+
+    :param cls: The struct class with annotations.
+    :return: The decorated struct class.
+    """
+
+    # inner class for defining a continuous memory region
+    class _MemRangeMeta(type):
+        """
+        A metaclass for creating MemRange classes.
+
+        This metaclass is used to dynamically create MemRange classes with specific
+        data types and sizes.
+
+        :ivar _dtype: The data type of the MemRange.
+        :ivar _size: The size of the MemRange.
+        """
+
+        _dtype = None
+        _size = None
+
+        def __new__(cls, name, bases, dct):
+            new_cls = super().__new__(cls, name, bases, dct)
+            return new_cls
+
+        def __getitem__(cls, params) -> Type["struct.MemRange"]:
+            # get params from syntax: struct.MemRange[dtype, size]
+            if len(params) == 2:
+                dtype, size = params
+            else:
+                raise TypeError("Invalid struct.MemRange Arguments")
+
+            if not struct._is_scalar_type(dtype):
+                raise TypeError("MemRange only support dsl scalar type!")
+
+            # Create new class with proper name and parameters
+            new_cls = type(
+                f"struct.MemRange[{dtype.__name__}, {size}]",
+                (struct.MemRange,),
+                {"_dtype": dtype, "_size": size},
+            )
+            return new_cls
+
+        @property
+        def size(cls):
+            return cls._size
+
+        @property
+        def elem_width(cls):
+            return cls._dtype.width
+
+        @property
+        def size_in_bytes(cls):
+            return cls.size * cls.elem_width // 8
+
+    class MemRange(metaclass=_MemRangeMeta):
+        """
+        Defines a range of memory by `MemRange[T, size]`.
+        """
+
+        pass
+
+    class _MemRangeData:
+        """
+        Represents a range of memory.
+
+        :param dtype: The data type.
+        :param size: The size of the memory range in bytes.
+        :param base: The base address of the memory range.
+        """
+
+        def __init__(self, dtype, size, base):
+            """
+            Initializes a new memory range.
+
+            :param dtype: The data type.
+            :param size: Size of the memory range in bytes. A size of **0** is accepted, but in that
+                         case the range can only be used for its address (e.g. as a partition marker).
+            :param base: The base address of the memory range.
+            """
+            self._dtype = dtype
+            self._size = size
+            self._base = base
+
+        def data_ptr(self):
+            """
+            Returns start pointer to the data in this memory range.
+
+            :return: A pointer to the start of the memory range.
+            :raises AssertionError: If the size of the memory range is negative.
+            """
+            assert self._size >= 0
+            return recast_ptr(self._base, dtype=self._dtype)
+
+        def get_tensor(self, layout, swizzle=None, dtype=None):
+            """
+            Creates a tensor from the memory range.
+
+            :param layout: The layout of the tensor.
+            :param swizzle: Optional swizzle pattern.
+            :param dtype: Optional data type; defaults to the memory range's data type if not specified.
+            :return: A tensor representing the memory range.
+            :raises TypeError: If the layout is incompatible with the swizzle.
+            :raises AssertionError: If the size of the memory range is not greater than zero.
+            """
+            assert self._size > 0
+            # make tensor
+            if isinstance(layout, ComposedLayout) and (swizzle is not None):
+                raise TypeError(f"incompatible layout with swizzle")
+            elem_type = self._dtype if dtype is None else dtype
+            ptr = recast_ptr(self._base, swizzle, dtype=elem_type)
+            res = make_tensor(ptr, layout)
+            return res
+
+        def __getitem__(self, index: int) -> Any:
+            """
+            Returns the element at the specified index in the memory range.
+
+            :param index: The index of the element to retrieve.
+            :return: The element at the specified index.
+            :raises AssertionError: If the index is out of range.
+            """
+            assert (index >= 0) and (index < self._size)
+            return self.data_ptr() + index
+
+    # inner class for aligning a member type
+    class _AlignMeta(type):
+        """
+        Aligns the given object by setting its alignment attribute.
+
+        :param v: The object to align. Must be a struct, MemRange, or a scalar type.
+        :param align: The alignment value to set.
+        :raises TypeError: If the object is not a struct, MemRange, or a scalar type.
+
+        :ivar _dtype: The data type to be aligned.
+        :ivar _align: The alignment of the data type.
+        """
+
+        _dtype = None
+        _align = None
+
+        def __new__(cls, name, bases, dct):
+            return super().__new__(cls, name, bases, dct)
+
+        def __getitem__(cls, params) -> Any:
+            if len(params) == 2:
+                dtype, align = params
+                assert align > 0
+            else:
+                raise TypeError("Invalid struct.Align Arguments")
+
+            if not struct._is_scalar_type(dtype) and not isinstance(
+                dtype, (struct, struct._MemRangeMeta)
+            ):
+                raise TypeError(
+                    "align only can be applied to struct/MemRange/base_dsl scalar"
+                )
+
+            # Create new class with alignment
+            new_cls = type(
+                f"struct.Align[{dtype.__name__}, {align}]",
+                (struct.Align,),
+                {"_dtype": dtype, "_align": align},
+            )
+            return new_cls
+
+        @property
+        def dtype(cls):
+            return cls._dtype
+
+        @property
+        def align(cls):
+            return cls._align
+
+    class Align(metaclass=_AlignMeta):
+        """
+        Aligns the given type by `Align[T, alignment]`.
+        """
+
+        pass
+
+    # util func for base dsl scalar types
+    @staticmethod
+    def _is_scalar_type(dtype):
+        """
+        Checks if the given type is a scalar numeric type.
+
+        :param dtype: The type to check.
+        :return: True if the type is a subclass of Numeric, False otherwise.
+        """
+        return isinstance(dtype, type) and issubclass(dtype, Numeric)
+
+    # calculate size and alignment
+    def __init__(self, cls):
+        """
+        Initializes a new struct decorator instance.
+
+        :param cls: The class representing the structured data type.
+        :raises TypeError: If the struct is empty.
+        """
+        self._cls = cls
+        self.__name__ = f"struct::{cls.__name__}"
+        # Get the class annotations
+        self._annotations = cls.__annotations__
+        # Create a dictionary to store the offsets
+        self._offsets: Dict[str, int] = {}
+
+        # Calculate the offsets and alignment
+        offset = 0
+        alignment = 1
+        if len(self._annotations) == 0:
+            raise TypeError("Empty struct is not supported!")
+        for name, object in self._annotations.items():
+            # get alignment of object
+            sub_align = 1
+            if isinstance(object, struct._AlignMeta):
+                sub_align = object.align
+                object = object.dtype
+
+            # switch addition order to support dynamic size
+            def add_offset(val):
+                return val + offset if isinstance(val, ir.Value) else offset + val
+
+            # size of scalar
+            if struct._is_scalar_type(object):
+                dtype_size = max(1, object.width // 8)
+                sub_align = max(dtype_size, sub_align)
+                offset = self.align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(dtype_size)
+            # size of array is size_in_bytes, alignment is elem_size
+            elif isinstance(object, struct._MemRangeMeta):
+                # Allow empty array as a free marker-only struct member.
+                # Use max(sub_align, ) because we might have in the future some
+                # object.elem_width less than 8, such as fp4, bit and others,
+                # and align_offset() does not support an alignment of 0.
+                sub_align = max(object.elem_width // 8, sub_align)
+                offset = self.align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(object.size_in_bytes)
+            # size of struct
+            elif isinstance(object, struct):
+                sub_align = max(object.__alignof__(), sub_align)
+                offset = self.align_offset(offset, sub_align)
+                self._offsets[name] = offset
+                offset = add_offset(object.__sizeof__())
+            else:
+                raise TypeError(
+                    f"Struct element only support struct/array/base_dsl scalar, "
+                    f"but got {object}"
+                )
+            # Total aligment determined by the strictest requirement
+            alignment = max(alignment, sub_align)
+        # Total size determined by alignment
+        self._align_of = alignment
+        self._size_of = self.align_offset(offset, alignment)
+
+    # create the __init__ method for decorated struct
+    def __call__(self, base: Any) -> None:
+        """
+        Creates a new instance of the decorated struct.
+
+        :param base: The base address of the struct.
+        :return: An instance of the decorated struct.
+        :raises TypeError: If the base pointer is not byte-sized.
+        """
+        if base.type.value_type.width != 8:
+            raise TypeError("struct base ptr value type must be byte sized.")
+        # make an new object of user-defined decorated struct
+        # otherwise it will override same self._cls when new instance created
+        cls = self._cls()
+        setattr(cls, "_base", base)
+        for name, off in self._offsets.items():
+            obj = self._annotations[name]
+            if isinstance(obj, struct._AlignMeta):
+                obj = obj.dtype
+            if struct._is_scalar_type(obj):
+                new_obj = recast_ptr(base + off, dtype=obj)
+                setattr(cls, name, new_obj)
+            elif isinstance(obj, struct._MemRangeMeta):
+                new_obj = struct._MemRangeData(obj._dtype, obj._size, base + off)
+                setattr(cls, name, new_obj)
+            elif isinstance(obj, struct):
+                new_obj = obj(base + off)
+                setattr(cls, name, new_obj)
+            else:
+                raise TypeError(
+                    f"Struct element only support struct/array/base_dsl scalar, "
+                    f"but got {obj}"
+                )
+        return cls
+
+    # get size
+    def size_in_bytes(self) -> int:
+        """
+        Returns the size of the struct in bytes.
+
+        :return: The size of the struct.
+        """
+        return self._size_of
+
+    # get size
+    def __sizeof__(self) -> int:
+        return self._size_of
+
+    # get alignment
+    def __alignof__(self) -> int:
+        return self._align_of
+
+    # util func for aligning offset
+    @staticmethod
+    def align_offset(offset, align):
+        """
+        Return the round-up offset up to the next multiple of align.
+        """
+        assert align > 0 and not (
+            align & (align - 1)
+        ), "align should be a strictly positive power of 2."
+        return (offset + (align - 1)) & ~(align - 1)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/math.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..daaa608262d00268ec1c47dfe32758c555f009b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/math.py
@@ -0,0 +1,445 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .core import TensorSSA
+from .typing import Numeric
+from cutlass._mlir.dialects import math, arith
+
+from typing import Callable, Union
+
+
+def _math_op(func: Callable, fastmath: bool, *args, **kwargs):
+    """Dispatch the function to either a TensorSSA or a Numeric(Float).
+
+    :param func: The function to dispatch
+    :param args: The input tensor or scalar
+    :param kwargs: The input tensor or scalar
+    """
+    arg_type = type(args[0])
+    for arg in args:
+        if not isinstance(arg, TensorSSA) and (
+            not isinstance(arg, Numeric) or not type(arg).is_float
+        ):
+            raise TypeError(
+                f"Expected a TensorSSA or Numeric(Float), but got {type(arg)}"
+            )
+        if not isinstance(arg, arg_type):
+            raise TypeError(
+                f"Expected all inputs to be of type {arg_type}, but got {type(arg)}"
+            )
+
+    fastmath_flag = arith.FastMathFlags.fast if fastmath else arith.FastMathFlags.none
+    if isinstance(args[0], TensorSSA):
+        return TensorSSA(
+            func(*args, fastmath=fastmath_flag), args[0].shape, args[0].dtype
+        )
+    else:
+        args = [a.ir_value() for a in args]
+        return func(*args, fastmath=fastmath_flag)
+
+
+def acos(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc cosine of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc cosine of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = acos(y)  # Compute arc cosine
+    """
+    return _math_op(math.acos, fastmath, a)
+
+
+def asin(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc sine of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc sine of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = asin(y)  # Compute arc sine
+    """
+    return _math_op(math.asin, fastmath, a)
+
+
+def atan(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc tangent of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of each element in input tensor
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = atan(y)  # Compute arc tangent
+    """
+    raise NotImplementedError("atan is not implemented")
+    return _math_op(math.atan, fastmath, a)
+
+
+def atan2(
+    a: Union[TensorSSA, Numeric], b: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise arc tangent of two tensors.
+
+    Computes atan2(a, b) element-wise. The function atan2(a, b) is the angle in radians
+    between the positive x-axis and the point given by the coordinates (b, a).
+
+    :param a: First input tensor (y-coordinates)
+    :type a: Union[TensorSSA, Numeric]
+    :param b: Second input tensor (x-coordinates)
+    :type b: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the arc tangent of a/b element-wise
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        y = cute.make_fragment(ptr1, layout).load()  # y coordinates
+        x = cute.make_fragment(ptr2, layout).load()  # x coordinates
+        theta = atan2(y, x)  # Compute angles
+    """
+    return _math_op(math.atan2, fastmath, a, b)
+
+
+def cos(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise cosine of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the cosine of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = cos(y)  # Compute cosine
+    """
+    return _math_op(math.cos, fastmath, a)
+
+
+def erf(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise error function of the input tensor.
+
+    The error function is defined as:
+    erf(x) = 2/√π ∫[0 to x] exp(-t²) dt
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the error function value for each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = erf(y)  # Compute error function
+    """
+    return _math_op(math.erf, fastmath, a)
+
+
+def exp(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise exponential of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the exponential of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = exp(y)  # Compute exponential
+    """
+    return _math_op(math.exp, fastmath, a)
+
+
+def exp2(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-2 exponential of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing 2 raised to the power of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = exp2(y)  # Compute 2^x
+    """
+    return _math_op(math.exp2, fastmath, a)
+
+
+def log(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise natural logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the natural logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log(y)  # Compute natural logarithm
+    """
+    return _math_op(math.log, fastmath, a)
+
+
+def log2(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-2 logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-2 logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log2(y)  # Compute log base 2
+    """
+    return _math_op(math.log2, fastmath, a)
+
+
+def log10(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise base-10 logarithm of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the base-10 logarithm of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = log10(y)  # Compute log base 10
+    """
+    return _math_op(math.log10, fastmath, a)
+
+
+def rsqrt(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise reciprocal square root of the input tensor.
+
+    Computes 1/√x element-wise.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the reciprocal square root of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = rsqrt(y)  # Compute 1/√x
+    """
+    return _math_op(math.rsqrt, fastmath, a)
+
+
+def sin(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise sine of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the sine of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sin(y)  # Compute sine
+    """
+    return _math_op(math.sin, fastmath, a)
+
+
+def sqrt(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise square root of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the square root of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = sqrt(y)  # Compute square root
+    """
+    return _math_op(math.sqrt, fastmath, a)
+
+
+def tan(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise tangent of the input tensor.
+
+    :param a: Input tensor (in radians)
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the tangent of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tan(y)  # Compute tangent
+    """
+    return _math_op(math.tan, fastmath, a)
+
+
+def tanh(
+    a: Union[TensorSSA, Numeric], fastmath: bool = False
+) -> Union[TensorSSA, Numeric]:
+    """Compute element-wise hyperbolic tangent of the input tensor.
+
+    :param a: Input tensor
+    :type a: Union[TensorSSA, Numeric]
+    :param fastmath: Enable fast math optimizations, defaults to False
+    :type fastmath: bool, optional
+    :return: Tensor containing the hyperbolic tangent of each element
+    :rtype: Union[TensorSSA, Numeric]
+
+    Example:
+
+    .. code-block::
+
+        x = cute.make_fragment(layout)  # Create tensor
+        y = x.load()  # Load values
+        z = tanh(y)  # Compute hyperbolic tangent
+    """
+    return _math_op(math.tanh, fastmath, a)
+
+
+__all__ = [
+    "acos",
+    "asin",
+    "atan",
+    "atan2",
+    "cos",
+    "erf",
+    "exp",
+    "exp2",
+    "log",
+    "log10",
+    "log2",
+    "rsqrt",
+    "sin",
+    "sqrt",
+    "tan",
+    "tanh",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0655bb09c05ae84714656020127cb41a4f28fbf6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from . import warp
+from . import cpasync
+from . import warpgroup
+from . import tcgen05
+
+from .common import *
+from .helpers import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    "OpError",
+    "MmaUniversalOp",
+    "CopyUniversalOp",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/common.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c4c82debcd55cd7f3d7df0e21920cda83ca18
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/common.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+import enum
+from dataclasses import dataclass
+from typing import Type, Optional
+
+from cutlass.cutlass_dsl import DSLBaseError
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from .. import core
+from ..typing import Float16, Float32, Float64, Numeric
+
+
+class OpError(DSLBaseError):
+    """
+    An exception class for Op construction errors.
+    """
+
+    def __init__(
+        self, op: core.Op, message: str, suggestion: Optional[str] = None
+    ) -> None:
+        if suggestion is None:
+            # Default suggestion
+            suggestion = "Check your Op construction code"
+        super().__init__(
+            message,
+            error_code=f"{op.__class__.__name__} error",
+            suggestion=suggestion,
+        )
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+@dataclass(frozen=True)
+class MmaUniversalOp(core.MmaOp):
+    """
+    The universal MMA Operation.
+
+    This Operation currently expects the A/B operands as well as the accumulator to share the same
+    data types.
+
+    :param abacc_dtype: The data type for the A/B operands and the accumulator
+    :type abacc_dtype:  Type[Numeric]
+    """
+
+    abacc_dtype: Type[Numeric]
+
+    def __post_init__(self) -> None:
+        if self.abacc_dtype not in [Float16, Float32, Float64]:
+            raise OpError(
+                self,
+                f"expects the 'abacc_dtype' Op parameter to be one of Float16, Float32, or Float64",
+            )
+
+    def __str__(self) -> str:
+        return (
+            "universal MMA Operation using FMA"
+            f"\n  A/B/Accumulator data type = {self.abacc_dtype}"
+        )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaUniversalTrait":
+        shape_mnk_attr = ir.Attribute.parse(f'#cute.shape<"(1,1,1)">')
+        atom_ty = _cute_nvgpu_ir.UniversalFmaAtomType.get(
+            shape_mnk_attr,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+            self.abacc_dtype.mlir_type,
+        )
+        return MmaUniversalTrait(_cute_ir.atom(atom_ty, loc=loc, ip=ip))
+
+    def _verify_fragment_A(self, input, *, loc=None, ip=None):
+        pass
+
+    def _verify_fragment_B(self, input, *, loc=None, ip=None):
+        pass
+
+class MmaUniversalTrait(core.Trait):
+    pass
+
+
+####################################################################################################
+#
+# Copy Ops and Traits
+#
+####################################################################################################
+
+
+class MemoryOrder(enum.Enum):
+    WEAK = _cute_ir.MemOrderKind.WEAK
+    RELAXED = _cute_ir.MemOrderKind.RELAXED
+    ACQUIRE = _cute_ir.MemOrderKind.ACQUIRE
+    RELEASE = _cute_ir.MemOrderKind.RELEASE
+    ACQ_REL = _cute_ir.MemOrderKind.ACQ_REL
+    SC = _cute_ir.MemOrderKind.SC
+    MMIO = _cute_ir.MemOrderKind.MMIO
+    CONSTANT = _cute_ir.MemOrderKind.CONSTANT
+    VOLATILE = _cute_ir.MemOrderKind.VOLATILE
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MemOrderKind:
+        return self.value
+
+
+class MemoryScope(enum.Enum):
+    CTA = _cute_ir.MemScopeKind.CTA
+    CLUSTER = _cute_ir.MemScopeKind.CLUSTER
+    GPU = _cute_ir.MemScopeKind.GPU
+    SYS = _cute_ir.MemScopeKind.SYS
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MemScopeKind:
+        return self.value
+
+@dataclass(frozen=True)
+class CopyUniversalOp(core.CopyOp):
+    """
+    The universal Copy Operation.
+
+    When creating a Copy Atom out of this operation, the expected usage pattern is
+
+    .. code-block:: python
+
+        op = cute.nvgpu.CopyUniversalOp()
+        atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64)
+
+    - ``tensor_dtype`` is the data type used to build the reference TV Layout (either the source \
+        or the destination TV Layout) in unit of tensor elements and is used for partitioning by \
+        ``TiledCopy`` for example
+    - ``num_bits_per_copy`` is a kw argument specifying the number of bits to copy per Atom \
+        execution. This can be larger than the width of the above data type. When not provided, \
+        the compiler will do a best effort at auto-vectorizing.
+    """
+
+    def __str__(self) -> str:
+        return "universal Copy Operation"
+
+    def _make_trait(
+        self,
+        copy_internal_type: Type[Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyUniversalTrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", 0)
+        memory_order = kwargs.get("memory_order", MemoryOrder.WEAK)
+        memory_scope = kwargs.get("memory_scope", MemoryScope.CTA)
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy < 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is non-negative "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get(
+            copy_internal_type.mlir_type,
+            num_bits_per_copy,
+            memory_order._to_ir(),
+            memory_scope._to_ir(),
+        )
+        return CopyUniversalTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class CopyUniversalTrait(core.Trait):
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..246360c2eb43ed5c4ca45127c579bc9f496caa08
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .helpers import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "LoadCacheMode",
+    "CopyG2SOp",
+    "CopyBulkTensorTileG2SOp",
+    "CopyBulkTensorTileG2SMulticastOp",
+    "CopyBulkTensorTileS2GOp",
+    "CopyReduceBulkTensorTileS2GOp",
+    #
+    # helpers.py
+    #
+    "make_tiled_tma_atom",
+    "tma_partition",
+    "create_tma_multicast_mask",
+    "prefetch_descriptor",
+    "copy_tensormap",
+    "update_tma_descriptor",
+    "fence_tma_desc_acquire",
+    "cp_fence_tma_desc_release",
+    "fence_tma_desc_release",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a15495602304700d19803825d93004e0fa9fc509
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py
@@ -0,0 +1,471 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Optional, Type
+
+from cutlass.cutlass_dsl import CuTeDSL, t
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ...core import CopyOp, Trait, ReductionOp
+from ...typing import Int16, Pointer, Integer, Numeric
+from ..common import OpError
+from ..tcgen05.mma import CtaGroup
+
+
+####################################################################################################
+#
+# Aynchronous copies
+#
+####################################################################################################
+
+
+class LoadCacheMode(enum.Enum):
+    """
+    An enumeration for the possible cache modes of a non-bulk ``cp.async`` instruction.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#cache-operators>`__.
+    """
+
+    ALWAYS = _cute_nvgpu_ir.LoadCacheMode.always
+    GLOBAL = _cute_nvgpu_ir.LoadCacheMode.global_
+    STREAMING = _cute_nvgpu_ir.LoadCacheMode.streaming
+    LAST_USE = _cute_nvgpu_ir.LoadCacheMode.last_use
+    NONE = _cute_nvgpu_ir.LoadCacheMode.none
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_nvgpu_ir.LoadCacheMode:
+        return self.value
+
+
+@dataclass(frozen=True)
+class CopyG2SOp(CopyOp):
+    """
+    Non-bulk asynchronous GMEM to SMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-non-bulk-copy>`__.
+    """
+
+    cache_mode: LoadCacheMode = LoadCacheMode.ALWAYS
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM copy Operation"
+        if self.cache_mode != LoadCacheMode.ALWAYS:
+            res += f"\n  with cache mode = {self.cache_mode}"
+        return res
+
+    def _make_trait(
+        self,
+        copy_internal_type: Type[t.Numeric],
+        *,
+        loc=None,
+        ip=None,
+        **kwargs,
+    ) -> "CopyG2STrait":
+        num_bits_per_copy = kwargs.get("num_bits_per_copy", None)
+        # Verify that the user provided enum values
+        if not isinstance(self.cache_mode, LoadCacheMode):
+            raise OpError(
+                self,
+                "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance",
+            )
+        if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy <= 0):
+            raise ValueError(
+                "expects a 'num_bits_per_copy' kw argument of type int that is positive "
+                f"when creating a copy Atom for {self.__class__.__name__}"
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cache_mode, LoadCacheMode):
+            raise OpError(
+                self,
+                "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance",
+            )
+        ty = _cute_nvgpu_ir.CopyAtomSIMTAsyncCopyType.get(
+            copy_internal_type.mlir_type, self.cache_mode._to_ir(), num_bits_per_copy
+        )
+        return CopyG2STrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class CopyG2STrait(Trait):
+    pass
+
+
+####################################################################################################
+#
+# Bulk tensor copies a.k.a TMA copies
+#
+####################################################################################################
+
+TMA_MBAR_PTR_FIELD_NAME = "tma_bar"
+TMA_MASK_FIELD_NAME = "mcast_mask"
+TMA_DESC_PTR_FIELD_NAME = "tma_descriptor_ptr"
+
+#
+# TMA GMEM -> SMEM copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SOp(CopyOp):
+    """
+    Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    cta_group: CtaGroup = CtaGroup.ONE
+
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm
+        else:
+            assert False, "unrecognized self.cta_group"
+
+
+class CopyBulkTensorTileG2SNonExecTrait(Trait):
+    # We allow kw args to be dropped so that the user can write common code for non-multicast
+    # and multicast loads.
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        tma_desc_ptr: Optional[Pointer] = None,
+        **kwargs,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+
+        The non-multicast TMA load requires a `tma_bar_ptr` keyword argument to be provided when
+        using `cute.copy`. Any other kw arguments will be ignored instead of triggering an error.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_MBAR_PTR_FIELD_NAME}>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+
+#
+# TMA GMEM -> SMEM multicast copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileG2SMulticastOp(CopyOp):
+    """
+    Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    cta_group: CtaGroup = CtaGroup.ONE
+
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self):
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self, "expects the 'cta_group' parameter to be a CtaGroup instance"
+            )
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90":
+            raise OpError(
+                self,
+                f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        res = "cp.async GMEM -> SMEM bulk tensor multicast copy Operation"
+        if self.cta_group == CtaGroup.TWO:
+            res += f"\n  CTA group = 2"
+        return res
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileG2SMulticastNonExecTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+
+    def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum:
+        if self.cta_group == CtaGroup.ONE:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90_multicast
+        elif self.cta_group == CtaGroup.TWO:
+            return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm_multicast
+        else:
+            assert False, "unrecognized self.cta_group"
+
+
+class CopyBulkTensorTileG2SMulticastNonExecTrait(Trait):
+    def unpack(
+        self,
+        *,
+        loc=None,
+        ip=None,
+        tma_bar_ptr: Optional[Pointer] = None,
+        mcast_mask=None,
+        tma_desc_ptr=None,
+    ):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+
+        The multicast TMA load requires a `tma_bar_ptr`  and a `mcast_mask` keyword arguments to be
+        provided when using `cute.copy`.
+        """
+        if not isinstance(tma_bar_ptr, Pointer):
+            raise ValueError(
+                "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument"
+            )
+        if not isinstance(mcast_mask, Integer):
+            raise ValueError(
+                "expects a multicast mask to be provided via the mcast_mask kw argument"
+            )
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<tma_bar>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip
+        )
+        attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<mcast_mask>"
+        attr = ir.Attribute.parse(attr_str)
+        exec_value = _cute_nvgpu_ir.atom_set_value(
+            exec_value, attr, Int16(mcast_mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>"
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+
+#
+# TMA SMEM -> GMEM copies
+#
+
+
+@dataclass(frozen=True)
+class CopyBulkTensorTileS2GOp(CopyOp):
+    """
+    Bulk tensor asynchronous SMEM to GMEM Copy Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self):
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        return "cp.async SMEM -> GMEM bulk tensor copy Operation"
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyBulkTensorTileS2GTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+
+
+class CopyBulkTensorTileS2GTrait(Trait):
+    def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        """
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = (
+                f"#cute_nvgpu.atom_copy_field_tmastore<{TMA_DESC_PTR_FIELD_NAME}>"
+            )
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+@dataclass(frozen=True)
+class CopyReduceBulkTensorTileS2GOp(CopyOp):
+    """
+    Bulk tensor asynchronous SMEM to GMEM Reduction Operation using the TMA unit.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk>`__.
+    This Operation uses TMA in the ``.tile`` mode.
+    """
+
+    reduction_kind: ReductionOp = ReductionOp.ADD
+
+    admissible_archs = [
+        "sm_90",
+        "sm_90a",
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post__init__(self):
+        # Arch verification
+        arch = CuTeDSL.__get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+    def __str__(self) -> str:
+        return "cp.async SMEM -> GMEM bulk tensor reduction Operation"
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "CopyReduceBulkTensorTileS2GTrait":
+        raise NotImplementedError(
+            "Use cpasync.make_tiled_tma_atom to obtain a copy Atom for TMA"
+        )
+
+    def _to_ir(self) -> _cute_nvgpu_ir.ReductionKind:
+        if self.reduction_kind == ReductionOp.ADD:
+            return _cute_nvgpu_ir.ReductionKind.ADD
+        elif self.reduction_kind == ReductionOp.MIN:
+            return _cute_nvgpu_ir.ReductionKind.MIN
+        elif self.reduction_kind == ReductionOp.MAX:
+            return _cute_nvgpu_ir.ReductionKind.MAX
+        elif self.reduction_kind == ReductionOp.INC:
+            return _cute_nvgpu_ir.ReductionKind.INC
+        elif self.reduction_kind == ReductionOp.DEC:
+            return _cute_nvgpu_ir.ReductionKind.DEC
+        elif self.reduction_kind == ReductionOp.AND:
+            return _cute_nvgpu_ir.ReductionKind.AND
+        elif self.reduction_kind == ReductionOp.OR:
+            return _cute_nvgpu_ir.ReductionKind.OR
+        elif self.reduction_kind == ReductionOp.XOR:
+            return _cute_nvgpu_ir.ReductionKind.XOR
+        else:
+            assert False, "unrecognized self.reduction_kind"
+
+
+class CopyReduceBulkTensorTileS2GTrait(Trait):
+    def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None):
+        """
+        Custom implementation of unpack for non-executable TMAs.
+        """
+        exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip)
+        if isinstance(tma_desc_ptr, Pointer):
+            attr_str = (
+                f"#cute_nvgpu.atom_copy_field_tmareduce<{TMA_DESC_PTR_FIELD_NAME}>"
+            )
+            attr = ir.Attribute.parse(attr_str)
+            exec_value = _cute_nvgpu_ir.atom_set_value(
+                exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip
+            )
+        return exec_value
+
+__all__ = [
+    "LoadCacheMode",
+    "CopyG2SOp",
+    "CopyBulkTensorTileG2SOp",
+    "CopyBulkTensorTileG2SMulticastOp",
+    "CopyBulkTensorTileS2GOp",
+    "CopyReduceBulkTensorTileS2GOp",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64f07f167501d1805096373e915017612de4387
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py
@@ -0,0 +1,341 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Tuple, Type, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import llvm
+
+from ...typing import Coord, Layout, Tensor, Tiler, Pointer, Int16, Numeric, NumericMeta
+from ... import core
+from .copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileS2GOp,
+    CopyReduceBulkTensorTileS2GOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+    CopyBulkTensorTileS2GTrait,
+    CopyReduceBulkTensorTileS2GTrait,
+)
+
+
+@dsl_user_op
+def make_tiled_tma_atom(
+    op: Union[
+        CopyBulkTensorTileG2SOp,
+        CopyBulkTensorTileG2SMulticastOp,
+        CopyBulkTensorTileS2GOp,
+        CopyReduceBulkTensorTileS2GOp,
+    ],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    cta_tiler: Tiler,
+    num_multicast: int = 1,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy Atom in the ``.tile`` mode to copy tiles of a GMEM tensor to/from SMEM
+    buffer with the given Layout.
+
+    Given
+
+    - a GMEM tensor
+    - a SMEM layout
+    - a CTA-level Tiler
+
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to/from an SMEM buffer with the provided
+    layout and consistent with the provided Tiler.
+
+    This function returns two results:
+
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates \
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the \
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned \
+       similarly to any other CuTe tensors using the algebra.
+
+    :param op:            The Copy Operation to construct an Atom for
+    :type op:             Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileS2GOp, CopyReduceBulkTensorTileS2GOp]
+    :param gmem_tensor:   The GMEM tensor involved in the Copy
+    :type gmem_tensor:    Tensor
+    :param smem_layout:   The SMEM layout to construct the Copy Atom for
+    :type smem_layout:    Union[Layout, core.ComposedLayout]
+    :param cta_tiler:     The CTA Tiler to use
+    :type cta_tiler:      Tiler
+    :param num_multicast: The multicast factor
+    :type num_multicast:  int
+    :param internal_type: An optional parameter for the internal data type to use when the actual data type is not supported by the TMA unit
+    :type internal_type:  Type[Numeric]
+    :return:              A Copy Atom for this Operation and the associated TMA tensor
+    :rtype:               Tuple[core.CopyAtom, Tensor]
+    """
+
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+
+    cta_v_map = core.composition(
+        core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip),
+        cta_tiler,
+        loc=loc,
+        ip=ip,
+    )
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        if num_multicast != 1:
+            raise ValueError(
+                f"expects num_multicast to be 1 for non multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    elif isinstance(op, CopyBulkTensorTileG2SMulticastOp):
+        if num_multicast < 1:
+            raise ValueError(
+                f"expects num_multicast to be >= 1 for multicast G2S copies, "
+                f"but got {num_multicast}"
+            )
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            num_multicast=num_multicast,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+    elif isinstance(op, CopyBulkTensorTileS2GOp):
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_store(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyBulkTensorTileS2GTrait(res[0])), res[1]
+    elif isinstance(op, CopyReduceBulkTensorTileS2GOp):
+        res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_reduce(
+            gmem_tensor.value,
+            smem_layout,
+            cta_v_map,
+            op._to_ir(),
+            internal_type=internal_type,
+            loc=loc,
+            ip=ip,
+        )
+        return core.CopyAtom(op, CopyReduceBulkTensorTileS2GTrait(res[0])), res[1]
+    else:
+        raise ValueError(f"expects a bulk tensor (TMA) Copy Op, but got {op}")
+
+
+@dsl_user_op
+def tma_partition(
+    atom: core.CopyAtom,
+    cta_coord: Coord,
+    cta_layout: Layout,
+    smem_tensor: Tensor,
+    gmem_tensor: Tensor,
+    *,
+    loc=None,
+    ip=None,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Tiles the GMEM and SMEM tensors for the provided TMA Copy Atom.
+    """
+    cta_coord_val = core._pack_coord(cta_coord, loc=loc, ip=ip)
+    s, d = _cute_nvgpu_ir.atom_tma_partition(
+        atom._trait.value,
+        cta_coord=cta_coord_val,
+        cta_layout=cta_layout,
+        smem_tensor=smem_tensor.value,
+        gmem_tensor=gmem_tensor.value,
+        loc=loc,
+        ip=ip,
+    )
+    return s, d
+
+
+@dsl_user_op
+def create_tma_multicast_mask(
+    cta_layout_vmnk: Layout,
+    cta_coord_vmnk: Coord,
+    mcast_mode: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Int16:
+    """
+    Computes a multicast mask for a TMA load Copy.
+
+    :param cta_layout_vmnk: The VMNK layout of the cluster
+    :type cta_layout_vmnk:  Layout
+    :param cta_coord_vmnk:  The VMNK coordinate of the current CTA
+    :type cta_coord_vmnk:   Coord
+    :param mcast_mode:      The tensor mode in which to multicast
+    :type mcast_mode:       int
+    :return:                The resulting mask
+    :rtype:                 Int16
+    """
+    if core.rank(cta_layout_vmnk) != 4:
+        raise ValueError(
+            f"cta_layout_vmnk must be rank 4, but got {core.pretty_str(cta_layout_vmnk)}"
+        )
+    if core.rank(cta_coord_vmnk) != 4:
+        raise ValueError(
+            f"cta_coord_vmnk must be rank 4, but got {core.pretty_str(cta_coord_vmnk)}"
+        )
+    return core.make_layout_image_mask(
+        cta_layout_vmnk, cta_coord_vmnk, mcast_mode, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def prefetch_descriptor(tma_atom: core.CopyAtom, *, loc=None, ip=None) -> None:
+    """
+    Prefetches the TMA descriptor associated with the TMA Atom.
+    """
+    _cute_nvgpu_ir.prefetch_tma_desc(tma_atom._trait.value, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def copy_tensormap(
+    tma_atom: core.CopyAtom, tensormap_ptr: Pointer, *, loc=None, ip=None
+) -> None:
+    """
+    Copies the tensormap held by a TMA Copy Atom to the memory location pointed to by the provided
+    pointer.
+
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param tensormap_ptr: The pointer to the memory location to copy the tensormap to
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.copy_tma_desc(
+        tma_atom._trait.value, tensormap_ptr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def update_tma_descriptor(
+    tma_atom: core.CopyAtom,
+    gmem_tensor: Tensor,
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Updates the TMA descriptor in the memory location pointed to by the provided pointer using
+    information from a TMA Copy Atom and the provided GMEM tensor.
+
+    Specifically, the following fields of the TMA descriptor will be updated:
+
+    1. the GMEM tensor base address
+    2. the GMEM tensor shape
+    3. the GMEM tensor stride
+
+    Other fields of the TMA descriptor are left unchanged.
+
+    :param tma_atom:      The TMA Copy Atom
+    :type tma_atom:       CopyAtom
+    :param gmem_tensor:   The GMEM tensor
+    :type gmem_tensor:    Tensor
+    :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate
+    :type tensormap_ptr:  Pointer
+    """
+    _cute_nvgpu_ir.update_tma_desc(
+        tma_atom._trait.value, gmem_tensor.value, tma_desc_ptr.value, loc=loc, ip=ip
+    )
+
+
+@dsl_user_op
+def fence_tma_desc_acquire(
+    tma_desc_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    tma_desc_ptr_i64 = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_ptr_i64],
+        "fence.proxy.tensormap::generic.acquire.gpu [$0], 128;",
+        "l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@dsl_user_op
+def cp_fence_tma_desc_release(
+    tma_desc_global_ptr: Pointer,
+    tma_desc_shared_ptr: Pointer,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`__.
+    """
+    tma_desc_global_ptr_i64 = tma_desc_global_ptr.toint(loc=loc, ip=ip).ir_value()
+    tma_desc_shared_ptr_i32 = tma_desc_shared_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        None,
+        [tma_desc_global_ptr_i64, tma_desc_shared_ptr_i32],
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [$0], [$1], 128;",
+        "l,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@dsl_user_op
+def fence_tma_desc_release(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`__.
+    """
+    llvm.inline_asm(
+        None,
+        [],
+        "fence.proxy.tensormap::generic.release.gpu;",
+        "",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4aa0dbb207dfad2832ddf7a80504c7cf591ff1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py
@@ -0,0 +1,249 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Optional, Tuple, Type, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from .. import core
+from ..typing import Shape, Layout, Tensor, Numeric, NumericMeta
+from ...impl_utils import check_type_in
+from .cpasync.copy import (
+    CopyBulkTensorTileG2SOp,
+    CopyBulkTensorTileG2SNonExecTrait,
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileG2SMulticastNonExecTrait,
+)
+
+
+####################################################################################################
+#
+# TMA creation helpers for tcgen05 MMAs
+#
+####################################################################################################
+
+
+@dsl_user_op
+def make_tiled_tma_atom_A(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy atom mapping to ``.tile`` mode for ``cp.async.bulk.tensor`` PTX operation
+    accounting for the MK projections of the TiledMMA for A tensor loads.
+
+    Given
+
+    - a GMEM tensor
+    - a SMEM layout
+    - a MMA Tiler
+    - a TiledMma
+    - a Cluster-level shape
+
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to an SMEM buffer with the provided
+    layout and consistent with the provided Tiler & tiled_mma (considering the M-mode & K-mode).
+    The Cluster-level shape is used to determine the multicast factor across the N-mode for A tensor loads.
+
+    This function returns two results:
+
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned
+       similarly to any other CuTe tensors using the algebra.
+
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp]
+    :param gmem_tensor:        The GMEM tensor to be loaded by this copy atom
+    :type gmem_tensor:         Tensor
+    :param smem_layout:        Shared memory layout to load the tensor into (PDSL)
+    :type smem_layout:         Union[Layout, core.ComposedLayout]
+    :param mma_tiler_mnk:      The MMA Tiler shape (TILE_M, TILE_N, TILE_K) in MNK dimensions
+    :type mma_tiler_mnk:       Shape
+    :param tiled_mma:          The TiledMMA that will consume the load as operands
+    :type tiled_mma:           core.TiledMma
+    :param cluster_shape_vmnk: The Cluster-level shape in VMNK dimensions
+    :type cluster_shape_vmnk:  Shape
+    :param internal_type:      An optional parameter for the internal data type to when element
+                               type does not match the copy type
+    :type internal_type:       Type[Numeric]
+    :return:                   A copy atom for this operation and the associated TMA coord tensor
+    :rtype:                    Tuple[core.CopyAtom, Tensor]
+
+    """
+
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tiled_tma_atom_A",
+    )
+
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_mk = (mma_tiler_mnk[0], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_mk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_A(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the N-mode since those would share the same tile of A
+        num_multicast = core.size(cluster_shape_vmnk, mode=[2])
+
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+
+
+@dsl_user_op
+def make_tiled_tma_atom_B(
+    op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+    gmem_tensor: Tensor,
+    smem_layout: Union[Layout, core.ComposedLayout],
+    mma_tiler_mnk: Shape,
+    tiled_mma: core.TiledMma,
+    cluster_shape_vmnk: Shape,
+    *,
+    internal_type: Optional[Type[Numeric]] = None,
+    loc=None,
+    ip=None,
+) -> Tuple[core.CopyAtom, Tensor]:
+    """
+    Makes a TMA Copy atom mapping to ``.tile`` mode for ``cp.async.bulk.tensor`` PTX operation
+    accounting for the NK projections of the TiledMMA for B tensor loads.
+
+    Given
+
+    - a GMEM tensor
+    - a SMEM layout
+    - a MMA Tiler
+    - a TiledMma
+    - a Cluster-level shape
+
+    this function figures out the bulk tensor asynchronous copy instruction to use with the maximum
+    "TMA vector length" to copy tiles of the GMEM tensor to an SMEM buffer with the provided
+    layout and consistent with the provided Tiler & tiled_mma (considering the N-mode & K-mode).
+    The Cluster-level shape is used to determine the multicast factor across the M-mode for B tensor loads.
+
+    This function returns two results:
+
+    1. the Copy Atom
+    2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates
+       that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the
+       associated layout can output coordinates. Otherwise, TMA tensors can be partitioned
+       similarly to any other CuTe tensors using the algebra.
+
+    :param op:                 The Copy Operation to construct an Atom for
+    :type op:                  Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp]
+    :param gmem_tensor:        The GMEM tensor to be loaded by this copy atom
+    :type gmem_tensor:         Tensor
+    :param smem_layout:        Shared memory layout to load the tensor into (PDSL)
+    :type smem_layout:         Union[Layout, core.ComposedLayout]
+    :param mma_tiler_mnk:      The MMA Tiler shape (TILE_M, TILE_N, TILE_K) in MNK dimensions
+    :type mma_tiler_mnk:       Shape
+    :param tiled_mma:          The TiledMMA that will consume the load as operands
+    :type tiled_mma:           core.TiledMma
+    :param cluster_shape_vmnk: The Cluster-level shape in VMNK dimensions
+    :type cluster_shape_vmnk:  Shape
+    :param internal_type:      An optional parameter for the internal data type to when element
+                               type does not match the copy type
+    :type internal_type:       Type[Numeric]
+    :return:                   A Copy Atom for this Operation and the associated TMA tensor
+    :rtype:                    Tuple[core.CopyAtom, Tensor]
+
+    """
+
+    if internal_type is not None:
+        if not isinstance(internal_type, NumericMeta):
+            raise TypeError(f"internal_type must be a Numeric, but got {internal_type}")
+        internal_type = internal_type.mlir_type
+    check_type_in(
+        op,
+        [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp],
+        "op",
+        "make_tiled_tma_atom_B",
+    )
+
+    ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip)
+    mma_tiler_nk = (mma_tiler_mnk[1], *mma_tiler_mnk[2:])
+    g_tile = core.composition(ident, mma_tiler_nk, loc=loc, ip=ip)
+    cta_v_map = tiled_mma._thrfrg_B(g_tile)
+    cta_v_map = core.get(cta_v_map, mode=[1])
+    cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile)))
+
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        num_multicast = 1
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        # multicast across the M-mode since those would share the same tile of B
+        num_multicast = core.size(cluster_shape_vmnk, mode=[1])
+
+    # res[0] = the IR Value for the non-executable atom instance
+    # res[1] = the IR Value for the associated TMA tensor
+    res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load(
+        gmem_tensor.value,
+        smem_layout,
+        cta_v_map,
+        op._to_ir(),
+        num_multicast=num_multicast,
+        internal_type=internal_type,
+        loc=loc,
+        ip=ip,
+    )
+    if isinstance(op, CopyBulkTensorTileG2SOp):
+        return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1]
+    else:
+        assert isinstance(op, CopyBulkTensorTileG2SMulticastOp)
+        return (
+            core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])),
+            res[1],
+        )
+
+
+__all__ = [
+    "make_tiled_tma_atom_A",
+    "make_tiled_tma_atom_B",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2831bec6039b86a2231a5f05bdd3d1b9e0d891b0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .mma import *
+from .helpers import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    #
+    # copy.py
+    #
+    "Repetition",
+    "Pack",
+    "Unpack",
+    "Ld16x64bOp",
+    "Ld16x128bOp",
+    "Ld16x256bOp",
+    "Ld16x32bx2Op",
+    "Ld32x32bOp",
+    "St16x64bOp",
+    "St16x128bOp",
+    "St16x256bOp",
+    "St16x32bx2Op",
+    "St32x32bOp",
+    #
+    # mma.py
+    #
+    "OperandMajorMode",
+    "OperandSource",
+    "CtaGroup",
+    "Field",
+    "MmaTF32Op",
+    "MmaF16BF16Op",
+    "MmaI8Op",
+    "MmaFP8Op",
+    "MmaMXF8Op",
+    "MmaMXF4Op",
+    "MmaMXF4NVF4Op",
+    "SmemLayoutAtomKind",
+    #
+    # helpers.py
+    #
+    "make_smem_layout_atom",
+    "tile_to_mma_shape",
+    "commit",
+    "is_tmem_load",
+    "is_tmem_store",
+    "get_tmem_copy_properties",
+    "find_tmem_tensor_col_offset",
+    "make_tmem_copy",
+    "make_s2t_copy",
+    "get_s2t_smem_desc_tensor",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..df954b09d5bcd30321df0dd65a9955fd30a0e811
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py
@@ -0,0 +1,663 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import CopyOp, Trait
+from ...typing import Numeric
+
+from .mma import CtaGroup
+
+
+class Repetition(enum.Enum):
+    """
+    An enumeration for the number of repetitions of a given TMEM copy within the instruction.
+    """
+
+    x1 = 1
+    x2 = 2
+    x4 = 4
+    x8 = 8
+    x16 = 16
+    x32 = 32
+    x64 = 64
+    x128 = 128
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, int):
+            if value == 1:
+                return Repetition.x1
+            elif value == 2:
+                return Repetition.x2
+            elif value == 8:
+                return Repetition.x8
+            elif value == 16:
+                return Repetition.x16
+            elif value == 32:
+                return Repetition.x32
+            elif value == 64:
+                return Repetition.x64
+            elif value == 128:
+                return Repetition.x128
+
+
+class Pack(enum.Enum):
+    """
+    An enumeration for the possible packing patterns for TMEM to RMEM copies.
+    """
+
+    NONE = enum.auto()
+    PACK_16b_IN_32b = enum.auto()
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+
+class Unpack(enum.Enum):
+    """
+    An enumeration for the possible unpacking patterns for RMEM to TMEM copies.
+    """
+
+    NONE = enum.auto()
+    UNPACK_32b_IN_16b = enum.auto()
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+
+@dataclass(frozen=True)
+class _LdBase(CopyOp):
+    repeat: Repetition = Repetition.x1
+    pack: Pack = Pack.NONE
+
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.pack, Pack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Pack instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.pack == Pack.PACK_16b_IN_32b:
+            res += f"\n  with 2x 16-bit to 32b packing"
+        return res
+
+
+@dataclass(frozen=True)
+class Ld16x64bOp(_LdBase):
+    """
+    16x64b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x64b`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x64bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x128bOp(_LdBase):
+    """
+    16x128b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x128b`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x256bOp(_LdBase):
+    """
+    16x256b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x256b`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld16x32bx2Op(_LdBase):
+    """
+    16x32bx2 TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.16x32bx2`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld16x32bx2Trait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Ld32x32bOp(_LdBase):
+    """
+    32x32b TMEM load Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Ld32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None,
+        )
+        return Ld32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Ld32x32bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class _StBase(CopyOp):
+    repeat: Repetition
+    unpack: Unpack = Unpack.NONE
+
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+
+        if not isinstance(self.repeat, Repetition):
+            raise OpError(
+                self,
+                "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance",
+            )
+        if not isinstance(self.unpack, Unpack):
+            raise OpError(
+                self,
+                "expects the 'pack' Op parameter to be a tcgen05.Unpack instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of repetitions = {self.repeat.value}"
+        )
+        if self.unpack == Unpack.UNPACK_32b_IN_16b:
+            res += f"\n  with 32-bit to 2x 16b unpacking"
+        return res
+
+
+@dataclass(frozen=True)
+class St16x64bOp(_StBase):
+    """
+    16x64b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x64`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x64bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            64,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x64bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x128bOp(_StBase):
+    """
+    16x128b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x128`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat == Repetition.x128:
+            raise OpError(
+                self,
+                "x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32, x64",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            128,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x256bOp(_StBase):
+    """
+    16x256b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x256`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.repeat in (Repetition.x128, Repetition.x64):
+            raise OpError(
+                self,
+                "x64 and x128 repetition is not supported",
+                suggestion="choose one of x1, x2, x4, x8, x16, x32",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            256,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St16x32bx2Op(_StBase):
+    """
+    16x32x2b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.16x32x2`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St16x32bx2Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            16,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St16x32bx2Trait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class St32x32bOp(_StBase):
+    """
+    32x32b TMEM store Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
+    This Operation corresponds to the ``.32x32`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "St32x32bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get(
+            copy_internal_type.mlir_type,
+            32,
+            32,
+            self.repeat.value,
+            ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None,
+        )
+        return St32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class St32x32bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class _S2TCopyBase(CopyOp):
+    cta_group: CtaGroup
+
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self) -> None:
+        # Arch verification
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  CTA group = {self.cta_group}"
+        )
+
+        return res
+
+
+@dataclass(frozen=True)
+class Cp128x256bOp(_S2TCopyBase):
+    """
+    128x256b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.128x256b`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp128x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            128,
+            256,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp128x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp128x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Cp128x128bOp(_S2TCopyBase):
+    """
+    128x128b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.128x128b`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp128x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            128,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp128x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp128x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Cp4x256bOp(_S2TCopyBase):
+    """
+    4x256b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.4x256b`` qualifier.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp4x256bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            4,
+            256,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.none,
+        )
+        return Cp4x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp4x256bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Cp4x32x128bOp(_S2TCopyBase):
+    """
+    32x128b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.32x128b`` qualifier with ``warpx4`` broadcast qualifier enabled.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp4x32x128bTrait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            32,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.x4,
+        )
+        return Cp4x32x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp4x32x128bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Cp2x64x128b0213Op(_S2TCopyBase):
+    """
+    64x128b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.64x128b`` qualifier with ``.warpx2::02_13`` broadcast qualifier enabled.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp2x64x128b0213Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            64,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.lw_0213,
+        )
+        return Cp2x64x128b0213Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp2x64x128b0213Trait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class Cp2x64x128b0123Op(_S2TCopyBase):
+    """
+    64x128b SMEM to TMEM Copy Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=tcgen05#tcgen05-instructions-tcgen05-cp>`__.
+    This Operation corresponds to the ``.64x128b`` qualifier with ``.warpx2::01_23`` broadcast qualifier enabled.
+    """
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "Cp2x64x128b0123Trait":
+        ty = _cute_nvgpu_ir.CopyAtomSM100CopyS2TType.get(
+            copy_internal_type.mlir_type,
+            64,
+            128,
+            self.cta_group.value,
+            _cute_nvgpu_ir.CopyS2TBroadcast.lw_0123,
+        )
+        return Cp2x64x128b0123Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class Cp2x64x128b0123Trait(Trait):
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ad27e62962e874da6707ac8a36863d5ed8f98a4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py
@@ -0,0 +1,328 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import overload, Type, Tuple, Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir.dialects import nvvm
+
+from ...typing import (
+    Shape,
+    IntTuple,
+    Layout,
+    Tensor,
+    Int,
+    Numeric,
+    NumericMeta,
+    Int16,
+    Int32,
+)
+from ... import core
+from .mma import SmemLayoutAtomKind, CtaGroup
+from .copy import (
+    Pack,
+    Unpack,
+    Ld16x64bOp,
+    Ld16x128bOp,
+    Ld16x256bOp,
+    Ld16x32bx2Op,
+    Ld32x32bOp,
+    St16x64bOp,
+    St16x128bOp,
+    St16x256bOp,
+    St16x32bx2Op,
+    St32x32bOp,
+)
+
+
+####################################################################################################
+#
+# Helper functions for MMA
+#
+####################################################################################################
+
+
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    elif kind == SmemLayoutAtomKind.MN_SW128_32B:
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(2, 5, 2)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+        SmemLayoutAtomKind.MN_SW128_32B,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+
+@overload
+def tile_to_mma_shape(
+    atom: Layout, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+) -> Layout: ...
+
+
+@overload
+def tile_to_mma_shape(
+    atom: core.ComposedLayout,
+    mma_tile_shape: Shape,
+    order: IntTuple = None,
+    *,
+    loc=None,
+    ip=None,
+) -> core.ComposedLayout: ...
+
+
+@dsl_user_op
+def tile_to_mma_shape(
+    atom, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None
+):
+    """
+    Tiles a layout to an MMA shape.
+    """
+    # Default order is colexicographical
+    if order is None:
+        order = tuple(range(core.rank(mma_tile_shape) - 1))
+    if core.rank(order) != core.rank(mma_tile_shape) - 1:
+        raise ValueError(
+            f"rank(order)={core.rank(order)} must be equal to "
+            f"rank(mma_tile_shape)-1={core.rank(mma_tile_shape)-1}"
+        )
+    order_val = core._pack_int_tuple(order, loc=loc, ip=ip)
+    mma_tile_shape_val = core._pack_shape(mma_tile_shape, loc=loc, ip=ip)
+
+    if not (
+        core.is_static(atom)
+        and core.is_static(mma_tile_shape_val)
+        and core.is_static(order_val)
+    ):
+        raise ValueError("tile_to_mma_shape only supports static inputs")
+
+    res_ty = _cute_nvgpu_ir.tile_to_mma_shape(atom, mma_tile_shape_val, order_val)
+    return _cute_ir.static(res_ty, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def commit(
+    mbar_ptr: core.Pointer,
+    mask=None,
+    cta_group: CtaGroup = CtaGroup.ONE,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    """
+    Perform an arrive operation on a mbarrier upon completion of previous MMA operations.
+
+    :param mbar_ptr: A pointer to the mbarrier in SMEM
+    :type mbar_ptr:  Pointer
+    :param mask:     An optional multicast mask for the CTAs in the cluster to signal arrival to
+    :type mask:      Int
+    """
+    if cta_group == CtaGroup.ONE:
+        group = nvvm.Tcgen05GroupKind.CTA_1
+    else:
+        assert cta_group == CtaGroup.TWO
+        group = nvvm.Tcgen05GroupKind.CTA_2
+
+    mbar_ptr = mbar_ptr.llvm_ptr
+    if mask is not None:
+        mask = Int16(mask).ir_value(loc=loc, ip=ip)
+        nvvm.tcgen05_commit_arrive(
+            mbar_ptr, multicast_mask=mask, group=group, loc=loc, ip=ip
+        )
+    else:
+        nvvm.tcgen05_commit_arrive(mbar_ptr, group=group, loc=loc, ip=ip)
+    return
+
+
+####################################################################################################
+#
+# Helper functions for Copies
+#
+####################################################################################################
+
+
+def is_tmem_load(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM load.
+    """
+    return isinstance(
+        atom.op,
+        (
+            Ld16x64bOp,
+            Ld16x128bOp,
+            Ld16x256bOp,
+            Ld16x32bx2Op,
+            Ld32x32bOp,
+        ),
+    )
+
+
+def is_tmem_store(atom: core.CopyAtom) -> bool:
+    """
+    Returns whether a CopyAtom instance is a TMEM store.
+    """
+    return isinstance(
+        atom.op,
+        (
+            St16x64bOp,
+            St16x128bOp,
+            St16x256bOp,
+            St16x32bx2Op,
+            St32x32bOp,
+        ),
+    )
+
+
+def get_tmem_copy_properties(
+    atom: core.CopyAtom,
+) -> Tuple[int, int, int, Union[Pack, Unpack]]:
+    """
+    Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions,
+    and whether packing/unpacking is used).
+    """
+    if isinstance(atom.op, (Ld16x64bOp, St16x64bOp)):
+        num_dp, num_bits = 16, 64
+    elif isinstance(atom.op, (Ld16x128bOp, St16x128bOp)):
+        num_dp, num_bits = 16, 128
+    elif isinstance(atom.op, (Ld16x256bOp, St16x256bOp)):
+        num_dp, num_bits = 16, 256
+    elif isinstance(atom.op, (Ld16x32bx2Op, St16x32bx2Op)):
+        num_dp, num_bits = 16, 32
+    elif isinstance(atom.op, (Ld32x32bOp, St32x32bOp)):
+        num_dp, num_bits = 32, 32
+    else:
+        raise ValueError(f"expects 'atom' to be a TMEM copy, but got {atom}")
+    if is_tmem_load(atom):
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.pack
+    else:
+        assert is_tmem_store(atom), "atom must be a TMEM store"
+        return num_dp, num_bits, atom.op.repeat.value, atom.op.unpack
+
+
+@dsl_user_op
+def find_tmem_tensor_col_offset(tmem_tensor: Tensor, *, loc=None, ip=None) -> Int:
+    """
+    Computes the TMEM column offset given a TMEM tensor.
+
+    :param tmem_tensor: The TMEM tensor to use to compute the columns offset
+    :type tmem_tensor:  Tensor
+    :return:            The columns offset
+    :rtype:             Int
+    """
+    tmem_col_mask = 0x0000FFFF
+    offset = (
+        core.cosize(core.recast_tensor(tmem_tensor, Int32).layout, loc=loc, ip=ip)
+        & tmem_col_mask
+    )
+    if isinstance(offset, int):
+        return offset
+    return Int32(offset, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_tmem_copy(
+    atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None
+) -> core.TiledCopy:
+    """
+    Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.
+    """
+    tiled_copy_val = _cute_nvgpu_ir.atom_make_tmem_copy(
+        atom._trait.value, tmem_tensor.value, loc=loc, ip=ip
+    )
+    new_trait = type(atom._trait)(tiled_copy_val)
+    return core.TiledCopy(atom.op, new_trait)
+
+
+@dsl_user_op
+def make_s2t_copy(
+    atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None
+) -> core.TiledCopy:
+    """
+    Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.
+    """
+    tiled_copy_val = _cute_nvgpu_ir.atom_make_s2t_copy(
+        atom._trait.value, tmem_tensor.value, loc=loc, ip=ip
+    )
+    new_trait = type(atom._trait)(tiled_copy_val)
+    return core.TiledCopy(atom.op, new_trait)
+
+
+@dsl_user_op
+def get_s2t_smem_desc_tensor(
+    atom: core.CopyAtom, smem_tensor: Tensor, *, loc=None, ip=None
+) -> Tensor:
+    """
+    Returns the SMEM descriptor tensor from a S2T copy atom and a SMEM tensor.
+    """
+    smem_desc_tensor = _cute_nvgpu_ir.atom_get_copy_s2t_smem_desc_view(
+        atom._trait.value, smem_tensor.value, loc=loc, ip=ip
+    )
+    return smem_desc_tensor
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a938523e130cf551c205669164e15e8bbd29132
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py
@@ -0,0 +1,1041 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL, T
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ... import core
+from ...core import Trait, _pack_shape, rank, depth, _Tensor
+from ...typing import (
+    Shape,
+    Float4E2M1FN,
+    Float8E8M0FNU,
+    Float8E5M2,
+    Float8E4M3FN,
+    Float16,
+    BFloat16,
+    Float32,
+    TFloat32,
+    Boolean,
+    Int8,
+    Uint8,
+    Int32,
+    Numeric,
+    AddressSpace,
+    Pointer,
+)
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+
+
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+
+    TMEM = _cute_ir.MmaFragKind.tmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+
+
+class CtaGroup(enum.Enum):
+    """
+    An enumeration for the ``cta_group``  qualifier of the MMA.
+    """
+
+    ONE = 1
+    TWO = 2
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+
+    NEGATE_A = "neg_a"
+    NEGATE_B = "neg_b"
+    ACCUMULATE = "accum_c"
+    SFA = "sf_a"
+    SFB = "sf_b"
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir_field_name(self) -> str:
+        return self.value
+
+
+# Base class for all tcgen05 MMA Ops with syntax `tcgen05.mma.cta_group.kind` used to factor out some internal code
+@dataclass(frozen=True)
+class MmaOp(core.MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    cta_group: CtaGroup
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+
+    admissible_archs = [
+        "sm_100a",
+        "sm_100f",
+    ]
+
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        # Verify the instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if self.cta_group == CtaGroup.ONE:
+            if m not in [64, 128]:
+                raise OpError(self, f"expects the M-mode to be 64 or 128, but got {m}")
+            if m == 64:
+                if (n < 8) or (n > 256) or (n % 8 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}",
+                    )
+            elif m == 128:
+                if (n < 16) or (n > 256) or (n % 16 != 0):
+                    raise OpError(
+                        self,
+                        f"expects the N-mode to satisfy 8 <= N <= 256 and N % 16 == 0, but got {n}",
+                    )
+        else:
+            if m not in [128, 256]:
+                raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}")
+            if (n < 32) or (n > 256) or (n % 32 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 32 <= N <= 256 and N % 32 == 0, but got {n}",
+                )
+
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  CTA group             = {self.cta_group}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B]
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"expects field to be one of {self.admissible_fields}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm100<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+
+# Base class for all tcgen05 BlockScaled MMA Ops with syntax `tcgen05.mma.cta_group.kind.block_scale` used to factor out some internal code
+@dataclass(frozen=True)
+class BlockScaledMmaOp(core.MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Float32
+    sf_dtype: Type[Numeric]
+    sf_vec_size: int
+    shape_mnk: Shape
+    cta_group: CtaGroup
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+
+    admissible_archs = [
+        "sm_100a",
+    ]
+
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.cta_group, CtaGroup):
+            raise OpError(
+                self,
+                "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance",
+            )
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance",
+            )
+        # Verify the instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if self.cta_group == CtaGroup.ONE:
+            if m != 128:
+                raise OpError(self, f"expects the M-mode to be 128, but got {m}")
+
+            if (n < 8) or (n > 256) or (n % 8 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}",
+                )
+        else:
+            if m not in [128, 256]:
+                raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}")
+            if (n < 16) or (n > 256) or (n % 16 != 0):
+                raise OpError(
+                    self,
+                    f"expects the N-mode to satisfy 16 <= N <= 256 and N % 16 == 0, but got {n}",
+                )
+        if self.sf_vec_size not in [16, 32]:
+            raise OpError(
+                self,
+                f"expects the scale factor vector size to be 16 or 32, but got {self.sf_vec_size}",
+            )
+
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type               = {self.a_dtype}"
+            + f"\n  B data type               = {self.b_dtype}"
+            + f"\n  Accumulator data type     = {self.acc_dtype}"
+            + f"\n  Scale factor data type    = {self.sf_dtype}"
+            + f"\n  Scale factor vector size  = {self.sf_vec_size}"
+            + f"\n  CTA group                 = {self.cta_group}"
+            + f"\n  A source location         = {self.a_src}"
+            + f"\n  A major mode              = {self.a_major_mode}"
+            + f"\n  B major mode              = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK     = {self.shape_mnk}"
+        )
+
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+
+class BlockScaledMmaTraits(Trait):
+    admissible_fields = [
+        Field.ACCUMULATE,
+        Field.NEGATE_A,
+        Field.NEGATE_B,
+        Field.SFA,
+        Field.SFB,
+    ]
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"expects field to be one of {self.admissible_fields}, but got {field}"
+            )
+        if field in [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B]:
+            value = Boolean(value).ir_value(loc=loc, ip=ip)
+        elif field in [Field.SFA, Field.SFB]:
+            if not isinstance(value, Pointer):
+                raise ValueError(
+                    f"expects value to be a pointer for {field}, but got {type(value).__name__}"
+                )
+            value = value.value
+
+        field_name = f"#cute_nvgpu.atom_mma_field_sm100_block_scaled<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, value, loc=loc, ip=ip
+        )
+
+
+#
+# TF32 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaTF32Op(MmaOp):
+    """
+    TF32 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::tf32`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 TF32 MMA Operation"
+
+    def __init__(
+        self,
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            TFloat32,
+            TFloat32,
+            Float32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Verify the instruction shape
+        instruction_k = 8
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaTF32Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaTF32Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaTF32Trait(MmaTrait):
+    pass
+
+
+#
+# F16/BF16 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::f16`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 F16/BF16 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaF16BF16Trait(MmaTrait):
+    pass
+
+
+#
+# I8 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaI8Op(MmaOp):
+    """
+    I8 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::i8`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 I8 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            Int32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Int8, Uint8]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Int8 or Uint8",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaI8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            (T.si8() if self.a_dtype.signed else T.ui8()),
+            (T.si8() if self.b_dtype.signed else T.ui8()),
+            T.si32(),
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaI8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaI8Trait(MmaTrait):
+    pass
+
+
+#
+# F8F6F4 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaFP8Op(MmaOp):
+    """
+    F8 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    """
+
+    descriptive_name = "tcgen05 F8 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaFP8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+            0,
+        )
+        return MmaFP8Trait(
+            _cute_nvgpu_ir.make_sm100_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaFP8Trait(MmaTrait):
+    pass
+
+
+#
+# MXF8F6F4 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaMXF8Op(BlockScaledMmaOp):
+    """
+    MXF8 tcgen05 BlockScaled MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf8f6f4`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 MXF8 BlockScaled MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            Float32,
+            Float8E8M0FNU,
+            32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Instruction shape verification
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF8Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaMXF8Trait(BlockScaledMmaTraits):
+    pass
+
+
+#
+# MXF4 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaMXF4Op(BlockScaledMmaOp):
+    """
+    MXF4 tcgen05 BlockScaled MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf4`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 MXF4 BlockScaled MMA Operation"
+
+    def __init__(
+        self,
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+    ) -> None:
+        super().__init__(
+            Float4E2M1FN,
+            Float4E2M1FN,
+            Float32,
+            Float8E8M0FNU,
+            32,
+            instruction_shape,
+            cta_group,
+            a_src,
+            OperandMajorMode.K,
+            OperandMajorMode.K,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Instruction shape verification
+        instruction_k = 64
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF4Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaMXF4Trait(BlockScaledMmaTraits):
+    pass
+
+
+#
+# MXF4NVF4 MMA
+#
+
+
+@dataclass(frozen=True)
+class MmaMXF4NVF4Op(BlockScaledMmaOp):
+    """
+    MXF4NVF4 tcgen05 BlockScaled MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-instructions-mma>`__.
+    This Operation corresponds to the ``.kind::mxf4nvf4`` qualifier.
+    """
+
+    descriptive_name = "tcgen05 MXF4NVF4 BlockScaled MMA Operation"
+
+    def __init__(
+        self,
+        sf_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        cta_group: CtaGroup,
+        a_src: OperandSource,
+    ) -> None:
+        super().__init__(
+            Float4E2M1FN,
+            Float4E2M1FN,
+            Float32,
+            sf_dtype,
+            16,
+            instruction_shape,
+            cta_group,
+            a_src,
+            OperandMajorMode.K,
+            OperandMajorMode.K,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Scale Factor data type verification
+        if self.sf_dtype not in [Float8E8M0FNU, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'sf_dtype' Op parameter to be one of Float8E8M0FNU",
+            )
+        # Instruction shape verification
+        instruction_k = 64
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaMXF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM100UMMABlockScaledType.get(
+            shape_mnk.type.attribute,
+            self.cta_group.value,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.sf_dtype.mlir_type,
+            self.a_src._to_ir(),
+            self.sf_vec_size,
+        )
+        return MmaMXF4NVF4Trait(
+            _cute_nvgpu_ir.make_sm100_mma_bs(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                core.make_ptr(self.sf_dtype, 0, _cute_ir.AddressSpace.tmem).value,
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaMXF4NVF4Trait(BlockScaledMmaTraits):
+    pass
+
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+
+
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM100.
+
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be
+    used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    MN_SW128_32B = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b3f7cf5b0698752d7ea6c450782f17a3fee797
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .copy import *
+from .mma import *
+
+
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "MmaF16BF16Op",
+    # copy.py
+    "LdMatrix8x8x16bOp",
+    "LdMatrix16x16x8bOp",
+    "StMatrix8x8x16bOp",
+    "StMatrix16x8x8bOp",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6ad4ca8f0e2dd05b6e779eaedec0b69cd47decf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from typing import Type
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import CopyOp, Trait, _pack_shape
+from ...typing import Numeric
+
+
+@dataclass(frozen=True)
+class BaseOp(CopyOp):
+    transpose: bool = False
+    num_matrices: int = 1
+
+    def __post_init__(self) -> None:
+        if not isinstance(self.transpose, bool):
+            raise OpError(
+                self,
+                "expects the 'transpose' Op parameter to be a bool instance",
+            )
+
+    def __str__(self) -> str:
+        res = (
+            f"{self.__class__.__name__[:-2]} Copy Operation"
+            + f"\n  number of matrices = {self.num_matrices}"
+        )
+        if self.transpose:
+            res += f"\n  transposed"
+        return res
+
+
+@dataclass(frozen=True)
+class LdMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``ldmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m8n8`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u16,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return LdMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class LdMatrix8x8x16bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class LdMatrix16x16x8bOp(BaseOp):
+    """
+    16x16 8-bit ``ldmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-load-instruction-ldmatrix>`__.
+    This operation corresponds to the ``.m16n16`` and the ``.b16`` qualifiers.
+    """
+
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+
+    def _verify(self):
+        assert self.transpose, "transpose must be True"
+        if self.num_matrices not in [1, 2]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "LdMatrix16x16x8bTrait":
+        mode = _pack_shape((16, 16), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomLdsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            _cute_nvgpu_ir.LdsmSzPattern.u8,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return LdMatrix16x16x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class LdMatrix16x16x8bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class StMatrix8x8x16bOp(BaseOp):
+    """
+    8x8 ``stmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m8n8`` qualifier.
+    """
+
+    def __post_init__(self) -> None:
+        super().__post_init__()
+        if self.num_matrices not in [1, 2, 4]:
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix8x8x16bTrait":
+        mode = _pack_shape((8, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get() if self.transpose else None,
+        )
+        return StMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class StMatrix8x8x16bTrait(Trait):
+    pass
+
+
+@dataclass(frozen=True)
+class StMatrix16x8x8bOp(BaseOp):
+    """
+    16x8 ``stmatrix`` Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-stmatrix>`__.
+    This operation corresponds to the ``m16n8`` qualifier.
+    """
+
+    def __init__(self, num_matrices: int) -> None:
+        super().__init__(transpose=True, num_matrices=num_matrices)
+        self._verify()
+
+    def _verify(self):
+        if self.num_matrices not in [1, 2, 4]:
+            assert self.transpose, "transpose must be True"
+            raise OpError(
+                self,
+                "expects the 'num_matrices' Op parameter to be one of [1,2,4]",
+            )
+
+    def _make_trait(
+        self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs
+    ) -> "StMatrix16x8x8bTrait":
+        mode = _pack_shape((16, 8), loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.CopyAtomStsmType.get(
+            copy_internal_type.mlir_type,
+            mode.type.attribute,
+            self.num_matrices,
+            ir.UnitAttr.get(),
+        )
+        return StMatrix16x8x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+
+class StMatrix16x8x8bTrait(Trait):
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py
new file mode 100644
index 0000000000000000000000000000000000000000..49df213b76f24f23ecfe5a75e36cf17d35aeb98b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from typing import Type
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, _Tensor
+from ...typing import Shape, Float16, BFloat16, Float32, Numeric, AddressSpace
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 tcgen05 MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+
+    ab_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+
+    def __post_init__(self) -> None:
+        if self.ab_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.ab_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        if self.shape_mnk not in [(16, 8, 8), (16, 8, 16)]:
+            raise OpError(
+                self,
+                "expects the 'shape_mnk' Op parameter to be one of (16,8,8) or (16,8,16)",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM80Type.get(
+            shape_mnk.type.attribute,
+            self.ab_dtype.mlir_type,
+            self.ab_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+        )
+        return MmaF16BF16Trait(_cute_ir.atom(ty, loc=loc, ip=ip))
+
+    def __str__(self) -> str:
+        return (
+            "warp-level F16/BF16 MMA Operation"
+            + f"\n  A/B data type         = {self.ab_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        pass
+
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        pass
+
+class MmaF16BF16Trait(Trait):
+    pass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a40165033024c9c9b17acd298a1f8ba055649c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .mma import *
+from .helpers import *
+
+# __all__ is required here for documentation generation
+__all__ = [
+    # mma.py
+    "OperandMajorMode",
+    "OperandSource",
+    "Field",
+    "MmaF16BF16Op",
+    "MmaF8Op",
+    "SmemLayoutAtomKind",
+    # helpers.py
+    "make_smem_layout_atom",
+    "fence",
+    "commit_group",
+    "wait_group",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6284134933bec170ecec5eeb0bf9f829ef0dff0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+from cutlass._mlir.dialects import nvvm
+
+from ...typing import Numeric, NumericMeta
+from ... import core
+from .mma import SmemLayoutAtomKind
+
+
+@dsl_user_op
+def make_smem_layout_atom(
+    kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None
+) -> core.ComposedLayout:
+    """
+    Makes a SMEM layout Atom.
+
+    This function creates a composed layout in unit of elements consistent with the requested layout
+    Atom kind and element data type.
+
+    :param kind:         The kind of layout Atom
+    :type kind:          SmemLayoutAtomKind
+    :param element_type: The element data type to construct the layout for
+    :type element_type:  Type[Numeric]
+    :return:             The SMEM layout atom
+    :rtype:              core.ComposedLayout
+    """
+    if not isinstance(element_type, NumericMeta):
+        raise TypeError(f"element_type must be a Numeric, but got {element_type}")
+
+    if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER):
+        num_contiguous_bits = 128
+        sw = core.make_swizzle(0, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32):
+        num_contiguous_bits = 256
+        sw = core.make_swizzle(1, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64):
+        num_contiguous_bits = 512
+        sw = core.make_swizzle(2, 4, 3)
+    elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128):
+        num_contiguous_bits = 1024
+        sw = core.make_swizzle(3, 4, 3)
+    else:
+        raise ValueError("unrecognized SMEM layout atom kind")
+    num_contiguous_elems = num_contiguous_bits // element_type.width
+
+    if kind in (
+        SmemLayoutAtomKind.MN_INTER,
+        SmemLayoutAtomKind.MN_SW32,
+        SmemLayoutAtomKind.MN_SW64,
+        SmemLayoutAtomKind.MN_SW128,
+    ):
+        # M/N-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (num_contiguous_elems, 8), stride=(1, num_contiguous_elems)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # K-major layout
+        return core.make_composed_layout(
+            sw,
+            0,
+            core.make_layout(
+                (8, num_contiguous_elems), stride=(num_contiguous_elems, 1)
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+
+@dsl_user_op
+def fence(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-fence>`__.
+    """
+    nvvm.wgmma_fence_aligned(loc=None, ip=None)
+
+
+@dsl_user_op
+def commit_group(*, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group>`__.
+    """
+    nvvm.wgmma_commit_group_sync_aligned(loc=loc, ip=ip)
+
+
+@dsl_user_op
+def wait_group(group, *, loc=None, ip=None) -> None:
+    """
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-wait-group>`__.
+    """
+    nvvm.wgmma_wait_group_sync_aligned(group, loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py
new file mode 100644
index 0000000000000000000000000000000000000000..275861f70cc3d6eca932cb263890aaaa4121445f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py
@@ -0,0 +1,405 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from dataclasses import dataclass
+from typing import Type
+
+from cutlass.cutlass_dsl import CuTeDSL
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+from cutlass._mlir import ir
+
+from ..common import OpError
+from ...core import MmaOp, Trait, _pack_shape, rank, depth, _Tensor
+from ...typing import (
+    Shape,
+    Float16,
+    BFloat16,
+    Float32,
+    Boolean,
+    Float8E5M2,
+    Float8E4M3FN,
+    Numeric,
+    AddressSpace,
+)
+
+
+####################################################################################################
+#
+# MMA Ops and Traits
+#
+####################################################################################################
+
+
+class OperandMajorMode(enum.Enum):
+    """
+    An enumeration for the majorness of the input operands of the MMA.
+    """
+
+    MN = _cute_ir.MajorMode.mn
+    K = _cute_ir.MajorMode.k
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    @classmethod
+    def _missing_(cls, value):
+        if isinstance(value, str):
+            value = value.upper()
+            if value == "MN":
+                return OperandMajorMode.MN
+            elif value == "K":
+                return OperandMajorMode.K
+
+    def _to_ir(self) -> _cute_ir.MajorMode:
+        return self.value
+
+
+class OperandSource(enum.Enum):
+    """
+    An enumeration for the source memory location of the A input operand of the MMA.
+    """
+
+    RMEM = _cute_ir.MmaFragKind.rmem
+    SMEM = _cute_ir.MmaFragKind.smem_desc
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir(self) -> _cute_ir.MmaFragKind:
+        return self.value
+
+
+class Field(enum.Enum):
+    """
+    An enumeration for the fields of the MMA Atom that can be modified at runtime.
+    """
+
+    ACCUMULATE = "accum_c"
+
+    def __str__(self) -> str:
+        return f"{self.__class__.__name__}.{self.name}"
+
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}.{self.name}>"
+
+    def _to_ir_field_name(self) -> str:
+        return self.value
+
+
+@dataclass(frozen=True)
+class MmaOp(MmaOp):
+    a_dtype: Type[Numeric]
+    b_dtype: Type[Numeric]
+    acc_dtype: Type[Numeric]
+    shape_mnk: Shape
+    a_src: OperandSource
+    a_major_mode: OperandMajorMode
+    b_major_mode: OperandMajorMode
+
+    admissible_archs = ["sm_90a"]
+
+    def __post_init__(self) -> None:
+        # Verify arch
+        arch = CuTeDSL._get_dsl().envar.arch
+        if arch not in self.admissible_archs:
+            raise OpError(
+                self,
+                f"expects arch to be one of {self.admissible_archs}, but got {arch}",
+                suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
+            )
+        # Verify that the user provided enum values
+        if not isinstance(self.a_src, OperandSource):
+            raise OpError(
+                self,
+                "expects the 'a_src' Op parameter to be a warpgroup.OperandSource instance",
+            )
+        if not isinstance(self.a_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'a_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        if not isinstance(self.b_major_mode, OperandMajorMode):
+            raise OpError(
+                self,
+                "expects the 'b_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance",
+            )
+        # Verify instruction shape
+        if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1):
+            raise OpError(
+                self,
+                f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, "
+                f"but got {self.shape_mnk}",
+            )
+        m, n = self.shape_mnk[0], self.shape_mnk[1]
+        if m != 64:
+            raise OpError(self, f"expects the M-mode to be 64, but got {m}")
+        if (n < 8) or (n > 256) or (n % 8 != 0):
+            raise OpError(
+                self,
+                f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0. but got {n}",
+            )
+
+    def __str__(self) -> str:
+        return (
+            self.__class__.descriptive_name  # type: ignore
+            + f"\n  A data type           = {self.a_dtype}"
+            + f"\n  B data type           = {self.b_dtype}"
+            + f"\n  Accumulator data type = {self.acc_dtype}"
+            + f"\n  A source location     = {self.a_src}"
+            + f"\n  A major mode          = {self.a_major_mode}"
+            + f"\n  B major mode          = {self.b_major_mode}"
+            + f"\n  Instruction shape MNK = {self.shape_mnk}"
+        )
+
+    def _verify_fragment_A(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand A, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+    def _verify_fragment_B(self, input: _Tensor, *, loc=None, ip=None):
+        if input.memspace == AddressSpace.smem and isinstance(
+            input.layout.type, _cute_ir.ComposedLayoutType
+        ):
+            raise OpError(
+                self,
+                f"Expected affine layout for {self._make_trait()}'s operand B, "
+                f"but got composed layout instead: {input.layout}"
+                f"\nPlease use recast_ptr(ptr, {input.layout.inner}, element_type) operation to move swizzle to the ptr",
+            )
+        return True
+
+
+class MmaTrait(Trait):
+    admissible_fields = [Field.ACCUMULATE]
+
+    def set(self, field, value, *, loc=None, ip=None) -> None:
+        if field not in self.admissible_fields:
+            raise ValueError(
+                f"invalid field, must be {Field.ACCUMULATE}, but got {field}"
+            )
+        field_name = f"#cute_nvgpu.atom_mma_field_sm90<{field._to_ir_field_name()}>"
+        attr = ir.Attribute.parse(field_name)
+        self.value = _cute_nvgpu_ir.atom_set_value(
+            self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+        )
+
+
+@dataclass(frozen=True)
+class MmaF16BF16Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands.
+    """
+
+    descriptive_name = "warpgroup F16/BF16 MMA Operation"
+
+    def __init__(
+        self,
+        ab_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            ab_dtype,
+            ab_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self) -> None:
+        # Input data type verification
+        if self.a_dtype not in [Float16, BFloat16]:
+            raise OpError(
+                self,
+                "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16",
+            )
+        assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same"
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        if (self.a_dtype == BFloat16) and (self.acc_dtype != Float32):
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16",
+            )
+        # Verify the instruction shape
+        instruction_k = 16
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF16BF16Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty,
+                Boolean(False).ir_value(loc=loc, ip=ip),
+                loc=loc,
+                ip=ip,
+            )
+        )
+
+
+class MmaF16BF16Trait(MmaTrait):
+    pass
+
+
+@dataclass(frozen=True)
+class MmaF8Op(MmaOp):
+    """
+    F16/BF16 warpgroup MMA Operation.
+
+    See the `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-multiply-and-accumulate-instruction-wgmma-mma-async>`__.
+    This Operation covers the instructions using the ``.e4m3`` or ``.e5m2`` qualifiers for the input operands.
+    """
+
+    descriptive_name = "warpgroup F8 MMA Operation"
+
+    def __init__(
+        self,
+        a_dtype: Type[Numeric],
+        b_dtype: Type[Numeric],
+        acc_dtype: Type[Numeric],
+        instruction_shape: Shape,
+        a_src: OperandSource,
+        a_major_mode: OperandMajorMode,
+        b_major_mode: OperandMajorMode,
+    ) -> None:
+        super().__init__(
+            a_dtype,
+            b_dtype,
+            acc_dtype,
+            instruction_shape,
+            a_src,
+            a_major_mode,
+            b_major_mode,
+        )
+        self._verify()
+
+    def _verify(self):
+        # Input data type verification
+        if self.a_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'a_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        if self.b_dtype not in [Float8E5M2, Float8E4M3FN]:
+            raise OpError(
+                self,
+                "expects the 'b_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN",
+            )
+        # Accumulator data type verification
+        if self.acc_dtype not in [Float16, Float32]:
+            raise OpError(
+                self,
+                "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32",
+            )
+        # Verify the instruction shape
+        instruction_k = 32
+        if rank(self.shape_mnk) == 2:
+            object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k))
+        if self.shape_mnk[2] != instruction_k:
+            raise OpError(
+                self,
+                f"expects the instruction extent in the K-mode to be {instruction_k}, "
+                f"but got {self.shape_mnk[2]}",
+            )
+
+    def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF8Trait":
+        shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip)
+        ty = _cute_nvgpu_ir.MmaAtomSM90Type.get(
+            shape_mnk.type.attribute,
+            self.a_major_mode._to_ir(),
+            self.b_major_mode._to_ir(),
+            self.a_dtype.mlir_type,
+            self.b_dtype.mlir_type,
+            self.acc_dtype.mlir_type,
+            self.a_src._to_ir(),
+        )
+        return MmaF8Trait(
+            _cute_nvgpu_ir.make_sm90_mma(
+                ty, Boolean(False).ir_value(loc=loc, ip=ip), loc=loc, ip=ip
+            )
+        )
+
+
+class MmaF8Trait(MmaTrait):
+    pass
+
+
+####################################################################################################
+#
+# SMEM layout atoms
+#
+####################################################################################################
+
+
+class SmemLayoutAtomKind(enum.Enum):
+    """
+    Enum class for the kinds of SMEM layout atoms for SM90.
+
+    Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can
+    be used to construct an SMEM layout using blocked product for operand A or B such that the
+    resulting layout is legal for both TMA and UMMA.
+
+    Note that there are other ways of creating legal layouts for operand A and B.
+    """
+
+    MN_INTER = enum.auto()
+    MN_SW32 = enum.auto()
+    MN_SW64 = enum.auto()
+    MN_SW128 = enum.auto()
+    K_INTER = enum.auto()
+    K_SW32 = enum.auto()
+    K_SW64 = enum.auto()
+    K_SW128 = enum.auto()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/runtime.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..9128c67a24a7202713c354fb99b2891542f0c887
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/runtime.py
@@ -0,0 +1,510 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import ctypes
+from functools import lru_cache
+import itertools
+import operator
+from time import time
+from typing import Union
+
+# MLIR modules imports
+from cutlass._mlir import ir
+import cutlass._mlir.dialects.cute as _cute_ir
+
+from cutlass.base_dsl.dsl import is_dynamic_expression
+from cutlass.cutlass_dsl import JitArgAdapterRegistry
+
+# Local modules imports
+from .typing import (
+    AddressSpace,
+    Tensor,
+    Type,
+    Pointer,
+    Boolean,
+    Numeric,
+    Float4E2M1FN,
+    Int64,
+    Int32,
+    Int16,
+    Int8,
+    Uint64,
+    Uint32,
+    Uint16,
+    Uint8,
+    Float64,
+    Float32,
+    Float16,
+    BFloat16,
+    Float8E5M2,
+)
+from . import core
+from .core import _Tensor as CoreTensor
+
+
+class _Pointer(Pointer):
+    """Runtime representation of a pointer that can inter-operate with various data structures,
+    including numpy arrays and device memory.
+
+    :param pointer: The pointer to the data
+    :type pointer: int or pointer-like object
+    :param dtype: Data type of the elements pointed to
+    :type dtype: Type
+    :param mem_space: Memory space where the pointer resides, defaults to generic
+    :type mem_space: _cute_ir.AddressSpace, optional
+    :param assumed_align: Assumed alignment of input pointer in bytes, defaults to None
+    :type assumed_align: int, optional
+
+    :ivar _pointer: The underlying pointer
+    :ivar _dtype: Data type of the elements
+    :ivar _addr_space: Memory space of the pointer
+    :ivar _assumed_align: Alignment of the pointer in bytes
+    :ivar _desc: C-type descriptor for the pointer
+    :ivar _c_pointer: C-compatible pointer representation
+    """
+
+    def __init__(
+        self,
+        pointer,
+        dtype,
+        mem_space: _cute_ir.AddressSpace = _cute_ir.AddressSpace.generic,
+        assumed_align=None,
+    ):
+        self._pointer = pointer
+        self._dtype = dtype
+        self._addr_space = mem_space
+
+        if assumed_align is None:
+            self._assumed_align = dtype.width // 8
+        else:
+            self._assumed_align = assumed_align
+
+        self._c_pointer = None
+        assert (
+            int(self._pointer) % self._assumed_align == 0
+        ), f"pointer must be {self._assumed_align} bytes aligned"
+
+    def size_in_bytes(self) -> int:
+        self._desc = ctypes.c_void_p(int(self._pointer))
+        return ctypes.sizeof(self._desc)
+
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+
+    def __c_pointers__(self):
+        if self._c_pointer is None:
+            self._desc = ctypes.c_void_p(int(self._pointer))
+            self._c_pointer = ctypes.addressof(self._desc)
+        return [self._c_pointer]
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        return values[0]
+
+    def __extract_mlir_values__(self):
+        return [self._c_pointer]
+
+    # Move mlir Type out of __init__ to decouple with mlir Context
+    @property
+    def mlir_type(self) -> ir.Type:
+        return _cute_ir.PtrType.get(
+            self._dtype.mlir_type, self._addr_space, self._assumed_align
+        )
+
+    @property
+    def dtype(self) -> Type[Numeric]:
+        return self._dtype
+
+    @property
+    def memspace(self):
+        return self._addr_space
+
+    def align(self, min_align: int, *, loc=None, ip=None) -> Pointer:
+        raise NotImplementedError("align is not supported in runtime")
+
+    def verify(self, expected_py_type):
+        if expected_py_type is Pointer:
+            return True
+        elif isinstance(expected_py_type, ir.Value) and expected_py_type.ty is Pointer:
+            return True
+
+        return False
+
+    def __str__(self) -> str:
+        return f"Ptr<0x{int(self._pointer):016x}@{self._addr_space}>"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class _Tensor(Tensor):
+    def __init__(
+        self,
+        tensor,
+        assumed_align=None,
+    ):
+        # If tensor is already a DLPack object, use it directly
+        if hasattr(tensor, "__dlpack_device__") and not hasattr(tensor, "__dlpack__"):
+            self._dlpack_data = tensor
+        else:
+            self._dlpack_data = tensor.__dlpack__()
+        self._dltensor_wrapper = None
+        self._assumed_align = assumed_align
+        self._is_dynamic = False
+        self._memref_desc = None
+        self._dtype = None
+
+    @property
+    def __class__(self) -> Type[Tensor]:
+        # Cheat to let `type(_Tensor())` to return cute.Tensor
+        return Tensor
+
+    @staticmethod
+    def lazily_load_dltensor(func):
+        """Decorator to lazily load the DLTensorWrapper.
+
+        This decorator loads the DLTensorWrapper when needed,
+        avoiding overhead in the critical path of calling JIT functions.
+        """
+
+        def wrapper(self, *args, **kwargs):
+            if self._dltensor_wrapper is None:
+                self._dltensor_wrapper = _cute_ir.DLTensorWrapper(self._dlpack_data)
+            return func(self, *args, **kwargs)
+
+        return wrapper
+
+    @lazily_load_dltensor
+    def mark_layout_dynamic(self, leading_dim: int | None = None):
+        """Marks the tensor layout as dynamic based on the leading dimension.
+
+        :param leading_dim: The leading dimension of the layout, defaults to None
+        :type leading_dim: int, optional
+
+        When ``leading_dim`` is None, automatically deduces the leading dimension from the tensor layout.
+        The layout can be deduced only when exactly one dimension has a stride of 1. Raises an error
+        if the layout cannot be automatically deduced.
+
+        When ``leading_dim`` is explicitly specified, marks the layout as dynamic while setting the
+        stride at ``leading_dim`` to 1. Also validates that the specified ``leading_dim`` is consistent
+        with the existing layout by checking that the corresponding stride of that dimension is 1.
+
+        Limitation: only support flat layout for now. Will work on supporting nested layout in the future.
+
+        :return: The tensor with dynamic layout
+        :rtype: _Tensor
+        """
+        self._dltensor_wrapper.mark_layout_dynamic(leading_dim)
+        return self
+
+    @lazily_load_dltensor
+    def mark_compact_shape_dynamic(
+        self,
+        mode: int,
+        stride_order: tuple[int, ...] | None = None,
+        divisibility: int = 1,
+    ):
+        """Marks the tensor shape as dynamic and propagates dynamic and divisibility information to the corresponding strides.
+
+        :param mode: The mode of the compact shape, defaults to 0
+        :type mode: int
+        :param stride_order: Consistent with `torch.Tensor.dim_order`. Defaults to None.
+        Indicates the order of the modes (dimensions) if the current layout were converted to row-major order.
+        It starts from the outermost to the innermost dimension.
+        :type stride_order: tuple[int, ...], optional
+        :param divisibility: The divisibility constraint for the compact shape, defaults to 1
+        :type divisibility: int, optional
+        :return: The tensor with dynamic compact shape
+        :rtype: _Tensor
+
+        If ``stride_order`` is not provided, the stride ordering will be automatically deduced from the layout.
+        Automatic deduction is only possible when exactly one dimension has a stride of 1 (compact layout).
+        An error is raised if automatic deduction fails.
+
+        If ``stride_order`` is explicitly specified, it does the consistency check with the layout.
+
+        For example:
+        - Layout: (4,2):(1,4) has stride_order: (1,0) indicates the innermost dimension is 0(`4:1`), the outermost dimension is 1(`2:4`)
+        - Layout: (5,3,2,4):(3,1,15,30) has stride_order: (3,2,0,1) indicates the innermost dimension is 1(`3:1`), the outermost dimension is 3(`4:30`).
+
+        Using `torch.Tensor.dim_order()` to get the stride order of the torch tensor.
+        .. code-block:: python
+            a = torch.empty(3, 4)
+            t = cute.runtime.from_dlpack(a)
+            t = t.mark_compact_shape_dynamic(mode=0, stride_order=a.dim_order())
+        """
+        self._dltensor_wrapper.mark_compact_shape_dynamic(
+            mode, stride_order, divisibility
+        )
+        return self
+
+    @property
+    @lazily_load_dltensor
+    def element_type(self) -> Type[Numeric]:
+        if self._dtype is None:
+            self._dtype = self._dltensor_wrapper.dtype
+        return self._dtype
+
+    @element_type.setter
+    def element_type(self, new_type):
+        """Set the element type of the tensor.
+
+        :warning: This API is added for narrow precision before we have a clean `recast_tensor` story.
+
+        :note: It is only used for the case that frameworks don't natively support narrow precision but we get tensor
+              from frameworks with storage type like uint8.
+
+        **Example**:
+
+        .. code-block:: python
+
+            # Create a tensor from a numpy array
+            import numpy as np
+            from cutlass.cute import from_dlpack
+
+            # Create a tensor with Float32 elements
+            a = np.zeros(shape, dtype=np.uint8)
+            tensor = from_dlpack(a)
+
+            # Change the element type to Float4E2M1FN even storage type is uint8
+            tensor.element_type = cutlass.Float4E2M1FN
+
+            src = from_dlpack(... data tensor ...)
+            # convert and initialize narrow precision tensor
+            cute.testing.convert(src, tensor)
+        """
+        self._dtype = new_type
+
+    @property
+    @lazily_load_dltensor
+    def memspace(self):
+        return self._dltensor_wrapper.address_space
+
+    @property
+    @lazily_load_dltensor
+    def size_in_bytes(self) -> int:
+        return self._dltensor_wrapper.size_in_bytes()
+
+    @property
+    @lazily_load_dltensor
+    def mlir_type(self) -> ir.Type:
+        return self._dltensor_wrapper.get_type(
+            self.element_type.mlir_type, self._assumed_align
+        )
+
+    @lazily_load_dltensor
+    def __str__(self) -> str:
+        return f"Tensor<0x{self._dltensor_wrapper.str}>"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __setitem__(self, crd, value):
+        raise TypeError(f"runtime._Tensor is not indexable")
+
+    def __getitem__(self, crd):
+        raise TypeError(f"runtime._Tensor is not indexable")
+
+    @property
+    @lazily_load_dltensor
+    def iterator(self):
+        return _Pointer(
+            self._dltensor_wrapper.data_ptr,
+            self.element_type,
+            self.memspace,
+            self._assumed_align,
+        )
+
+    @property
+    def layout(self):
+        raise NotImplementedError(
+            f"layout property is not supported in runtime, support in future"
+        )
+
+    @property
+    @lazily_load_dltensor
+    def shape(self):
+        return self._dltensor_wrapper.shape
+
+    @property
+    @lazily_load_dltensor
+    def stride(self):
+        strides = self._dltensor_wrapper.stride
+        if strides is None:
+            strides = itertools.accumulate(
+                reversed(self.shape), func=operator.mul, initial=1
+            )
+            strides = tuple(reversed(list(strides)[:-1]))
+
+        return strides
+
+    @property
+    @lru_cache(maxsize=128, typed=True)
+    def leading_dim(self):
+        """Get the leading dimension of this Tensor.
+
+        :return: The leading dimension index or indices
+        :rtype: int or tuple or None
+
+        The return value depends on the tensor's stride pattern:
+
+        * If a single leading dimension is found, returns an integer index
+        * If nested leading dimensions are found, returns a tuple of indices
+        * If no leading dimension is found, returns None
+        """
+        return core.leading_dim(self.shape, self.stride)
+
+    def fill(self, value: Numeric):
+        raise TypeError(f"fill function is not supported in runtime")
+
+    @property
+    @lazily_load_dltensor
+    def data_ptr(self):
+        return self._dltensor_wrapper.data_ptr
+
+    @lazily_load_dltensor
+    def __c_pointers__(self):
+        self._memref_desc = self._dltensor_wrapper.build_memref_desc(
+            self._assumed_align
+        )
+        return [_cute_ir.pycapsule_get_pointer(self._memref_desc)]
+
+    def __get_mlir_types__(self):
+        return [self.mlir_type]
+
+    def __new_from_mlir_values__(self, values):
+        assert len(values) == 1
+        assert isinstance(values[0], CoreTensor)
+        return CoreTensor(values[0].value, self._dtype)
+
+
+def from_dlpack(
+    tensor_dlpack,
+    assumed_align=None,
+) -> Tensor:
+    """Convert from tensor object supporting __dlpack__() to a CuTe Tensor.
+
+    :param tensor_dlpack: Tensor object that supports the DLPack protocol
+    :type tensor_dlpack: object
+    :param assumed_align: Assumed alignment of the tensor (bytes), defaults to None,
+      if None, will use the element size bytes as the assumed alignment.
+    :type assumed_align: int, optional
+    :return: A CuTe Tensor object
+    :rtype: Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import torch
+            from cutlass.cute.runtime import from_dlpack
+            x = torch.randn(100, 100)
+            y = from_dlpack(x)
+            y.shape
+            # (100, 100)
+            type(y)
+            # <class 'cutlass.cute.Tensor'>
+    """
+    return _Tensor(
+        tensor_dlpack,
+        assumed_align=assumed_align,
+    )
+
+
+def make_ptr(
+    dtype: Type[Numeric],
+    value: Union[int, ctypes._Pointer],
+    mem_space: AddressSpace = AddressSpace.generic,
+    assumed_align=None,
+) -> Pointer:
+    """Create a pointer from a memory address
+
+    :param dtype: Data type of the pointer elements
+    :type dtype: Type[Numeric]
+    :param value: Memory address as integer or ctypes pointer
+    :type value: Union[int, ctypes._Pointer]
+    :param mem_space: Memory address space, defaults to AddressSpace.generic
+    :type mem_space: AddressSpace, optional
+    :param align_bytes: Alignment in bytes, defaults to None
+    :type align_bytes: int, optional
+    :return: A pointer object
+    :rtype: Pointer
+
+    .. code-block:: python
+
+        import numpy as np
+        import ctypes
+
+        from cutlass import Float32
+        from cutlass.cute.runtime import make_ptr
+
+        # Create a numpy array
+        a = np.random.randn(16, 32).astype(np.float32)
+
+        # Get pointer address as integer
+        ptr_address = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+
+        # Create pointer from address
+        y = make_ptr(cutlass.Float32, ptr_address)
+
+        # Check properties
+        print(y.element_type)
+        print(type(y))  # <class 'cutlass.cute.Pointer'>
+    """
+    # check if value is int or ctypes.POINTER
+    if isinstance(value, int):
+        address_value = value
+    elif isinstance(value, ctypes._Pointer):
+        # get address value
+        address_value = ctypes.cast(value, ctypes.c_void_p).value
+        assert address_value is not None, "Pointer address is None"
+    else:
+        raise TypeError(
+            f"Expect int or ctypes.POINTER for value but got {type(value)=}"
+        )
+
+    return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align)
+
+
+class TensorAdapter:
+    """
+    Convert a DLPack protocol supported tensor/array to a cute tensor.
+    """
+
+    def __init__(self, arg):
+        self._arg = from_dlpack(arg).mark_layout_dynamic()
+
+    def __new_from_mlir_values__(self, values):
+        return self._arg.__new_from_mlir_values__(values)
+
+    def __c_pointers__(self):
+        return self._arg.__c_pointers__()
+
+    def __get_mlir_types__(self):
+        return self._arg.__get_mlir_types__()
+
+
+# -------------------------------------------------------------------------
+# Try to register_jit_arg_adapter for TensorAdapter
+# -------------------------------------------------------------------------
+
+try:  # Register for numpy.ndarray
+    import numpy
+
+    JitArgAdapterRegistry.register_jit_arg_adapter(numpy.ndarray)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error
+
+try:  # Register for torch.Tensor
+    import torch
+
+    JitArgAdapterRegistry.register_jit_arg_adapter(torch.Tensor)(TensorAdapter)
+except ImportError:
+    pass  # silent attempt, suppress error
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/testing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e0da048fc951da5091bcc38a6e6c92164f6d04
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/testing.py
@@ -0,0 +1,610 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import functools
+import inspect
+import logging
+import os
+from enum import Enum
+from inspect import isclass
+from itertools import product
+from time import time
+from typing import Any, Callable, Dict, List, Optional, Type, Union
+
+import cuda.bindings.driver as cuda_driver
+import cuda.bindings.runtime as cuda_runtime
+import numpy as np
+
+import cutlass._mlir.ir as ir
+import cutlass.base_dsl.jit_executor
+import cutlass.cute as cute
+from cutlass._mlir.dialects import builtin, cf, nvvm, vector
+from cutlass.cute import core, nvgpu
+from cutlass.cutlass_dsl import Constexpr, CuTeDSL, T, t, dsl_user_op
+
+
+@dsl_user_op
+def assert_(cond, msg=None, *, loc=None, ip=None):
+    cf.assert_(t.Boolean(cond).ir_value(), msg if msg else "", loc=loc, ip=ip)
+
+
+def _maybe_recast_tensor_from_f4(src: core.Tensor, tv_layout: core.Layout):
+    if src.element_type.width == 4:
+        tv_layout = core.recast_layout(8, 4, tv_layout)
+        src = core.recast_tensor(src, dtype=t.Int8)
+    return src, tv_layout
+
+
+def _maybe_recast_to_f4(input: core.TensorSSA, dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor to 4-bit type if the destination type is 4-bit.
+
+    :param input: The input tensor to recast.
+    :param dtype: The target numeric type to potentially recast to.
+    :raises TypeError: If dtype is not a subclass of Numeric.
+    :return: A new tensor recast to 4-bit if dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(dtype) or not issubclass(dtype, core.Numeric):
+        raise TypeError(f"dst_ty must be a type of Numeric, but got {dtype}")
+
+    if dtype.width == 4:
+        recast_shape = core.recast_layout(4, 8, core.make_layout(input.shape)).shape
+        i4_vec = vector.bitcast(
+            T.vector(input.type.shape[0] * 2, T.i(4)), input.maybe_downcast()
+        )
+        res_vect = builtin.unrealized_conversion_cast(
+            [T.vector(i4_vec.type.shape[0], dtype.mlir_type)], [i4_vec]
+        )
+        return core.TensorSSA(res_vect, recast_shape, dtype)
+    return input
+
+
+def _maybe_recast_from_f4(input: core.TensorSSA, src_dtype: Type[core.Numeric]):
+    """Conditionally recasts the tensor from 4-bit type if the source type is 4-bit.
+
+    :param input: The input tensor to recast.
+    :param src_dtype: The source numeric type to potentially recast from.
+    :raises TypeError: If src_dtype is not a subclass of Numeric.
+    :return: A new tensor recast from 4-bit if src_dtype is 4-bit, otherwise returns self unchanged.
+    """
+    if not isclass(src_dtype) or not issubclass(src_dtype, core.Numeric):
+        raise TypeError(f"src_ty must be a type of Numeric, but got {src_dtype}")
+
+    if src_dtype.width == 4:
+        recast_shape = core.recast_layout(8, 4, core.make_layout(input.shape)).shape
+        i4_vec = builtin.unrealized_conversion_cast(
+            [T.vector(input.type.shape[0], T.i(4))], [input.maybe_downcast()]
+        )
+        res_vect = vector.bitcast(T.vector(i4_vec.type.shape[0] // 2, T.i8()), i4_vec)
+        return core.TensorSSA(res_vect, recast_shape, core.Int8)
+    return input
+
+
+@CuTeDSL.kernel
+def _convert_kernel(
+    gSrc: core.Tensor,
+    gDst: core.Tensor,
+    cSrc: core.Tensor,
+    src_tv_layout: core.Layout,
+    dst_tv_layout: core.Layout,
+    src_shape: core.Shape,
+    src_ty,
+    dst_ty,
+):
+    tidx = nvvm.read_ptx_sreg_tid_x(T.i32())
+    bidx = nvvm.read_ptx_sreg_ctaid_x(T.i32())
+
+    cta_coord = (None, bidx)
+    # logical idx -> address
+    ctaSrc = gSrc[cta_coord]  # (...,TileV,...)
+    ctaDst = gDst[cta_coord]  # (...,TileV,...)
+    ctaCSrc = cSrc[cta_coord]  # (...,TileV,...)
+    # print(f"ctaSrc = {ctaSrc.type}")
+
+    # compose with CTA TV layout
+    # tid, vid -> address
+    tidfrgSrc = core.composition(ctaSrc, src_tv_layout)  # (T,V)
+    tidfrgDst = core.composition(ctaDst, dst_tv_layout)  # (T,V)
+    tidfrgCSrc = core.composition(ctaCSrc, src_tv_layout)  # (T,V)
+    # print(f"tidfrgSrc = {tidfrgSrc.type}")
+
+    # slice for threads
+    thr_coord = (tidx, None)
+    thrSrc = tidfrgSrc[thr_coord]  # (V)
+    thrDst = tidfrgDst[thr_coord]  # (V)
+    thrCSrc = tidfrgCSrc[thr_coord]  # (V)
+    # print(f"thrSrc = {thrSrc.type}")
+
+    # predicate
+    if core.elem_less(thrCSrc[0], src_shape):
+        # allocate fragments for gmem->rmem
+        frgSrc = core.make_fragment(
+            core.get(src_tv_layout, mode=[1]), gSrc.element_type
+        )  # (V)
+        frgDst = core.make_fragment(
+            core.get(dst_tv_layout, mode=[1]), gDst.element_type
+        )  # (V)
+        # print(f"frgSrc = {frgSrc.type}")
+
+        # Move data to reg address space
+        copy_atom_load = core.make_copy_atom(nvgpu.CopyUniversalOp(), gSrc.element_type)
+        core.copy(copy_atom_load, thrSrc, frgSrc)
+
+        vec_src = frgSrc.load()
+        vec_src = _maybe_recast_to_f4(vec_src, src_ty)
+        vec_dst = vec_src.to(dst_ty)
+        vec_dst = _maybe_recast_from_f4(vec_dst, dst_ty)
+        frgDst.store(vec_dst)
+
+        # Copy the results back to c
+        copy_atom_stg = core.make_copy_atom(nvgpu.CopyUniversalOp(), gDst.element_type)
+        core.copy(copy_atom_stg, frgDst, thrDst)
+
+
+@CuTeDSL.jit(preprocess=False)
+def _convert(
+    src: core.Tensor,
+    dst: core.Tensor,
+    leading_mode: Constexpr,
+    elem_per_copy: Constexpr,
+):
+
+    # Step 1. figure proper tv_layout
+    src_ty = src.element_type
+    dst_ty = dst.element_type
+
+    tv_layout = core.make_layout((128, elem_per_copy), stride=(elem_per_copy, 1))
+
+    # Step 2. maybe recast from f4 tensor
+    src, src_tv_layout = _maybe_recast_tensor_from_f4(src, tv_layout)
+    dst, dst_tv_layout = _maybe_recast_tensor_from_f4(dst, tv_layout)
+    src_shape = src.shape
+    # predicate tensor
+    idA = core.make_identity_tensor(src.shape)
+
+    # Step 3. select a proper tiling pattern as (...,TileV, ...)
+    src_cta_tiler = [
+        1,
+    ] * core.rank(src.layout)
+    src_cta_tiler[leading_mode] = core.size(src_tv_layout)  # (...,TileV,...)
+    dst_cta_tiler = [
+        1,
+    ] * core.rank(dst.layout)
+    dst_cta_tiler[leading_mode] = core.size(dst_tv_layout)  # (...,TileV,...)
+
+    # Step 4. partition input and output tensor by cta tiler.
+    gS = core.zipped_divide(
+        src, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    cS = core.zipped_divide(
+        idA, tuple(src_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    gD = core.zipped_divide(
+        dst, tuple(dst_cta_tiler)
+    )  # ((...,TileV,...),(...,RestV,...))
+    # print(f"{gS.type=}")
+
+    _convert_kernel(
+        gS,
+        gD,
+        cS,
+        src_tv_layout,
+        dst_tv_layout,
+        src_shape,
+        src_ty,
+        dst_ty,
+    ).launch(
+        grid=[core.size(gS, mode=[1]), 1, 1],
+        block=[core.size(src_tv_layout, mode=[0]), 1, 1],
+    )
+
+
+# Converts from src tensor to dst tensor, their logical shape are required to be the same.
+# And when src or dst dtype is narrow precision(Float4E2M1FN/Float8E8M0FNU/Float8E4M3FN), the shape of
+# their leading dimension should be 4(fp8)/8(fp4) element align. (nvgpu.cvt_fptrunc/cvt_fpext
+# needs 32-bits aligned input/output)
+def convert(src: core.Tensor, dst: core.Tensor):
+    assert len(src.shape) == len(
+        dst.shape
+    ), "Shape of src and dst tensors should be the same rank."
+    # find leading mode
+    leading_mode = [
+        idx
+        for idx, (shape, stride) in enumerate(zip(src.shape, src.stride))
+        if shape > 1 and stride == 1
+    ]
+    if len(leading_mode) != 1:
+        raise ValueError(f"Leading mode should be unique, but got {leading_mode}")
+    leading_mode = leading_mode[0]
+
+    elem_per_copy = 2
+
+    if src.element_type.width == 4 or dst.element_type.width == 4:
+        elem_per_copy = 8
+    elif src.element_type.width == 8 or dst.element_type.width == 8:
+        elem_per_copy = 4
+    assert (
+        src.shape[leading_mode] % elem_per_copy == 0
+        and dst.shape[leading_mode] % elem_per_copy == 0
+    )
+    _convert(src, dst, leading_mode, elem_per_copy)
+
+
+#########################################
+# Testing utilities
+#########################################
+
+
+def sample_pytest(rand_cfg=None):
+    """
+    Decorator to randomly sample pytest parametrized tests.
+    rand_cfg: Tuple[int, float] - (random_seed, sample_ratio)
+    Sampling is disabled when:
+    - A specific test is selected (via -k or direct test path)
+    - Not running under pytest
+    """
+    import functools
+    import os
+    import random
+    import sys
+
+    import pytest
+
+    seed, sample_ratio = rand_cfg
+    random.seed(seed)
+
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            if rand_cfg is not None and "PYTEST_CURRENT_TEST" in os.environ:
+                # Check if test was explicitly selected like ::test_name[param1-param2-...]
+                if "-k" in sys.argv or any(".py::" in arg for arg in sys.argv):
+                    # Test was explicitly selected, don't skip
+                    return func(*args, **kwargs)
+
+                if random.uniform(0.0, 1.0) > sample_ratio:
+                    pytest.skip(f"Randomly skipped (sampling ratio: {sample_ratio})")
+            return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+#########################################
+# Benchmarking utilities
+#########################################
+
+
+class JitArguments:
+    """
+    A type to hold both args and kwargs for passing to a kernel while benchmarking.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+
+def _cuda_success(
+    err: Union[tuple, cuda_runtime.cudaError_t, cuda_driver.CUresult], message: str
+):
+    """
+    Helper function to check CUDA API errors.
+    """
+    if isinstance(err, tuple):
+        _cuda_success(err[0], message)
+    elif isinstance(err, cuda_runtime.cudaError_t):
+        error_message = cuda_runtime.cudaGetErrorString(err)[1].decode("utf-8")
+        if err != cuda_runtime.cudaError_t.cudaSuccess:
+            raise RuntimeError(f"{message} : {error_message}")
+    elif isinstance(err, cuda_driver.CUresult):
+        if err != cuda_driver.CUresult.CUDA_SUCCESS:
+            error_message = cuda_driver.cuGetErrorString(err)[1].decode("utf-8")
+            raise RuntimeError(f"{message} : {error_message}")
+    else:
+        raise TypeError(
+            f"{err} is an unexpected type : it should be a cudaError_t or CUresult"
+        )
+
+
+def _does_kernel_use_stream(
+    kernel: Callable, stream: cuda_driver.CUstream, *args, **kwargs
+):
+    """
+    This function checks if the kernel uses the provided non-default stream.
+    It does this by capturing the stream and then checking if any kernels were launched.
+    :param kernel: The kernel to check
+    :type kernel: Callable
+    :param stream: The stream to check
+    :type stream: cuda_driver.CUstream
+    :return: True if the kernel uses the stream, False otherwise
+    :rtype: bool
+    """
+
+    assert int(stream) != int(
+        cuda_driver.CUstream_flags.CU_STREAM_DEFAULT
+    ), "Stream must be a non-default stream"
+
+    err = cuda_runtime.cudaStreamBeginCapture(
+        stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+    )
+    _cuda_success(err, "Error on stream capture")
+
+    kernel(*args, **kwargs)
+
+    err, graph = cuda_runtime.cudaStreamEndCapture(stream)
+    _cuda_success(err, "Error on stream capture")
+
+    # Get number of nodes in warmup graph to check it matches what is expected
+    err, _, num_nodes = cuda_runtime.cudaGraphGetNodes(graph)
+    _cuda_success(err, "Error on querying graph")
+    return num_nodes > 0
+
+
+def benchmark(
+    callable: Callable,
+    *,
+    warmup_iterations: int = 10,
+    iterations: int = 100,
+    stream: Optional[cuda_driver.CUstream] = None,
+    kernel_arguments: Optional[JitArguments] = None,
+    workspace_generator: Optional[Callable[[], JitArguments]] = None,
+    workspace_count: int = 1,
+    use_cuda_graphs: bool = False,
+) -> float:
+    """Benchmarks a callable function with the specified parameters.
+
+    For example,
+    .. code-block:: python
+
+        from cutlass.cute.testing import benchmark
+
+        @cute.jit
+        def user_function(a: cute.Tensor, b: cute.Tensor, c: cute.Tensor, stream: cuda_driver.CUstream):
+            # contents of the function
+            pass
+
+        time_us = benchmark(user_function, kernel_arguments=JitArguments(a, b, c, stream)
+                            warmup_iterations=10, iterations=100
+                            stream=stream)
+
+    To prevent skewing results by repeately accessing the L2 cache, use the workspace_count and workspace_generator
+    parameters to cycle through a number of different workspaces.
+
+    .. code-block:: python
+
+        from cutlass.cute.testing import benchmark
+
+        @cute.jit
+        def user_function(a: cute.Tensor, b: cute.Tensor, c: cute.Tensor):
+            # contents of the function
+            pass
+
+        def workspace_generator():
+            # create a, b, and c
+            return JitArguments(a, b, c)
+
+        time_us = benchmark(user_function,
+                            workspace_generator=workspace_generator,
+                            workspace_count=10,
+                            warmup_iterations=10000,
+                            iterations=1000)
+
+    To benchmark you may always configure the function being profiled (callable), the warmup iterations, and
+    the number of profiling iterations.
+
+    Whenever the kernel being benchmarked runs in a non-default stream, the stream must be provided through the stream parameter.
+
+    To use CUDA graphs, the callable must be a compiled @cute.jit annotated function.
+    When using CUDA graphs, the kernel must be launched in a non-default stream.
+
+    :param callable: The function to benchmark
+    :type callable: Callable
+    :param warmup_iterations: Number of warmup iterations, defaults to 10
+    :type warmup_iterations: int, optional
+    :param iterations: Number of benchmark iterations, defaults to 100
+    :type iterations: int, optional
+    :param stream: Stream kernel is launched in, defaults to CUDA stream default
+    :type stream: CUstream, None
+    :param kernel_arguments: Kernel arguments to launch callable with, defaults to None
+    :type kernel_arguments: JitArguments, None
+    :param workspace_generator: Function that returns kernel arguments, defaults to None
+    :type workspace_generator: Callable
+    :param workspace_count: Number of workspaces (arguments) to loop through, looping through enough workspaces will keep the L2 cache cold
+    :type workspace_count: int, optional
+    :param use_cuda_graphs: Whether to use cuda graphs, defaults to False
+    :type use_cuda_graphs: bool, optional
+
+    :return: The benchmark time in microseconds
+    :rtype: float
+    """
+
+    if stream is None:
+        stream = cuda_driver.CUstream(cuda_driver.CUstream_flags.CU_STREAM_DEFAULT)
+
+    if workspace_count < 1:
+        raise ValueError("workspace_count must be at least 1")
+
+    time_us = float("nan")
+    if workspace_generator == None:
+        # If no workspace generator is provided, we need a single workspace
+        if workspace_count != 1:
+            raise ValueError("Need a single workspace if not providing a generator")
+
+        # If no workspace generator is provided, we need a kernel_argument
+        if kernel_arguments == None:
+            raise ValueError(
+                "Please pass a kernel argument if not providing a generator"
+            )
+        workspace_generator = lambda: kernel_arguments
+
+    workspaces = [workspace_generator() for _ in range(workspace_count)]
+
+    for workspace in workspaces:
+        if type(workspace) != JitArguments:
+            raise TypeError(
+                "workspace_generator and/or kernel_arguments should use JitArguments type"
+            )
+
+    def _loop_and_call_kernel(iterations: int, workspace_index: int = 0):
+        for _ in range(iterations):
+            current_workspace = workspaces[workspace_index]
+            callable(*current_workspace.args, **current_workspace.kwargs)
+            workspace_index = (workspace_index + 1) % workspace_count
+        return workspace_index
+
+    # Create CUDA events for timing
+    err, start_event = cuda_driver.cuEventCreate(
+        cuda_driver.CUevent_flags.CU_EVENT_DEFAULT
+    )
+    _cuda_success(err, "Error on creating event")
+    err, end_event = cuda_driver.cuEventCreate(
+        cuda_driver.CUevent_flags.CU_EVENT_DEFAULT
+    )
+    _cuda_success(err, "Error on creating event")
+
+    elapsed_time = float("nan")
+
+    if use_cuda_graphs:
+        # Check if the callable is a JitExecutor
+        if not isinstance(callable, cutlass.base_dsl.jit_executor.JitExecutor):
+            raise TypeError("Function must be precompiled to be used with CUDA Graphs")
+
+        # Check if the stream is a non-default stream
+        if int(stream) == int(cuda_driver.CUstream_flags.CU_STREAM_DEFAULT):
+            raise ValueError(
+                "Measuring with CUDA Graphs requires executing in a non-default stream"
+            )
+
+        workspace_index = 0
+
+        # Capture warmup graph
+        err = cuda_runtime.cudaStreamBeginCapture(
+            stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+        )
+        _cuda_success(err, "Error on stream capture")
+
+        workspace_index = _loop_and_call_kernel(warmup_iterations)
+        err, gwarm = cuda_runtime.cudaStreamEndCapture(stream)
+        _cuda_success(err, "Error on stream capture")
+
+        # Get number of nodes in warmup graph to check it matches what is expected
+        err, _, num_nodes = cuda_runtime.cudaGraphGetNodes(gwarm)
+        _cuda_success(err, "Error on querying graph")
+        # Assertion is >= since we may launch multiple kernels in one host function
+        if num_nodes < warmup_iterations:
+            raise ValueError(
+                f"CUDA stream passed to benchmark does not match the stream the kernel was launched in"
+            )
+
+        # Capture profiling graph
+        err = cuda_runtime.cudaStreamBeginCapture(
+            stream, cuda_runtime.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+        )
+        _cuda_success(err, "Error on stream capture")
+        _loop_and_call_kernel(iterations, workspace_index)
+        err, gprofile = cuda_runtime.cudaStreamEndCapture(stream)
+        _cuda_success(err, "Error on stream capture")
+
+        # Instantiate graphs
+        err, gwarm = cuda_runtime.cudaGraphInstantiate(gwarm, 0)
+        _cuda_success(err, "Error on graph instantiation")
+        err, gprofile = cuda_runtime.cudaGraphInstantiate(gprofile, 0)
+        _cuda_success(err, "Error on graph instantiation")
+
+        # Launch warmup graph
+        err = cuda_runtime.cudaGraphLaunch(gwarm, stream)
+        _cuda_success(err, "Error on graph launch")
+
+        # Record start time
+        err = cuda_driver.cuEventRecord(start_event, stream)
+        _cuda_success(err, "Error on recording event")
+
+        # Launch profiling graph
+        err = cuda_runtime.cudaGraphLaunch(gprofile, stream)
+        _cuda_success(err, "Error on graph launch")
+
+        # Record end time
+        err = cuda_driver.cuEventRecord(end_event, stream)
+        _cuda_success(err, "Error on recording event")
+        err = cuda_driver.cuEventSynchronize(end_event)
+        _cuda_success(err, "Error on synchronizing event")
+
+        # Get elapsed time
+        err, elapsed_time = cuda_driver.cuEventElapsedTime(start_event, end_event)
+        _cuda_success(err, "Error on querying event")
+
+        # Destroy graphs
+        err = cuda_runtime.cudaGraphExecDestroy(gwarm)
+        _cuda_success(err, "Error on destroying graph")
+        err = cuda_runtime.cudaGraphExecDestroy(gprofile)
+        _cuda_success(err, "Error on destroying graph")
+
+    else:
+
+        if int(stream) != int(
+            cuda_driver.CUstream_flags.CU_STREAM_DEFAULT
+        ) and not _does_kernel_use_stream(
+            callable, stream, *workspaces[0].args, **workspaces[0].kwargs
+        ):
+            raise ValueError(
+                "CUDA stream passed to benchmark does not match the stream the kernel was launched in"
+            )
+
+        # Not using graphs
+        # Warmup
+        workspace_index = _loop_and_call_kernel(warmup_iterations)
+        # Record start event
+        err = cuda_driver.cuEventRecord(start_event, stream)
+        _cuda_success(err, "Error on recording event")
+        _loop_and_call_kernel(iterations, workspace_index)
+        # Record end event
+        err = cuda_driver.cuEventRecord(end_event, stream)
+        _cuda_success(err, "Error on recording event")
+        # Synchronize end event
+        err = cuda_driver.cuEventSynchronize(end_event)
+        _cuda_success(err, "Error on synchronizing event")
+        err, elapsed_time = cuda_driver.cuEventElapsedTime(start_event, end_event)
+        _cuda_success(err, "Error on querying event")
+
+    # Destroy events
+    err = cuda_driver.cuEventDestroy(start_event)
+    _cuda_success(err, "Error on destroying event")
+    err = cuda_driver.cuEventDestroy(end_event)
+    _cuda_success(err, "Error on destroying event")
+
+    return elapsed_time / iterations * 1e3
+
+
+def get_workspace_count(
+    one_workspace_bytes: int, warmup_iterations: int, iterations: int
+) -> int:
+    """Calculate the number of workspaces needed to fill L2 cache.
+
+    :param one_workspace_bytes: Size of one workspace in bytes
+    :type one_workspace_bytes: int
+    :param warmup_iterations: Number of warmup iterations
+    :type warmup_iterations: int
+    :param iterations: Number of iterations
+    :type iterations: int
+    :return: Number of workspaces needed
+    :rtype: int
+    """
+    num_l2_cache_bytes = cutlass.utils.HardwareInfo().get_l2_cache_size_in_bytes()
+    return max(
+        1,
+        min(
+            warmup_iterations + iterations,  # Don't create more workspaces than needed
+            (num_l2_cache_bytes + one_workspace_bytes - 1)
+            // one_workspace_bytes,  # Ceiling division
+        ),
+    )
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/typing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..215e71d98fc39c192c784c99bb8ef14f6e2f55d9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/cute/typing.py
@@ -0,0 +1,207 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from abc import ABC, abstractmethod
+from typing import ForwardRef, Tuple, Union, Any, Type, List
+
+from cutlass.base_dsl.typing import *
+
+from cutlass._mlir import ir
+import cutlass._mlir.extras.types as T
+from cutlass._mlir.dialects.cute import AddressSpace
+
+
+Int = Union[int, Integer]
+
+
+ScaledBasis = ForwardRef("ScaledBasis")
+
+
+IntTuple = Union[Int, Tuple["IntTuple", ...]]
+Shape = Union[Int, Tuple["Shape", ...]]
+Stride = Union[Int, ScaledBasis, Tuple["Stride", ...]]
+Coord = Union[Int, None, Tuple["Coord", ...]]
+
+
+class Layout(ir.Value):
+    def __init__(self, op_result):
+        super().__init__(op_result)
+
+    def __str__(self): ...
+
+    def get_hier_coord(self, idx) -> Coord:
+        """Return the (hierarchical) ND logical coordinate corresponding to the linear index"""
+        ...
+
+    @property
+    def shape(self, *, loc=None, ip=None) -> Shape: ...
+
+    @property
+    def stride(self, *, loc=None, ip=None) -> Stride: ...
+
+
+Tile = Union[Int, None, Layout, Tuple["Tile", ...]]
+
+# XTuple is super set of above types
+XTuple = Union[IntTuple, Shape, Stride, Coord, Tile]
+
+Tiler = Union[Shape, Layout, Tile]
+
+
+class Pointer(ABC):
+    """
+    Abstract base class for CuTe jit function and runtime _Pointer
+    """
+
+    @property
+    def value_type(self) -> Type[Numeric]:
+        return self.dtype
+
+    @property
+    def dtype(self) -> Type[Numeric]: ...
+
+    def align(self, min_align: int) -> "Pointer": ...
+
+    def __get_mlir_types__(self) -> List[ir.Type]: ...
+
+    def __extract_mlir_values__(self) -> List[ir.Value]: ...
+
+    def __new_from_mlir_values__(self, values) -> "Pointer": ...
+
+
+class Tensor(ABC):
+    """
+    Abstract base class for CuTe jit function and runtime _Tensor
+
+    A CuTe Tensor is iterator with layout
+
+    :Examples:
+
+    Create tensor from torch.tensor with Host Runtime:
+
+    .. code-block:: python
+
+        >>> import torch
+        >>> from cutlass.cute.runtime import from_dlpack
+        >>> mA = from_dlpack(torch.tensor([1, 3, 5], dtype=torch.int32))
+        >>> mA.shape
+        (3,)
+        >>> mA.stride
+        (1,)
+        >>> mA.layout
+        (3,):(1,)
+
+    Define JIT function:
+
+    .. code-block:: python
+
+        @cute.jit
+        def add(a: Tensor, b: Tensor, res: Tensor): ...
+
+    Call JIT function from python:
+
+    .. code-block:: python
+
+        >>> import torch
+        >>> a = torch.tensor([1, 3, 5], dtype=torch.int32)
+        >>> b = torch.tensor([2, 4, 6], dtype=torch.int32)
+        >>> c = torch.zeros([3], dtype=torch.int32)
+        >>> mA = from_dlpack(a)
+        >>> mB = from_dlpack(b)
+        >>> mC = from_dlpack(c)
+        >>> add(mA, mB, mC)
+        >>> c
+        tensor([3, 7, 11], dtype=torch.int32)
+    """
+
+    def __str__(self): ...
+
+    @abstractmethod
+    def __getitem__(self, idx) -> Union["Tensor", ir.Value, IntTuple]: ...
+
+    @abstractmethod
+    def __setitem__(self, idx, value): ...
+
+    @property
+    @abstractmethod
+    def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]: ...
+
+    @element_type.setter
+    def element_type(self, new_type): ...
+
+    @property
+    @abstractmethod
+    def memspace(self) -> AddressSpace: ...
+
+    @property
+    @abstractmethod
+    def iterator(self): ...
+
+    @property
+    def layout(self) -> Union[Layout, "ComposedLayout"]: ...
+
+    @property
+    def shape(self) -> Shape: ...
+
+    def load(self, *, loc=None, ip=None) -> "TensorSSA": ...
+
+    def store(self, data: "TensorSSA", *, loc=None, ip=None): ...
+
+    def mark_layout_dynamic(self, leading_dim: int | None = None) -> "Tensor": ...
+
+    def mark_compact_shape_dynamic(
+        self,
+        mode: int,
+        stride_order: tuple[int, ...] | None = None,
+        divisibility: int = 1,
+    ) -> "Tensor": ...
+
+    @abstractmethod
+    def fill(self, value: Numeric) -> None: ...
+
+
+__all__ = [
+    "Coord",
+    "Numeric",
+    "Integer",
+    "Boolean",
+    "Int8",
+    "Int16",
+    "Int32",
+    "Int64",
+    "Uint8",
+    "Uint16",
+    "Uint32",
+    "Uint64",
+    "Float",
+    "Float16",
+    "BFloat16",
+    "TFloat32",
+    "Float32",
+    "Float64",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E4M3B11FNUZ",
+    "Float8E4M3",
+    "Float8E8M0FNU",
+    "Float4E2M1FN",
+    "Float6E2M3FN",
+    "Float6E3M2FN",
+    "IntTuple",
+    "Layout",
+    "Pointer",
+    "Shape",
+    "Stride",
+    "Tensor",
+    "Tile",
+    "Tiler",
+    "XTuple",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/impl_utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/impl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bb9b5207144a11665449fac431fcbe2bd8f49bd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/impl_utils.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+
+def check_value_in(
+    value, possible_values: list, value_description: str, prefix=""
+) -> None:
+    if value not in possible_values:
+        err_msg = prefix
+        if err_msg != "":
+            err_msg += ": "
+        err_msg += f"invalid {value_description}, got {value}, must be one of {possible_values}"
+        raise ValueError(err_msg)
+
+
+def check_type_in(ty, possible_types: list, type_description: str, prefix="") -> None:
+    if not isinstance(ty, type):
+        ty = type(ty)
+    if ty not in possible_types:
+        err_msg = prefix
+        if err_msg != "":
+            err_msg += ": "
+        err_msg += f"invalid type for {type_description}, got {ty}, must be one of {possible_types}"
+        raise TypeError(err_msg)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7df24dd6bb6a5e42ebf5bad0e785cf77589bbbc6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/__init__.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .helpers import (
+    Agent,
+    CooperativeGroup,
+    PipelineOp,
+    SyncObject,
+    MbarrierArray,
+    NamedBarrier,
+    TmaStoreFence,
+    PipelineUserType,
+    PipelineState,
+    make_pipeline_state,
+    pipeline_init_wait,
+    arrive,
+    arrive_unaligned,
+    wait,
+    wait_unaligned,
+    arrive_and_wait,
+    sync,
+)
+
+from .sm90 import (
+    PipelineAsync,
+    PipelineCpAsync,
+    PipelineTmaAsync,
+    PipelineTmaMultiConsumersAsync,
+    PipelineTmaStore,
+    PipelineProducer,
+    PipelineConsumer,
+)
+
+from .sm100 import (
+    PipelineTmaUmma,
+    PipelineAsyncUmma,
+    PipelineUmmaAsync,
+)
+
+__all__ = [
+    "Agent",
+    "CooperativeGroup",
+    "PipelineOp",
+    "SyncObject",
+    "MbarrierArray",
+    "NamedBarrier",
+    "TmaStoreFence",
+    "PipelineUserType",
+    "PipelineState",
+    "PipelineAsync",
+    "PipelineCpAsync",
+    "PipelineTmaAsync",
+    "PipelineTmaUmma",
+    "PipelineTmaMultiConsumersAsync",
+    "PipelineAsyncUmma",
+    "PipelineUmmaAsync",
+    "PipelineTmaStore",
+    "PipelineProducer",
+    "PipelineConsumer",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b94899435224ceda4bd152944e9a4b9bc2e911
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/helpers.py
@@ -0,0 +1,652 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Union
+import warnings
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, Int32, Int64, if_generate
+from cutlass._mlir.dialects import llvm
+import cutlass._mlir.dialects.cute as _cute_ir
+
+
+##############################################################################
+# Agent class
+##############################################################################
+
+
+class Agent(enum.Enum):
+    """
+    Agent indicates what is participating in the pipeline synchronization.
+    """
+
+    # Arbitrary grouping of N threads
+    Thread = enum.auto()
+    # Same as AsyncThread, but includes all threads in the block
+    ThreadBlock = enum.auto()
+    # Same as AsyncThread, but includes all threads in the cluster
+    ThreadBlockCluster = enum.auto()
+
+
+class CooperativeGroup:
+    """
+    CooperativeGroup contains size and alignment restrictions for an Agent.
+    """
+
+    def __init__(self, agent: Agent, size: int = 1, alignment: int = 1):
+        if agent is Agent.Thread:
+            assert size > 0
+            if size == 32:
+                assert (
+                    size == alignment
+                ), "Error: Alignment does not match number of threads in a warp."
+            elif size == 128:
+                assert (
+                    size == alignment
+                ), "Error: Alignment does not match number of threads in a warpgroup."
+        elif agent is Agent.ThreadBlock:
+            raise NotImplementedError("Error: Not yet supported.")
+        elif agent is Agent.ThreadBlockCluster:
+            raise NotImplementedError("Error: Not yet supported.")
+        else:
+            # Should never reach this state
+            size = 0
+
+        if size <= 0:
+            raise ValueError(
+                "Error: The number of threads in a CooperativeGroup must be more than 0."
+            )
+
+        # Size indicates how many threads are participating in this CooperativeGroup
+        self.size = size
+        # Agent indicates the type of thread group
+        self.agent = agent
+
+
+class PipelineOp(enum.Enum):
+    """
+    PipelineOp assigns an operation to an agent corresponding to a specific hardware feature.
+    """
+
+    # async-threads
+    AsyncThread = enum.auto()
+    # Blackwell (SM100a) MMA instruction
+    TCGen05Mma = enum.auto()
+    # Tensor Memory Accelerator load
+    TmaLoad = enum.auto()
+    # TMA Store consuming smem produced by AsyncThread
+    TmaStore = enum.auto()
+    # Composite of multiple PipelineOps
+    Composite = enum.auto()
+    # Async load without TMA
+    AsyncLoad = enum.auto()
+
+
+def _get_pipeline_op(type_str):
+    return PipelineOp(type_str)
+
+
+##############################################################################
+# SyncObject class
+##############################################################################
+
+
+class SyncObject(ABC):
+    """Abstract base class for hardware synchronization primitives.
+
+    This class defines the interface for different types of hardware synchronization
+    mechanisms including shared memory barriers, named barriers, and fences.
+    """
+
+    @abstractmethod
+    def arrive(self) -> None:
+        pass
+
+    @abstractmethod
+    def wait(self) -> None:
+        pass
+
+    @abstractmethod
+    def arrive_and_wait(self) -> None:
+        pass
+
+    @abstractmethod
+    def arrive_and_drop(self) -> None:
+        pass
+
+    @abstractmethod
+    def get_barrier(self) -> Union[cute.Pointer, int, None]:
+        pass
+
+    @abstractmethod
+    def max(self) -> Union[int, None]:
+        pass
+
+
+class MbarrierArray(SyncObject):
+    """
+    MbarrierArray implements an abstraction for an array of smem barriers.
+    """
+
+    def __init__(
+        self,
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+    ) -> None:
+        self.barrier_storage = barrier_storage
+        self.tx_count = tx_count
+        self.num_stages = num_stages
+        self.op_type, self.cg = agent
+        self.arrive_count = self.cg.size
+
+        if self.num_stages <= 0:
+            raise ValueError("Error: Mbarrier stage count must be greater than 0.")
+        if self.arrive_count <= 0:
+            raise ValueError("Error: Mbarrier arrive count must be greater than 0.")
+        if self.op_type is PipelineOp.TmaLoad and self.tx_count < 0:
+            raise ValueError(
+                "Error: Mbarrier tx count must not be less than 0 for TMA ops."
+            )
+
+        # Store mbarrier base pointer
+        self.mbarrier_base = self.barrier_storage
+
+        # Mbarrier initialization in constructor
+        self.mbarrier_init()
+
+    def recast_to_new_op_type(self, new_op_type: PipelineOp) -> "MbarrierArray":
+        """
+        Creates a copy of MbarrierArray with a different op_type without re-initializing barriers
+        """
+        # Create new instance without initialization
+        new_mbarrier_array = object.__new__(MbarrierArray)
+
+        # Copy all attributes directly
+        new_mbarrier_array.barrier_storage = self.barrier_storage
+        new_mbarrier_array.op_type = new_op_type
+        new_mbarrier_array.cg = self.cg
+        new_mbarrier_array.num_stages = self.num_stages
+        new_mbarrier_array.tx_count = self.tx_count
+        new_mbarrier_array.arrive_count = self.arrive_count
+        new_mbarrier_array.mbarrier_base = self.mbarrier_base
+        return new_mbarrier_array
+
+    # Mbarrier initialization
+    def mbarrier_init(self) -> None:
+        """
+        Initializes an array of mbarriers using warp 0.
+        """
+
+        def then_body():
+            for index in range(self.num_stages):
+                cute.arch.mbarrier_init(self.get_barrier(index), self.arrive_count)
+
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        if_generate(warp_idx == 0, then_body)
+
+    def arrive(
+        self,
+        index: int,
+        dst: int,
+        cta_group: Optional[cute.nvgpu.tcgen05.CtaGroup] = None,
+    ) -> None:
+        """Select the arrive corresponding to this MbarrierArray's PipelineOp.
+
+        :param index: Index of the mbarrier in the array to arrive on
+        :type index: int
+        :param dst: Destination parameter for selective arrival, which can be either a mask or destination cta rank.
+            When None, both ``TCGen05Mma`` and ``AsyncThread`` will arrive on their local mbarrier.
+            - For ``TCGen05Mma``, ``dst`` serves as a multicast mask (e.g., 0b1011 allows arrive signal to be multicast to CTAs
+            in the cluster with rank = 0, 1, and 3).
+            - For ``AsyncThread``, ``dst`` serves as a destination cta rank (e.g., 3 means threads will arrive on
+            the mbarrier with rank = 3 in the cluster).
+        :type dst: int | None
+        :param cta_group: CTA group for ``TCGen05Mma``, defaults to None for other op types
+        :type cta_group: ``cute.nvgpu.tcgen05.CtaGroup``, optional
+        """
+        if self.op_type is PipelineOp.AsyncThread:
+            self.arrive_mbarrier(index, dst)
+        elif self.op_type is PipelineOp.TCGen05Mma:
+            assert (
+                cta_group is not None
+            ), "Error: CTA group must be provided for TCGen05Mma."
+            self.arrive_tcgen05mma(index, dst, cta_group)
+        elif self.op_type in [PipelineOp.TmaLoad]:
+            self.arrive_and_expect_tx(index, self.tx_count)
+        elif self.op_type is PipelineOp.AsyncLoad:
+            self.arrive_cp_async_mbarrier(index)
+        else:
+            assert (
+                False
+            ), f"Error: MbarrierArray is not supported for PipelineOp: {_get_pipeline_op(self.op_type)}."
+
+    def arrive_mbarrier(self, index: int, dst_rank: Optional[int] = None) -> None:
+        if dst_rank is None:
+            cute.arch.mbarrier_arrive(self.get_barrier(index))
+        else:
+            cute.arch.mbarrier_arrive(self.get_barrier(index), dst_rank)
+
+    def arrive_cp_async_mbarrier(self, index: int):
+        cute.arch.cp_async_mbarrier_arrive_noinc(self.get_barrier(index))
+
+    def arrive_tcgen05mma(
+        self, index: int, mask: Optional[int], cta_group: cute.nvgpu.tcgen05.CtaGroup
+    ) -> None:
+        if mask is None:
+            with cute.arch.elect_one():
+                cute.nvgpu.tcgen05.commit(self.get_barrier(index))
+        else:
+            with cute.arch.elect_one():
+                cute.nvgpu.tcgen05.commit(self.get_barrier(index), mask, cta_group)
+
+    def arrive_and_expect_tx(self, index: int, tx_count: int) -> None:
+        with cute.arch.elect_one():
+            cute.arch.mbarrier_arrive_and_expect_tx(self.get_barrier(index), tx_count)
+
+    def try_wait(self, index: int, phase: int) -> Boolean:
+        return cute.arch.mbarrier_try_wait(self.get_barrier(index), phase)
+
+    def wait(self, index: int, phase: int) -> None:
+        cute.arch.mbarrier_wait(self.get_barrier(index), phase)
+
+    def arrive_and_wait(
+        self,
+        index: int,
+        phase: int,
+        dst: int,
+        cta_group: Optional[cute.nvgpu.tcgen05.CtaGroup] = None,
+    ) -> None:
+        arrive(index, dst, cta_group)
+        wait(index, phase)
+
+    def arrive_and_drop(self) -> None:
+        raise NotImplementedError("Error: Not yet supported.")
+
+    def get_barrier(self, index: int) -> cute.Pointer:
+        return self.mbarrier_base + index
+
+    def max(self) -> int:
+        # Transaction barriers have a maximum arrive count of 511 (2^9 - 1).
+        # Non-transaction barriers have a maximum arrive count of 1,048,575 (2^20 - 1).
+        return 511
+
+    def __extract_mlir_values__(self):
+        return [self.barrier_storage]
+
+    def __new_from_mlir_values__(self, values):
+        return MbarrierArray(
+            values[0], self.num_stages, (self.op_type, self.cg), self.tx_count
+        )
+
+
+@dataclass(frozen=True)
+class NamedBarrier(SyncObject):
+    """
+    NamedBarrier is an abstraction for named barriers managed by hardware.
+    There are 16 named barriers available, with barrier_ids 0-15.
+
+    See the `PTX documentation <https://https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-bar>`__.
+    """
+
+    barrier_id: int
+    num_threads: int
+
+    def __post_init__(self) -> None:
+        if self.barrier_id < 0 or self.barrier_id >= 16:
+            raise ValueError("Error: NamedBarrier ID must be between 0 and 16.")
+        if self.barrier_id == 0:
+            warnings.warn(
+                "NamedBarrier ID 0 is by other driver APIs (i.e. sync_threads()) and should not be used."
+            )
+
+    def arrive(self) -> None:
+        """
+        The aligned flavor of arrive is used when all threads in the CTA will execute the
+        same instruction. See PTX documentation.
+        """
+        cute.arch.barrier_arrive(
+            barrier_id=self.barrier_id, number_of_threads=self.num_threads
+        )
+
+    def arrive_unaligned(self) -> None:
+        """
+        The unaligned flavor of arrive can be used with an arbitrary number of threads in the CTA.
+        """
+        llvm.inline_asm(
+            None,
+            [Int32(self.barrier_id).ir_value(), Int32(self.num_threads).ir_value()],
+            "barrier.arrive $0, $1;",
+            "r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+
+    def wait(self) -> None:
+        """
+        NamedBarriers do not have a standalone wait like mbarriers, only an arrive_and_wait.
+        If synchronizing two warps in a producer/consumer pairing, the arrive count would be
+        32 using mbarriers but 64 using NamedBarriers. Only threads from either the producer
+        or consumer are counted for mbarriers, while all threads participating in the sync
+        are counted for NamedBarriers.
+        """
+        warnings.warn(
+            "NamedBarrier wait also arrives on the barrier. Routing call to NamedBarrier.arrive_and_wait()."
+        )
+        self.arrive_and_wait()
+
+    def wait_unaligned(self) -> None:
+        warnings.warn(
+            "NamedBarrier wait also arrives on the barrier. Routing call to NamedBarrier.arrive_and_wait()."
+        )
+        llvm.inline_asm(
+            None,
+            [Int32(self.barrier_id).ir_value(), Int32(self.num_threads).ir_value()],
+            "barrier.sync $0, $1;",
+            "r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+
+    def arrive_and_wait(self) -> None:
+        cute.arch.barrier(
+            barrier_id=self.barrier_id, number_of_threads=self.num_threads
+        )
+
+    def arrive_and_drop(self) -> None:
+        raise NotImplementedError("Error: Not supported.")
+
+    def sync(self) -> None:
+        cute.arch.barrier(barrier_id=self.barrier_id)
+
+    def get_barrier(self) -> int:
+        return self.barrier_id
+
+    def max(self) -> int:
+        # Transaction barriers have a maximum arrive count of 4095 (2^12 - 1).
+        return 4095
+
+
+class TmaStoreFence(SyncObject):
+    """
+    TmaStoreFence is used for a multi-stage epilogue buffer.
+    """
+
+    def __init__(self, num_stages: int = 0) -> None:
+        if num_stages <= 0:
+            raise ValueError("Mbarrier stage count must be greater than 0.")
+
+        self.num_stages = num_stages
+
+    def arrive(self) -> None:
+        cute.arch.cp_async_bulk_commit_group()
+
+    def wait(self) -> None:
+        cute.arch.cp_async_bulk_wait_group(self.num_stages - 1, read=True)
+
+    def arrive_and_wait(self) -> None:
+        self.arrive()
+        self.wait()
+
+    def arrive_and_drop(self) -> None:
+        raise NotImplementedError("Error: Not supported.")
+
+    # TmaStoreFence doesn't have mbarriers
+    def get_barrier(self) -> None:
+        assert (
+            False
+        ), "Error: TmaStoreFence doesn't use mbarriers and cannot return a barrier."
+
+    def max(self) -> None:
+        raise NotImplementedError("Error: Not supported.")
+
+    def tail(self) -> None:
+        cute.arch.cp_async_bulk_wait_group(0, read=True)
+
+
+##############################################################################
+# PipelineState class
+##############################################################################
+
+
+class PipelineUserType(enum.Enum):
+    Producer = enum.auto()
+    Consumer = enum.auto()
+
+
+class PipelineState:
+    """
+    Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer.
+    """
+
+    def __init__(self, stages: int, count, index, phase):
+        self._stages = stages
+        self._count = count
+        self._index = index
+        self._phase = phase
+
+    def clone(self) -> "PipelineState":
+        return PipelineState(self.stages, self._count, self.index, self.phase)
+
+    @property
+    def index(self) -> Int32:
+        return self._index
+
+    @property
+    def count(self) -> Int32:
+        return self._count
+
+    @property
+    def stages(self) -> int:
+        return self._stages
+
+    @property
+    def phase(self) -> Int32:
+        return self._phase
+
+    def reset_count(self):
+        self._count = Int32(0)
+
+    def advance(self):
+        self._index += 1
+        self._count += 1
+
+        def then_body(index, phase):
+            new_index = Int32(0)
+            new_phase = phase ^ 1
+            return new_index, new_phase
+
+        def else_body(index, phase):
+            return index, phase
+
+        self._index, self._phase = if_generate(
+            self._index == self.stages,
+            then_body,
+            else_body,
+            [self.index, self.phase],
+            [Int32, Int32],
+        )
+
+    def reverse(self):
+        self._index -= 1
+        self._count -= 1
+
+        def then_body(index, phase):
+            new_index = Int32(self.stages - 1)
+            new_phase = phase ^ 1
+            return new_index, new_phase
+
+        def else_body(index, phase):
+            return index, phase
+
+        self._index, self._phase = if_generate(
+            self._index == -1,
+            then_body,
+            else_body,
+            [self.index, self.phase],
+            [Int32, Int32],
+        )
+
+    def __get_mlir_types__(self):
+        return [self._count.type, self._index.type, self._phase.type]
+
+    def __extract_mlir_values__(self):
+        count = self._count
+        index = self._index
+        phase = self._phase
+        return [count.ir_value(), index.ir_value(), phase.ir_value()]
+
+    # This can be overridden by derived classes
+    def __new_from_mlir_values__(self, values):
+        return PipelineState(
+            self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2])
+        )
+
+
+def make_pipeline_state(type: PipelineUserType, stages: int):
+    """
+    Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1.
+    """
+    if type is PipelineUserType.Producer:
+        return PipelineState(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(1),
+        )
+    elif type is PipelineUserType.Consumer:
+        return PipelineState(
+            stages,
+            Int32(0),
+            Int32(0),
+            Int32(0),
+        )
+    else:
+        assert (
+            False
+        ), "Error: invalid PipelineUserType specified for make_pipeline_state."
+
+
+##############################################################################
+# Helper functions
+##############################################################################
+
+
+def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None):
+    """
+    Fences the mbarrier init and syncs the threadblock or cluster
+    """
+    cute.arch.mbarrier_init_fence()
+
+    if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+        # If not using clusters, sync the threadblock
+        _sync(Agent.ThreadBlock)
+    else:
+        # If using clusters, sync the cluster
+        _sync(Agent.ThreadBlockCluster)
+
+
+def _sync(group: Agent):
+    """
+    Syncs all threads within an agent.
+    """
+    if group is Agent.Thread:
+        raise NotImplementedError("Error: Not supported.")
+    elif group is Agent.ThreadBlock:
+        cute.arch.sync_threads()
+    elif group is Agent.ThreadBlockCluster:
+        cute.arch.cluster_arrive()
+        cute.arch.cluster_wait()
+    else:
+        assert (
+            False
+        ), "Error: No explicit sync instruction exists. Please use barriers (named / mbarrier) instead."
+
+
+def _mbarrier_i64_to_ptr(val: Int64) -> cute.Pointer:
+    """
+    Converts a smem pointer of type Int64 to cute.Pointer with 8B alignment
+    """
+    return cute.make_ptr(
+        Int64,
+        val.ir_value(),
+        mem_space=_cute_ir.AddressSpace.smem,
+        assumed_align=8,
+    )
+
+
+# NamedBarrier free functions
+def arrive(barrier_id: int, num_threads: int):
+    """
+    The aligned flavor of arrive is used when all threads in the CTA will execute the
+    same instruction. See PTX documentation.
+    """
+    cute.arch.barrier_arrive(barrier_id=barrier_id, number_of_threads=num_threads)
+
+
+def arrive_unaligned(barrier_id: int, num_threads: int):
+    """
+    The unaligned flavor of arrive can be used with an arbitrary number of threads in the CTA.
+    """
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(num_threads).ir_value()],
+        "barrier.arrive $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def wait(barrier_id: int, num_threads: int):
+    """
+    NamedBarriers do not have a standalone wait like mbarriers, only an arrive_and_wait.
+    If synchronizing two warps in a producer/consumer pairing, the arrive count would be
+    32 using mbarriers but 64 using NamedBarriers. Only threads from either the producer
+    or consumer are counted for mbarriers, while all threads participating in the sync
+    are counted for NamedBarriers.
+    """
+    warnings.warn(
+        "NamedBarrier wait also arrives on the barrier. Routing call to NamedBarrier.arrive_and_wait()."
+    )
+    arrive_and_wait()
+
+
+def wait_unaligned(barrier_id: int, num_threads: int):
+    warnings.warn(
+        "NamedBarrier wait also arrives on the barrier. Routing call to NamedBarrier.arrive_and_wait()."
+    )
+    llvm.inline_asm(
+        None,
+        [Int32(barrier_id).ir_value(), Int32(num_threads).ir_value()],
+        "barrier.sync $0, $1;",
+        "r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+def arrive_and_wait(barrier_id: int, num_threads: int):
+    cute.arch.barrier(barrier_id=barrier_id, number_of_threads=num_threads)
+
+
+def sync(barrier_id: int = 0):
+    cute.arch.barrier(barrier_id=barrier_id)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm100.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm100.py
new file mode 100644
index 0000000000000000000000000000000000000000..2feed8cc0f1e702557f0c2b21b7582651a6405b8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm100.py
@@ -0,0 +1,453 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Union
+import warnings
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, if_generate
+
+from cutlass.pipeline import (
+    Agent,
+    CooperativeGroup,
+    PipelineOp,
+    PipelineState,
+    pipeline_init_wait,
+    PipelineAsync,
+)
+
+##############################################################################
+# Pipeline classes
+##############################################################################
+
+
+@dataclass(frozen=True)
+class PipelineTmaUmma(PipelineAsync):
+    """
+    PipelineTmaUmma is used for TMA producers and UMMA consumers (e.g. Blackwell mainloops).
+    """
+
+    is_leader_cta: bool
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_mcast_arrival_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask for signaling arrivals to multicasting threadblocks.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        tma_mcast_mask_a = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=2
+        )
+        tma_mcast_mask_b = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=1
+        )
+
+        block_in_cluster_coord_vmnk_peer = (
+            cta_in_cluster_coord_vmnk[0] ^ 1,
+            *cta_in_cluster_coord_vmnk[1:],
+        )
+        tma_mcast_mask_a_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2
+        )
+        tma_mcast_mask_b_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1
+        )
+
+        return (
+            tma_mcast_mask_a
+            | tma_mcast_mask_b
+            | tma_mcast_mask_a_peer
+            | tma_mcast_mask_b_peer
+        )
+
+    @staticmethod
+    def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout):
+        """
+        Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders.
+        """
+        bidx, bidy, _ = cute.arch.block_idx()
+
+        mma_coord_vmnk = (
+            bidx % cute.size(cta_layout_vmnk, mode=[0]),
+            bidx // cute.size(cta_layout_vmnk, mode=[0]),
+            bidy,
+            None,
+        )
+        return mma_coord_vmnk[0] == 0
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.TCGen05Mma
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # No mcast mask if not using clusters
+            producer_mask = None
+            # All threadblocks are leaders if not using clusters
+            is_leader_cta = True
+        else:
+            producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk)
+            is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk)
+
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+
+        consumer_mask = producer_mask
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            cta_group,
+        )
+
+    def consumer_release(self, state: PipelineState):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+        """
+        self.sync_object_empty.arrive(state.index, self.consumer_mask, self.cta_group)
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        if_generate(
+            self.is_leader_cta,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a noop since TMA instruction itself updates the transaction count.
+        """
+        pass
+
+
+@dataclass(frozen=True)
+class PipelineAsyncUmma(PipelineAsync):
+    """
+    PipelineAsyncUmma is used for AsyncThread producers and UMMA consumers (e.g. Blackwell input fusion pipelines).
+    """
+
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_leading_cta_rank(cta_v_size):
+        """
+        Computes the leading CTA rank.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        return cta_rank_in_cluster // cta_v_size * cta_v_size
+
+    @staticmethod
+    def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout):
+        """
+        Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders.
+        """
+        bidx, bidy, _ = cute.arch.block_idx()
+        mma_coord_vmnk = (
+            bidx % cute.size(cta_layout_vmnk, mode=[0]),
+            bidx // cute.size(cta_layout_vmnk, mode=[0]),
+            bidy,
+            None,
+        )
+        return mma_coord_vmnk[0] == 0
+
+    @staticmethod
+    def _compute_peer_cta_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask for signaling arrivals to multicasting threadblocks.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        mask_self = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=0
+        )
+        block_in_cluster_coord_vmnk_peer = (
+            cta_in_cluster_coord_vmnk[0] ^ 1,
+            *cta_in_cluster_coord_vmnk[1:],
+        )
+        mask_peer = cute.nvgpu.cpasync.create_tma_multicast_mask(
+            cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=0
+        )
+        return mask_self | mask_peer
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineAsyncUmma.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.AsyncThread
+        consumer_type = PipelineOp.TCGen05Mma
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8),
+            num_stages,
+            producer,
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        cta_v_size = (
+            cute.size(cta_layout_vmnk, mode=[0]) if cta_layout_vmnk is not None else 1
+        )
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
+            # No mcast mask if we're not using 2CTA tcgen05 MMA
+            producer_mask = None
+            consumer_mask = None
+        else:
+            # If we're using 2CTA UMMAs, producer will arrive the mbar on leading CTA
+            # We need to get the target cta_rank
+            producer_mask = PipelineAsyncUmma._compute_leading_cta_rank(cta_v_size)
+            # consumer needs to get the mask to signal
+            consumer_mask = PipelineAsyncUmma._compute_peer_cta_mask(cta_layout_vmnk)
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineAsyncUmma(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            cta_group,
+        )
+
+    def consumer_release(self, state: PipelineState):
+        """
+        UMMA consumer release buffer empty, cta_group needs to be provided.
+        """
+        self.sync_object_empty.arrive(state.index, self.consumer_mask, self.cta_group)
+
+
+@dataclass(frozen=True)
+class PipelineUmmaAsync(PipelineAsync):
+    """
+    PipelineUmmaAsync is used for UMMA producers and AsyncThread consumers (e.g. Blackwell accumulator pipelines).
+    """
+
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def _compute_tmem_sync_mask(cta_layout_vmnk: cute.Layout):
+        """
+        Computes a mask to signal completion of tmem buffers for 2CTA kernels.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+        return cute.make_layout_image_mask(
+            cta_layout_vmnk, cta_in_cluster_coord_vmnk, mode=0
+        )
+
+    @staticmethod
+    def _compute_peer_cta_rank():
+        """
+        Computes a mask to signal release of tmem buffers for 2CTA kernels.
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        return cta_rank_in_cluster // 2 * 2
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineUmmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TCGen05Mma
+        consumer_type = PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            # Set mask to None if not using clusters (i.e. 1CTA kernels)
+            producer_mask = None
+        else:
+            producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk)
+
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1:
+            # Set mask to None if not using 2CTA intructions
+            consumer_mask = None
+        else:
+            consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank()
+
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineUmmaAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            cta_group,
+        )
+
+    def producer_commit(self, state: PipelineState):
+        """
+        UMMA producer commit buffer full, cta_group needs to be provided.
+        """
+        self.sync_object_full.arrive(state.index, self.producer_mask, self.cta_group)
+
+    def producer_tail(self, state: PipelineState):
+        """
+        Make sure the last used buffer empty signal is visible to producer.
+        Producer tail is usually executed by producer before exit, to avoid dangling
+        mbarrier arrive signals after kernel exit.
+
+        :param state: The pipeline state that points to next useful buffer
+        :type state: PipelineState
+        """
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+        is_leader_cta = cta_rank_in_cluster % 2 == 0
+
+        def then_body():
+            # Assume state contains that next useful buffer
+            # So we only need to advance to num_stages - 1 times to last used buffer
+            for i in range(self.num_stages - 1):
+                state.advance()
+            self.producer_acquire(state)
+
+        if_generate(is_leader_cta, then_body)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm90.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc19960c9b1ccca84dcc18bca002e2fa2a303ca
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/pipeline/sm90.py
@@ -0,0 +1,985 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import enum
+from typing import Type, Tuple
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional, Union
+import warnings
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Boolean, Int32, if_generate
+
+from cutlass.pipeline import (
+    Agent,
+    CooperativeGroup,
+    PipelineOp,
+    SyncObject,
+    MbarrierArray,
+    TmaStoreFence,
+    PipelineUserType,
+    PipelineState,
+    make_pipeline_state,
+    pipeline_init_wait,
+)
+
+##############################################################################
+# Pipeline classes
+##############################################################################
+
+
+@dataclass(frozen=True)
+class PipelineAsync:
+    """PipelineAsync is a generic pipeline class where both the producer and consumer are
+    AsyncThreads. It also serves as a base class for specialized pipeline classes.
+
+    This class implements a producer-consumer pipeline pattern where both sides operate
+    asynchronously. The pipeline maintains synchronization state using barrier objects
+    to coordinate between producer and consumer threads.
+
+    The pipeline state transitions of one pipeline entry(mbarrier) can be represented as:
+
+    .. table:: Pipeline State Transitions
+       :widths: auto
+
+       +-----------+-----------+-----------+-----------+-----------+-----------+
+       | Barrier   | State     | p.acquire | p.commit  | c.wait    | c.release |
+       +===========+===========+===========+===========+===========+===========+
+       | empty_bar | empty     | <Return>  | n/a       | n/a       | -         |
+       +-----------+-----------+-----------+-----------+-----------+-----------+
+       | empty_bar | wait      | <Block>   | n/a       | n/a       | -> empty  |
+       +-----------+-----------+-----------+-----------+-----------+-----------+
+       | full_bar  | wait      | n/a       | -> full   | <Block >  | n/a       |
+       +-----------+-----------+-----------+-----------+-----------+-----------+
+       | full_bar  | full      | n/a       | -         | <Return>  | n/a       |
+       +-----------+-----------+-----------+-----------+-----------+-----------+
+
+    Where:
+
+    - p: producer
+    - c: consumer
+    - <Block>: This action is blocked until transition to a state allow it to proceed by other side
+      - e.g. ``p.acquire()`` is blocked until ``empty_bar`` transition to ``empty`` state by ``c.release()``
+
+    .. code-block:: text
+
+        Array of mbarriers as circular buffer:
+
+             Advance Direction
+           <-------------------
+
+            Producer   Consumer
+                |         ^
+                V         |
+           +-----------------+
+         --|X|X|W|D|D|D|D|R|X|<-.
+        /  +-----------------+   \\
+        |                        |
+        `------------------------'
+
+    Where:
+
+    - X: Empty buffer (initial state)
+    - W: Producer writing (producer is waiting for buffer to be empty)
+    - D: Data ready (producer has written data to buffer)
+    - R: Consumer reading (consumer is consuming data from buffer)
+
+    **Example:**
+
+    .. code-block:: python
+
+        # Create pipeline with 5 stages
+        pipeline = PipelineAsync.create(
+            num_stages=5,                   # number of pipeline stages
+            producer_group=producer_warp,
+            consumer_group=consumer_warp
+            barrier_storage=smem_ptr,       # smem pointer for array of mbarriers in shared memory
+        )
+
+        producer, consumer = pipeline.make_participants()
+        # Producer side
+        for i in range(num_iterations):
+            handle = producer.acquire_and_advance()  # Wait for buffer to be empty & Move index to next stage
+            # Write data to pipeline buffer
+            handle.commit()   # Signal buffer is full
+
+        # Consumer side
+        for i in range(num_iterations):
+            handle = consumer.wait_and_advance()     # Wait for buffer to be full & Move index to next stage
+            # Read data from pipeline buffer
+            handle.release()  # Signal buffer is empty
+    """
+
+    sync_object_full: SyncObject
+    sync_object_empty: SyncObject
+    num_stages: int
+    producer_mask: Optional[Int32]
+    consumer_mask: Optional[Int32]
+
+    @staticmethod
+    def _make_sync_object(
+        barrier_storage: cute.Pointer,
+        num_stages: int,
+        agent: tuple[PipelineOp, CooperativeGroup],
+        tx_count: int = 0,
+    ) -> SyncObject:
+        """
+        Returns a SyncObject corresponding to an agent's PipelineOp.
+        """
+        if agent[0] in [
+            PipelineOp.AsyncThread,
+            PipelineOp.TmaLoad,
+            PipelineOp.TCGen05Mma,
+            PipelineOp.Composite,
+            PipelineOp.AsyncLoad,
+        ]:
+            return MbarrierArray(
+                barrier_storage=barrier_storage,
+                num_stages=num_stages,
+                agent=agent,
+                tx_count=tx_count,
+            )
+        elif agent[0] is PipelineOp.TmaStore:
+            # Path taken for AsyncTmaStore
+            return TmaStoreFence(num_stages=num_stages)
+        else:
+            assert False, "Error: Invalid PipelineOp specified."
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        barrier_storage: cute.Pointer = None,
+        producer_mask: Int32 = None,
+        consumer_mask: Int32 = None,
+    ):
+        """Creates and initializes a new PipelineAsync instance.
+
+        This helper function computes necessary attributes and returns an instance of PipelineAsync
+        with the specified configuration for producer and consumer synchronization.
+
+        :param barrier_storage: Pointer to the shared memory address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: int
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param producer_mask: Mask for signaling arrives for the producer agent, defaults to ``None``
+        :type producer_mask: Int32, optional
+        :param consumer_mask: Mask for signaling arrives for the consumer agent, defaults to ``None``
+        :type consumer_mask: Int32, optional
+        :return: A new PipelineAsync instance
+        :rtype: PipelineAsync
+        :raises ValueError: If barrier_storage is not a cute.Pointer instance
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.AsyncThread
+        consumer_type = PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        pipeline_init_wait()
+
+        return PipelineAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+
+    def producer_try_acquire(self, state: PipelineState):
+        return self.sync_object_empty.try_wait(state.index, state.phase)
+
+    def producer_commit(self, state: PipelineState):
+        self.sync_object_full.arrive(state.index, self.producer_mask)
+
+    def consumer_wait(
+        self, state: PipelineState, try_wait_token: Optional[Boolean] = None
+    ):
+        if_generate(
+            try_wait_token is None or try_wait_token == 0,
+            lambda: self.sync_object_full.wait(state.index, state.phase),
+        )
+
+    def consumer_try_wait(self, state: PipelineState):
+        return self.sync_object_full.try_wait(state.index, state.phase)
+
+    def consumer_release(self, state: PipelineState):
+        self.sync_object_empty.arrive(state.index, self.consumer_mask)
+
+    def producer_get_barrier(self, state: PipelineState) -> cute.Pointer:
+        return self.sync_object_full.get_barrier(state.index)
+
+    def producer_tail(self, state: PipelineState):
+        """
+        Make sure the last used buffer empty signal is visible to producer.
+        Producer tail is usually executed by producer before exit, to avoid dangling
+        mbarrier arrive signals after kernel exit.
+
+        :param state: The pipeline state that points to next useful buffer
+        :type state: PipelineState
+        """
+        # Assume state contains that next useful buffer
+        # So we only need to advance to num_stages - 1 times to last used buffer
+        for i in range(self.num_stages - 1):
+            state.advance()
+        self.producer_acquire(state)
+
+    # Util methods to manage produer and consumer
+    def make_producer(self):
+        state = make_pipeline_state(PipelineUserType.Producer, self.num_stages)
+        return PipelineProducer(self, state, self.sync_object_full.cg)
+
+    def make_consumer(self):
+        state = make_pipeline_state(PipelineUserType.Consumer, self.num_stages)
+        return PipelineConsumer(self, state, self.sync_object_empty.cg)
+
+    def make_participants(self):
+        return self.make_producer(), self.make_consumer()
+
+
+
+@dataclass(frozen=True)
+class PipelineCpAsync(PipelineAsync):
+    """
+    PipelineCpAsync is used for CpAsync producers and AsyncThread consumers (e.g. Hopper non-TMA mainloops).
+    """
+
+    @staticmethod
+    def create(
+        barrier_storage: cute.Pointer,
+        num_stages: Int32,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        producer_mask: Int32 = None,
+        consumer_mask: Int32 = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: CooperativeGroup for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: CooperativeGroup for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param producer_mask: Mask for signaling arrives for the producer agent
+        :type producer_mask: Int32 | None
+        :param consumer_mask: Mask for signaling arrives for the consumer agent
+        :type consumer_mask: Int32 | None
+        """
+        producer_type = PipelineOp.AsyncLoad
+        consumer_type = PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_array_full = PipelineCpAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer
+        )
+        sync_object_array_empty = PipelineCpAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+
+        pipeline_init_wait()
+
+        return PipelineCpAsync(
+            sync_object_array_full,
+            sync_object_array_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+        )
+
+
+@dataclass(frozen=True)
+class PipelineTmaAsync(PipelineAsync):
+    """
+    PipelineTmaAsync is used for TMA producers and AsyncThread consumers (e.g. Hopper mainloops).
+    """
+
+    is_signalling_thread: Boolean
+
+    @staticmethod
+    @cute.jit
+    def init_empty_barrier_arrive_signal(cta_layout_vmnk: cute.Layout, tidx: Int32):
+        """
+        Initialize the empty barrier arrive signal
+        This function returns the destination cta rank and a boolean indicating if the signalling thread is the same as the current thread
+        """
+        # Logic to optimally schedule Empty Arrives
+        cluster_shape_vmnk = cta_layout_vmnk.shape
+
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(
+            cute.arch.block_idx_in_cluster()
+        )
+
+        tidx = tidx % 32
+        is_signalling_thread = tidx < cute.size(cluster_shape_vmnk)
+        dst_rank = tidx % cute.size(cluster_shape_vmnk)
+
+        dst_cta_coord = cta_layout_vmnk.get_hier_coord(dst_rank)
+        cur_cta_coord = cta_layout_vmnk.get_hier_coord(cta_rank_in_cluster)
+
+        is_same_row = (
+            dst_cta_coord[0] == cur_cta_coord[0]
+            and dst_cta_coord[1] == cur_cta_coord[1]
+            and dst_cta_coord[3] == cur_cta_coord[3]
+        )
+        is_same_col = (
+            dst_cta_coord[0] == cur_cta_coord[0]
+            and dst_cta_coord[2] == cur_cta_coord[2]
+            and dst_cta_coord[3] == cur_cta_coord[3]
+        )
+
+        is_same_row_or_col = is_same_row or is_same_col
+        is_signalling_thread_final = is_signalling_thread and is_same_row_or_col
+
+        return dst_rank, is_signalling_thread_final
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+        tidx: Optional[Int32] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group: `CooperativeGroup` for the consumer agent
+        :type consumer_group: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        :param tidx: thread index to consumer async threads
+        :type tidx: Int32 | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.AsyncThread
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        if tidx is None:
+            tidx, _, _ = cute.arch.thread_idx()
+        if cta_layout_vmnk is None:
+            cta_layout_vmnk = cute.make_layout((1, 1, 1, 1))
+        (
+            dst_rank,
+            is_signalling_thread,
+        ) = PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk, tidx)
+        if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1:
+            dst_rank = None
+        else:
+            dst_rank = dst_rank
+
+        producer_mask = None
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            dst_rank,
+            is_signalling_thread,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        TMA producer commit conditionally waits on buffer empty and sets the transaction barrier.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        self.sync_object_full.arrive(state.index, self.producer_mask)
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a noop since TMA instruction itself updates the transaction count.
+        """
+        pass
+
+    def consumer_release(self, state: PipelineState):
+        """
+        TMA consumer release conditionally signals the empty buffer to the producer.
+        """
+        if_generate(
+            self.is_signalling_thread,
+            lambda: self.sync_object_empty.arrive(state.index, self.consumer_mask),
+        )
+
+
+@dataclass(frozen=True)
+class PipelineTmaMultiConsumersAsync(PipelineAsync):
+    """
+    PipelineTmaMultiConsumersAsync is used for TMA producers and UMMA+Async consumers.
+    """
+
+    is_leader_cta: bool
+    sync_object_empty_umma: SyncObject
+    sync_object_empty_async: SyncObject
+    cta_group: cute.nvgpu.tcgen05.CtaGroup
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+        consumer_group_umma: CooperativeGroup,
+        consumer_group_async: CooperativeGroup,
+        tx_count: int,
+        barrier_storage: cute.Pointer = None,
+        cta_layout_vmnk: Optional[cute.Layout] = None,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaMultiConsumersAsync.
+        :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers
+        :type barrier_storage: cute.Pointer
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        :param consumer_group_umma: `CooperativeGroup` for the UMMA consumer agent
+        :type consumer_group_umma: CooperativeGroup
+        :param consumer_group_async: `CooperativeGroup` for the AsyncThread consumer agent
+        :type consumer_group_async: CooperativeGroup
+        :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage
+        :type tx_count: int
+        :param cta_layout_vmnk: Layout of the cluster shape
+        :type cta_layout_vmnk: cute.Layout | None
+        """
+        if not isinstance(barrier_storage, cute.Pointer):
+            raise ValueError(
+                f"Expected barrier_storage to be a cute.Pointer, but got {type(barrier_storage)}"
+            )
+
+        producer_type = PipelineOp.TmaLoad
+        consumer_type = PipelineOp.Composite
+        consumer_type_umma = PipelineOp.TCGen05Mma
+        consumer_type_async = PipelineOp.AsyncThread
+
+        if consumer_group_umma.agent != consumer_group_async.agent:
+            raise ValueError(
+                "UMMA and AsyncThread consumer groups must be the same agent"
+            )
+
+        if cta_layout_vmnk is not None and cute.size(cta_layout_vmnk) != 1:
+            raise ValueError(
+                f"PipelineTmaMultiConsumersAsync is not verified for cta_layout_vmnk != 1, cta_layout_vmnk:{cta_layout_vmnk}"
+            )
+
+        consumer_group = CooperativeGroup(
+            consumer_group_umma.agent,
+            consumer_group_umma.size + consumer_group_async.size,
+        )
+
+        producer = (producer_type, producer_group)
+        consumer = (consumer_type, consumer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8), num_stages, producer, tx_count
+        )
+        sync_object_empty = PipelineAsync._make_sync_object(
+            barrier_storage.align(min_align=8) + num_stages, num_stages, consumer
+        )
+        sync_object_empty_umma = sync_object_empty.recast_to_new_op_type(
+            consumer_type_umma
+        )
+        sync_object_empty_async = sync_object_empty.recast_to_new_op_type(
+            consumer_type_async
+        )
+
+        # No mcast mask if not using clusters
+        producer_mask = None
+        consumer_mask = None
+        # All threadblocks are leaders if not using clusters
+        is_leader_cta = True
+        cta_group = (
+            cute.nvgpu.tcgen05.CtaGroup.ONE
+            if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1
+            else cute.nvgpu.tcgen05.CtaGroup.TWO
+        )
+
+        pipeline_init_wait(cta_layout_vmnk)
+
+        return PipelineTmaMultiConsumersAsync(
+            sync_object_full,
+            sync_object_empty,
+            num_stages,
+            producer_mask,
+            consumer_mask,
+            is_leader_cta,
+            sync_object_empty_umma,
+            sync_object_empty_async,
+            cta_group,
+        )
+
+    def producer_acquire(
+        self, state: PipelineState, try_acquire_token: Optional[Boolean] = None
+    ):
+        """
+        TMA producer acquire waits on buffer empty and sets the transaction barrier for leader threadblocks.
+        """
+        if_generate(
+            try_acquire_token is None or try_acquire_token == 0,
+            lambda: self.sync_object_empty.wait(state.index, state.phase),
+        )
+        if_generate(
+            self.is_leader_cta,
+            lambda: self.sync_object_full.arrive(state.index, self.producer_mask),
+        )
+
+    def producer_commit(self, state: PipelineState):
+        """
+        TMA producer commit is a noop since TMA instruction itself updates the transaction count.
+        """
+        pass
+
+    def consumer_release(self, state: PipelineState, op_type: PipelineOp):
+        if op_type == PipelineOp.TCGen05Mma:
+            self.sync_object_empty_umma.arrive(
+                state.index, self.consumer_mask, self.cta_group
+            )
+        elif op_type == PipelineOp.AsyncThread:
+            self.sync_object_empty_async.arrive(state.index, self.consumer_mask)
+        else:
+            raise ValueError(f"Invalid PipelineOp specified. op_type:{op_type}")
+
+
+@dataclass(frozen=True)
+class PipelineTmaStore(PipelineAsync):
+    """
+    PipelineTmaStore is used for synchronizing TMA stores in the epilogue. It does not use mbarriers.
+    """
+
+    @staticmethod
+    def create(
+        *,
+        num_stages: int,
+        producer_group: CooperativeGroup,
+    ):
+        """
+        This helper function computes any necessary attributes and returns an instance of PipelineTmaStore.
+        :param num_stages: Number of buffer stages for this pipeline
+        :type num_stages: Int32
+        :param producer_group: `CooperativeGroup` for the producer agent
+        :type producer_group: CooperativeGroup
+        """
+
+        producer_type = PipelineOp.TmaStore
+
+        producer = (producer_type, producer_group)
+
+        sync_object_full = PipelineAsync._make_sync_object(None, num_stages, producer)
+
+        return PipelineTmaStore(sync_object_full, None, num_stages, None, None)
+
+    def producer_acquire(self):
+        self.sync_object_full.wait()
+
+    def producer_commit(self):
+        self.sync_object_full.arrive()
+
+    def consumer_wait(self):
+        assert False, "Error: PipelineTmaStore does not have a consumer agent."
+
+    def consumer_release(self):
+        assert False, "Error: PipelineTmaStore does not have a consumer agent."
+
+    def producer_tail(self):
+        self.sync_object_full.tail()
+
+
+#################################################################
+# Utilities to help user of pipeline to simplify the workflow
+#################################################################
+
+
+class ImmutableResourceHandle:
+    __origin: PipelineAsync
+    __immutable_state: PipelineState
+
+    def __init__(self, origin: PipelineAsync, immutable_state: PipelineState):
+        self.__origin = origin
+        self.__immutable_state = immutable_state
+
+    @property
+    def index(self):
+        """Get the index of the current pipeline stage."""
+        return self.__immutable_state.index
+
+    @property
+    def count(self):
+        """Get the count of how many handles this producer has committed.
+        This is useful for tracking the number of blocks that have been loaded from gmem.
+        """
+        return self.__immutable_state.count
+
+    def get_origin(self):
+        """Get the original pipeline this resource handle belongs to."""
+        return self.__origin
+
+    def __extract_mlir_values__(self):
+        """Extract MLIR values from the current state.
+
+        :return: List of MLIR values representing the current state
+        :rtype: list
+        """
+        # TODO: need to handle pipeline as well
+        return self.__immutable_state.__extract_mlir_values__()
+
+    def __new_from_mlir_values__(self, values):
+        """Create a new Producer instance from MLIR values.
+
+        :param values: MLIR values to initialize the state
+        :type values: Any
+        :return: New Producer instance with state initialized from values
+        :rtype: Producer
+        """
+        return self.__class__(
+            self.__origin, self.__immutable_state.__new_from_mlir_values__(values)
+        )
+
+class PipelineProducer:
+    """A class representing a producer in an asynchronous pipeline.
+
+    The Producer class manages the producer side of an asynchronous pipeline, handling
+    synchronization and state management for producing data. It provides methods for
+    acquiring, committing, and advancing through pipeline stages.
+
+    :ivar __pipeline: The asynchronous pipeline this producer belongs to
+    :type __pipeline: PipelineAsync
+    :ivar __state: The current state of the producer in the pipeline
+    :type __state: PipelineState
+    :ivar __group: The cooperative group this producer operates in
+    :type __group: CooperativeGroup
+
+    **Examples:**
+
+        .. code-block:: python
+
+            pipeline = PipelineAsync.create(...)
+            producer = pipeline.create_producer(producer_group, stages)
+            for i in range(iterations):
+                handle = producer.acquire_and_advance()  # Wait for buffer to be empty
+                # Produce data
+                producer.commit(handle)   # Signal data is ready
+                # An alternative way to do this is:
+                # handle.commit()   # Signal data is ready
+    """
+
+    __pipeline: PipelineAsync
+    __state: PipelineState
+    __group: CooperativeGroup
+
+    class ImmutableResourceHandle(ImmutableResourceHandle):
+        @property
+        def barrier(self):
+            """Get the barrier pointer for the current pipeline stage.
+
+            :return: Pointer to the barrier for the current stage
+            :rtype: cute.Pointer
+            """
+            return self.get_origin().producer_get_barrier(
+                self._ImmutableResourceHandle__immutable_state
+            )
+
+        def commit(self):
+            """Signal that data production is complete for the current stage.
+            This allows consumers to start processing the data.
+            """
+            self.get_origin().producer_commit(
+                self._ImmutableResourceHandle__immutable_state
+            )
+
+    def __init__(self, pipeline, state, group: CooperativeGroup):
+        """Initialize a new Producer instance.
+
+        :param pipeline: The pipeline this producer belongs to
+        :type pipeline: PipelineAsync
+        :param state: Initial pipeline state
+        :type state: PipelineState
+        :param group: The cooperative group for synchronization
+        :type group: CooperativeGroup
+        """
+        self.__pipeline = pipeline
+        self.__state = state
+        self.__group = group
+
+    def acquire(
+        self,
+        try_acquire_token: Optional[Boolean] = None,
+    ) -> ImmutableResourceHandle:
+        """Wait for the current buffer to be empty before producing data.
+        This is a blocking operation.
+
+        :param try_acquire_token: Optional token to try to acquire the buffer
+        :type try_acquire_token: Optional[Boolean]
+        :return: A handle to the producer for committing the data
+        :rtype: ImmutableResourceHandle
+        """
+        self.__pipeline.producer_acquire(self.__state, try_acquire_token)
+        handle = PipelineProducer.ImmutableResourceHandle(
+            self.__pipeline, self.__state.clone()
+        )
+        return handle
+
+    def advance(self):
+        """Move to the next pipeline stage."""
+        self.__state.advance()
+
+    def acquire_and_advance(
+        self, try_acquire_token: Optional[Boolean] = None
+    ) -> ImmutableResourceHandle:
+        """Wait for the current buffer to be empty before producing data.
+        Then advance to the next stage.
+        This is a blocking operation.
+
+        :param try_acquire_token: Optional token to try to acquire the buffer
+        :type try_acquire_token: Optional[Boolean]
+        :return: A handle to the producer for committing the data
+        :rtype: ImmutableResourceHandle
+        """
+        handle = self.acquire(try_acquire_token)
+        self.advance()
+        return handle
+
+    def try_acquire(self) -> Boolean:
+        """Try to acquire the current buffer without blocking.
+
+        :return: True if acquisition was successful, False otherwise
+        :rtype: Boolean
+        """
+        return self.__pipeline.producer_try_acquire(self.__state)
+
+    def commit(self, handle: Optional[ImmutableResourceHandle] = None):
+        """Signal that data production is complete for the current stage.
+        This allows consumers to start processing the data.
+        """
+        if handle is not None:
+            assert (
+                handle.get_origin() is self
+            ), "ResourceHandle does not belong to this PipelineProducer instance"
+            handle.commit()
+        else:
+            self.__pipeline.producer_commit(self.__state)
+
+    def tail(self):
+        """Ensure all used buffers are properly synchronized before producer exit.
+        This should be called before the producer finishes to avoid dangling signals.
+        """
+        self.__pipeline.producer_tail(self.__state)
+
+    def __extract_mlir_values__(self):
+        """Extract MLIR values from the current state.
+
+        :return: List of MLIR values representing the current state
+        :rtype: list
+        """
+        # TODO: need to handle pipeline as well
+        return self.__state.__extract_mlir_values__()
+
+    def __new_from_mlir_values__(self, values):
+        """Create a new Producer instance from MLIR values.
+
+        :param values: MLIR values to initialize the state
+        :type values: Any
+        :return: New Producer instance with state initialized from values
+        :rtype: Producer
+        """
+        return PipelineProducer(
+            self.__pipeline, self.__state.__new_from_mlir_values__(values), self.__group
+        )
+
+class PipelineConsumer:
+    """A class representing a consumer in an asynchronous pipeline.
+
+    The Consumer class manages the consumer side of an asynchronous pipeline, handling
+    synchronization and state management for consuming data. It provides methods for
+    waiting, releasing, and advancing through pipeline stages.
+
+    :ivar __pipeline: The asynchronous pipeline this consumer belongs to
+    :type __pipeline: PipelineAsync
+    :ivar __state: The current state of the consumer in the pipeline
+    :type __state: PipelineState
+    :ivar __group: The cooperative group this consumer operates in
+    :type __group: CooperativeGroup
+
+    **Examples:**
+        .. code-block:: python
+
+            pipeline = PipelineAsync.create(...)
+            consumer = pipeline.create_consumer(consumer_group, stages)
+            for i in range(iterations):
+                handle = consumer.wait_and_advance()     # Wait for data to be ready
+                # Consume data
+                consumer.release(handle)  # Signal buffer is empty
+                # An alternative way to do this is:
+                # handle.release()  # Signal buffer is empty
+    """
+
+    __pipeline: PipelineAsync
+    __state: PipelineState
+    __group: CooperativeGroup
+
+    class ImmutableResourceHandle(ImmutableResourceHandle):
+        def release(self):
+            """Signal that data production is complete for the current stage.
+            This allows consumers to start processing the data.
+            """
+            self.get_origin().consumer_release(
+                self._ImmutableResourceHandle__immutable_state
+            )
+
+    def __init__(self, pipeline, state: PipelineState, group: CooperativeGroup):
+        """Initialize a new Consumer instance.
+
+        :param pipeline: The pipeline this consumer belongs to
+        :type pipeline: PipelineAsync
+        :param state: Initial pipeline state
+        :type state: PipelineState
+        :param group: The cooperative group for synchronization
+        :type group: CooperativeGroup
+        """
+        self.__pipeline = pipeline
+        self.__group = group
+        self.__state = state
+
+    def wait(self, try_wait_token: Optional[Boolean] = None) -> ImmutableResourceHandle:
+        """Wait for data to be ready in the current buffer.
+        This is a blocking operation.
+
+        :param try_wait_token: Optional token to try to wait for the buffer
+        :type try_wait_token: Optional[Boolean]
+        :return: A handle to the consumer for releasing the data
+        :rtype: PipelineConsumerHandle
+        """
+        self.__pipeline.consumer_wait(self.__state, try_wait_token)
+        handle = PipelineConsumer.ImmutableResourceHandle(
+            self.__pipeline, self.__state.clone()
+        )
+        return handle
+
+    def advance(self):
+        """Move to the next pipeline stage."""
+        self.__state.advance()
+
+    def wait_and_advance(
+        self, try_wait_token: Optional[Boolean] = None
+    ) -> ImmutableResourceHandle:
+        """Wait for data to be ready in the current buffer.
+        Then advance to the next stage.
+        This is a blocking operation.
+
+        :param try_wait_token: Optional token to try to wait for the buffer
+        :type try_wait_token: Optional[Boolean]
+        :return: A handle to the consumer for releasing the data
+        :rtype: PipelineConsumerHandle
+        """
+        handle = self.wait(try_wait_token)
+        self.advance()
+        return handle
+
+    def try_wait(self) -> Boolean:
+        """Try to check if data is ready without blocking.
+
+        :return: True if data is ready, False otherwise
+        :rtype: Boolean
+        """
+        return self.__pipeline.consumer_try_wait(self.__state)
+
+    def release(self, handle: Optional[ImmutableResourceHandle] = None):
+        """Signal that data consumption is complete for the current stage.
+        This allows producers to start producing new data.
+        """
+        if handle is not None:
+            assert (
+                handle.get_origin() is self
+            ), "ResourceHandle does not belong to this PipelineConsumer instance"
+            handle.release()
+        else:
+            self.__pipeline.consumer_release(self.__state)
+
+    def __extract_mlir_values__(self):
+        """Extract MLIR values from the current state.
+
+        :return: List of MLIR values representing the current state
+        :rtype: list
+        """
+        return self.__state.__extract_mlir_values__()
+
+    def __new_from_mlir_values__(self, values):
+        """Create a new Consumer instance from MLIR values.
+
+        :param values: MLIR values to initialize the state
+        :type values: Any
+        :return: New Consumer instance with state initialized from values
+        :rtype: Consumer
+        """
+        # TODO: need to call pipeline.__new_from_mlir_values__ recursively
+        return PipelineConsumer(
+            self.__pipeline, self.__state.__new_from_mlir_values__(values), self.__group
+        )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/torch.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/torch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5ee5777cad35487f30b8705ff19747405d11194
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/torch.py
@@ -0,0 +1,311 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+import ctypes
+from math import prod
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Type, Union
+
+from cutlass.cute.typing import (
+    Numeric,
+    Boolean,
+    Float,
+    Integer,
+    TFloat32,
+    Float8E4M3B11FNUZ,
+    Float8E4M3FN,
+    Float8E5M2,
+    Float8E8M0FNU,
+    Float4E2M1FN,
+    Tensor,
+)
+from cutlass.cute.runtime import from_dlpack
+import cutlass.cute as cute
+import torch
+import cuda.bindings.driver as cuda
+
+
+def dtype(ty: Type[Numeric]):
+    """
+    Return the corresponding torch.dtype per the given DSL type
+    """
+    torch_dtype = getattr(torch, ty.__name__.lower(), None)
+
+    torch_type_map = {
+        Boolean: torch.bool,
+        # TFloat32 is just alias of float32
+        TFloat32: torch.float32,
+        Float8E5M2: torch.float8_e5m2,
+        Float8E4M3FN: torch.float8_e4m3fn,
+        Float8E4M3B11FNUZ: torch.float8_e4m3fnuz,
+    }
+    if torch_dtype is None:
+        torch_dtype = torch_type_map.get(ty)
+
+    if torch_dtype is None:
+        raise TypeError(f"{ty} is not supported by torch")
+    return torch_dtype
+
+
+def as_tensor(pointer, shape, torch_type):
+    """Convert a pointer to a torch tensor"""
+    if torch_type.itemsize == 1:
+        cytype = ctypes.c_uint8
+    elif torch_type.itemsize == 2:
+        cytype = ctypes.c_uint16
+    elif torch_type.itemsize == 4:
+        cytype = ctypes.c_uint32
+    elif torch_type.itemsize == 8:
+        cytype = ctypes.c_uint64
+    else:
+        raise ValueError(f"Unsupported torch dtype: {torch_type}")
+    cpointer = ctypes.cast(pointer, ctypes.POINTER(cytype))
+    arr = (cpointer._type_ * prod(shape)).from_address(
+        ctypes.addressof(cpointer.contents)
+    )
+    return torch.frombuffer(arr, dtype=torch_type).view(*shape)
+
+
+@dataclass
+class ScalarInitConfig:
+    """Configuration for scalar initialization"""
+
+    value: float = 0.0
+
+
+@dataclass
+class RandomInitConfig:
+    """Configuration for random initialization"""
+
+    min_val: int = -2
+    max_val: int = 2
+
+
+@dataclass
+class GaussianInitConfig:
+    """Configuration for Gaussian initialization"""
+
+    mean: float = 0.0
+    std: float = 1.0
+    scale: float = 1.0
+
+
+class TensorInitType(Enum):
+    """Enumeration of tensor initialization types"""
+
+    SKIP = "skip"
+    SCALAR = "scalar"
+    RANDOM = "random"
+    GAUSSIAN = "gaussian"
+
+
+def create_and_permute_torch_tensor(
+    shape,
+    dtype: "torch.dtype",
+    permute_order=None,
+    init_type: TensorInitType = TensorInitType.RANDOM,
+    init_config: Optional[
+        Union[RandomInitConfig, ScalarInitConfig, GaussianInitConfig]
+    ] = None,
+    device: Optional[torch.device] = None,
+) -> "torch.Tensor":
+    """
+    Create a torch tensor with specified shape and dtype. Optionally permute it and initialize it with specified init type and config
+    """
+    init_dtype = torch.int32 if init_type == TensorInitType.RANDOM else torch.float32
+    init_torch_tensor = torch.empty(*shape, dtype=init_dtype, device=device)
+    if init_type == TensorInitType.SKIP:
+        assert init_config is None
+        f32_torch_tensor = init_torch_tensor
+    elif init_type == TensorInitType.SCALAR:
+        if init_config is None:
+            init_config = ScalarInitConfig()
+        else:
+            if not isinstance(init_config, ScalarInitConfig):
+                raise ValueError("init_config must be ScalarInitConfig()")
+        f32_torch_tensor = init_torch_tensor.fill_(init_config.value)
+    elif init_type == TensorInitType.RANDOM:
+        if init_config is None:
+            init_config = RandomInitConfig()
+        else:
+            if not isinstance(init_config, RandomInitConfig):
+                raise ValueError("init_config must be RandomInitConfig()")
+        f32_torch_tensor = init_torch_tensor.random_(
+            init_config.min_val, init_config.max_val
+        ).to(dtype=torch.float32)
+    elif init_type == TensorInitType.GAUSSIAN:
+        if init_config is None:
+            init_config = GaussianInitConfig()
+        else:
+            if not isinstance(init_config, GaussianInitConfig):
+                raise ValueError("init_config must be GaussianInitConfig()")
+        f32_torch_tensor = init_torch_tensor.normal_(init_config.mean, init_config.std)
+        f32_torch_tensor = f32_torch_tensor * init_config.scale
+    else:
+        raise ValueError(f"Invalid init type: {init_type}")
+
+    if permute_order is not None:
+        f32_torch_tensor = f32_torch_tensor.permute(permute_order)
+
+    dtype_torch_tensor = f32_torch_tensor.to(dtype=dtype)
+
+    return dtype_torch_tensor
+
+
+def convert_cute_tensor(
+    f32_torch_tensor: "torch.Tensor",
+    cute_tensor: Tensor,
+    dtype: Type[Numeric],
+    is_dynamic_layout: bool = True,
+) -> Tensor:
+    """
+    Change the value of the cute tensor to make its value converted from a fp32 torch tensor.
+    Used for fp8 types tensor creatation now.
+    """
+    # if torch_tensor is on cpu, create a gpu copy
+    if f32_torch_tensor.device.type == "cpu":
+        f32_torch_tensor = f32_torch_tensor.cuda()
+
+    # Fp8 type need explicit type conversion
+    if dtype in {
+        Float8E5M2,
+        Float8E4M3FN,
+        Float8E8M0FNU,
+        Float4E2M1FN,
+    }:
+        fp32_cute_tensor = from_dlpack(f32_torch_tensor)
+        if is_dynamic_layout:
+            fp32_cute_tensor = fp32_cute_tensor.mark_layout_dynamic(
+                f32_torch_tensor.dim_order()[-1]
+            )
+        # Copy and convert from f32 cute tensor to dtype cute tensor
+        cute.testing.convert(fp32_cute_tensor, cute_tensor)
+    return cute_tensor
+
+
+def default_stream() -> cuda.CUstream:
+    """
+    Get default CUstream from torch stream
+    """
+    torch_stream = torch.cuda.default_stream()
+    stream = cuda.CUstream(torch_stream.cuda_stream)
+    return stream
+
+
+def current_stream() -> cuda.CUstream:
+    """
+    Get current CUstream from torch stream
+    """
+    torch_stream = torch.cuda.current_stream()
+    stream = cuda.CUstream(torch_stream.cuda_stream)
+    return stream
+
+
+def matrix(
+    l: int,
+    mode0: int,
+    mode1: int,
+    is_mode0_major: bool,
+    cutlass_dtype: Type[Numeric],
+    init_type: TensorInitType = TensorInitType.RANDOM,
+    init_config: Optional[
+        Union[RandomInitConfig, ScalarInitConfig, GaussianInitConfig]
+    ] = None,
+    device: Optional[torch.device] = None,
+) -> torch.Tensor:
+    """
+    Create a torch tensor for matrix
+
+    :param l: length of the matrix
+    :param mode0: mode0 of the matrix
+    :param mode1: mode1 of the matrix
+    :param is_mode0_major: whether the matrix is mode0 major
+    :param cutlass_dtype: cutlass dtype of the matrix
+    :param init_type: type of initialization
+    :param init_config: configuration for initialization
+    :param device: target torch device
+    """
+
+    shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1)
+    permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0)
+
+    if cutlass_dtype.is_float and cutlass_dtype.width <= 8:
+        torch_dtype = torch.int8
+    else:
+        torch_dtype = dtype(cutlass_dtype)
+
+    if init_type == TensorInitType.RANDOM and init_config is None:
+        if torch_dtype.is_signed:
+            min_val = -2
+            max_val = 2
+        else:
+            min_val = 0
+            max_val = 4
+        init_config = RandomInitConfig(min_val=min_val, max_val=max_val)
+
+    # Create dtype torch tensor
+    torch_tensor = create_and_permute_torch_tensor(
+        shape,
+        torch_dtype,
+        permute_order=permute_order,
+        init_type=init_type,
+        init_config=init_config,
+        device=device,
+    )
+
+    return torch_tensor
+
+
+def cute_tensor_like(
+    data_ref: torch.Tensor,
+    cutlass_dtype: Type[Numeric],
+    is_dynamic_layout: bool,
+    assumed_align: Optional[int] = None,
+) -> tuple[Tensor, torch.Tensor]:
+    """
+    Create a cute tensor use a torch tensor as the data source
+
+    :param data_ref: torch tensor as the data source
+    :param cutlass_dtype: cutlass dtype of the cute tensor
+    :param is_dynamic_layout: whether the cute tensor uses dynamic layout
+    :param assumed_align: assumed alignment of the cute tensor
+    """
+
+    # allocate device buffer for cute tensor
+    if cutlass_dtype.is_float and cutlass_dtype.width <= 8:
+        torch_dtype = torch.int8
+    else:
+        torch_dtype = dtype(cutlass_dtype)
+    torch_tensor = torch.empty_like(data_ref, dtype=torch_dtype, device="cuda")
+
+    # create cute tensor using the device buffer
+    cute_tensor = from_dlpack(torch_tensor, assumed_align=assumed_align)
+    cute_tensor.element_type = cutlass_dtype
+    if is_dynamic_layout:
+        for i, stride in enumerate(torch_tensor.stride()):
+            if stride == 1:
+                leading_dim = i
+                break
+        cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=leading_dim)
+
+    # initialize the cute tensor data
+    if cutlass_dtype.is_float and cutlass_dtype.width <= 8:
+        cute_tensor = convert_cute_tensor(
+            data_ref.to(dtype=torch.float32),
+            cute_tensor,
+            cutlass_dtype,
+            is_dynamic_layout,
+        )
+    else:
+        torch_tensor.copy_(data_ref.to(dtype=torch_dtype))
+
+    return cute_tensor, torch_tensor
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aec0a186d7a8fc18d65637e97905c7cd5702310d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/__init__.py
@@ -0,0 +1,93 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .static_persistent_tile_scheduler import (
+    WorkTileInfo,
+    PersistentTileSchedulerParams,
+    StaticPersistentTileScheduler,
+)
+
+from .hardware_info import (
+    HardwareInfo,
+)
+
+from .blackwell_helpers import (
+    compute_epilogue_tile_shape,
+    get_smem_store_op,
+    get_tmem_load_op,
+    get_num_tmem_alloc_cols,
+    make_smem_layout_a,
+    make_smem_layout_b,
+    make_smem_layout_epi,
+    make_trivial_tiled_mma,
+    make_blockscaled_trivial_tiled_mma,
+)
+
+from .hopper_helpers import (
+    sm90_get_smem_store_op,
+)
+
+from .blockscaled_layout import (
+    BlockScaledBasicChunk,
+    tile_atom_to_shape_SF,
+    make_smem_layout_sfa,
+    make_smem_layout_sfb,
+    make_tmem_layout_sfa,
+    make_tmem_layout_sfb,
+)
+
+from .grouped_gemm_tile_scheduler_helper import (
+    GroupSearchResult,
+    GroupedGemmGroupSearchState,
+    GroupedGemmTileSchedulerHelper,
+    create_initial_search_state,
+)
+
+from .tensormap_manager import (
+    TensorMapUpdateMode,
+    TensorMapManager,
+)
+
+from .smem_allocator import SmemAllocator
+
+from .layout import LayoutEnum
+
+from .smem_capacity import (
+    get_smem_capacity_in_bytes,
+)
+
+from .distributed_helpers import (
+    spin_lock_wait,
+    spin_lock_multimem_arrive,
+    multimem_ld_reduce_8xf16,
+    multimem_ld_reduce_4xf32,
+    multimem_ld_reduce_8xbf16,
+    multimem_ld_reduce_16xe4m3,
+    multimem_ld_reduce_16xe5m2,
+    multimem_st_4xb32,
+    sm_wise_inter_gpu_multimem_barrier,
+)
+
+__all__ = [
+    "get_smem_capacity_in_bytes",
+    "SmemAllocator",
+    "LayoutEnum",
+    "WorkTileInfo",
+    "PersistentTileSchedulerParams",
+    "StaticPersistentTileScheduler",
+    "TensorMapUpdateMode",
+    "TensorMapManager",
+    "GroupSearchResult",
+    "GroupedGemmGroupSearchState",
+    "create_initial_search_state",
+    "GroupedGemmTileSchedulerHelper",
+    "HardwareInfo",
+]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/ampere_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/ampere_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..1341756f3584f89b0c201631445beb91c34dc29e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/ampere_helpers.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+from typing_extensions import deprecated
+import warnings
+
+
+@deprecated("Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead")
+class SmemCapacity(Enum):
+    SM80_SMEM_CAPACITY_BYTES = (164 - 1) * 1024
+    SM86_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+    SM89_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+
+
+warnings.warn(
+    "SMEM_CAPACITY is deprecated: Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead",
+    DeprecationWarning,
+    stacklevel=2,
+)
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm80": SmemCapacity.SM80_SMEM_CAPACITY_BYTES.value,
+    "sm86": SmemCapacity.SM86_SMEM_CAPACITY_BYTES.value,
+    "sm89": SmemCapacity.SM89_SMEM_CAPACITY_BYTES.value,
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blackwell_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blackwell_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fb6bf4dbfa3e73f058037e79b0999697d720502
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blackwell_helpers.py
@@ -0,0 +1,1135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+from math import log2, ceil
+from typing import List, Type, Union, Tuple
+from typing_extensions import deprecated
+import warnings
+
+from cutlass.cutlass_dsl import (
+    Float16,
+    BFloat16,
+    TFloat32,
+    Float32,
+    Uint8,
+    Int8,
+    Float8E4M3FN,
+    Float8E5M2,
+    Float4E2M1FN,
+    Numeric,
+    NumericMeta,
+    dsl_user_op,
+)
+import cutlass.cute as cute
+from cutlass.cute.nvgpu.common import CopyUniversalOp
+from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp, StMatrix16x8x8bOp
+from cutlass.cute.nvgpu.tcgen05 import (
+    MmaF16BF16Op,
+    MmaTF32Op,
+    MmaI8Op,
+    MmaFP8Op,
+    MmaMXF8Op,
+    MmaMXF4Op,
+    MmaMXF4NVF4Op,
+    OperandSource,
+    OperandMajorMode,
+    CtaGroup,
+    Ld16x64bOp,
+    Ld16x128bOp,
+    Ld16x256bOp,
+    Ld16x32bx2Op,
+    Ld32x32bOp,
+    Repetition,
+    Pack,
+    find_tmem_tensor_col_offset,
+    SmemLayoutAtomKind,
+    make_smem_layout_atom,
+    tile_to_mma_shape,
+    is_tmem_load,
+    get_tmem_copy_properties,
+)
+from cutlass.cute.nvgpu.cpasync import (
+    CopyBulkTensorTileG2SMulticastOp,
+    CopyBulkTensorTileG2SOp,
+)
+from cutlass.utils.layout import LayoutEnum
+
+
+@deprecated("Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead")
+class SmemCapacity(Enum):
+    SM100_SMEM_CAPACITY_BYTES = (228 - 1) * 1024
+    SM120_SMEM_CAPACITY_BYTES = (100 - 1) * 1024
+
+
+warnings.warn(
+    "SMEM_CAPACITY is deprecated: Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead",
+    DeprecationWarning,
+    stacklevel=2,
+)
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm100": SmemCapacity.SM100_SMEM_CAPACITY_BYTES.value,
+    "sm120": SmemCapacity.SM120_SMEM_CAPACITY_BYTES.value,
+}
+
+
+@dsl_user_op
+def compute_epilogue_tile_shape(
+    cta_tile_shape: cute.Shape,
+    use_2cta_instrs: bool,
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    *,
+    layout_c: LayoutEnum = None,
+    elem_ty_c: Union[Type[Numeric], None] = None,
+    loc=None,
+    ip=None,
+) -> cute.Tile:
+    """Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
+
+    :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile, where
+        cta_tile_shape[0] corresponds to the height (M) and cta_tile_shape[1]
+        corresponds to the width (N) of the tile.
+    :type cta_tile_shape: cute.Shape
+    :param use_2cta_instrs: A flag indicating whether the configuration is for a 2SM setup.
+    :type use_2cta_instrs: bool
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type of output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param layout_c: The layout enum of the input tensor C. Defaults to None.
+    :type layout_c: LayoutEnum, optional
+    :param elem_ty_c: The element type for input tensor C. Defaults to None.
+    :type elem_ty_c: Union[Type[Numeric], None], optional
+
+    :return: Returns epilog tiler, which is used in subsequent epilog partitions.
+    :rtype: cute.Tile
+
+    :raises ValueError: If the computed tile cute.size does not meet minimum requirements based on CTA dimensions.
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    if elem_ty_c is not None:
+        validate_type(elem_ty_c, "elem_ty_c")
+
+    cta_m, cta_n = cta_tile_shape[:2]
+    (warp_m, warp_n) = (2, 2) if (cta_m == 64 and use_2cta_instrs) else (4, 1)
+    disable_source = elem_ty_c == None
+    max_bits = (
+        elem_ty_d.width if disable_source else max(elem_ty_c.width, elem_ty_d.width)
+    )
+
+    dp_full = 32
+    tile_m = min(cta_m, dp_full * warp_m)
+    n_perf = 0
+    if disable_source:
+        if max_bits == 4:
+            compute_elts = 8192
+        else:
+            compute_elts = 4096
+        n_perf = compute_elts // tile_m
+    else:
+        if max_bits == 32:
+            n_perf = 16 if (cta_m > 64 and cta_n <= 128) else 32
+        elif max_bits == 16:
+            n_perf = 32 if cta_n <= 128 else 64
+        else:
+            n_perf = 64
+
+    d_is_m_major = layout_d.is_m_major_c()
+    c_is_m_major = True if layout_c is None else layout_c.is_m_major_c()
+
+    n_min_d = (
+        8 * warp_n
+        if d_is_m_major
+        else (128 * warp_n if elem_ty_d.width == 6 else 128 // elem_ty_d.width * warp_n)
+    )
+    n_min_c = (
+        8 * warp_n
+        if (c_is_m_major or disable_source)
+        else (128 * warp_n if elem_ty_c.width == 6 else 128 // elem_ty_c.width * warp_n)
+    )
+    tile_n = min(cta_n, max(n_perf, n_min_c, n_min_d))
+
+    if cta_n < n_min_c or cta_n < n_min_d:
+        raise ValueError(f"CTA tile too small: {cta_tile_shape=}")
+
+    # stride by tmem warp layout and return a by-mode tiler
+    tile_m_layout = cute.make_layout(tile_m, loc=loc, ip=ip)
+    tile_n_layout = cute.make_layout(
+        (tile_n // warp_n, warp_n), stride=(1, cta_n // warp_n), loc=loc, ip=ip
+    )
+    return (tile_m_layout, cute.coalesce(tile_n_layout, loc=loc, ip=ip))
+
+
+@dsl_user_op
+def get_smem_store_op(
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    tiled_tmem_load: cute.TiledCopy,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """Selects the largest vectorized smem store atom available subject to
+    constraint of gmem layout and chosen TMEM_LOAD's thread-value ownership.
+
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type for output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param elem_ty_acc: The element type for accumulator.
+    :type elem_ty_acc: Type[Numeric]
+    :param tiled_tmem_load: An instance of TiledCopy that represents the tmem load operation.
+    :type tiled_tmem_load: cute.TiledCopy
+
+    :return: Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters.
+    :rtype: cute.CopyAtom
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be a Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    validate_type(elem_ty_acc, "elem_ty_acc")
+
+    is_m_major = layout_d.is_m_major_c()
+    is_n_major = layout_d.is_n_major_c()
+
+    if not is_tmem_load(tiled_tmem_load):
+        return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip)
+
+    num_dp, num_bits, num_rep, pack = get_tmem_copy_properties(tiled_tmem_load)
+
+    use_stmatrix_m8n8_4x = (
+        all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 32,
+                is_n_major,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep in (2, 4, 8, 16, 32, 64),
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 256,
+                num_rep in (2, 4, 8, 16, 32),
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 16,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep in (2, 4, 8, 16, 32, 64),
+                pack == Pack.PACK_16b_IN_32b,
+            ]
+        )
+    )
+    use_stmatrix_m16n8_4x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep in (4, 8, 16, 32),
+            pack == Pack.NONE,
+        ]
+    )
+    use_stmatrix_m8n8_2x = (
+        all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 32,
+                is_n_major,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep == 1,
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 32,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 256,
+                num_rep == 1,
+                pack == Pack.NONE,
+            ]
+        )
+        or all(
+            [
+                elem_ty_acc.width == 16,
+                elem_ty_d.width == 16,
+                num_dp == 16,
+                num_bits == 128,
+                num_rep == 1,
+                pack == Pack.PACK_16b_IN_32b,
+            ]
+        )
+    )
+    use_stmatrix_m16n8_2x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep == 2,
+            pack == Pack.NONE,
+        ]
+    )
+    use_stmatrix_m16n8_1x = all(
+        [
+            elem_ty_acc.width == 32,
+            elem_ty_d.width == 8,
+            is_m_major,
+            num_dp == 16,
+            num_bits == 256,
+            num_rep == 1,
+            pack == Pack.NONE,
+        ]
+    )
+
+    if use_stmatrix_m8n8_4x:
+        op = StMatrix8x8x16bOp(is_m_major, 4)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m8n8_2x:
+        op = StMatrix8x8x16bOp(is_m_major, 2)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_4x:
+        op = StMatrix16x8x8bOp(4)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_2x:
+        op = StMatrix16x8x8bOp(2)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    elif use_stmatrix_m16n8_1x:
+        op = StMatrix16x8x8bOp(1)
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+    else:
+        op = CopyUniversalOp()
+        return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def get_tmem_load_op(
+    cta_tile_shape: cute.Shape,
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    epi_tile: cute.Tile,
+    use_2cta_instrs: bool,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """Finds a performant TMEM_LOAD copy op for the selected epilogue
+    tile (epi_tile), element types, and tcgen05.mma instruction used.
+
+    :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile.
+    :type cta_tile_shape: cute.Shape
+    :param layout_d: The layout enum of the output tensor D.
+    :type layout_d: LayoutEnum
+    :param elem_ty_d: The element type for output tensor D.
+    :type elem_ty_d: Type[Numeric]
+    :param elem_ty_acc: The element type for accumulation.
+    :type elem_ty_acc: Type[Numeric]
+    :param epi_tile: The epilogue tile configuration.
+    :type epi_tile: cute.Tile
+    :param use_2cta_instrs: A flag indicating whether the configuration is for 2 SMs.
+    :type use_2cta_instrs: bool
+
+    :return: An instance of Sm100TmemLoad with the computed configuration.
+    :rtype: cute.CopyAtom
+
+    :raises ValueError: If the function cannot handle the given combination of accumulation
+    and dimension types, or if it cannot determine the appropriate configuration based on
+    the input parameters.
+    """
+    is_m_major = layout_d.is_m_major_c()
+
+    acc_bits = elem_ty_acc.width
+    d_bits = elem_ty_d.width
+
+    tmem_warp_shape_mn = (
+        (2, 2) if (cta_tile_shape[0] == 64 and use_2cta_instrs) else (4, 1)
+    )
+    epilog_tile_shape_mn = cute.product_each(
+        cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip
+    )
+    epilog_warp_tile_shape_mn = cute.shape_div(
+        epilog_tile_shape_mn, tmem_warp_shape_mn, loc=loc, ip=ip
+    )
+
+    num_dp = cute.size(epilog_warp_tile_shape_mn[0], loc=loc, ip=ip)
+    if num_dp not in {16, 32}:
+        raise ValueError("Cta tile and 2sm config does not generate correct num dp.")
+
+    num_col_bits = cute.size(epilog_warp_tile_shape_mn[1], loc=loc, ip=ip) * acc_bits
+
+    tmem_dp = 0
+    tmem_bit = 0
+    tmem_rep = 0
+    tmem_pack16b = False
+    if acc_bits == 32 and d_bits == 32:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 128
+        else:
+            tmem_dp = 32
+            tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 16:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 256
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 8:
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 16
+                tmem_bit = 32
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 256
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 16 and d_bits == 16:
+        tmem_pack16b = True
+        if num_dp == 16:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 128
+            else:
+                tmem_dp = 16
+                tmem_bit = 128
+        else:
+            if is_m_major:
+                tmem_dp = 16
+                tmem_bit = 128
+            else:
+                tmem_dp = 32
+                tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 6:
+        if not num_dp == 32:
+            raise ValueError("Num dp must be 32.")
+        tmem_dp = 32
+        tmem_bit = 32
+    elif acc_bits == 32 and d_bits == 4:
+        if not num_dp == 32:
+            raise ValueError("Num dp must be 32.")
+        tmem_dp = 32
+        tmem_bit = 32
+    else:
+        raise ValueError(
+            f"Can not handle acc/d type combination: {elem_ty_acc=}, {elem_ty_d=}"
+        )
+
+    num_bit_div = tmem_bit
+    if tmem_dp == 16 and tmem_bit == 32:
+        num_bit_div = 64
+
+    if (num_col_bits % (num_bit_div * 128) == 0) and (
+        (tmem_dp == 16 and tmem_bit == 64)
+        or (tmem_dp == 16 and tmem_bit == 32)
+        or (tmem_dp == 32 and tmem_bit == 32)
+    ):
+        tmem_rep = 128
+    elif (num_col_bits % (num_bit_div * 64) == 0) and (
+        (tmem_dp == 16 and tmem_bit == 128)
+        or (tmem_dp == 16 and tmem_bit == 64)
+        or (tmem_dp == 16 and tmem_bit == 32)
+        or (tmem_dp == 32 and tmem_bit == 32)
+    ):
+        tmem_rep = 64
+    elif num_col_bits % (num_bit_div * 32) == 0:
+        tmem_rep = 32
+    elif num_col_bits % (num_bit_div * 16) == 0:
+        tmem_rep = 16
+    elif num_col_bits % (num_bit_div * 8) == 0:
+        tmem_rep = 8
+    elif num_col_bits % (num_bit_div * 4) == 0:
+        tmem_rep = 4
+    elif num_col_bits % (num_bit_div * 2) == 0:
+        tmem_rep = 2
+    elif num_col_bits % (num_bit_div * 1) == 0:
+        tmem_rep = 1
+    else:
+        raise ValueError("Can not pick tmem_rep based on cta tile shape and tmem atom.")
+
+    if tmem_dp == 16 and tmem_bit == 64:
+        op = Ld16x64bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 128:
+        op = Ld16x128bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 256:
+        op = Ld16x256bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    elif tmem_dp == 16 and tmem_bit == 32:
+        op = Ld16x32bx2Op(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+
+    elif tmem_dp == 32 and tmem_bit == 32:
+        op = Ld32x32bOp(
+            Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE
+        )
+        return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip)
+    else:
+        raise ValueError()
+
+
+def get_num_tmem_alloc_cols(
+    tmem_tensors: Union[cute.Tensor, List[cute.Tensor]], rounding=True
+) -> int:
+    """Get the total number of TMEM allocation columns for the given TMEM tensors.
+
+    :param tmem_tensors: The TMEM tensors to get the number of allocation columns for.
+    :type tmem_tensors: Union[cute.Tensor, List[cute.Tensor]]
+    :param rounding: Whether to round up the number of allocation columns to the nearest power of 2.
+    :type rounding: bool
+
+    :return: The total number of TMEM allocation columns.
+    :rtype: int
+
+    :raises ValueError: If the number of TMEM allocation columns exceeds the maximum capacity of 512 or is less than 32.
+    """
+    # Turn tmem_tensors into a list
+    if isinstance(tmem_tensors, cute.Tensor):
+        tmem_tensors = [tmem_tensors]
+
+    # For each tensor in tmem_tensors, find the tmem_tensor_col_offset
+    num_tmem_alloc_cols_per_tensor = [
+        find_tmem_tensor_col_offset(t) for t in tmem_tensors
+    ]
+
+    # Sum up the num_tmem_alloc_cols_per_tensor
+    num_tmem_alloc_cols = sum(num_tmem_alloc_cols_per_tensor)
+
+    # Round up num_tmem_cols_total to the nearest power of 2
+    if rounding:
+        num_tmem_alloc_cols = 1 << ceil(log2(num_tmem_alloc_cols))
+
+    # Validate the number of TMEM allocation columns
+    SM100_TMEM_CAPACITY_COLUMNS = 512
+    SM100_TMEM_MIN_ALLOC_COLUMNS = 32
+    if (
+        num_tmem_alloc_cols > SM100_TMEM_CAPACITY_COLUMNS
+        or num_tmem_alloc_cols < SM100_TMEM_MIN_ALLOC_COLUMNS
+    ):
+        raise ValueError(
+            f"TMEM allocation columns {num_tmem_alloc_cols} exceeds the maximum capacity of {SM100_TMEM_CAPACITY_COLUMNS} or less than {SM100_TMEM_MIN_ALLOC_COLUMNS}"
+        )
+    return num_tmem_alloc_cols
+
+
+def get_smem_layout_atom_ab(
+    major_mode: OperandMajorMode,
+    element_type: Type[Numeric],
+    smem_shape_mn_k: Tuple[int, int],
+    *,
+    loc=None,
+    ip=None,
+) -> SmemLayoutAtomKind:
+    """Simple heuristics to select the optimal SMEM layout atom based on the
+    majorness, the data type, and the major mode size.
+
+    :param major_mode: The major mode for the SMEM tensor is K major.
+    :type major_mode: OperandMajorMode
+    :param element_type: The element type for the SMEM tensor.
+    :type element_type: Type[Numeric]
+    :param smem_shape_mn_k: The shape of the SMEM tensor.
+    :type smem_shape_mn_k: Tuple[int, int]
+
+    :return: The SMEM layout atom kind
+    :rtype: SmemLayoutAtomKind
+    """
+    is_k_major = major_mode == OperandMajorMode.K
+    major_mode_size = smem_shape_mn_k[1] if is_k_major else smem_shape_mn_k[0]
+
+    assert major_mode_size % 8 == 0
+    sw128_num_contiguous_bits = 1024
+    sw64_num_contiguous_bits = 512
+    sw32_num_contiguous_bits = 256
+    inter_num_contiguous_bits = 128
+    major_mode_size_bits = major_mode_size * element_type.width
+    assert major_mode_size_bits % inter_num_contiguous_bits == 0
+
+    if not is_k_major:
+        if (element_type.width == 32) and (
+            major_mode_size_bits % sw128_num_contiguous_bits == 0
+        ):
+            return SmemLayoutAtomKind.MN_SW128_32B
+        if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW128
+        if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW64
+        if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+            return SmemLayoutAtomKind.MN_SW32
+        return SmemLayoutAtomKind.MN_INTER
+    if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW128
+    if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW64
+    if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+        return SmemLayoutAtomKind.K_SW32
+    return SmemLayoutAtomKind.K_INTER
+
+
+@dsl_user_op
+def make_smem_layout_a(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    a_dtype: Type[Numeric],
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps with:
+    1. Get the partitioned shape of the A tensor based on the tiled_mma & MMA tiler.
+    2. Select the heuristic SMEM layout atom based on the A tensor's majorness, the data type, and the major mode size.
+    3. cute.Tile the SMEM layout atom to the MMA tile shape.
+    4. Stage the SMEM layout based on the number of stages.
+
+    :param tiled_mma: The tiled MMA used to partition tensor A
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The MMA tile shape
+    :type mma_tiler_mnk: cute.cute.Tile
+    :param a_dtype: The element type for tensor A
+    :type a_dtype: Type[Numeric]
+    :param num_stages: The number of pipeline stages for tensor A
+    :type num_stages: int
+
+    :return: SMEM layout for tensor A
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    is_k_major = tiled_mma.op.a_major_mode == OperandMajorMode.K
+    a_smem_shape = tiled_mma.partition_shape_A(
+        cute.dice(mma_tiler_mnk, (1, None, 1), loc=loc, ip=ip)
+    )
+    a_smem_shape_mn_k = (
+        cute.size(a_smem_shape[0][0], loc=loc, ip=ip) * a_smem_shape[1],
+        cute.size(a_smem_shape[0][1], loc=loc, ip=ip) * a_smem_shape[2],
+    )
+    a_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_ab(
+            tiled_mma.op.a_major_mode,
+            a_dtype,
+            a_smem_shape_mn_k,
+            loc=loc,
+            ip=ip,
+        ),
+        a_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    a_smem_layout_staged = tile_to_mma_shape(
+        a_smem_layout_atom,
+        cute.append(a_smem_shape, num_stages, loc=loc, ip=ip),
+        order=((1, 0, 2) if not is_k_major else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+    return a_smem_layout_staged
+
+
+@dsl_user_op
+def make_smem_layout_b(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    b_dtype: Type[Numeric],
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps:
+    1. Get the partitioned shape of the B tensor based on the tiled_mma & MMA tiler.
+    2. Select the heuristic SMEM layout atom based on the B tensor's majorness, the data type, and the major mode size.
+    3. cute.Tile the SMEM layout atom to the MMA tile shape.
+    4. Stage the SMEM layout based on the number of stages.
+
+    :param tiled_mma: The tiled MMA which is used to partition the B tensor.
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The MMA tile shape.
+    :type mma_tiler_mnk: cute.cute.Tile
+    :param b_dtype: The element type for the B tensor.
+    :type b_dtype: Type[Numeric]
+    :param num_stages: The stage of the B tensor.
+    :type num_stages: int
+
+    :return: SMEM layout for the B tensor.
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    is_k_major = tiled_mma.op.b_major_mode == OperandMajorMode.K
+    b_smem_shape = tiled_mma.partition_shape_B(
+        cute.dice(mma_tiler_mnk, (None, 1, 1), loc=loc, ip=ip)
+    )
+    b_smem_shape_nk = (
+        cute.size(b_smem_shape[0][0], loc=loc, ip=ip) * b_smem_shape[1],
+        cute.size(b_smem_shape[0][1], loc=loc, ip=ip) * b_smem_shape[2],
+    )
+    b_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_ab(
+            tiled_mma.op.b_major_mode,
+            b_dtype,
+            b_smem_shape_nk,
+            loc=loc,
+            ip=ip,
+        ),
+        b_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    b_smem_layout_staged = tile_to_mma_shape(
+        b_smem_layout_atom,
+        cute.append(b_smem_shape, num_stages, loc=loc, ip=ip),
+        order=((1, 0, 2) if not is_k_major else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+
+    return b_smem_layout_staged
+
+
+@dsl_user_op
+def get_smem_layout_atom_epi(
+    layout: LayoutEnum,
+    element_type: Type[Numeric],
+    epi_tile: cute.Tile,
+    *,
+    loc=None,
+    ip=None,
+) -> SmemLayoutAtomKind:
+    """Simple heuristics to select the optimal SMEM layout atom for epilog tensors.
+
+    :param layout: The layout enum for the SMEM tensor.
+    :type layout: LayoutEnum
+    :param element_type: The element type for the SMEM tensor.
+    :type element_type: Type[Numeric]
+    :param epi_tile: The epilogue tile shape.
+    :type epi_tile: cute.Tile
+
+    :return: The SMEM layout atom kind
+    :rtype: SmemLayoutAtomKind
+    """
+    # Get the max contiguous tile usable by TMA
+    tma_shape = tuple(
+        (
+            # assumes get<0>(epi_tile) is coalesced and unit stride
+            cute.coalesce(cute.right_inverse(x, loc=loc, ip=ip), loc=loc, ip=ip).shape
+            if isinstance(x, cute.Layout)
+            else x
+        )
+        for x in epi_tile
+    )
+
+    if layout.is_m_major_c():
+        # ColMajor C/D (M-major)
+        return get_smem_layout_atom_ab(
+            OperandMajorMode.MN, element_type, tma_shape, loc=loc, ip=ip
+        )
+    else:
+        # RowMajor C/D (N-major)
+        return get_smem_layout_atom_ab(
+            OperandMajorMode.K, element_type, tma_shape, loc=loc, ip=ip
+        )
+
+
+@dsl_user_op
+def make_smem_layout_epi(
+    epi_dtype: Type[Numeric],
+    epi_layout: LayoutEnum,
+    epi_tile: cute.Tile,
+    epi_stage: int,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[cute.Layout, cute.ComposedLayout]:
+    """This function helps:
+    1. Select the heuristic SMEM layout atom based on the epilog tile shape,
+       the epilog tensor's majorness, and the element type.
+    2. cute.Tile the SMEM layout atom to the epilog tile shape.
+    3. Stage the SMEM layout based on the number of stages.
+
+    :param epi_dtype: The element type for the epilog tensor.
+    :type epi_dtype: Type[Numeric]
+    :param epi_layout: The layout enum for the epilog tensor.
+    :type epi_layout: LayoutEnum
+    :param epi_tile: The epilogue tile shape.
+    :type epi_tile: cute.cute.Tile
+    :param epi_stage: The stage of the epilog tensor.
+    :type epi_stage: int
+
+    :return: SMEM layout for epilog tensors (usually C & D which are processed in the epilog)
+    :rtype: Union[cute.Layout, cute.ComposedLayout]
+    """
+
+    epilog_shape = cute.product_each(
+        cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip
+    )
+
+    c_smem_layout_atom = make_smem_layout_atom(
+        get_smem_layout_atom_epi(
+            epi_layout,
+            epi_dtype,
+            epi_tile,
+            loc=loc,
+            ip=ip,
+        ),
+        epi_dtype,
+        loc=loc,
+        ip=ip,
+    )
+    epi_smem_layout_staged = cute.tile_to_shape(
+        c_smem_layout_atom,
+        cute.append(epilog_shape, epi_stage, loc=loc, ip=ip),
+        order=((1, 0, 2) if not epi_layout.is_n_major_c() else (0, 1, 2)),
+        loc=loc,
+        ip=ip,
+    )
+
+    return epi_smem_layout_staged
+
+
+@dsl_user_op
+def make_trivial_tiled_mma(
+    ab_dtype: Type[Numeric],
+    a_leading_mode: OperandMajorMode,
+    b_leading_mode: OperandMajorMode,
+    acc_dtype: Type[Numeric],
+    cta_group: CtaGroup,
+    mma_tiler_mn: Tuple[int, int],
+    a_source: OperandSource = OperandSource.SMEM,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.TiledMma:
+    """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape.
+    By default, the MMA atom is created with SMEM operand source for A.
+
+    :param ab_dtype: Data type of operands A and B.
+    :type ab_dtype: type[Numeric]
+    :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N).
+    :type a_leading_mode: tcgen05.OperandMajorMode
+    :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N).
+    :type b_leading_mode: tcgen05.OperandMajorMode
+    :param acc_dtype: Data type of the accumulator.
+    :type acc_dtype: type[Numeric]
+    :param cta_group: The CTA group to use.
+    :type cta_group: tcgen05.CtaGroup
+    :param mma_tiler_mn: The shape (M, N, K) of the MMA tiler.
+    :type mma_tiler_mn: Tuple[int, int]
+    :param a_source: The source of operand A (SMEM by default or TMEM).
+    :type a_source: OperandSource
+
+    :return: A tiled MMA atom.
+    :rtype: cute.TiledMma
+
+    :raises TypeError: If the data type is not supported.
+    """
+
+    if ab_dtype in {Float16, BFloat16}:
+        mma_op = MmaF16BF16Op(
+            ab_dtype,
+            acc_dtype,
+            (*mma_tiler_mn, 16),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {TFloat32, Float32}:
+        mma_op = MmaTF32Op(
+            (*mma_tiler_mn, 8),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {
+        Uint8,
+        Int8,
+    }:
+        mma_op = MmaI8Op(
+            ab_dtype,
+            (*mma_tiler_mn, 32),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype in {Float8E4M3FN, Float8E5M2}:
+        mma_op = MmaFP8Op(
+            ab_dtype,
+            acc_dtype,
+            (*mma_tiler_mn, 32),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    else:
+        raise TypeError(f"unsupported ab_dtype, got {ab_dtype}")
+
+    return cute.make_tiled_mma(cute.make_mma_atom(mma_op))
+
+
+@dsl_user_op
+def make_blockscaled_trivial_tiled_mma(
+    ab_dtype: Type[Numeric],
+    a_leading_mode: OperandMajorMode,
+    b_leading_mode: OperandMajorMode,
+    sf_dtype: Type[Numeric],
+    sf_vec_size: int,
+    cta_group: CtaGroup,
+    mma_tiler_mn: Tuple[int, int],
+    a_source: OperandSource = OperandSource.SMEM,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.TiledMma:
+    """Make a BlockScaled tiled MMA atom with given data type, leading dimension, cta group and mma tile shape.
+    By default, the MMA atom is created with SMEM operand source for A.
+
+    :param ab_dtype: Data type of operands A and B.
+    :type ab_dtype: type[Numeric]
+    :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N).
+    :type a_leading_mode: tcgen05.OperandMajorMode
+    :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N).
+    :type b_leading_mode: tcgen05.OperandMajorMode
+    :param sf_dtype: Data type of the Scale Factor.
+    :type sf_dtype: type[Numeric]
+    :param sf_vec_size: The vector size of the Scale Factor.
+    :type sf_vec_size: int
+    :param cta_group: The CTA group to use.
+    :type cta_group: tcgen05.CtaGroup
+    :param mma_tiler_mn: The shape (M, N, K) of the MMA tiler.
+    :type mma_tiler_mn: Tuple[int, int]
+    :param a_source: The source of operand A (SMEM by default or TMEM).
+    :type a_source: OperandSource
+
+    :return: A tiled MMA atom.
+    :rtype: cute.TiledMma
+
+    :raises TypeError: If the data type is not supported.
+    """
+    if ab_dtype in {Float8E4M3FN, Float8E5M2}:
+        mma_op = MmaMXF8Op(
+            ab_dtype,
+            (*mma_tiler_mn, 32),
+            cta_group,
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif ab_dtype == Float4E2M1FN:
+        if sf_vec_size == 32:
+            mma_op = MmaMXF4Op(
+                (*mma_tiler_mn, 64),
+                cta_group,
+                a_source,
+            )
+        elif sf_vec_size == 16:
+            mma_op = MmaMXF4NVF4Op(
+                sf_dtype,
+                (*mma_tiler_mn, 64),
+                cta_group,
+                a_source,
+            )
+        else:
+            raise ValueError(f"unsupported sf_vec_size, got {sf_vec_size}")
+    else:
+        raise TypeError(f"unsupported ab_dtype, got {ab_dtype}")
+
+    return cute.make_tiled_mma(cute.make_mma_atom(mma_op))
+
+
+@dsl_user_op
+def cluster_shape_to_tma_atom_A(
+    cluster_shape_mnk: cute.Shape, atom_thr_id: cute.Layout, *, loc=None, ip=None
+) -> Union[CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileG2SOp]:
+    """
+    Select the appropriate TMA copy atom for A based on the number of SMs and the multicast flag.
+
+    :param cluster_shape_mnk: The shape of the cluster
+    :type cluster_shape_mnk: cute.Shape
+    :param atom_thr_id: The thread ID of the atom
+    :type atom_thr_id: cute.Layout
+
+    :return: The appropriate TMA copy atom kind
+    :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+    :raise ValueError: If the atom_sm_cnt is invalid
+    :raise ValueError: If the cluster shape is not divisible by the atom SM count
+    """
+    atom_sm_cnt = cute.size(atom_thr_id, loc=loc, ip=ip)
+    mcast = not (cute.size(cluster_shape_mnk, mode=[1], loc=loc, ip=ip) == 1)
+    cluster_size = cute.size(cluster_shape_mnk, loc=loc, ip=ip)
+
+    if not isinstance(cluster_size, int) or not isinstance(atom_sm_cnt, int):
+        raise ValueError(
+            f"Dynamic cluster shape or atom SM count is not supported: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if cute.size(cluster_shape_mnk, mode=[0], loc=loc, ip=ip) % atom_sm_cnt != 0:
+        raise ValueError(
+            f"Cluster shape not divisible by MMA size: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if atom_sm_cnt == 2 and mcast:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.TWO)
+    elif atom_sm_cnt == 2 and not mcast:
+        return CopyBulkTensorTileG2SOp(CtaGroup.TWO)
+    elif atom_sm_cnt == 1 and mcast:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.ONE)
+    elif atom_sm_cnt == 1 and not mcast:
+        return CopyBulkTensorTileG2SOp(CtaGroup.ONE)
+
+    raise ValueError(
+        f"Unsupported Configuration for SM100 TMA: {cluster_shape_mnk} and {atom_thr_id}"
+    )
+
+
+@dsl_user_op
+def cluster_shape_to_tma_atom_B(
+    cluster_shape_mnk: cute.Shape, atom_thr_id: cute.Layout, *, loc=None, ip=None
+) -> Union[CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileG2SOp]:
+    """
+    Select the appropriate TMA copy atom for Bbased on the number of SMs and the multicast flag.
+
+    :param cluster_shape_mnk: The shape of the cluster
+    :type cluster_shape_mnk: cute.Shape
+    :param atom_thr_id: The thread ID of the atom
+    :type atom_thr_id: cute.Layout
+
+    :return: The appropriate TMA copy atom kind
+    :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+    :raise ValueError: If the atom_sm_cnt is invalid
+    :raise ValueError: If the cluster shape is not divisible by the atom SM count
+    """
+    atom_sm_cnt = cute.size(atom_thr_id, loc=loc, ip=ip)
+    mcast = not (cute.size(cluster_shape_mnk, mode=[0], loc=loc, ip=ip) == atom_sm_cnt)
+    cluster_size = cute.size(cluster_shape_mnk, loc=loc, ip=ip)
+
+    if not isinstance(cluster_size, int) or not isinstance(atom_sm_cnt, int):
+        raise ValueError(
+            f"Dynamic cluster shape or atom SM count is not supported: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if cute.size(cluster_shape_mnk, mode=[0], loc=loc, ip=ip) % atom_sm_cnt != 0:
+        raise ValueError(
+            f"Cluster shape not divisible by MMA size: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if atom_sm_cnt == 2 and mcast:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.TWO)
+    elif atom_sm_cnt == 2 and not mcast:
+        return CopyBulkTensorTileG2SOp(CtaGroup.TWO)
+    elif atom_sm_cnt == 1 and mcast:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.ONE)
+    elif atom_sm_cnt == 1 and not mcast:
+        return CopyBulkTensorTileG2SOp(CtaGroup.ONE)
+
+    raise ValueError(
+        f"Unsupported Configuration for SM100 TMA: {cluster_shape_mnk} and {atom_thr_id}"
+    )
+
+
+@dsl_user_op
+def cluster_shape_to_tma_atom_SFB(
+    cluster_shape_mnk: cute.Shape, atom_thr_id: cute.Layout, *, loc=None, ip=None
+) -> Union[CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileG2SOp]:
+    """
+    Select the appropriate TMA copy atom for SFB based on the number of SMs and the multicast flag.
+
+    :param cluster_shape_mnk: The shape of the cluster
+    :type cluster_shape_mnk: cute.Shape
+    :param atom_thr_id: The thread ID of the atom
+    :type atom_thr_id: cute.Layout
+
+    :return: The appropriate TMA copy atom kind
+    :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+    :raise ValueError: If the atom_sm_cnt is invalid
+    :raise ValueError: If the cluster shape is not divisible by the atom SM count
+    """
+    atom_sm_cnt = cute.size(atom_thr_id, loc=loc, ip=ip)
+    mcast = not (cute.size(cluster_shape_mnk, mode=[0], loc=loc, ip=ip) == 1)
+    cluster_size = cute.size(cluster_shape_mnk, loc=loc, ip=ip)
+
+    if not isinstance(cluster_size, int) or not isinstance(atom_sm_cnt, int):
+        raise ValueError(
+            f"Dynamic cluster shape or atom SM count is not supported: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if cute.size(cluster_shape_mnk, mode=[0], loc=loc, ip=ip) % atom_sm_cnt != 0:
+        raise ValueError(
+            f"Cluster shape not divisible by MMA size: {cluster_shape_mnk} and {atom_thr_id}"
+        )
+
+    if atom_sm_cnt == 2:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.TWO)
+    elif atom_sm_cnt == 1 and mcast:
+        return CopyBulkTensorTileG2SMulticastOp(CtaGroup.ONE)
+    elif atom_sm_cnt == 1 and not mcast:
+        return CopyBulkTensorTileG2SOp(CtaGroup.ONE)
+
+    raise ValueError(
+        f"Unsupported Configuration for SM100 TMA: {cluster_shape_mnk} and {atom_thr_id}"
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blockscaled_layout.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blockscaled_layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa1e2eb70e38236d73f435e001fdc160d301c47c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/blockscaled_layout.py
@@ -0,0 +1,287 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass, field
+from typing import Union
+
+from cutlass.cutlass_dsl import dsl_user_op
+
+import cutlass.cute as cute
+from cutlass.cute.nvgpu.tcgen05 import OperandMajorMode
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+
+@dataclass(frozen=True)
+class BlockScaledBasicChunk:
+    """
+    The basic scale factor atom layout decided by tcgen05 BlockScaled MMA Ops.
+
+    This class represents the fixed layout pattern for scale factors used in
+    tcgen05 BlockScaled MMA Ops. The layout is determined by the
+    instruction specification and cannot be modified.
+    See `PTX documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x>`.
+    """
+
+    sf_vec_size: int
+    major_mode: OperandMajorMode = OperandMajorMode.K
+    _layout: cute.Layout = field(init=False, repr=False)
+
+    def __post_init__(self) -> None:
+        if self.major_mode == OperandMajorMode.K:
+            # K-major layout: (AtomMN, AtomK)
+            atom_shape = ((32, 4), (self.sf_vec_size, 4))
+            atom_stride = ((16, 4), (0, 1))
+        else:
+            # MN-major layout: (AtomK, AtomMN)
+            atom_shape = ((self.sf_vec_size, 4), (32, 4))
+            atom_stride = ((0, 1), (16, 4))
+
+        object.__setattr__(
+            self, "_layout", cute.make_layout(atom_shape, stride=atom_stride)
+        )
+
+    @property
+    def layout(self) -> cute.Layout:
+        """
+        Get the layout for this block scaled chunk.
+
+        :return: The layout representing the scale factor atom
+        :rtype: cute.Layout
+        """
+        return self._layout
+
+
+@dsl_user_op
+def tile_atom_to_shape_SF(
+    Shape: cute.Shape,
+    sf_vec_size: int,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Layout:
+    """
+    A helper function to get dynamic SFA/SFB layout by filling dynamic A/B shape to the scale factor atom layout.
+
+    :param Shape: The shape of the A/B tensor
+    :param sf_vec_size: Scale factor vector size
+
+    :return: The layout of the SFA/SFB tensor
+    :rtype: cute.Layout
+    """
+    # ((Atom_MN, Rest_MN),(Atom_K, Rest_K),RestL)
+    sf_layout = cute.tile_to_shape(
+        BlockScaledBasicChunk(sf_vec_size).layout, Shape, (2, 1, 3)
+    )
+    return sf_layout
+
+
+@dsl_user_op
+def make_smem_layout_sfa(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    sf_vec_size: int,
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Layout:
+    """
+    Make smem layout for SFA based on:
+    1. BlockScaledBasicChunk
+    2. MMA tiler shape
+    3. Scale factor vector size
+    4. Number of stages
+
+    :param tiled_mma: The tiled MMA
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The mma tiler shape
+    :type mma_tiler_mnk: cute.Tile
+    :param sf_vec_size: The scale factor vector size
+    :type sf_vec_size: int
+    :param num_stages: The number of stages
+    :type num_stages: int
+
+    :return: Smem layout for SFA
+    :rtype: cute.Layout
+    """
+    # (CTA_Tile_Shape_M, MMA_Tile_Shape_K)
+    sfa_tile_shape = (
+        mma_tiler_mnk[0] // cute.size(tiled_mma.thr_id.shape),
+        mma_tiler_mnk[2],
+    )
+
+    # ((Atom_M, Rest_M),(Atom_K, Rest_K))
+    smem_layout = cute.tile_to_shape(
+        BlockScaledBasicChunk(sf_vec_size).layout,
+        sfa_tile_shape,
+        (2, 1),
+    )
+
+    mma_tile_inst_k = 4
+    # (CTA_Tile_Shape_M, MMA_Inst_Shape_K)
+    sfa_tile_shape = cute.shape_div(sfa_tile_shape, (1, mma_tile_inst_k))
+    # ((Atom_Inst_M, Atom_Inst_K), MMA_M, MMA_K))
+    smem_layout = cute.tiled_divide(smem_layout, sfa_tile_shape)
+
+    atom_m = 128
+    tiler_inst = ((atom_m, sf_vec_size),)
+    # (((Atom_Inst_M, Rest_M),(Atom_Inst_K, Rest_K)), MMA_M, MMA_K)
+    smem_layout = cute.logical_divide(smem_layout, tiler_inst)
+
+    # (((Atom_Inst_M, Rest_M),(Atom_Inst_K, Rest_K)), MMA_M, MMA_K, STAGE)
+    sfa_smem_layout_staged = cute.append(
+        smem_layout,
+        cute.make_layout(
+            num_stages, stride=cute.cosize(cute.filter_zeros(smem_layout))
+        ),
+    )
+
+    return sfa_smem_layout_staged
+
+
+@dsl_user_op
+def make_smem_layout_sfb(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    sf_vec_size: int,
+    num_stages: int,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Layout:
+    """
+    Make smem layout for SFB based on:
+    1. BlockScaledBasicChunk
+    2. MMA tiler shape
+    3. Scale factor vector size
+    4. Number of stages
+
+    :param tiled_mma: The tiled MMA
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The mma tiler shape
+    :type mma_tiler_mnk: cute.Tile
+    :param sf_vec_size: The scale factor vector size
+    :type sf_vec_size: int
+    :param num_stages: The number of stages
+    :type num_stages: int
+
+    :return: Smem layout for SFA
+    :rtype: cute.Layout
+    """
+    # (Round_Up(CTA_Tile_Shape_N, 128), MMA_Tile_Shape_K)
+    sfb_tile_shape = (
+        cute.round_up(mma_tiler_mnk[1], 128),
+        mma_tiler_mnk[2],
+    )
+
+    # ((Atom_N, Rest_N),(Atom_K, Rest_K))
+    smem_layout = cute.tile_to_shape(
+        BlockScaledBasicChunk(sf_vec_size).layout,
+        sfb_tile_shape,
+        (2, 1),
+    )
+
+    mma_tile_inst_k = 4
+    # (CTA_Tile_Shape_N, MMA_Inst_Shape_K)
+    sfb_tile_shape = cute.shape_div(sfb_tile_shape, (1, mma_tile_inst_k))
+    # ((Atom_Inst_N, Atom_Inst_K), MMA_N, MMA_K)
+    smem_layout = cute.tiled_divide(smem_layout, sfb_tile_shape)
+
+    atom_n = 128
+    tiler_inst = ((atom_n, sf_vec_size),)
+    # (((Atom_Inst_M, Rest_M),(Atom_Inst_K, Rest_K)), MMA_M, MMA_K)
+    smem_layout = cute.logical_divide(smem_layout, tiler_inst)
+
+    # (((Atom_Inst_M, Rest_M),(Atom_Inst_K, Rest_K)), MMA_M, MMA_K, STAGE)
+    sfb_smem_layout_staged = cute.append(
+        smem_layout,
+        cute.make_layout(
+            num_stages, stride=cute.cosize(cute.filter_zeros(smem_layout))
+        ),
+    )
+
+    return sfb_smem_layout_staged
+
+
+@dsl_user_op
+def make_tmem_layout_sfa(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    sf_vec_size: int,
+    smem_layout: cute.Layout,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Layout:
+    """Make tmem layout for SFA based on:
+    1. SFA smem layout per stage
+    2. Cta tile shape m
+    3. tiled MMA atom thr size
+    4. Scale factor vector size
+
+    :param tiled_mma: The tiled MMA
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The mma tiler shape
+    :type mma_tiler_mnk: cute.Tile
+    :param sf_vec_size: The scale factor vector size
+    :type sf_vec_size: int
+    :param smem_layout: The smem layout of SFA per stage
+    :type smem_layout: cute.Layout
+
+    :return: TMEM layout for SFA
+    :rtype: cute.Layout
+    """
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+    cta_tile_shape_m = mma_tiler_mnk[0] // atom_thr_size
+
+    sfa_layout_ty = _cute_nvgpu_ir.make_tmem_layout_sfa(
+        smem_layout, cta_tile_shape_m, atom_thr_size, sf_vec_size
+    )
+    return _cute_ir.static(sfa_layout_ty, loc=loc, ip=ip)
+
+
+@dsl_user_op
+def make_tmem_layout_sfb(
+    tiled_mma: cute.TiledMma,
+    mma_tiler_mnk: cute.Tile,
+    sf_vec_size: int,
+    smem_layout: cute.Layout,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.Layout:
+    """Make tmem layout for SFB based on:
+    1. SFB smem layout per stage
+    2. Cta tile shape m
+    3. tiled MMA atom thr size
+    4. Scale factor vector size
+
+    :param tiled_mma: The tiled MMA
+    :type tiled_mma: cute.TiledMma
+    :param mma_tiler_mnk: The mma tiler shape
+    :type mma_tiler_mnk: cute.Tile
+    :param sf_vec_size: The scale factor vector size
+    :type sf_vec_size: int
+    :param smem_layout: The smem layout of SFB per stage
+    :type smem_layout: cute.Layout
+
+    :return: TMEM layout for SFB
+    :rtype: cute.Layout
+    """
+    atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+    cta_tile_shape_m = mma_tiler_mnk[0] // atom_thr_size
+
+    sfb_layout_ty = _cute_nvgpu_ir.make_tmem_layout_sfb(
+        smem_layout, cta_tile_shape_m, atom_thr_size, sf_vec_size
+    )
+    return _cute_ir.static(sfb_layout_ty, loc=loc, ip=ip)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/distributed_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/distributed_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5853c56c84f6fc02e911537147fa03b6b4566117
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/distributed_helpers.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from functools import partial
+from typing import Tuple
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import T, dsl_user_op, while_generate
+
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import arith, llvm, nvvm, scf
+from cutlass._mlir.dialects.nvvm import (
+    MemOrderKind,
+    MemScopeKind,
+    AtomicOpKind,
+)
+from cutlass.cute.typing import Pointer, Int32, Boolean
+
+
+@dsl_user_op
+def atomicAdd(dst_ptr: Pointer, val: Int32, loc=None, ip=None) -> Int32:
+    return nvvm.atomicrmw(
+        T.i32(),
+        AtomicOpKind.ADD,
+        dst_ptr.llvm_ptr,
+        val.ir_value(loc=loc, ip=ip),
+        mem_order=MemOrderKind.RELAXED,
+        syncscope=MemScopeKind.SYS,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@cute.jit
+def ld_bypass(input_tensor: cute.Tensor):
+    fragment = cute.make_fragment(input_tensor.layout, input_tensor.element_type)
+    copy_atom_load = cute.make_copy_atom(
+        cute.nvgpu.CopyUniversalOp(),
+        input_tensor.element_type,
+        memory_order=cute.nvgpu.common.MemoryOrder.VOLATILE,
+        memory_scope=cute.nvgpu.common.MemoryScope.SYS,
+    )
+    cute.copy(copy_atom_load, input_tensor, fragment)
+    vals = fragment.load()
+    return vals
+
+@cute.jit
+def spin_lock_wait(lock_ptr: Pointer, expect_count: Int32, mem_order : str = "relaxed", mem_scope : str = "gpu", loc=None, ip=None) -> None:
+    """
+    wait on a spin lock until the expected count is reached.
+    """
+    res = 0
+    while res != expect_count:
+        res = nvvm.atomicrmw(
+            T.i32(),    
+            AtomicOpKind.CAS, 
+            lock_ptr.llvm_ptr, 
+            Int32(0).ir_value(loc=loc, ip=ip),
+            b=Int32(expect_count).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.ACQUIRE if mem_order == "acquire" else MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU if mem_scope == "gpu" else MemScopeKind.SYS
+        )
+
+
+@dsl_user_op
+def multimem_red_add_sys_release(mc_ptr: Pointer, loc=None, ip=None) -> None:
+    """
+    add 1 to the multimem address
+    """
+    llvm.inline_asm(
+        None,
+        [mc_ptr.toint().ir_value()],
+        "multimem.red.release.sys.global.add.u32 [$0], 1;",
+        "l",
+        has_side_effects=True,
+        asm_dialect=0,
+        loc=loc,
+        ip=ip,
+    )
+
+@dsl_user_op
+def multimem_red_add_gpu_relaxed(mc_ptr: Pointer, loc=None, ip=None) -> None:
+    """
+    add 1 to the multimem address
+    """
+    llvm.inline_asm(
+        None,
+        [mc_ptr.toint().ir_value()],
+        "multimem.red.relaxed.gpu.global.add.u32 [$0], 1;",
+        "l",
+        has_side_effects=True,
+        asm_dialect=0,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def spin_lock_multimem_arrive(lock_ptr: Pointer, loc=None, ip=None) -> None:
+    """
+    arrive a spin lock when the lock_ptr is a multimem address.
+    """
+    multimem_red_add_gpu_relaxed(lock_ptr, loc=loc, ip=ip)
+
+
+def sm_wise_inter_gpu_multimem_barrier(barrier : Pointer, barrier_mc : Pointer, num_ranks, loc=None, ip=None) -> None :
+    """
+    barrier for inter-gpu sm-wise
+    """
+    bidx, bidy, bidz = cute.arch.block_idx()
+    bdimx, bdimy, _ = cute.arch.grid_dim()
+    pid = bidx + bidy * bdimx + bidz * bdimx * bdimy
+    multimem_red_add_sys_release(barrier_mc + pid, loc=loc, ip=ip)
+    cute.arch.fence_proxy(cute.arch.ProxyKind.alias)
+    spin_lock_wait(barrier + pid, num_ranks, mem_order="acquire", mem_scope="sys", loc=loc, ip=ip)
+
+
+@dsl_user_op
+def multimem_ld_reduce_base(
+    mc_ptr: Pointer,
+    *,
+    ptx_string: str = "",
+    loc=None,
+    ip=None,
+)  -> Tuple[Int32, Int32, Int32, Int32]:
+    # ld reduce 8xf16 elts
+    mc_ptr_int = mc_ptr.toint(loc=loc, ip=ip).ir_value()
+    return_struct = llvm.inline_asm(
+        ir.Type.parse("!llvm.struct<(i32,i32,i32,i32)>"),
+        [mc_ptr_int],
+        ptx_string,
+        "=r,=r,=r,=r,l",
+        has_side_effects=True,
+        asm_dialect=0,
+        loc=loc,
+        ip=ip,
+    )
+    return_regs = [llvm.extractvalue(T.i32(), return_struct, [i]) for i in range(4)]
+    return return_regs[0], return_regs[1], return_regs[2], return_regs[3]
+
+
+multimem_ld_reduce_8xf16 = partial(multimem_ld_reduce_base, ptx_string="multimem.ld_reduce.sys.relaxed.global.add.acc::f32.v4.f16x2 {$0, $1, $2, $3}, [$4];")
+multimem_ld_reduce_4xf32 = partial(multimem_ld_reduce_base, ptx_string="multimem.ld_reduce.sys.relaxed.global.add.v4.f32 {$0, $1, $2, $3}, [$4];")
+multimem_ld_reduce_8xbf16 = partial(multimem_ld_reduce_base, ptx_string="multimem.ld_reduce.sys.relaxed.global.add.acc::f32.v4.bf16x2 {$0, $1, $2, $3}, [$4];")
+multimem_ld_reduce_16xe4m3 = partial(multimem_ld_reduce_base, ptx_string="multimem.ld_reduce.sys.relaxed.global.add.acc::f16.v4.e4m3x4 {$0, $1, $2, $3}, [$4];")
+multimem_ld_reduce_16xe5m2 = partial(multimem_ld_reduce_base, ptx_string="multimem.ld_reduce.sys.relaxed.global.add.acc::f16.v4.e5m2x4 {$0, $1, $2, $3}, [$4];")
+
+
+@dsl_user_op
+def multimem_st_4xb32(
+    mc_ptr: Pointer,
+    x: Int32,
+    y: Int32,
+    z: Int32,
+    w: Int32,
+    *,
+    loc=None,
+    ip=None,
+) -> None:
+    # st 4x32 bits of data
+    mc_ptr_int = mc_ptr.toint(loc=loc, ip=ip).ir_value()
+    llvm.inline_asm(
+        T.i32(),
+        [mc_ptr_int, x, y, z, w],
+        "multimem.st.sys.relaxed.global.v4.f32 [$1], {$2, $3, $4, $5};",
+        "=r,l,r,r,r,r",
+        has_side_effects=True,
+        asm_dialect=0,
+        loc=loc,
+        ip=ip,
+    )
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a51bae62963bd482fd590f824a4bc1c8564ece0e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py
@@ -0,0 +1,466 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import List, Tuple
+
+import cutlass.cute as cute
+from cutlass.cutlass_dsl import Int32, extract_mlir_values, new_from_mlir_values
+from cutlass._mlir import ir
+
+from cutlass.utils.static_persistent_tile_scheduler import PersistentTileSchedulerParams
+
+
+class GroupSearchResult:
+    """
+    The result of the group search for grouped gemm.
+
+    :param group_idx: The result group index
+    :type group_idx: Int32
+    :param cta_tile_idx_m: CTA tile index along M dimension after rasterization
+    :type cta_tile_idx_m: Int32
+    :param cta_tile_idx_n: CTA tile index along N dimension after rasterization
+    :type cta_tile_idx_n: Int32
+    :param problem_shape_m: The M dimension of the gemm problem
+    :type problem_shape_m: Int32
+    :param problem_shape_n: The N dimension of the gemm problem
+    :type problem_shape_n: Int32
+    :param problem_shape_k: The K dimension of the gemm problem
+    :type problem_shape_k: Int32
+    :param cta_tile_count_k: Number of tiles along K dimension
+    :type cta_tile_count_k: Int32
+    """
+
+    def __init__(
+        self,
+        group_idx: Int32,
+        cta_tile_idx_m: Int32,
+        cta_tile_idx_n: Int32,
+        problem_shape_m: Int32,
+        problem_shape_n: Int32,
+        problem_shape_k: Int32,
+        cta_tile_count_k: Int32,
+    ) -> None:
+        self.group_idx = group_idx
+        self.cta_tile_idx_m = cta_tile_idx_m
+        self.cta_tile_idx_n = cta_tile_idx_n
+        self.problem_shape_m = problem_shape_m
+        self.problem_shape_n = problem_shape_n
+        self.problem_shape_k = problem_shape_k
+        self.cta_tile_count_k = cta_tile_count_k
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.group_idx)
+        values.extend(extract_mlir_values(self.cta_tile_idx_m))
+        values.extend(extract_mlir_values(self.cta_tile_idx_n))
+        values.extend(extract_mlir_values(self.problem_shape_m))
+        values.extend(extract_mlir_values(self.problem_shape_n))
+        values.extend(extract_mlir_values(self.problem_shape_k))
+        values.extend(extract_mlir_values(self.cta_tile_count_k))
+        return values
+
+    def __new_from_mlir_values__(self, values: List[ir.Value]) -> "GroupSearchResult":
+        assert len(values) == 7
+        return GroupSearchResult(*tuple(values))
+
+
+class GroupedGemmGroupSearchState:
+    """
+    The state of group index search for grouped gemm.
+
+    The state will be initialized once and updated in every round of group index search.
+
+    :param start_group_idx: The group idx to start the search with
+    :type start_group_idx: Int32
+    :param tile_count_prev_group: Number of tiles before the matched group
+    :type tile_count_prev_group: Int32
+    :param tile_count_searched: Number of tiles we have searched. When the matched group is found,
+                               it records the number of tiles including the matched group
+    :type tile_count_searched: Int32
+    """
+
+    def __init__(
+        self,
+        start_group_idx: Int32,
+        tile_count_prev_group: Int32,
+        tile_count_searched: Int32,
+    ) -> None:
+        self.start_group_idx = start_group_idx
+        self.tile_count_prev_group = tile_count_prev_group
+        self.tile_count_searched = tile_count_searched
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.start_group_idx)
+        values.extend(extract_mlir_values(self.tile_count_prev_group))
+        values.extend(extract_mlir_values(self.tile_count_searched))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: List[ir.Value]
+    ) -> "GroupedGemmGroupSearchState":
+        start_group_idx = new_from_mlir_values(self.start_group_idx, [values[0]])
+        tile_count_prev_group = new_from_mlir_values(
+            self.tile_count_prev_group, [values[1]]
+        )
+        tile_count_searched = new_from_mlir_values(
+            self.tile_count_searched, [values[2]]
+        )
+        return GroupedGemmGroupSearchState(
+            start_group_idx, tile_count_prev_group, tile_count_searched
+        )
+
+
+def create_initial_search_state() -> GroupedGemmGroupSearchState:
+    """
+    Create an initial search state for grouped gemm.
+
+    :return: A new search state with initial values
+    :rtype: GroupedGemmGroupSearchState
+    """
+    return GroupedGemmGroupSearchState(
+        start_group_idx=Int32(0),
+        tile_count_prev_group=Int32(0),
+        tile_count_searched=Int32(0),
+    )
+
+
+class GroupedGemmTileSchedulerHelper:
+    """
+    A helper to translate the raw block index (x, y, z) from tile scheduler to real CTA tile index for grouped gemm.
+
+    :param group_count: Number of groups in current grouped gemm problem
+    :type group_count: int
+    :param tile_sched_params: Parameter used to create the tile scheduler this helper works with
+    :type tile_sched_params: PersistentTileSchedulerParams
+    :param cluster_tile_shape_mnk: The shape of cluster tile as (m, n, k)
+    :type cluster_tile_shape_mnk: tuple[int, int, int]
+    :param search_state: The initial search state
+    :type search_state: GroupedGemmGroupSearchState
+    """
+
+    def __init__(
+        self,
+        group_count: int,
+        tile_sched_params: PersistentTileSchedulerParams,
+        cluster_tile_shape_mnk: tuple[int, int, int],
+        search_state: GroupedGemmGroupSearchState,
+    ) -> None:
+        self.tile_sched_params = tile_sched_params
+        self.group_count = group_count
+        self.lane_idx = cute.arch.lane_idx()
+        self.cluster_tile_shape_mnk = cluster_tile_shape_mnk
+        self.search_state = search_state
+
+    def __extract_mlir_values__(self) -> List[ir.Value]:
+        values = extract_mlir_values(self.tile_sched_params)
+        values.extend(extract_mlir_values(self.search_state))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: List[ir.Value]
+    ) -> "GroupedGemmTileSchedulerHelper":
+        tile_sched_params = new_from_mlir_values(self.tile_sched_params, values)
+        search_state = new_from_mlir_values(self.search_state, values[1:])
+        return GroupedGemmTileSchedulerHelper(
+            self.group_count,
+            tile_sched_params,
+            self.cluster_tile_shape_mnk,
+            search_state,
+        )
+
+    def delinearize_z(
+        self,
+        cta_tile_coord: tuple,
+        problem_shape_mnkl: cute.Tensor,
+    ) -> GroupSearchResult:
+        """
+        Delinearize the linear z index and return GroupSearchResult.
+
+        This function should be used by warps that need to know the CTA tile index on M and N dimensions.
+
+        :param cta_tile_coord: The raw CTA coordinate from tile scheduler
+        :type cta_tile_coord: tuple of Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for each group
+        :type problem_shape_mnkl: cute.Tensor
+        :return: The search result containing group index and tile coordinates
+        :rtype: GroupSearchResult
+        """
+        # delinear the z coord
+        linear_idx = cta_tile_coord[2]
+        group_idx, problem_mnkl = self._group_search_and_load_problem_shape(
+            linear_idx,
+            problem_shape_mnkl,
+            self.search_state.start_group_idx,
+            self.search_state.tile_count_prev_group,
+        )
+        # linear index local to current group
+        cluster_tile_idx_in_current_group = (
+            linear_idx - self.search_state.tile_count_prev_group
+        )
+        cluster_count_m, cluster_count_n, cluster_count_k = cute.ceil_div(
+            (problem_mnkl[0], problem_mnkl[1], problem_mnkl[2]),
+            (
+                self.cluster_tile_shape_mnk[0],
+                self.cluster_tile_shape_mnk[1],
+                self.cluster_tile_shape_mnk[2],
+            ),
+        )
+        # decompose to get indices on M and N
+        cta_tile_idx_m, cta_tile_idx_n = self._compute_cta_tile_coord(
+            cluster_tile_idx_in_current_group,
+            cta_tile_coord,
+            cluster_count_m,
+            cluster_count_n,
+        )
+        return GroupSearchResult(
+            group_idx,
+            cta_tile_idx_m,
+            cta_tile_idx_n,
+            problem_mnkl[0],
+            problem_mnkl[1],
+            problem_mnkl[2],
+            cluster_count_k,
+        )
+
+    def search_cluster_tile_count_k(
+        self,
+        cta_tile_coord: tuple,
+        problem_shape_mnkl: cute.Tensor,
+    ) -> Tuple[Int32, Int32]:
+        """
+        Search the matched group for given linear index and compute the number of tiles along K dimension for the matched group.
+
+        This function should be used by warps that are only interested in the number of tiles along K dimension.
+
+        :param cta_tile_coord: The raw CTA coordinate from tile scheduler
+        :type cta_tile_coord: tuple of Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :return: A tuple containing cluster count along K dimension and the group index
+        :rtype: Tuple[Int32, Int32]
+        """
+        group_idx, problem_mnk = self._group_search_and_load_problem_shape(
+            cta_tile_coord[2],
+            problem_shape_mnkl,
+            self.search_state.start_group_idx,
+            self.search_state.tile_count_prev_group,
+        )
+        cluster_count_k = (
+            problem_mnk[2] + self.cluster_tile_shape_mnk[2] - 1
+        ) // self.cluster_tile_shape_mnk[2]
+        return cluster_count_k, group_idx
+
+    @cute.jit
+    def _prefix_sum(self, value_per_thread: Int32) -> Int32:
+        """
+        Perform prefix sum within a full warp.
+
+        :param value_per_thread: The value for this thread to contribute to the prefix sum
+        :type value_per_thread: Int32
+        :return: The prefix sum result for this thread
+        :rtype: Int32
+        """
+        clamp_value = 0
+        idx = 1
+        sum_per_thread = value_per_thread
+        while idx < cute.arch.WARP_SIZE:
+            value = cute.arch.shuffle_sync_up(
+                sum_per_thread, idx, mask_and_clamp=clamp_value
+            )
+            if self.lane_idx >= idx:
+                sum_per_thread += value
+            idx = idx << 1
+        return sum_per_thread
+
+    def _get_problem_for_group(
+        self, problem_shape_mnkl: cute.Tensor, group_idx: Int32
+    ) -> cute.Tensor:
+        """
+        Load gemm problem (m,n,k,l) for the specified group from global memory to register.
+
+        :param problem_shape_mnkl: Tensor in global memory with layout (group_count, 4):(4, 1)
+        :type problem_shape_mnkl: cute.Tensor
+        :param group_idx: The index of the group to load
+        :type group_idx: Int32
+        :return: The problem shape tensor for the specified group
+        :rtype: cute.Tensor
+        """
+        cur_problem_mnkl = cute.make_fragment(
+            cute.make_layout(4), problem_shape_mnkl.element_type
+        )
+        cute.autovec_copy(problem_shape_mnkl[(group_idx, None)], cur_problem_mnkl)
+        return cur_problem_mnkl
+
+    def _get_cluster_tile_count_mn(self, problem_shape: cute.Tensor) -> Int32:
+        """
+        Compute total cluster count.
+
+        :param problem_shape: Tensor containing problem shape (m, n, k, l)
+        :type problem_shape: cute.Tensor
+        :return: The total cluster tile count for M and N dimensions
+        :rtype: Int32
+        """
+        cur_ntile_m = (
+            problem_shape[0] + self.cluster_tile_shape_mnk[0] - 1
+        ) // self.cluster_tile_shape_mnk[0]
+        cur_ntile_n = (
+            problem_shape[1] + self.cluster_tile_shape_mnk[1] - 1
+        ) // self.cluster_tile_shape_mnk[1]
+        cur_ntile_mn = cur_ntile_m * cur_ntile_n
+        return cur_ntile_mn
+
+    def _compute_cta_tile_coord(
+        self,
+        cluster_tile_idx: Int32,
+        cta_tile_coord_in_cluster: tuple,
+        cluster_tile_count_m: Int32,
+        cluster_tile_count_n: Int32,
+    ) -> tuple:
+        """
+        Compute CTA tile indices along M and N dimensions based on the linear index within a group.
+
+        It uses the AlongM mode to decompose the linear index onto M and N dimensions.
+
+        :param cluster_tile_idx: The linear index within a group
+        :type cluster_tile_idx: Int32
+        :param cta_tile_coord_in_cluster: CTA indices along M and N dimensions within a cluster
+        :type cta_tile_coord_in_cluster: tuple of Int32
+        :param cluster_tile_count_m: The number of clusters along M dimension of the matched group
+        :type cluster_tile_count_m: Int32
+        :param cluster_tile_count_n: The number of clusters along N dimension of the matched group
+        :type cluster_tile_count_n: Int32
+        :return: A tuple containing CTA tile indices along M and N dimensions
+        :rtype: tuple of (Int32, Int32)
+        """
+        cluster_layout_mn = cute.make_layout(
+            (cluster_tile_count_m, cluster_tile_count_n)
+        )
+        (mi, ni) = cluster_layout_mn.get_hier_coord(cluster_tile_idx)
+        cta_tile_idx_m = (
+            mi * self.tile_sched_params.cluster_shape_mn[0]
+            + cta_tile_coord_in_cluster[0]
+        )
+        cta_tile_idx_n = (
+            ni * self.tile_sched_params.cluster_shape_mn[1]
+            + cta_tile_coord_in_cluster[1]
+        )
+        return (cta_tile_idx_m, cta_tile_idx_n)
+
+    @cute.jit
+    def _group_search(
+        self,
+        linear_idx: Int32,
+        problem_shape_mnkl: cute.Tensor,
+        init_group_idx: Int32,
+        init_tile_count_searched: Int32,
+    ) -> GroupedGemmGroupSearchState:
+        """
+        Search which group the linear index belongs to.
+
+        :param linear_idx: The linear index to be decomposed
+        :type linear_idx: Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :param init_group_idx: The group idx to start the search with
+        :type init_group_idx: Int32
+        :param init_tile_count_searched: The number of tiles we have searched
+        :type init_tile_count_searched: Int32
+        :return: The updated search state
+        :rtype: GroupedGemmGroupSearchState
+        """
+        c_0 = Int32(0).ir_value()
+        last_lane_idx = cute.arch.WARP_SIZE - 1
+
+        tile_count_searched = init_tile_count_searched
+        start_group_idx = init_group_idx
+        not_found = linear_idx >= tile_count_searched
+        tile_count_prev_group = self.search_state.tile_count_prev_group
+        while not_found:
+            # get group to search for current lane
+            cur_group_idx = start_group_idx + self.lane_idx
+            # check if the group to be checked is out of range
+            inside_group_bound = cur_group_idx < self.group_count
+            cur_ntile_mn = c_0
+            if inside_group_bound:
+                # get problem size of current group
+                cur_problem_mnkl = self._get_problem_for_group(
+                    problem_shape_mnkl, cur_group_idx
+                )
+                cur_ntile_mn = self._get_cluster_tile_count_mn(cur_problem_mnkl)
+            # compute tile count from beginning to current group(included)
+            total_cluster_tile_count_ps_per_thread = self._prefix_sum(cur_ntile_mn)
+            cluster_tile_count_end_per_thread = (
+                total_cluster_tile_count_ps_per_thread + tile_count_searched
+            )
+
+            group_not_in_window = linear_idx >= cluster_tile_count_end_per_thread
+            hitted_group_idx_in_search_window = cute.arch.popc(
+                cute.arch.vote_ballot_sync(group_not_in_window)
+            )
+            not_found = hitted_group_idx_in_search_window == cute.arch.WARP_SIZE
+            start_group_idx = hitted_group_idx_in_search_window + start_group_idx
+            hit_the_1st_problem_in_search_window = (
+                hitted_group_idx_in_search_window == c_0
+            )
+            tile_count_prev_group = tile_count_searched
+            if hit_the_1st_problem_in_search_window == False:
+                tile_count_prev_group = cute.arch.shuffle_sync(
+                    cluster_tile_count_end_per_thread,
+                    hitted_group_idx_in_search_window - 1,
+                )
+
+            # If no matched group, then get new_cluster_tile_count_end from last lane
+            # Otherwise, get new_cluster_tile_count_end from the hitted group
+            lane_idx_for_cluster_tile_count_end = hitted_group_idx_in_search_window
+            if not_found:
+                lane_idx_for_cluster_tile_count_end = last_lane_idx
+            tile_count_searched = cute.arch.shuffle_sync(
+                cluster_tile_count_end_per_thread,
+                lane_idx_for_cluster_tile_count_end,
+            )
+
+        return GroupedGemmGroupSearchState(
+            start_group_idx,
+            tile_count_prev_group,
+            tile_count_searched,
+        )
+
+    def _group_search_and_load_problem_shape(
+        self,
+        linear_idx: Int32,
+        problem_shape_mnkl: cute.Tensor,
+        start_group_idx: Int32,
+        tile_count_searched: Int32,
+    ) -> Tuple[Int32, cute.Tensor]:
+        """
+        Perform group search and load problem shape for the matched group.
+
+        :param linear_idx: The linear index to be decomposed
+        :type linear_idx: Int32
+        :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups
+        :type problem_shape_mnkl: cute.Tensor
+        :param start_group_idx: The group idx to start the search with
+        :type start_group_idx: Int32
+        :param tile_count_searched: The number of tiles we have searched
+        :type tile_count_searched: Int32
+        :return: A tuple containing the final group index and the problem shape tensor
+        :rtype: Tuple[Int32, cute.Tensor]
+        """
+        self.search_state = self._group_search(
+            linear_idx,
+            problem_shape_mnkl,
+            start_group_idx,
+            tile_count_searched,
+        )
+        # get final group search state
+        final_group_idx = self.search_state.start_group_idx
+        # let's revisit if it's better to broadcast problem_shape_mnk in group_search
+        problem_mnkl = self._get_problem_for_group(problem_shape_mnkl, final_group_idx)
+        return final_group_idx, problem_mnkl
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hardware_info.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hardware_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86fcbefc86fbc7da333735fa2cebbd3af47f39e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hardware_info.py
@@ -0,0 +1,174 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from cuda.bindings import driver, nvrtc
+
+import cutlass.cute as cute
+
+"""
+This class is used to get the hardware info of given GPU device.
+It provides methods to get the max active clusters for given cluster size.
+
+Prerequisite:
+- CUDA driver is initialized via `driver.cuInit` or other CUDA APIs.
+- CUDA context is created via `driver.cuCtxCreate` or other CUDA APIs.
+
+"""
+
+
+class HardwareInfo:
+    """
+    device_id: CUDA device ID to get the hardware info.
+    """
+
+    def __init__(self, device_id: int = 0):
+        count = self._checkCudaErrors(driver.cuDeviceGetCount())
+        if device_id >= count:
+            raise ValueError(
+                f"Device ID {device_id} is out of range for device count {count}"
+            )
+        self.device_id = device_id
+        self.device = self._checkCudaErrors(driver.cuDeviceGet(device_id))
+        self.context = self._checkCudaErrors(driver.cuCtxGetCurrent())
+        self.driver_version = self._checkCudaErrors(driver.cuDriverGetVersion())
+
+    # Getting the max active clusters for a given cluster size
+    def get_max_active_clusters(self, cluster_size: int) -> int:
+        self._get_device_function()
+        if self._cuda_driver_version_lt(11, 8):
+            raise RuntimeError(
+                "CUDA Driver version < 11.8, cannot get _max_active_clusters"
+            )
+        if cluster_size <= 0 or cluster_size > 32:
+            raise ValueError(
+                f"Cluster size must be between 1 and 32, {cluster_size} is not supported"
+            )
+
+        max_shared_memory_per_block = self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+                self.device,
+            )
+        )
+        self._checkCudaErrors(
+            driver.cuFuncSetAttribute(
+                self.kernel,
+                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                max_shared_memory_per_block,
+            )
+        )
+        max_dynamic_shared_memory = self._checkCudaErrors(
+            driver.cuOccupancyAvailableDynamicSMemPerBlock(
+                self.kernel, 1, 1  # numBlocks  # blockSize
+            )
+        )
+        max_active_blocks = self._checkCudaErrors(
+            driver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
+                self.kernel, 1, max_dynamic_shared_memory  # blockSize,
+            )
+        )
+        # allow non-portable cluster size to support detection of non-portable cluster size
+        self._checkCudaErrors(
+            driver.cuFuncSetAttribute(
+                self.kernel,
+                driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+                1,
+            )
+        )
+        # prepare launch configuration
+        launch_config = driver.CUlaunchConfig()
+        launch_config.blockDimX = 128
+        launch_config.blockDimY = 1
+        launch_config.blockDimZ = 1
+        launch_config.sharedMemBytes = max_dynamic_shared_memory
+        launch_config.numAttrs = 1
+        # max possible cluster size is 32
+        cluster_dims_attr = driver.CUlaunchAttribute()
+        cluster_dims_attr.id = (
+            driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+        )
+        value = driver.CUlaunchAttributeValue()
+        value.clusterDim.x = cluster_size
+        value.clusterDim.y = 1
+        value.clusterDim.z = 1
+        cluster_dims_attr.value = value
+        launch_config.attrs = [cluster_dims_attr]
+        launch_config.gridDimX = cluster_size
+        launch_config.gridDimY = max_active_blocks
+        launch_config.gridDimZ = 1
+
+        num_clusters = self._checkCudaErrors(
+            driver.cuOccupancyMaxActiveClusters(self.kernel, launch_config)
+        )
+        return num_clusters
+
+    def get_l2_cache_size_in_bytes(self) -> int:
+        return self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                self.device,
+            )
+        )
+
+    def get_device_multiprocessor_count(self) -> int:
+        return self._checkCudaErrors(
+            driver.cuDeviceGetAttribute(
+                driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                self.device,
+            )
+        )
+
+    def _checkCudaErrors(self, result) -> None:
+        if result[0].value:
+            raise RuntimeError(
+                "CUDA error code={}({})".format(
+                    result[0].value, self._cudaGetErrorEnum(result[0])
+                )
+            )
+        # CUDA APIs always return the status as the first element of the result tuple
+        if len(result) == 1:
+            return None
+        elif len(result) == 2:
+            return result[1]
+        else:
+            return result[1:]
+
+    def _cudaGetErrorEnum(self, error) -> str:
+        if isinstance(error, driver.CUresult):
+            err, name = driver.cuGetErrorName(error)
+            return name if err == driver.CUresult.CUDA_SUCCESS else "<unknown>"
+        elif isinstance(error, nvrtc.nvrtcResult):
+            return nvrtc.nvrtcGetErrorString(error)[1]
+        else:
+            raise RuntimeError("Unknown error type: {}".format(error))
+
+    def _cuda_driver_version_ge(self, major: int, minor: int) -> bool:
+        return self.driver_version >= (major * 1000 + 10 * minor)
+
+    def _cuda_driver_version_lt(self, major: int, minor: int) -> bool:
+        return not self._cuda_driver_version_ge(major, minor)
+
+    @cute.kernel
+    def _empty_kernel(self):
+        return
+
+    @cute.jit
+    def _host_function(self):
+        self._empty_kernel().launch(
+            grid=[1, 1, 1],
+            block=[1, 1, 1],
+        )
+
+    # get a empty kernel to compute occupancy
+    def _get_device_function(self) -> None:
+        self.compiled_kernel = cute.compile(self._host_function)
+        self.module = next(iter(self.compiled_kernel.cuda_modules.modules)).cuda_module
+        self.kernel = next(iter(self.compiled_kernel.cuda_modules.modules)).kernel_ptr
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hopper_helpers.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hopper_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd2bae3de66983dc5bf7883305f6a926b3c0d72
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/hopper_helpers.py
@@ -0,0 +1,209 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type, Tuple
+from enum import Enum
+from typing_extensions import deprecated
+import warnings
+
+from cutlass.utils.layout import LayoutEnum
+from cutlass.cutlass_dsl import (
+    Float16,
+    BFloat16,
+    Float8E5M2,
+    Float8E4M3FN,
+    Numeric,
+    NumericMeta,
+    dsl_user_op,
+)
+
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.nvgpu.common import CopyUniversalOp
+from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp
+from cutlass.cute.nvgpu.warpgroup import (
+    MmaF16BF16Op,
+    MmaF8Op,
+    OperandMajorMode,
+    OperandSource,
+)
+
+
+@deprecated("Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead")
+class SmemCapacity(Enum):
+    SM90_SMEM_CAPACITY_BYTES = (228 - 1) * 1024
+
+
+warnings.warn(
+    "SMEM_CAPACITY is deprecated: Use get_smem_capacity_in_bytes from cutlass.utils.smem_capacity instead",
+    DeprecationWarning,
+    stacklevel=2,
+)
+# Dictionary to map compute capability to SMEM capacity
+SMEM_CAPACITY = {
+    "sm90": SmemCapacity.SM90_SMEM_CAPACITY_BYTES.value,
+}
+
+
+@dsl_user_op
+def sm90_get_smem_store_op(
+    layout_d: LayoutEnum,
+    elem_ty_d: Type[Numeric],
+    elem_ty_acc: Type[Numeric],
+    *,
+    loc=None,
+    ip=None,
+) -> cute.CopyAtom:
+    """
+    Selects the largest vectorized smem store atom available subject to constraint of gmem layout.
+
+    Parameters:
+    -----------
+    layout_d : LayoutEnum
+        The layout enum of the output tensor D.
+
+    elem_ty_d : Type[Numeric]
+        The element type for output tensor D.
+
+    elem_ty_acc : Type[Numeric]
+        The element type for accumulator.
+
+    Returns:
+    --------
+    Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters.
+    """
+
+    def validate_type(ty, ty_name):
+        if not isinstance(ty, NumericMeta):
+            raise TypeError(f"{ty_name} must be a Numeric, but got {ty}")
+
+    validate_type(elem_ty_d, "elem_ty_d")
+    validate_type(elem_ty_acc, "elem_ty_acc")
+
+    is_m_major = layout_d.is_m_major_c()
+
+    if elem_ty_d.width == 16:
+        return cute.make_copy_atom(
+            StMatrix8x8x16bOp(is_m_major, 4), elem_ty_d, loc=loc, ip=ip
+        )
+    else:
+        return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip)
+
+
+def make_trivial_tiled_mma(
+    a_dtype: Type[Numeric],
+    b_dtype: Type[Numeric],
+    a_leading_mode: OperandMajorMode,
+    b_leading_mode: OperandMajorMode,
+    acc_dtype: Type[Numeric],
+    atom_layout_mnk: Tuple[int, int, int],
+    tiler_mn: Tuple[int, int],
+    a_source: OperandSource = OperandSource.SMEM,
+    *,
+    loc=None,
+    ip=None,
+) -> cute.TiledMma:
+    """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape.
+    By default, the MMA atom is created with SMEM operand source for A.
+
+    :param a_dtype: Data type of operand A.
+    :type a_dtype: type[Numeric]
+    :param b_dtype: Data type of operand B.
+    :type b_dtype: type[Numeric]
+    :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N).
+    :type a_leading_mode: warpgroup.OperandMajorMode
+    :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N).
+    :type b_leading_mode: warpgroup.OperandMajorMode
+    :param acc_dtype: Data type of the accumulator.
+    :type acc_dtype: type[Numeric]
+    :param atom_layout_mnk: A integer tuple describing the tiling of Atom across threads.
+    :type atom_layout_mnk: Tuple[int, int, int]
+    :param tiler_mn: The shape (M, N) of the cta tiler.
+    :type tiler_mn: Tuple[int, int]
+
+    :return: A tiled MMA atom.
+    :rtype: cute.TiledMma
+
+    :raises TypeError: If the data type is not supported.
+    """
+
+    if a_dtype in {Float16, BFloat16}:
+        if cutlass.const_expr(a_dtype != b_dtype):
+            raise TypeError(f"Type mismatch: {a_dtype} != {b_dtype}")
+        if cutlass.const_expr(a_dtype.width != b_dtype.width):
+            raise TypeError(f"Type width mismatch: {a_dtype.width} != {b_dtype.width}")
+
+        mma_op = MmaF16BF16Op(
+            a_dtype,
+            acc_dtype,
+            (*tiler_mn, 16),
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    elif a_dtype in {Float8E4M3FN, Float8E5M2} and b_dtype in {
+        Float8E4M3FN,
+        Float8E5M2,
+    }:
+        mma_op = MmaF8Op(
+            a_dtype,
+            b_dtype,
+            acc_dtype,
+            (*tiler_mn, 32),
+            a_source,
+            a_leading_mode,
+            b_leading_mode,
+        )
+    else:
+        raise TypeError(f"unsupported a_dtype and b_dtype, got {a_dtype} and {b_dtype}")
+
+    return cute.make_tiled_mma(cute.make_mma_atom(mma_op), atom_layout_mnk)
+
+def get_smem_layout_atom(
+    layout: LayoutEnum,
+    element_type: Type[Numeric],
+    major_mode_size: int,
+    *,
+    loc=None,
+    ip=None,
+):
+    """Select the optimal shared memory layout atom based on parameters.
+
+    :param layout: Layout enum of the tensor
+    :type layout: LayoutEnum
+    :param element_type: Data type of the elements
+    :type element_type: type[cutlass.Numeric]
+    :param major_mode_size: Size of the major mode dimension
+    :type major_mode_size: int
+
+    :return: Selected shared memory layout atom kind
+    :rtype: cute.nvgpu.warpgroup.SmemLayoutAtomKind
+    """
+    assert major_mode_size % 8 == 0
+    sw128_num_contiguous_bits = 1024
+    sw64_num_contiguous_bits = 512
+    sw32_num_contiguous_bits = 256
+    major_mode_size_bits = major_mode_size * element_type.width
+    if layout.sm90_mma_major_mode() == OperandMajorMode.MN:
+        if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW128
+        if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW64
+        if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+            return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW32
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_INTER
+    if major_mode_size_bits % sw128_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW128
+    if major_mode_size_bits % sw64_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW64
+    if major_mode_size_bits % sw32_num_contiguous_bits == 0:
+        return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW32
+    return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/layout.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..4560c266cf9930ac024adeaa94859d06ecf3650a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/layout.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from enum import Enum
+
+import cutlass.cute as cute
+from cutlass.cute.nvgpu import warpgroup
+from cutlass.cute.nvgpu import tcgen05
+
+
+class LayoutEnum(Enum):
+    ROW_MAJOR = "row_major"
+    COL_MAJOR = "col_major"
+
+    def mma_major_mode(self):
+        return (
+            tcgen05.OperandMajorMode.K
+            if self == LayoutEnum.ROW_MAJOR
+            else tcgen05.OperandMajorMode.MN
+        )
+
+    def sm90_mma_major_mode(self):
+        return (
+            warpgroup.OperandMajorMode.K
+            if self == LayoutEnum.ROW_MAJOR
+            else warpgroup.OperandMajorMode.MN
+        )
+
+    def is_n_major_c(self):
+        return self == LayoutEnum.ROW_MAJOR
+
+    def is_m_major_c(self):
+        return self == LayoutEnum.COL_MAJOR
+
+    @staticmethod
+    def from_tensor(tensor: cute.Tensor) -> "LayoutEnum":
+        ret = None
+        if tensor.leading_dim == 1:
+            ret = LayoutEnum.ROW_MAJOR
+        elif tensor.leading_dim == 0:
+            ret = LayoutEnum.COL_MAJOR
+        else:
+            raise ValueError(f"Invalid leading dimension: {tensor.leading_dim}")
+
+        return ret
+
+
+__all__ = ["LayoutEnum"]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_allocator.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_allocator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2500c06e1808bc06db5decce88e8ebf7837f17d0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_allocator.py
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Type, Union, overload
+
+from cutlass.cutlass_dsl import Int8, Numeric, NumericMeta, CutlassBaseDSL
+
+import cutlass.cute as cute
+from cutlass.cute.arch import get_dyn_smem, get_dyn_smem_size
+
+
+class SmemAllocator:
+    """A class for managing shared memory allocation on GPU.
+
+    This class manages a chunk of shared memory and provides APIs for sub-allocation
+    inside the chunk.
+
+    :ivar _base: The current base address of the shared memory as an i8 typed dynamic value.
+    :type _base: cute.Pointer
+    :ivar _allocated_bytes: The total number of bytes allocated in shared memory.
+    :type _allocated_bytes: int
+
+    .. note::
+        This class is responsible for managing the allocation of tensors in shared memory.
+        The base pointer is aligned to 1024 bytes upon initialization.
+    """
+
+    def __init__(self):
+        """Initialize the SmemAllocator instance.
+
+        Creates a dynamic shared memory base pointer of type i8, aligned to 1024 bytes.
+        """
+        self._base = get_dyn_smem(Int8, alignment=1024)
+        self._allocated_bytes = 0
+        CutlassBaseDSL.track_smem_allocator(self, lambda cls: cls._allocated_bytes)
+
+    @overload
+    def allocate(self, size_or_type: int, byte_alignment: int) -> cute.Pointer: ...
+
+    @overload
+    def allocate(
+        self, size_or_type: cute.struct, byte_alignment: int
+    ) -> cute.Pointer: ...
+
+    def allocate(self, size_or_type, byte_alignment: int = 1) -> cute.Pointer:
+        """Allocate a block of memory with specified size and alignment.
+
+        This method adjusts the base pointer to ensure proper alignment and updates
+        the internal state to track allocated memory.
+
+        :param size_or_type: The number of bytes to allocate or a struct class
+        :type size_or_type: Union[int, cute.struct]
+        :param byte_alignment: The byte alignment requirement, defaults to 1 (no alignment)
+        :type byte_alignment: int, optional
+        :return: Pointer to the start of the allocated memory block or struct instance
+        :rtype: cute.Pointer
+        :raises ValueError: If size is negative or alignment is less than 1
+        :raises RuntimeError: If allocation would exceed available shared memory
+        """
+        if isinstance(size_or_type, cute.struct):
+            alignment = max(byte_alignment, size_or_type.__alignof__())
+            base_ptr = self.allocate(size_or_type.__sizeof__(), alignment)
+            return size_or_type(base_ptr)
+
+        num_bytes = size_or_type
+        if num_bytes < 0:
+            raise ValueError("num_bytes must be non-negative")
+        if byte_alignment < 1:
+            raise ValueError("byte_alignment must be at least 1")
+
+        self._base = self._base.align(byte_alignment)
+        ptr = self._base
+        self._base += num_bytes
+        if self._allocated_bytes % byte_alignment != 0:
+            self._allocated_bytes += (
+                byte_alignment - self._allocated_bytes % byte_alignment
+            )
+        self._allocated_bytes += num_bytes
+
+        # Check bounds against available dynamic shared memory
+        cute.testing.assert_(
+            self._allocated_bytes <= get_dyn_smem_size(),
+            f"Allocation failed: shared memory allocation exceeds available memory set in kernel launch. "
+            f"Allocated bytes: {self._allocated_bytes} bytes. "
+            f"Please reduce the allocation or set a larger smem size in kernel launch.",
+        )
+        return ptr
+
+    def allocate_array(self, element_type: Type[Numeric], num_elems: int = 1):
+        """Allocate an array of elements in shared memory.
+
+        :param element_type: The type of elements to allocate
+        :type element_type: Type[Numeric]
+        :param num_elems: Number of elements to allocate, defaults to 1
+        :type num_elems: int, optional
+        :return: Pointer to the start of the allocated array
+        :rtype: cute.Pointer
+        :raises ValueError: If num_elems is less than 1
+        :raises TypeError: If element_type is not a Numeric type
+        """
+        if num_elems < 1:
+            raise ValueError("num_elems must be at least 1")
+        if not isinstance(element_type, NumericMeta):
+            raise TypeError(
+                f"value_ty must be a type of Numeric, but got {element_type}"
+            )
+
+        ptr = self.allocate(
+            element_type.width // 8 * num_elems, element_type.width // 8
+        )
+
+        return cute.recast_ptr(ptr, dtype=element_type)
+
+    def allocate_tensor(
+        self,
+        element_type: Type[Numeric],
+        layout: Union[int, cute.Layout, cute.ComposedLayout],
+        byte_alignment: int = 1,
+        swizzle: cute.Swizzle = None,
+    ):
+        """Allocate a tensor in shared memory.
+
+        :param element_type: The type of elements in the tensor
+        :type element_type: Type[Numeric]
+        :param layout: The layout specification for the tensor
+        :type layout: Union[int, cute.Layout, cute.ComposedLayout]
+        :param byte_alignment: The byte alignment requirement, defaults to 1
+        :type byte_alignment: int, optional
+        :param swizzle: Swizzle for position-dependent swizzling, defaults to None
+        :type swizzle: cute.Swizzle, optional
+        :return: The allocated tensor with specified properties
+        :rtype: cute.Tensor
+        :raises TypeError: If element_type is not a Numeric type or if swizzle conflicts with layout
+        :raises ValueError: If allocation is not byte-aligned
+        :raises NotImplementedError: If dynamic layout is specified
+        """
+        if not isinstance(element_type, NumericMeta):
+            raise TypeError(
+                f"value_ty must be a type of Numeric, but got {element_type}"
+            )
+
+        if (
+            isinstance(layout, cute.ComposedLayout)
+            and isinstance(layout.inner, cute.Swizzle)
+        ) and (swizzle is not None):
+            raise TypeError(
+                f"Invalid tensor type: cannot be both iterator swizzle (PDSL) and swizzle layout(PISL) at the same time."
+            )
+
+        if isinstance(layout, int):
+            layout = cute.make_layout(layout)
+
+        profile = layout(0)
+        if isinstance(profile, tuple):
+            raise TypeError(
+                f"cannot allocate a shared memory tensor with a non-integer iterator"
+            )
+
+        if not cute.is_static(layout.type):
+            raise NotImplementedError(f"dynamic layout is not supported: {layout.type}")
+
+        # At least align the allocation to the natural alignment given by the element type
+        if element_type.width // 8 > byte_alignment:
+            byte_alignment = element_type.width // 8
+
+        # Relevant only for sub-byte data types: verify that the entire allocation is byte-aligned
+        cosize_in_bits = cute.cosize(layout) * element_type.width
+        assert isinstance(cosize_in_bits, int)
+        if cosize_in_bits % 8 != 0:
+            raise ValueError("invalid allocation that is not byte-aligned")
+
+        num_bytes = cosize_in_bits // 8
+        ptr = self.allocate(num_bytes, byte_alignment)
+        ptr = cute.recast_ptr(ptr, swizzle, dtype=element_type)
+        res = cute.make_tensor(ptr, layout)
+        return res
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_capacity.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_capacity.py
new file mode 100644
index 0000000000000000000000000000000000000000..87ddb990436caf8135a849b3a37bf52632eed2fc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/smem_capacity.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+
+SMEM_CAPACITY_MAP = {
+    "sm_120": (100 - 1) * 1024,
+    "sm_100": (228 - 1) * 1024,
+    "sm_90": (228 - 1) * 1024,
+    "sm_80": (164 - 1) * 1024,
+    "sm_86": (100 - 1) * 1024,
+    "sm_89": (100 - 1) * 1024,
+}
+
+
+def get_smem_capacity_in_bytes(compute_capability: str) -> int:
+    if compute_capability not in SMEM_CAPACITY_MAP:
+        raise ValueError(f"Unsupported compute capability: {compute_capability}")
+    return SMEM_CAPACITY_MAP[compute_capability]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2873244d7cce9d8072f1fa71bbba1762022631b9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py
@@ -0,0 +1,386 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Tuple
+
+from cutlass.cutlass_dsl import (
+    Boolean,
+    Integer,
+    Int32,
+    min,
+    extract_mlir_values,
+    new_from_mlir_values,
+    dsl_user_op,
+)
+from cutlass._mlir import ir
+import cutlass.cute as cute
+
+##############################################################################
+# Static persistent tile scheduler
+##############################################################################
+
+
+class WorkTileInfo:
+    """A class to represent information about a work tile.
+
+    :ivar tile_idx: The index of the tile.
+    :type tile_idx: cute.Coord
+    :ivar is_valid_tile: Whether the tile is valid.
+    :type is_valid_tile: Boolean
+    """
+
+    def __init__(self, tile_idx: cute.Coord, is_valid_tile: Boolean):
+        self._tile_idx = tile_idx
+        self._is_valid_tile = Boolean(is_valid_tile)
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.tile_idx)
+        values.extend(extract_mlir_values(self.is_valid_tile))
+        return values
+
+    def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo":
+        assert len(values) == 4
+        new_tile_idx = new_from_mlir_values(self._tile_idx, values[:-1])
+        new_is_valid_tile = new_from_mlir_values(self._is_valid_tile, [values[-1]])
+        return WorkTileInfo(new_tile_idx, new_is_valid_tile)
+
+    @property
+    def is_valid_tile(self) -> Boolean:
+        """Check latest tile returned by the scheduler is valid or not. Any scheduling
+        requests after all tasks completed will return an invalid tile.
+
+        :return: The validity of the tile.
+        :rtype: Boolean
+        """
+        return self._is_valid_tile
+
+    @property
+    def tile_idx(self) -> cute.Coord:
+        """
+        Get the index of the tile.
+
+        :return: The index of the tile.
+        :rtype: cute.Coord
+        """
+        return self._tile_idx
+
+
+class PersistentTileSchedulerParams:
+    """A class to represent parameters for a persistent tile scheduler.
+
+    This class is designed to manage and compute the layout of clusters and tiles
+    in a batched gemm problem.
+
+    :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1).
+    :type cluster_shape_mn: tuple
+    :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of
+        number of clusters in (m, n, l) dimensions.
+    :type problem_layout_ncluster_mnl: cute.Layout
+    """
+
+    def __init__(
+        self,
+        problem_shape_ntile_mnl: cute.Shape,
+        cluster_shape_mnk: cute.Shape,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """
+        Initializes the PersistentTileSchedulerParams with the given parameters.
+
+        :param problem_shape_ntile_mnl: The shape of the problem in terms of
+            number of CTA (Cooperative Thread Array) in (m, n, l) dimensions.
+        :type problem_shape_ntile_mnl: cute.Shape
+        :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions.
+        :type cluster_shape_mnk: cute.Shape
+
+        :raises ValueError: If cluster_shape_k is not 1.
+        """
+
+        if cluster_shape_mnk[2] != 1:
+            raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
+
+        self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
+        # cluster_shape_mnk is kept for reconstruction
+        self._cluster_shape_mnk = cluster_shape_mnk
+        self.cluster_shape_mn = cluster_shape_mnk[:2]
+        self._loc = loc
+
+        # By default, we follow m major (col-major) raster order, so make a col-major layout
+        self.problem_layout_ncluster_mnl = cute.make_layout(
+            cute.ceil_div(
+                self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
+            ),
+            loc=loc,
+            ip=ip,
+        )
+
+    def __extract_mlir_values__(self):
+        values, self._values_pos = [], []
+        for obj in [self.problem_shape_ntile_mnl, self._cluster_shape_mnk]:
+            obj_values = extract_mlir_values(obj)
+            values += obj_values
+            self._values_pos.append(len(obj_values))
+        return values
+
+    def __new_from_mlir_values__(self, values):
+        obj_list = []
+        for obj, n_items in zip(
+            [self.problem_shape_ntile_mnl, self._cluster_shape_mnk], self._values_pos
+        ):
+            obj_list.append(new_from_mlir_values(obj, values[:n_items]))
+            values = values[n_items:]
+        return PersistentTileSchedulerParams(*(tuple(obj_list)), loc=self._loc)
+
+    @dsl_user_op
+    def get_grid_shape(
+        self, max_active_clusters: Int32, *, loc=None, ip=None
+    ) -> Tuple[Integer, Integer, Integer]:
+        """
+        Computes the grid shape based on the maximum active clusters allowed.
+
+        :param max_active_clusters: The maximum number of active clusters that
+            can run in one wave.
+        :type max_active_clusters: Int32
+
+        :return: A tuple containing the grid shape in (m, n, persistent_clusters).
+            - m: self.cluster_shape_m.
+            - n: self.cluster_shape_n.
+            - persistent_clusters: Number of persistent clusters that can run.
+        """
+
+        # Total ctas in problem size
+        num_ctas_mnl = tuple(
+            x * y
+            for x, y in zip(
+                self.problem_layout_ncluster_mnl.shape, self.cluster_shape_mn
+            )
+        ) + (self.problem_layout_ncluster_mnl.shape[2],)
+
+        num_ctas_in_problem = cute.size(num_ctas_mnl, loc=loc, ip=ip)
+
+        num_ctas_per_cluster = cute.size(self.cluster_shape_mn, loc=loc, ip=ip)
+        # Total ctas that can run in one wave
+        num_ctas_per_wave = max_active_clusters * num_ctas_per_cluster
+
+        num_persistent_ctas = min(num_ctas_in_problem, num_ctas_per_wave)
+        num_persistent_clusters = num_persistent_ctas // num_ctas_per_cluster
+
+        return (*self.cluster_shape_mn, num_persistent_clusters)
+
+
+class StaticPersistentTileScheduler:
+    """A scheduler for static persistent tile execution in CUTLASS/CuTe kernels.
+
+    :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl
+    :type params: PersistentTileSchedulerParams
+    :ivar num_persistent_clusters: Number of persistent clusters that can be launched
+    :type num_persistent_clusters: Int32
+    :ivar cta_id_in_cluster: ID of the CTA within its cluster
+    :type cta_id_in_cluster: cute.Coord
+    :ivar _num_tiles_executed: Counter for executed tiles
+    :type _num_tiles_executed: Int32
+    :ivar _current_work_linear_idx: Current cluster index
+    :type _current_work_linear_idx: Int32
+    """
+
+    def __init__(
+        self,
+        params: PersistentTileSchedulerParams,
+        num_persistent_clusters: Int32,
+        current_work_linear_idx: Int32,
+        cta_id_in_cluster: cute.Coord,
+        num_tiles_executed: Int32,
+    ):
+        """
+        Initializes the StaticPersistentTileScheduler with the given parameters.
+
+        :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl.
+        :type params: PersistentTileSchedulerParams
+        :param num_persistent_clusters: Number of persistent clusters that can be launched.
+        :type num_persistent_clusters: Int32
+        :param current_work_linear_idx: Current cluster index.
+        :type current_work_linear_idx: Int32
+        :param cta_id_in_cluster: ID of the CTA within its cluster.
+        :type cta_id_in_cluster: cute.Coord
+        :param num_tiles_executed: Counter for executed tiles.
+        :type num_tiles_executed: Int32
+        """
+        self.params = params
+        self.num_persistent_clusters = num_persistent_clusters
+        self._current_work_linear_idx = current_work_linear_idx
+        self.cta_id_in_cluster = cta_id_in_cluster
+        self._num_tiles_executed = num_tiles_executed
+
+    def __extract_mlir_values__(self) -> list[ir.Value]:
+        values = extract_mlir_values(self.num_persistent_clusters)
+        values.extend(extract_mlir_values(self._current_work_linear_idx))
+        values.extend(extract_mlir_values(self.cta_id_in_cluster))
+        values.extend(extract_mlir_values(self._num_tiles_executed))
+        return values
+
+    def __new_from_mlir_values__(
+        self, values: list[ir.Value]
+    ) -> "StaticPersistentTileScheduler":
+        assert len(values) == 6
+        new_num_persistent_clusters = new_from_mlir_values(
+            self.num_persistent_clusters, [values[0]]
+        )
+        new_current_work_linear_idx = new_from_mlir_values(
+            self._current_work_linear_idx, [values[1]]
+        )
+        new_cta_id_in_cluster = new_from_mlir_values(
+            self.cta_id_in_cluster, values[2:5]
+        )
+        new_num_tiles_executed = new_from_mlir_values(
+            self._num_tiles_executed, [values[5]]
+        )
+        return StaticPersistentTileScheduler(
+            self.params,
+            new_num_persistent_clusters,
+            new_current_work_linear_idx,
+            new_cta_id_in_cluster,
+            new_num_tiles_executed,
+        )
+
+    # called by host
+    @dsl_user_op
+    @staticmethod
+    def create(
+        params: PersistentTileSchedulerParams,
+        block_idx: Tuple[Integer, Integer, Integer],
+        grid_dim: Tuple[Integer, Integer, Integer],
+        *,
+        loc=None,
+        ip=None,
+    ):
+        """Initialize the static persistent tile scheduler.
+
+        :param params: Parameters for the persistent
+            tile scheduler.
+        :type params: PersistentTileSchedulerParams
+        :param block_idx: The 3d block index in the format (bidx, bidy, bidz).
+        :type block_idx: Tuple[Integer, Integer, Integer]
+        :param grid_dim: The 3d grid dimensions for kernel launch.
+        :type grid_dim: Tuple[Integer, Integer, Integer]
+
+        :return: A StaticPersistentTileScheduler object.
+        :rtype: StaticPersistentTileScheduler
+        """
+        params = params
+
+        # Calculate the number of persistent clusters by dividing the total grid size
+        # by the number of CTAs per cluster
+        num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size(
+            params.cluster_shape_mn, loc=loc, ip=ip
+        )
+
+        bidx, bidy, bidz = block_idx
+
+        # Initialize workload index equals to the cluster index in the grid
+        current_work_linear_idx = Int32(bidz)
+
+        # CTA id in the cluster
+        cta_id_in_cluster = (
+            Int32(bidx % params.cluster_shape_mn[0]),
+            Int32(bidy % params.cluster_shape_mn[1]),
+            Int32(0),
+        )
+        # Initialize number of tiles executed to zero
+        num_tiles_executed = Int32(0)
+        return StaticPersistentTileScheduler(
+            params,
+            num_persistent_clusters,
+            current_work_linear_idx,
+            cta_id_in_cluster,
+            num_tiles_executed,
+        )
+
+    # called by host
+    @staticmethod
+    def get_grid_shape(
+        params: PersistentTileSchedulerParams,
+        max_active_clusters: Int32,
+        *,
+        loc=None,
+        ip=None,
+    ) -> Tuple[Integer, Integer, Integer]:
+        """Calculates the grid shape to be launched on GPU using problem shape,
+        threadblock shape, and active cluster size.
+
+        :param params: Parameters for grid shape calculation.
+        :type params: PersistentTileSchedulerParams
+        :param max_active_clusters: Maximum active clusters allowed.
+        :type max_active_clusters: Int32
+
+        :return: The calculated 3d grid shape.
+        :rtype: Tuple[Integer, Integer, Integer]
+        """
+
+        return params.get_grid_shape(max_active_clusters, loc=loc, ip=ip)
+
+    # private method
+    def _get_current_work_for_linear_idx(
+        self, current_work_linear_idx: Int32, *, loc=None, ip=None
+    ) -> WorkTileInfo:
+        """Compute current tile coord given current_work_linear_idx and cta_id_in_cluster.
+
+        :param current_work_linear_idx: The linear index of the current work.
+        :type current_work_linear_idx: Int32
+
+        :return: An object containing information about the current tile coordinates
+            and validity status.
+        :rtype: WorkTileInfo
+        """
+
+        is_valid = current_work_linear_idx < cute.size(
+            self.params.problem_layout_ncluster_mnl, loc=loc, ip=ip
+        )
+
+        cur_cluster_coord = self.params.problem_layout_ncluster_mnl.get_hier_coord(
+            current_work_linear_idx, loc=loc, ip=ip
+        )
+
+        # cur_tile_coord is a tuple of i32 values
+        cur_tile_coord = tuple(
+            Int32(x) * Int32(z) + Int32(y)
+            for x, y, z in zip(
+                cur_cluster_coord,
+                self.cta_id_in_cluster,
+                (*self.params.cluster_shape_mn, Int32(1)),
+            )
+        )
+
+        return WorkTileInfo(cur_tile_coord, is_valid)
+
+    @dsl_user_op
+    def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self._get_current_work_for_linear_idx(
+            self._current_work_linear_idx, loc=loc, ip=ip
+        )
+
+    @dsl_user_op
+    def initial_work_tile_info(self, *, loc=None, ip=None) -> WorkTileInfo:
+        return self.get_current_work(loc=loc, ip=ip)
+
+    @dsl_user_op
+    def advance_to_next_work(self, *, advance_count: int = 1, loc=None, ip=None):
+        self._current_work_linear_idx += Int32(advance_count) * Int32(
+            self.num_persistent_clusters
+        )
+        self._num_tiles_executed += Int32(1)
+
+    @property
+    def num_tiles_executed(self) -> Int32:
+        return self._num_tiles_executed
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/tensormap_manager.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/tensormap_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6369c200e13ad280dfdecdb5cb4aa7ad081da4c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass/utils/tensormap_manager.py
@@ -0,0 +1,140 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Tuple
+
+from cutlass.cutlass_dsl import const_expr
+
+import cutlass._mlir.dialects.cute as _cute_ir
+import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
+
+import cutlass.cute as cute
+
+
+class TensorMapUpdateMode(Enum):
+    """
+    Enum class defining tensor map update modes.
+
+    Modes:
+    GMEM: Update tensormap in global memory
+    SMEM: Load tensormap from global memory to shared memory,
+    update it in shared memory, then store back to global memory
+    """
+
+    GMEM = auto()  # Update tensormap in global memory
+    SMEM = auto()  # Update tensormap in shared memory
+
+
+@dataclass(frozen=True)
+class TensorMapManager:
+    """
+    Manages TensorMap operations including initialization and updates.
+    Provides utilities to convert tensormap pointer to across different memory spaces.
+    """
+
+    tensormap_update_mode: TensorMapUpdateMode
+    bytes_per_tensormap: int
+
+    # convert given cute.Pointer or cutlass.Int64 to a cute.Pointer to tensormap.
+    # address_space: the address space of the resulting tensormap pointer. It could be generic or gmem
+    def get_tensormap_ptr(
+        self,
+        ptr: cute.Pointer,
+        address_space=_cute_ir.AddressSpace.gmem,
+    ) -> cute.Pointer:
+        if address_space not in [
+            _cute_ir.AddressSpace.gmem,
+            _cute_ir.AddressSpace.generic,
+        ]:
+            raise ValueError(f"Invalid address space: {address_space} for tensormap")
+
+        gmem_ptr_i64 = ptr.toint().ir_value()
+        gmem_ptr_i64_align_ty = _cute_ir.ConstrainedIntType.get(
+            self.bytes_per_tensormap, gmem_ptr_i64.type.width
+        )
+        gmem_ptr_i64_align = _cute_ir.assume(gmem_ptr_i64_align_ty, gmem_ptr_i64)
+        gmem_ptr_ty = _cute_ir.PtrType.get(
+            _cute_nvgpu_ir.TmaDescriptorTiledType.get(),
+            address_space,
+            self.bytes_per_tensormap,
+        )
+        return _cute_ir.inttoptr(gmem_ptr_ty, gmem_ptr_i64_align)
+
+    # init tensormap pointed by dst_ptr with the one inside copy_atom.
+    # dst_ptr should be pointing to a global memory location or a smem location
+    # warp_id specifies which warp to perform the initialization
+    @cute.jit
+    def init_tensormap_from_atom(
+        self, copy_atom: cute.CopyAtom, dst_ptr: cute.Pointer, warp_id: int
+    ) -> None:
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+        if warp_idx == warp_id:
+            with cute.arch.elect_one():
+                cute.nvgpu.cpasync.copy_tensormap(copy_atom, dst_ptr)
+        cute.arch.sync_warp()
+        return
+
+    # Perform a fence operation to ensure previous `init_tensormap_from_atom` calls have been completed
+    def fence_tensormap_initialization(
+        self,
+    ) -> None:
+        if self.tensormap_update_mode == TensorMapUpdateMode.GMEM:
+            cute.arch.fence_acq_rel_cta()
+        return
+
+    # Perform a fence operation to ensure previous `update_tensormap` calls have been completed
+    def fence_tensormap_update(
+        self,
+        tensormap_ptr: cute.Pointer,
+    ) -> None:
+        cute.nvgpu.cpasync.fence_tma_desc_acquire(tensormap_ptr)
+        return
+
+    @cute.jit
+    def update_tensormap(
+        self,
+        tensor_gmem: Tuple[cute.Tensor, ...],
+        tma_copy_atom: Tuple[cute.CopyAtom, ...],
+        tensormap_gmem_ptr: Tuple[cute.Pointer, ...],
+        warp_id: int,
+        tensormap_smem_ptr: Tuple[cute.Pointer, ...],
+    ) -> None:
+        warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+        # updates before touching tensormap in global memory
+        if warp_idx == warp_id:
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for copy_atom, tensor, smem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_smem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(
+                        copy_atom, tensor, smem_ptr
+                    )
+            # wait until it's safe to update tensormap in global memory
+            with cute.arch.elect_one():
+                cute.arch.cp_async_bulk_commit_group()
+                cute.arch.cp_async_bulk_wait_group(0, read=True)
+            cute.arch.sync_warp()
+            # updates to tensormap in global memory
+            if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM):
+                for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr):
+                    cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr)
+            else:
+                for copy_atom, tensor, gmem_ptr in zip(
+                    tma_copy_atom, tensor_gmem, tensormap_gmem_ptr
+                ):
+                    cute.nvgpu.cpasync.update_tma_descriptor(
+                        copy_atom, tensor, gmem_ptr
+                    )
+                cute.arch.sync_warp()
+                cute.nvgpu.cpasync.fence_tma_desc_release()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06ea3f6f5f54b0b4f125c22504b06f41e8bf7697
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/__init__.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from .cutlass import *
+
+from ..base_dsl.ast_helpers import (
+    loop_selector,
+    if_selector,
+    if_executor,
+    while_selector,
+    while_executor,
+    range,
+    range_constexpr,
+    range_dynamic,
+    const_expr,
+    dynamic_expr,
+    assert_executor,
+    bool_cast,
+    compare_executor,
+    any_executor,
+    all_executor,
+    range_value_check,
+    range_perf_warning,
+    cf_symbol_check,
+    redirect_builtin_function,
+    copy_members,
+    get_locals_or_none,
+)
+
+from ..base_dsl import *
+from ..base_dsl.dsl import extract_mlir_values, new_from_mlir_values
+from ..base_dsl.typing import _binary_op_type_promote
+from ..base_dsl._mlir_helpers.gpu import *
+from ..base_dsl._mlir_helpers.op import dsl_user_op
+from ..base_dsl.runtime import *
+from ..base_dsl.runtime import cuda as cuda_helpers
+from ..base_dsl.compiler import compile
+from ..base_dsl.runtime.jit_arg_adapters import *
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..1630c873c7a1be3e013f966ea153c904f2b776ff
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass.py
@@ -0,0 +1,1696 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+"""
+This module provides a DSL for Cutlass Dialects. It also includes utils with
+regarding to that dialect.
+"""
+
+# Local module imports
+from itertools import chain
+from types import GenericAlias, SimpleNamespace, UnionType
+from typing import Callable, Union, Type, List, Union, Sequence, ForwardRef, Any
+import functools
+import pkgutil
+from dataclasses import is_dataclass, fields
+from collections.abc import Sequence
+import builtins
+
+from ..base_dsl import *
+from ..base_dsl import compiler
+from ..base_dsl.dsl import is_dynamic_expression, extract_mlir_values
+from ..base_dsl.typing import *
+from ..base_dsl.typing import DynamicExpression, get_mlir_types
+from ..base_dsl.runtime.jit_arg_adapters import is_arg_spec_constexpr
+
+from ..base_dsl.ast_helpers import const_expr
+
+# MLIR Imports
+from cutlass._mlir import ir, execution_engine, passmanager
+from cutlass._mlir.dialects import arith, func, gpu, scf, cute, gpu as cutlass_gpu
+from cutlass._mlir.dialects._ods_common import (
+    get_op_result_or_op_results as _get_op_result_or_op_results,
+)
+from cutlass._mlir.extras import types as T
+
+# Helpers
+from ..base_dsl._mlir_helpers import arith as cutlass_arith
+from ..base_dsl._mlir_helpers import lru_cache_ir
+
+from ..base_dsl.ast_helpers import (
+    loop_selector,
+    executor,
+    if_selector,
+    if_executor,
+    while_selector,
+    while_executor,
+    assert_executor,
+    const_expr,
+    dynamic_expr,
+    bool_cast,
+    compare_executor,
+    any_executor,
+    all_executor,
+    range_value_check,
+    range_perf_warning,
+    cf_symbol_check,
+)
+
+from .cutlass_ast_decorators import (
+    _loop_execute_range_dynamic,
+    _if_execute_dynamic,
+    _while_execute_dynamic,
+)
+
+from .tree_utils import (
+    is_constexpr_field,
+    tree_flatten,
+    tree_unflatten,
+    PyTreeDef,
+    is_frozen_dataclass,
+    DSLTreeFlattenError,
+)
+from ..base_dsl.runtime.jit_arg_adapters import JitArgAdapterRegistry
+
+
+# =============================================================================
+# Cutlass DSL Base Abstract Class
+# =============================================================================
+
+
+# Return a ctype class that represents the in-memory layout expected
+# for a CuTe hierarchical tuple type.
+def get_sparse_tuple_ctype(dyn):
+    # When there is a single dynamic value, the sparse CuTe
+    # representation is a single integer.
+    if isinstance(dyn, int):
+        return ctypes.c_int32
+
+    # For zero or greater than 1 dynamic values, the tuple
+    # representation will be a struct with a field for each dynamic
+    # value. The representation is flattened, even for hierarchical CuTe
+    # profiles (although we are only dealing with depth 1 inputs here).
+    class TupleDescriptor(ctypes.Structure):
+        _fields_ = [(f"x{idx}", ctypes.c_int32) for idx in range(len(dyn))]
+
+        def __str__(self):
+            return f"struct<{str(self._fields_)}>"
+
+    return TupleDescriptor
+
+
+def is_cute_algebra_type(arg_spec):
+    # Walk through the arg_spec to check if it's a cute algebra type
+    _cute_algebra_type_aliases = (
+        "Shape",
+        "Stride",
+        "Coord",
+        "Tile",
+        "IntTuple",
+    )
+
+    origin = get_origin(arg_spec)
+    if origin is Union:
+        for sub_ty in get_args(arg_spec):
+            sub_origin = get_origin(sub_ty)
+            if sub_origin is Tuple or (
+                type(sub_origin) is type and issubclass(sub_origin, tuple)
+            ):
+                tuple_arg0 = get_args(sub_ty)[0]
+                if isinstance(
+                    tuple_arg0, ForwardRef
+                ) and tuple_arg0.__forward_arg__ in (_cute_algebra_type_aliases):
+                    return True
+    return False
+
+
+def _get_c_pointers_cutlass(obj):
+    """
+    This is an extended version of `get_c_pointers` that supports dataclasses, SimpleNamespace, and dict.
+    """
+    if hasattr(obj, "__c_pointers__"):
+        return obj.__c_pointers__()
+    elif isinstance(obj, (tuple, list)):
+        return list(chain.from_iterable(_get_c_pointers_cutlass(x) for x in obj))
+    elif isinstance(obj, SimpleNamespace):
+        return list(
+            chain.from_iterable(
+                _get_c_pointers_cutlass(x) for x in obj.__dict__.values()
+            )
+        )
+    elif isinstance(obj, dict):
+        return list(
+            chain.from_iterable(_get_c_pointers_cutlass(x) for x in obj.values())
+        )
+    elif is_dataclass(obj):
+        return list(
+            chain.from_iterable(
+                _get_c_pointers_cutlass(getattr(obj, f.name))
+                for f in fields(obj)
+                if not is_constexpr_field(f)
+            )
+        )
+    elif isinstance(obj, set):
+        raise DSLRuntimeError(
+            "Sets are not supported in get_c_pointers to ensure order preservation",
+            context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.",
+            suggestion="Consider using a list or tuple instead",
+        )
+    else:
+        # Try get adapter
+        adapter = JitArgAdapterRegistry.get_registered_adapter(type(obj))
+        if adapter is not None:
+            return _get_c_pointers_cutlass(adapter(obj))
+    return []
+
+
+class CutlassBaseDSL(BaseDSL):
+    """This abstract class provides a DSL for Cutlass."""
+
+    def __init__(
+        self,
+        name: str,
+        compiler_provider: Any,
+        pass_sm_arch_name: str,
+        device_compilation_only: bool = False,
+        preprocess: bool = False,
+    ):
+        super().__init__(
+            name=name,
+            dsl_package_name=["cutlass"],
+            compiler_provider=compiler_provider,
+            pass_sm_arch_name=pass_sm_arch_name,
+            device_compilation_only=device_compilation_only,
+            preprocess=preprocess,
+        )
+        self._smem_usage_tracker: tuple = None
+
+    # this method is not useful for cutlass_dsl, so we only provide a dummy implementation.
+    def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool:
+        return False
+
+    # this method is not useful for cutlass_dsl, so we only provide a dummy implementation.
+    def _handle_tensor_descriptor(
+        self, maybe_tensor, arg_name: str, need_gpu_memory: bool
+    ) -> Any:
+        return False
+
+    def _build_gpu_module(self, attrs):
+        self.gpu_module = gpu.GPUModuleOp(ir.StringAttr.get("kernels"))
+        with ir.InsertionPoint(self.gpu_module.bodyRegion.blocks.append(*[])):
+            pass
+
+        for attr_name in attrs:
+            self.gpu_module.attributes[attr_name] = ir.Attribute.parse(attrs[attr_name])
+
+    def _get_pipeline(self, pipeline):
+        pipeline = super()._get_pipeline(pipeline)
+        if pipeline == None:
+            # cubin format is required to be cubin as we launch cuda module at python level.
+            return (
+                "builtin.module(cute-to-nvvm{cubin-format=bin "
+                + self.compile_options.to_str()
+                + "})"
+            )
+
+        return pipeline
+
+    def preprocess_pipeline(self, pipeline, arch) -> str:
+        pipeline = super().preprocess_pipeline(pipeline, arch)
+        pipeline = pipeline.rstrip(")") + ",external-kernel-for-gpu-launch)"
+        return pipeline
+
+    def _enter_gpu_module(self):
+        return ir.InsertionPoint(self.gpu_module.bodyRegion.blocks[0])
+
+    def _generate_kernel_attrs(self, config: BaseDSL.LaunchConfig) -> dict:
+        assert isinstance(
+            config, BaseDSL.LaunchConfig
+        ), f"Expect LaunchConfig for @kernel, but got {type(config)}"
+
+        ret = {}
+        # generate launch bound attr from LaunchConfig
+        max_threads = ", ".join(map(str, config.block))
+        ret["nvvm.reqntid"] = ir.Attribute.parse(f"array<i32 : {max_threads}>")
+        # min_blocks_per_mp is optional for kernel
+        min_blocks = config.min_blocks_per_mp
+        if min_blocks > 0:
+            ret["nvvm.minctasm"] = ir.Attribute.parse(f"{min_blocks} : i32")
+        return ret
+
+    @lru_cache(maxsize=1)
+    def get_version(self):
+        """
+        Get the version of cutlass dsl, used for computing the hash key of the cache.
+        Including source python files and the shared library.
+        """
+        dsl_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        # get the version hash of the cutlass shared library
+        version_hash = hashlib.sha256()
+        # update the version hash of the source python files
+        for lib in pkgutil.walk_packages([dsl_path], prefix="cutlass."):
+            try:
+                with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f:
+                    version_hash.update(f.read())
+            except Exception:
+                raise DSLRuntimeError(
+                    f"Failed to read module file {lib.name}. The file may not exist or may not be readable."
+                    "Please re-install the package."
+                )
+        try:
+            # update the version hash of the cutlass shared library
+            with open(
+                os.path.join(dsl_path, "_mlir/_mlir_libs/libCutlassIRPythonCAPI.so"),
+                "rb",
+            ) as f:
+                while True:
+                    chunk = f.read(1024**2)
+                    if not chunk:
+                        break
+                    version_hash.update(chunk)
+        except Exception:
+            raise DSLRuntimeError(
+                f"Failed to read the shared library file libCutlassIRPythonCAPI.so."
+                "The file may not exist or may not be readable."
+                "Please re-install the package."
+            )
+
+        return version_hash
+
+    @staticmethod
+    def track_smem_allocator(allocator, callback):
+        """
+        Tracks shared memory usage for kernel functions.
+        Find and set allocator to its parent dsl object.
+        """
+        frame = inspect.currentframe().f_back
+        while frame:
+            obj = frame.f_locals.get("self", None)
+            if obj and isinstance(obj, CutlassBaseDSL):
+                obj._set_smem_tracking(allocator, callback)
+                return
+            frame = frame.f_back
+        warnings.warn("Cannot find parent dsl for allocator!", UserWarning)
+
+    def _set_smem_tracking(self, allocator, callback):
+        # Registers an allocator and callback for current dsl
+        self._smem_usage_tracker = (allocator, callback)
+
+    def _reset_smem_tracking(self):
+        # Clear an allocator and callback for current dsl
+        self._smem_usage_tracker = None
+
+    def _get_smem_usage(self) -> int:
+        # Treat final allocated bytes of allocator as smem usage
+        if not self._smem_usage_tracker:
+            return 0
+        allocator, callback = self._smem_usage_tracker
+        return callback(allocator)
+
+    def _kernel_helper(self, funcBody, *args, **kwargs):
+        class _CutlassIrKernelGenHelper(BaseDSL._KernelGenHelper):
+            def __init__(self, dsl: CutlassBaseDSL):
+                super().__init__()
+                self.dsl = dsl
+                self.dsl._reset_smem_tracking()
+
+            def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None):
+                super().generate_func_op(arg_types, arg_attrs, kernel_name)
+                self.func_op = func.FuncOp(
+                    kernel_name, ir.FunctionType.get(arg_types, []), loc=loc
+                )
+                if arg_attrs is not None:
+                    log().debug(arg_attrs)
+                    self.func_op.arg_attrs = arg_attrs
+                return self.func_op
+
+            def generate_func_ret_op(self):
+                return func.ReturnOp([])
+
+            def get_func_body_start(self):
+                assert self.func_op is not None, "Invalid func_op is not expected!"
+                return self.func_op.add_entry_block()
+
+            def generate_launch_op(self, *args, **kwargs):
+                # Extract args and do validation
+                kernelSym = kwargs.get("kernelSym", None)
+                kernelOperands = kwargs.get("kernelOperands", None)
+                requiredArgs = kwargs.get("requiredArgs", None)
+                assert kernelSym is not None, "kernelSym being None is not expected!"
+                assert (
+                    requiredArgs is not None
+                ), "requiredArgs being None is not expected!"
+                assert (
+                    kernelOperands is not None
+                ), "kernelOperands being None is not expected!"
+                assert isinstance(
+                    requiredArgs.config, BaseDSL.LaunchConfig
+                ), f"Expect LaunchConfig for @kernel, but got {type(requiredArgs.config)}"
+
+                cfg = requiredArgs.config
+
+                # Apply to grid, block, and cluster if present
+                cfg.grid = [to_index(size) for size in cfg.grid]
+                cfg.block = [to_index(size) for size in cfg.block]
+                if cfg.has_cluster:
+                    cfg.cluster = [to_index(size) for size in cfg.cluster]
+
+                smem_usage = self.dsl._get_smem_usage()
+                if any(not isinstance(x, int) for x in [cfg.smem, smem_usage]):
+                    pass  # cannot compare dynamic value inside kernel to launch op in py
+                elif cfg.auto_smem:
+                    cfg.smem = smem_usage
+                elif smem_usage > cfg.smem:
+                    warnings.warn(
+                        f"Potential error: specified kernel launch smem bytes "
+                        f"({cfg.smem}) is smaller than kernel usage ({smem_usage})!",
+                        UserWarning,
+                    )
+                cfg.smem = const(cfg.smem)
+
+                if not isinstance(cfg.async_deps, (list, tuple)):
+                    cfg.async_deps = [cfg.async_deps]
+                is_async = len(cfg.async_deps) > 0
+                token = gpu.launch_func(
+                    gpu.AsyncTokenType.get() if is_async else None,
+                    cfg.async_deps,
+                    kernelSym,
+                    *cfg.grid,
+                    *cfg.block,
+                    kernelOperands,
+                    **dict(
+                        zip(
+                            ("cluster_size_x", "cluster_size_y", "cluster_size_z"),
+                            tuple(cfg.cluster),
+                        )
+                    ),
+                    dynamic_shared_memory_size=cfg.smem,
+                )
+                return token if is_async else None
+
+        return KernelLauncher(
+            self,
+            lambda: _CutlassIrKernelGenHelper(self),
+            funcBody,
+            *args,
+            **kwargs,
+        )
+
+    def _preprocess_launch_config_args(self, args, kwargs):
+        """Helper to preprocess args and kwargs for LaunchConfig"""
+        if "stream" in kwargs:
+            kwargs["async_deps"] = kwargs.pop("stream")
+
+    def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec):
+        """Mangle the name of the function to avoid conflicts with other functions"""
+        function_name = "cutlass_" + function_name
+        return super().mangle_name(function_name, args, args_spec)
+
+    def _validate_arg(self, arg, arg_index, arg_name, arg_annotation):
+        """
+        Validates if the arg is really of the annotated type.
+        """
+
+        if (
+            is_arg_spec_constexpr(arg_annotation, arg_name, arg_index, None)
+            or arg_annotation is Any
+        ):
+            pass
+        else:
+            origin = get_origin(arg_annotation)
+            # Handle special case where annotation is Type[X] but arg is an actual type
+            if origin is type and isinstance(arg, type):
+                # Get the expected base type from Type[X]
+                expected_base = get_args(arg_annotation)[0]
+                if not issubclass(arg, expected_base):
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be Type[{expected_base}], but got {arg}"
+                    )
+            # Handle Union types and generic types
+            elif origin is Union or isinstance(arg_annotation, UnionType):
+                # For Union types, check if arg matches any of the allowed types
+                allowed_types = get_args(arg_annotation)
+                if not any(
+                    (ty is Any)
+                    or (isinstance(ty, type) and isinstance(arg, ty))
+                    or (get_origin(ty) is tuple and isinstance(arg, tuple))
+                    for ty in allowed_types
+                ):
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be one of {allowed_types}, but got {type(arg)}"
+                    )
+            elif isinstance(arg_annotation, type):
+                # Handle simple type annotations
+                if not isinstance(arg, arg_annotation) and arg is not None:
+                    return DSLRuntimeError(
+                        f"expects argument #{arg_index+1} ({arg_name}) to be {arg_annotation}, but got {type(arg)}"
+                    )
+        # Everything looks good if we are here
+        return None
+
+    def _generate_jit_func_args_for_known_types(
+        self,
+        func,
+        arg,
+        arg_name,
+        arg_spec,
+        arg_index,
+        *,
+        is_host=True,
+    ):
+        jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], []
+        default_attr = ir.DictAttr.get({})
+
+        (
+            jit_exec_arg,
+            jit_arg_type,
+            jit_arg_attr,
+        ) = super()._generate_jit_func_args_for_known_types(
+            func, arg, arg_name, arg_spec, arg_index, is_host=is_host
+        )
+
+        if jit_arg_type is not None and len(jit_arg_type) == 0:
+            # Handle DSL specific types
+            if is_cute_algebra_type(arg_spec):
+                dyn_vals = extract_mlir_values(arg)
+                if dyn_vals:
+                    # Handle dynamic types
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+                    jit_arg_attr.extend([default_attr] * len(dyn_vals))
+                    jit_exec_arg.extend(get_c_pointers(arg) if is_host else dyn_vals)
+                else:
+                    jit_exec_arg = jit_arg_type = jit_arg_attr = None
+            elif not hasattr(arg, "__extract_mlir_values__") and not hasattr(
+                arg, "__new_from_mlir_values__"
+            ):
+                # Try tree_flatten
+                try:
+                    dyn_vals, _ = tree_flatten(arg)
+                except DSLTreeFlattenError:
+                    # If fails, just return the original arg
+                    return jit_exec_arg, jit_arg_type, jit_arg_attr
+
+                if dyn_vals:
+                    jit_arg_type.extend([v.type for v in dyn_vals])
+                    jit_arg_attr.extend([default_attr] * len(dyn_vals))
+                    jit_exec_arg.extend(
+                        _get_c_pointers_cutlass(arg) if is_host else dyn_vals
+                    )
+                else:
+                    # If tree flatten yields empty list, treat it as a constexpr thing
+                    # Like a dataclass with all fields are constexpr, or an empty tuple or list
+                    jit_exec_arg = jit_arg_type = jit_arg_attr = None
+        return jit_exec_arg, jit_arg_type, jit_arg_attr
+
+    def _generate_execution_arguments_for_known_types(
+        self, arg, arg_spec, arg_name, i, fop_args, iv_block_args
+    ):
+        ir_arg, iv_block_args = super()._generate_execution_arguments_for_known_types(
+            arg, arg_spec, arg_name, i, fop_args, iv_block_args
+        )
+        if not ir_arg:
+            # Handling DSL specific types
+            if is_cute_algebra_type(arg_spec):
+                n_args = len(get_mlir_types(arg))
+                blk_args = fop_args[iv_block_args : iv_block_args + n_args]
+                ir_arg.append(new_from_mlir_values(arg, blk_args))
+                iv_block_args += n_args
+            elif not hasattr(arg, "__extract_mlir_values__") and not hasattr(
+                arg, "__new_from_mlir_values__"
+            ):
+                # Try tree_unflatten
+                try:
+                    dyn_vals, tree_def = tree_flatten(arg)
+                    block_args = fop_args[iv_block_args : iv_block_args + len(dyn_vals)]
+                    ir_arg.append(tree_unflatten(tree_def, block_args))
+                    iv_block_args += len(dyn_vals)
+                except DSLTreeFlattenError:
+                    return ir_arg, iv_block_args
+
+        return ir_arg, iv_block_args
+
+
+# =============================================================================
+# Cute DSL Class
+# =============================================================================
+
+
+class CuTeDSL(CutlassBaseDSL):
+    """
+    This is a concrete DSL subclass for the CuTe dialect.
+    """
+
+    def __init__(self):
+        name = "CUTE_DSL"
+        compiler_provider = compiler.Compiler(passmanager, execution_engine)
+        pass_sm_arch_name = "cubin-chip"
+
+        super().__init__(name, compiler_provider, pass_sm_arch_name, preprocess=True)
+
+
+# =============================================================================
+# KernelLauncher
+# =============================================================================
+
+
+class KernelLauncher:
+    """
+    This class is used to launch a kernel function.
+    Usage:
+        ```python
+        @cute.kernel
+        def kernel(arg1, arg2, ...):
+            ...
+
+        @cute.jit
+        def launch_kernel():
+            kernel(arg1, arg2, ...).launch(grid=[1, 1, 1], block=[1, 1, 1], ...)
+            # or
+            kernel(arg1, arg2, ...)(grid=[1, 1, 1], block=[1, 1, 1], ...)
+        ```
+    """
+
+    def __init__(
+        self,
+        dsl: "CutlassBaseDSL",
+        kernelGenHelper: BaseDSL._KernelGenHelper,
+        funcBody,
+        *func_args,
+        **func_kwargs,
+    ):
+        self.dsl = dsl
+        self.kernelGenHelper = kernelGenHelper
+        self.funcBody = funcBody
+        self.func_args = func_args
+        self.func_kwargs = func_kwargs
+
+        self._check_func_args(funcBody, *func_args, **func_kwargs)
+
+    def _check_func_args(self, funcBody, *func_args, **func_kwargs):
+        # Get function signature
+        sig = inspect.signature(funcBody)
+
+        # func_args and func_kwargs should match funcBody's signature,
+        # no extra or missing arguments.
+        try:
+            sig.bind(*func_args, **func_kwargs)
+        except TypeError as e:
+            raise DSLRuntimeError(
+                f"Failed to bind arguments to function `{funcBody.__name__}` with signature `{sig}`",
+                cause=e,
+            )
+
+    def smem_usage(self) -> int:
+        """
+        Check smem usage for this kernel, only available after `launch`
+        """
+        return self.dsl._get_smem_usage()
+
+    def launch(self, *args, **kwargs):
+        self.dsl.frame = inspect.currentframe().f_back
+        self.dsl._preprocess_launch_config_args(args, kwargs)
+        config = self.dsl.LaunchConfig(*args, **kwargs)
+
+        kernel_generator = self.dsl.kernel_launcher(
+            requiredArgs=["config"],
+            unitAttrNames=["gpu.kernel", "cute.kernel"],
+            valueAttrDict=self.dsl._generate_kernel_attrs(config),
+            kernelGenHelper=self.kernelGenHelper,
+        )(self.funcBody)
+
+        ret, name = kernel_generator(*self.func_args, **self.func_kwargs, config=config)
+        self.dsl.kernel_symbols.append(name)
+        self.dsl.frame = None
+        return ret.launch_op_ret
+
+    def __call__(self, *args, **kwargs):
+        return self.launch(*args, **kwargs)
+
+
+# =============================================================================
+# Utils
+# =============================================================================
+def _filter_readonly_frozen_dataclass(
+    iter_args: List[Any], items_to_filter: List[Any], full_write_args_count: int
+) -> List[Any]:
+    """
+    Filter items based on whether corresponding iter_args are frozen dataclasses.
+
+    This function filters items (which can be values or names) based on the same
+    logic: keep items if they correspond to full-write arguments (index < full_write_args_count)
+    or if the corresponding iter_arg is not a frozen dataclass.
+
+    Args:
+        iter_args: List of arguments to check for frozen dataclass status
+        items_to_filter: List of items to filter (values or names)
+        full_write_args_count: Number of arguments that are always written (not read-only)
+
+    Returns:
+        Filtered list of items
+
+    Examples:
+        # Filter values (original remove_read_only_frozen_dataclass behavior)
+        filtered_values = _filter_readonly_frozen_dataclass(iter_args, iter_args, full_write_args_count)
+
+        # Filter names (original filter_readonly_frozen_dataclass_names behavior)
+        filtered_names = _filter_readonly_frozen_dataclass(iter_args, iter_args_names, full_write_args_count)
+    """
+    return [
+        item
+        for i, item in enumerate(items_to_filter)
+        if i < full_write_args_count or not is_frozen_dataclass(iter_args[i])
+    ]
+
+
+def remove_read_only_frozen_dataclass(
+    iter_args: List[Any], full_write_args_count: int
+) -> List[Any]:
+    """Filter out frozen dataclass arguments that are not full-write arguments."""
+    return _filter_readonly_frozen_dataclass(
+        iter_args, iter_args, full_write_args_count
+    )
+
+
+def filter_readonly_frozen_dataclass_names(
+    iter_args: List[Any], iter_args_names: List[str], full_write_args_count: int
+) -> List[str]:
+    """Filter names based on whether corresponding iter_args are frozen dataclasses."""
+    return _filter_readonly_frozen_dataclass(
+        iter_args, iter_args_names, full_write_args_count
+    )
+
+
+def insert_read_only_frozen_dataclass(
+    iter_args: List[Any], original_iter_args: List[Any], full_write_args_count: int
+) -> List[Any]:
+    """
+    Insert read-only frozen dataclass arguments back into the iteration arguments.
+
+    This function takes the new iteration arguments and the original arguments,
+    and preserves frozen dataclass instances from the original arguments while
+    using the new arguments for non-frozen dataclass instances.
+
+    Args:
+        iter_args: New iteration arguments to use for non-frozen dataclass instances
+        original_iter_args: Original iteration arguments to preserve frozen dataclass instances from
+        full_write_args_count: Number of arguments that are always written (not read-only)
+
+    Returns:
+        List of arguments with frozen dataclass instances preserved from original
+    """
+    # Take full-write arguments from new iter_args
+    full_write_args = (
+        iter_args[:full_write_args_count] if full_write_args_count > 0 else []
+    )
+
+    # Process remaining arguments: preserve frozen dataclass from original, use new for others
+    remaining_original = original_iter_args[full_write_args_count:]
+    remaining_new = iter_args[full_write_args_count:]
+
+    def process_remaining_arg(original_arg, new_arg_iter):
+        """Process a single remaining argument, preserving frozen dataclass if present"""
+        return original_arg if is_frozen_dataclass(original_arg) else next(new_arg_iter)
+
+    # Use zip to pair original args with new args, then map the processing function
+    new_arg_iter = iter(remaining_new)
+    processed_remaining = [
+        process_remaining_arg(orig_arg, new_arg_iter) for orig_arg in remaining_original
+    ]
+
+    return full_write_args + processed_remaining
+
+
+def unpack_to_irvalue(
+    mixed_values: List[Any], body_name: str, full_write_args_count: int
+) -> Tuple[List[ir.Value], PyTreeDef]:
+    log().debug("===--- Values UNPack")
+    for idx, packed in enumerate(mixed_values):
+        log().debug("[%d]: will-unpacked: [type:%s] %s", idx, type(packed), packed)
+
+    try:
+        unpacked_values, treedef = tree_flatten(
+            remove_read_only_frozen_dataclass(mixed_values, full_write_args_count)
+        )
+    except DSLTreeFlattenError as e:
+        raise DSLRuntimeError(
+            f"The '{body_name}' statement encountered a user-defined Python object, which cannot be automatically converted into an dynamic expression.",
+            context={
+                e.message: (
+                    f"All expressions within '{body_name}' must be dynamic expressions, "
+                    "mixing Python objects and dynamic expressions is not supported. "
+                    "The DSL failed to convert the Python object into dynamic expressions."
+                )
+            },
+            suggestion=(
+                f"Please ensure '{e.type_str}' implements the '{DynamicExpression.__name__}' or mark with `dataclass`, "
+                f"so it can be treated as a valid dynamic expression or mark '{body_name}' as a constant expression if conditions are Python objects."
+            ),
+        )
+
+    log().debug("------------------ ")
+    for idx, unpacked in enumerate(unpacked_values):
+        log().debug("[%d]: unpacked values: %s", idx, unpacked)
+    log().debug("treedef: %s", treedef)
+    log().debug("------------------ ")
+
+    return unpacked_values, treedef
+
+
+def pack_from_irvalue(
+    ir_values: List["ir.Value"],
+    pytree_def: PyTreeDef,
+    mixed_values: List[Any],
+    full_write_args_count: int,
+) -> List[Any]:
+    """
+    Packs MLIR values into a list of mixed values.
+    """
+    log().debug("===--- Values Pack (%d)", len(ir_values))
+    for idx, value in enumerate(ir_values):
+        log().debug("[%d]: will-packed: %s", idx, value)
+    log().debug("treedef: %s", pytree_def)
+    log().debug("------------------ ")
+
+    unflattened = tree_unflatten(pytree_def, ir_values)
+    return insert_read_only_frozen_dataclass(
+        unflattened, mixed_values, full_write_args_count
+    )
+
+
+def to_index(value):
+    """Converts a value to an index, either by casting or coercing to int."""
+    if is_dynamic_expression(value):
+        if isinstance(value, Numeric):
+            value = value.ir_value()
+        assert ir.IntegerType.isinstance(
+            value.type
+        ), f"expects integer type, but got {value.type}"
+        res = arith.index_cast(T.index(), value)
+    else:
+        res = const(int(value), ty=T.index())
+
+    return res
+
+
+def _validate_iter_args_structure(iter_args, ir_values):
+    """
+    Validates that iter_args structure contains the same number of atomic values
+    as there are IR values.
+
+    Args:
+        iter_args: Original iteration arguments, possibly nested sequences
+        ir_values: Flattened MLIR values extracted from iter_args
+
+    Returns:
+        bool: True if the number of atomic values in iter_args matches
+              the number of values in ir_values
+    """
+    # Handle non-sequence case
+    if not isinstance(iter_args, (tuple, list, set)):
+        return not isinstance(ir_values, (tuple, list, set)) or len(ir_values) == 1
+
+    # If we have a sequence but ir_values isn't one, there's a mismatch
+    if not isinstance(ir_values, (tuple, list, set)):
+        return False
+
+    # Count all non-sequence values recursively
+    def count_values(args):
+        if not isinstance(args, (tuple, list, set)):
+            return 1
+        else:
+            return sum(count_values(arg) for arg in args)
+
+    return count_values(iter_args) == len(ir_values)
+
+
+
+# =============================================================================
+# DSL implementation of Python Build-in Operators
+# =============================================================================
+
+
+def _minmax(op, *args, loc=None, ip=None):
+    """Computes the minimum or maximum value from the provided arguments."""
+    from ..base_dsl.typing import _binary_op, _binary_op_type_promote
+
+    # AST Traversal doesn't support early exit in if executor
+    x = None
+    res = None
+    if len(args) == 1:
+        # Handle case for min([a, b, c, d, ..])
+        if hasattr(args[0], "__iter__"):
+            x = op(*tuple(args[0]))
+        # Handle case for min(a)
+        else:
+            x = args[0]
+    # Handle case for min(a, b, c, ...) and min([x, y], [b]) and min(a, (x, y, z))
+    elif len(args) > 1:
+        res, *xs = tuple(args)
+        for x in xs:
+            lhs = as_numeric(op(res, loc=loc, ip=ip))
+            rhs = as_numeric(op(x, loc=loc, ip=ip))
+            emitter = getattr(cutlass_arith, f"_{op.__name__}")
+
+            lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool=True)
+
+            if isinstance(lhs.value, cutlass_arith.ArithValue) and isinstance(
+                lhs, Integer
+            ):
+                lhs_val = lhs.value.with_signedness(lhs.signed)
+            else:
+                lhs_val = lhs.value
+
+            if isinstance(rhs.value, cutlass_arith.ArithValue) and isinstance(
+                rhs, Integer
+            ):
+                rhs_val = rhs.value.with_signedness(rhs.signed)
+            else:
+                rhs_val = rhs.value
+
+            res = res_type(emitter(lhs_val, rhs_val), loc=loc, ip=ip)
+        x = res
+    else:
+        raise DSLNotImplemented(f"{type(args)} is not supported")
+    return x
+
+
+def min(*args, loc=None, ip=None):
+    """Computes the minimum value from the provided arguments.
+
+    This function differs from Python's built-in min() in that the return type
+    is determined by the static types of the inputs, not their dynamic values.
+
+    :param args: One or more values or iterables to find the minimum of
+    :type args: tuple
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The minimum value among all inputs
+    :rtype: Numeric
+    :raises DSLNotImplemented: If the input type is not supported
+
+    Supports multiple calling patterns:
+
+    - min(a): Returns a
+    - min([a, b, c, ...]): Returns minimum of all elements in the iterable
+    - min(a, b, c, ...): Returns minimum of all arguments
+    - min([x, y], [b]): Returns minimum across all elements in all iterables
+    - min(a, (x, y, z)): Returns minimum across all elements
+
+    Examples:
+
+    .. code-block:: python
+
+        # Find minimum of two values
+        result = min(x, y)
+
+        # Find minimum of multiple values
+        result = min(a, b, c, d)
+
+        # Find minimum of values in a list
+        values = [a, b, c, d]
+        result = min(values)
+
+        # Find minimum across mixed arguments
+        result = min(x, [y, z])
+
+    Difference from Python's built-in min():
+
+    .. code-block:: python
+
+        # In Python, the return type depends on the dynamic values:
+        a = 5
+        b = 3.14
+        result = min(a, b)  # Returns 3.14 (float)
+
+        # In this DSL implementation, the return type is determined statically:
+        a = Int32(5)
+        b = Float32(3.14)
+        result = min(a, b)  # Return type is determined by the type of operands, not values
+    """
+    return _minmax(min, *args, loc=loc, ip=ip)
+
+
+def max(*args, loc=None, ip=None):
+    """Computes the maximum value from the provided arguments.
+
+    This function differs from Python's built-in max() in that the return type
+    is determined by the static types of the inputs, not their dynamic values.
+
+    :param args: One or more values or iterables to find the maximum of
+    :type args: tuple
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The maximum value among all inputs
+    :rtype: Numeric
+    :raises DSLNotImplemented: If the input type is not supported
+
+    Supports multiple calling patterns:
+
+    - max(a): Returns a
+    - max([a, b, c, ...]): Returns maximum of all elements in the iterable
+    - max(a, b, c, ...): Returns maximum of all arguments
+    - max([x, y], [b]): Returns maximum across all elements in all iterables
+    - max(a, (x, y, z)): Returns maximum across all elements
+
+    Examples:
+
+    .. code-block:: python
+
+        # Find maximum of two values
+        result = max(x, y)
+
+        # Find maximum of multiple values
+        result = max(a, b, c, d)
+
+        # Find maximum of values in a list
+        values = [a, b, c, d]
+        result = max(values)
+
+        # Find maximum across mixed arguments
+        result = max(x, [y, z])
+
+    Difference from Python's built-in max():
+
+    .. code-block:: python
+
+        # In Python, the return type depends on the dynamic values:
+        a = 5
+        b = 3.14
+        result = max(a, b)  # Returns 5 (int)
+
+        # In this DSL implementation, the return type is determined statically:
+        a = Int32(5)
+        b = Float32(3.14)
+        result = max(a, b)  # Return type is determined by the type of operands, not values
+    """
+    return _minmax(max, *args, loc=loc, ip=ip)
+
+
+def and_(*args, loc=None, ip=None):
+    """AND operation for value in DSL numeric types.
+
+    :param *args: One or more numeric values to AND together
+    :type *args: Numeric
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The result of the logical AND operation
+    :rtype: Numeric
+    :raises ValueError: If no arguments are provided
+
+    Supports multiple calling patterns:
+
+    - and_(a): Returns a
+    - and_(a, b, c, ...): if a is truthy, returns and_(b, c, ...), otherwise returns a
+
+    All arguments must be of the same type.
+
+    Examples:
+
+    .. code-block:: python
+
+        # In Python, 'and' returns the second operand if the first is truthy,
+        # otherwise it returns the first operand
+        a = 5
+        b = 3
+        result = a and b  # Returns 3
+
+        # In this DSL implementation, the behavior is similar but works with DSL types
+        a = Int32(5)
+        b = Int32(3)
+        result = and_(a, b)  # Returns b
+    """
+    if len(args) == 0:
+        raise ValueError("and_() requires at least one argument")
+
+    if len(args) == 1:
+        return args[0]
+
+    def and_op(lhs, rhs):
+        if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)):
+            raise DSLNotImplemented(f"{type(lhs)} is not supported")
+        elif isinstance(lhs, (int, float, bool)) and isinstance(
+            rhs, (int, float, bool)
+        ):
+            return lhs and rhs
+        else:
+            return as_numeric(lhs).__dsl_and__(as_numeric(rhs))
+
+    return functools.reduce(and_op, args[1:], args[0])
+
+
+def or_(*args, loc=None, ip=None):
+    """Logical OR operation for DSL numeric types.
+
+    :param *args: One or more numeric values to OR together
+    :type *args: Numeric
+    :param loc: Source location for MLIR operation tracking
+    :type loc: object, optional
+    :param ip: Insertion point for MLIR operation
+    :type ip: object, optional
+    :return: The result of the logical OR operation
+    :rtype: Numeric
+    :raises ValueError: If no arguments are provided
+
+    Supports multiple calling patterns:
+
+    - or_(a): Returns a
+    - or_(a, b, c, ...): if a is truthy, returns a, otherwise returns or_(b, c, ...)
+
+    Examples:
+
+    .. code-block:: python
+
+        # In Python, 'or' returns the first operand if it's truthy,
+        # otherwise it returns the second operand
+        a = 5
+        b = 3
+        result = a or b  # Returns 5
+
+        # In this DSL implementation, the behavior is similar but works with DSL types
+        a = Int32(5)
+        b = Int32(3)
+        result = or_(a, b)  # Returns a
+    """
+    if len(args) == 0:
+        raise ValueError("or_() requires at least one argument")
+
+    if len(args) == 1:
+        return args[0]
+
+    def or_op(lhs, rhs):
+        if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)):
+            raise DSLNotImplemented(f"{type(lhs)} is not supported")
+        elif isinstance(lhs, (int, float, bool)) and isinstance(
+            rhs, (int, float, bool)
+        ):
+            return lhs or rhs
+        else:
+            return as_numeric(lhs).__dsl_or__(as_numeric(rhs))
+
+    return functools.reduce(or_op, args[1:], args[0])
+
+
+def all_(iterable):
+    """Logical AND operation for all elements in an iterable.
+
+    Returns True if all elements in the iterable are truthy, otherwise False.
+    This is the DSL equivalent of Python's built-in all() function.
+
+    :param iterable: An iterable containing values to check
+    :type iterable: Iterable
+    :return: True if all elements are truthy, False otherwise
+    :rtype: Boolean
+
+    Examples:
+
+    .. code-block:: python
+
+        # Check if all values are non-zero
+        values = [Int32(1), Int32(2), Int32(3)]
+        result = all_(values)  # Returns True
+
+        # Check if all conditions are met
+        conditions = [a > 0, b < 10, c != 0]
+        result = all_(conditions)  # Returns True if all conditions are met
+    """
+    bool_iterable = [Boolean(i) for i in iterable]
+    return functools.reduce(
+        lambda lhs, rhs: lhs.__dsl_and__(rhs) if hasattr(lhs, "__dsl_and__") else lhs,
+        bool_iterable,
+        Boolean(True),
+    )
+
+
+def any_(iterable):
+    """Logical OR operation for any element in an iterable.
+
+    Returns True if any element in the iterable is truthy, otherwise False.
+    This is the DSL equivalent of Python's built-in any() function.
+
+    :param iterable: An iterable containing values to check
+    :type iterable: Iterable
+    :return: True if any element is truthy, False otherwise
+    :rtype: Boolean
+
+    Examples:
+
+    .. code-block:: python
+
+        # Check if any value is non-zero
+        values = [Int32(0), Int32(0), Int32(3)]
+        result = any_(values)  # Returns True
+
+        # Check if any condition is met
+        conditions = [a > 10, b < 0, c != 0]
+        result = any_(conditions)  # Returns True if any condition is met
+    """
+    bool_iterable = [Boolean(i) for i in iterable]
+    return functools.reduce(
+        lambda lhs, rhs: lhs.__dsl_or__(rhs) if hasattr(lhs, "__dsl_or__") else lhs,
+        bool_iterable,
+        Boolean(False),
+    )
+
+
+# =============================================================================
+# Conditional Expression
+# =============================================================================
+
+
+def select_(cond, if_value, else_value):
+    def _as_scalar(value):
+        if isinstance(value, list):
+            if len(value) == 1:
+                return value[0]
+            else:
+                raise DSLRuntimeError(
+                    "Conditional expression must have exactly one value in all expressions"
+                )
+        return value
+
+    if not is_dynamic_expression(cond):
+        raise DSLRuntimeError("Conditional expression must be dynamic")
+
+    # Extract MLIR values
+    cond = extract_mlir_values(cond)
+    if is_dynamic_expression(if_value):
+        if_value = extract_mlir_values(if_value)
+    else:
+        if_value = const(if_value)
+    if is_dynamic_expression(else_value):
+        else_value = extract_mlir_values(else_value)
+    else:
+        else_value = const(else_value)
+
+    return arith.SelectOp(
+        _as_scalar(cond), _as_scalar(if_value), _as_scalar(else_value)
+    ).result
+
+
+# =============================================================================
+# Terminator
+# =============================================================================
+
+
+def yield_out(args=[], loc=None, ip=None):
+    """
+    Generate a yield operation. It it used to return values from a loop, if-else, or while region.
+    """
+    scf.yield_(extract_mlir_values(args), loc=loc, ip=ip)
+
+
+# =============================================================================
+# For Loop
+# =============================================================================
+
+
+class LoopUnroll(ir.Attribute):
+    def __init__(self, **kwargs):
+        valid_keys = set(["count", "full"])
+        def to_mlir_attr(val):
+            if isinstance(val, bool):
+                return "true" if val else "false"
+            elif isinstance(val, int):
+                return f"{val} : i32"
+            else:
+                raise DSLNotImplemented(f"{type(val)} is not supported")
+
+        cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs}
+        if kwargs.get("count", None) == 1:
+            cfg["disable"] = "true"
+
+        unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">"
+
+        super().__init__(
+            ir.Attribute.parse(f"#llvm.loop_annotation<unroll = {unroll}>")
+        )
+
+
+def for_generate(
+    start,
+    stop=None,
+    step=None,
+    iter_args: Optional[Sequence[ir.Value]] = None,
+    *,
+    unroll: LoopUnroll = None,
+    prefetch_stages=None,
+    loc=None,
+    ip=None,
+):
+    """
+    scf.for with yield support
+    """
+
+    if step is None:
+        step = 1
+    if stop is None:
+        stop = start
+        start = 0
+    start = const(start)
+    params = [start, stop, step]
+    for i, p in enumerate(params):
+        if isinstance(p, int):
+            p = const(p)
+        elif isinstance(p, float):
+            raise DSLRuntimeError(f"{p=} must be int.")
+        elif isinstance(p, Integer):
+            p = p.ir_value()
+        params[i] = p
+
+    start, stop, step = params
+
+    def _createI32Attr(value):
+        if not isinstance(value, int):
+            raise DSLRuntimeError(f"value must be int.")
+        return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value)
+
+    ir_iter_args = extract_mlir_values(iter_args) if iter_args is not None else None
+    if not _validate_iter_args_structure(iter_args, ir_iter_args):
+        raise DSLRuntimeError("iter_args: Elements should be extractable as ir.Value.")
+    for_op = scf.ForOp(start, stop, step, ir_iter_args, loc=loc, ip=ip)
+    if unroll is not None:
+        for_op.attributes["loop_annotation"] = unroll
+
+    if prefetch_stages is not None:
+        for_op.attributes["cutlass.pipelining"] = _createI32Attr(prefetch_stages)
+
+    iv = for_op.induction_variable
+    new_results = new_from_mlir_values(iter_args, for_op.results)
+    new_iter_args = new_from_mlir_values(iter_args, for_op.inner_iter_args)
+    new_iter_args = () if new_iter_args is None else tuple(new_iter_args)
+
+    with ir.InsertionPoint(for_op.body):
+        if len(new_iter_args) > 1:
+            yield iv, new_iter_args, new_results
+        elif len(new_iter_args) == 1:
+            yield iv, new_iter_args[0], new_results[0]
+        else:
+            yield iv
+
+
+# =============================================================================
+# Logical Operators
+# =============================================================================
+
+
+def not_(lhs: Union[ir.Value, bool], *, loc=None, ip=None):
+    """
+    Logical Not
+    """
+    res = None
+    # Handle Python bool first to prevent infinite recursion
+    if type(lhs) == bool:
+        res = lhs ^ True
+    elif hasattr(lhs, "__dsl_not__"):
+        res = lhs.__dsl_not__(loc=loc, ip=ip)
+    elif is_dynamic_expression(lhs):
+        # If lhs is MLIR value, compute not using xor
+        res = arith.XOrIOp(lhs, const(1, lhs.type)).result
+    else:
+        res = bool(lhs) ^ True
+
+    return res
+
+
+# =============================================================================
+# If/Else
+# =============================================================================
+
+
+def if_generate(
+    cond: Boolean,
+    then_body: Callable,
+    else_body: Optional[Callable] = None,
+    input_args: List[DslType] = None,
+    return_types: List[DslType] = None,
+    *,
+    loc=None,
+    ip=None,
+) -> List:
+    """
+    Generate an IfOp with optional else branch and return values.
+
+    Args:
+        cond: The condition expression
+        then_body: Function to execute in then branch
+        else_body: Optional function to execute in else branch
+        input_args: Arguments to pass to branch bodies
+        return_types: Expected return types for the operation
+        loc: Optional location information
+        ip: Optional insertion point
+
+    Returns:
+        List of DSL typed results
+    """
+    input_args = input_args or []
+    mlir_return_types = []
+
+    # Validate and collect MLIR return types (if provided).
+    if return_types is not None:
+        for t in return_types:
+            if not isinstance(t, DslType):
+                raise DSLRuntimeError(f"{t=} must be a DslType.")
+            mlir_return_types.append(t.mlir_type)
+
+    # Determine whether there's an else branch.
+    has_else = else_body is not None
+
+    # Create the IfOp.
+    if_op = scf.IfOp(
+        Boolean(cond).ir_value(), mlir_return_types, hasElse=has_else, loc=loc, ip=ip
+    )
+
+    def _execute_and_yield_out(body, input_args):
+        yield_vals = body(*input_args)
+        if return_types is not None:
+            if not isinstance(yield_vals, Iterable):
+                # body only return single element
+                yield_vals = [yield_vals]
+
+            yield_vals = [t(r) for t, r in zip(return_types, yield_vals)]
+        yield_out(yield_vals)
+
+    # Generate the body for 'then'.
+    with ir.InsertionPoint(if_op.then_block):
+        _execute_and_yield_out(then_body, input_args)
+
+    # Generate the body for 'else' if provided.
+    if has_else:
+        with ir.InsertionPoint(if_op.else_block):
+            _execute_and_yield_out(else_body, input_args)
+
+    # Collect MLIR results.
+    mlir_results = _get_op_result_or_op_results(if_op)
+
+    if not isinstance(mlir_results, list):
+        mlir_results = [mlir_results]
+
+    # Wrap the results with their DSL types.
+    if return_types is None:
+        return []
+
+    vals = [t(r) for t, r in zip(return_types, mlir_results)]
+
+    if len(vals) == 1:
+        return vals[0]
+
+    return vals
+
+
+# =============================================================================
+# While Loop
+# =============================================================================
+
+
+class WhileLoopContext:
+    """
+    Context manager for a dynamic while loop.
+    """
+
+    def __init__(
+        self,
+        inputs: Sequence[Union[ir.Value, Numeric]],
+        condition: Callable[[Sequence[ir.Value]], ir.Value],
+        *,
+        loc=None,
+        ip=None,
+    ):
+        # Keep original inputs and allow recover original type information
+        self.inputs = inputs
+
+        self.input_ir_values = extract_mlir_values(inputs)
+
+        if not _validate_iter_args_structure(inputs, self.input_ir_values):
+            raise DSLRuntimeError("inputs: Elements should be extractable as ir.Value.")
+
+        self.condition = condition
+        self.input_ir_types = [i.type for i in self.input_ir_values]
+        self.while_op = scf.WhileOp(
+            self.input_ir_types, self.input_ir_values, loc=loc, ip=ip
+        )
+
+        self.before_region = self.while_op.before
+        self.after_region = self.while_op.after
+
+        self.before_region.blocks.append(*self.input_ir_types)
+        self.before_block = self.before_region.blocks[0]
+
+        self.after_region.blocks.append(*self.input_ir_types)
+        self.after_block = self.after_region.blocks[0]
+
+    def __enter__(self):
+        with ir.InsertionPoint(self.before_block):
+            args = new_from_mlir_values(self.inputs, self.before_block.arguments)
+            cond = self.condition(*args)
+            cond_ir_val = extract_mlir_values(cond)
+            scf.ConditionOp(cond_ir_val[0], [*self.before_block.arguments])
+        self.ipoint_op = ir.InsertionPoint(self.after_block)
+        self.ipoint_op.__enter__()
+        return new_from_mlir_values(self.inputs, self.after_block.arguments)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.ipoint_op.__exit__(exc_type, exc_value, traceback)
+
+    @property
+    def results(self):
+        return new_from_mlir_values(self.inputs, self.while_op.results_)
+
+
+def while_generate(
+    inputs: Sequence[Union[ir.Value, Numeric]],
+    condition: Callable[[Sequence[Union[ir.Value, Numeric]]], Union[ir.Value, Numeric]],
+    *,
+    loc=None,
+    ip=None,
+) -> WhileLoopContext:
+    """
+    Generate a WhileLoopContext for a dynamic loop.
+    """
+    return WhileLoopContext(inputs, condition, loc=loc, ip=ip)
+
+
+def equal(lhs, rhs):
+    if not is_dynamic_expression(lhs) and not is_dynamic_expression(rhs):
+        return lhs == rhs
+
+    # Both sequence
+    if isinstance(lhs, Sequence) and isinstance(rhs, Sequence):
+        # Short-circuit for unequal length
+        if len(lhs) != len(rhs):
+            return False
+        return all_(equal(l, r) for l, r in zip(lhs, rhs))
+    return lhs == rhs
+
+
+def not_equal(lhs, rhs):
+    if not is_dynamic_expression(lhs) and not is_dynamic_expression(rhs):
+        return lhs != rhs
+
+    # Both sequence
+    if isinstance(lhs, Sequence) and isinstance(rhs, Sequence):
+        # Short-circuit for unequal length
+        if len(lhs) != len(rhs):
+            return True
+        return any_(not_equal(l, r) for l, r in zip(lhs, rhs))
+
+    if hasattr(lhs, "__ne__"):
+        return lhs != rhs
+    elif hasattr(rhs, "__ne__"):
+        return rhs != lhs
+    else:
+        return not_(equal(lhs, rhs))
+
+
+def in_(lhs, rhs):
+    if not is_dynamic_expression(lhs) and not is_dynamic_expression(rhs):
+        return lhs in rhs
+
+    if not isinstance(rhs, Sequence):
+        raise DSLRuntimeError(
+            f"'in' not supported between instances of {type(lhs)} and {type(rhs)}"
+        )
+
+    return any_(equal(lhs, r) for r in rhs)
+
+
+def _lte_gte(lhs, rhs, op):
+    def native_lte_gte(lhs, rhs, op):
+        match op:
+            case "<":
+                return lhs < rhs
+            case "<=":
+                if hasattr(lhs, "__le__"):
+                    return lhs <= rhs
+                else:
+                    return not_(lhs > rhs)
+            case ">":
+                return lhs > rhs
+            case ">=":
+                if hasattr(lhs, "__ge__"):
+                    return lhs >= rhs
+                else:
+                    return not_(lhs < rhs)
+            case _:
+                raise DSLRuntimeError(f"Unsupported comparison operator: {op}")
+
+    if not is_dynamic_expression(lhs) and not is_dynamic_expression(rhs):
+        return native_lte_gte(lhs, rhs, op)
+
+    # Both sequence, comparisons other than == and != do not allow mixing different types of sequences
+    if (
+        isinstance(lhs, Sequence)
+        and isinstance(rhs, Sequence)
+        and type(lhs) == type(rhs)
+    ):
+        unequal_found = False
+        comp_results = []
+        mask = []
+        for l, r in zip(lhs, rhs):
+            is_equal = equal(l, r)
+            mask.append(not_(or_(is_equal, unequal_found)))
+            unequal_found = not_(is_equal)
+            comp_results.append(_lte_gte(l, r, op))
+
+        result = any_(and_(r, m) for r, m in zip(comp_results, mask))
+
+        if len(lhs) != len(rhs):
+            # Ref https://docs.python.org/3/tutorial/datastructures.html#comparing-sequences-and-other-types
+            # If one sequence is an initial sub-sequence of the other, the shorter sequence is the smaller (lesser) one
+            has_valid_mask = any_(mask)
+            match op:
+                case "<":
+                    length_result = len(lhs) < len(rhs)
+                case ">":
+                    length_result = len(lhs) > len(rhs)
+                case "<=":
+                    length_result = len(lhs) <= len(rhs)
+                case ">=":
+                    length_result = len(lhs) >= len(rhs)
+            if type(has_valid_mask) == bool:
+                return result if has_valid_mask else length_result
+            else:
+                return select_(has_valid_mask, result, length_result)
+        else:
+            if op in {"<=", ">="}:
+                # If no unequal, return True
+                return select_(unequal_found, result, True)
+            else:
+                return result
+    else:
+        return native_lte_gte(lhs, rhs, op)
+
+
+def greater_than(lhs, rhs):
+    return _lte_gte(lhs, rhs, ">")
+
+
+def greater_equal(lhs, rhs):
+    return _lte_gte(lhs, rhs, ">=")
+
+
+def less_than(lhs, rhs):
+    return _lte_gte(lhs, rhs, "<")
+
+
+def less_equal(lhs, rhs):
+    return _lte_gte(lhs, rhs, "<=")
+
+
+def _compare_dispatch(lhs, rhs, op):
+    """
+    Dispatches the comparison operation between lhs and rhs based on the given operator.
+
+    :param lhs: The left-hand side operand for the comparison.
+    :param rhs: The right-hand side operand for the comparison.
+    :param op: The comparison operator as a string. Supported operators are:
+        - "is", "is not": Python identity comparisons.
+        - "in", "not in": Membership tests.
+        - "==", "!=": Equality and inequality.
+        - "<", ">", "<=", ">=": Relational comparisons.
+    :return: The result of the comparison, which may be a boolean or a DSL-specific type.
+    :raises DSLRuntimeError: If the operator is not supported.
+    """
+    match op:
+        # 'is' and 'is not' are pure python operators
+        case "is":
+            return lhs is rhs
+        case "is not":
+            return lhs is not rhs
+        case "in":
+            return in_(lhs, rhs)
+        case "not in":
+            return not_(in_(lhs, rhs))
+        case "==":
+            return equal(lhs, rhs)
+        case "!=":
+            return not_equal(lhs, rhs)
+        case "<":
+            return less_than(lhs, rhs)
+        case ">":
+            return greater_than(lhs, rhs)
+        case ">=":
+            return greater_equal(lhs, rhs)
+        case "<=":
+            return less_equal(lhs, rhs)
+        case _:
+            raise DSLRuntimeError(f"Unsupported comparison operator: {op}")
+
+
+def _compare_executor(left, comparators, ops):
+    # Fast path for single comparison
+    if len(comparators) == 1:
+        return _compare_dispatch(left, comparators[0], ops[0])
+
+    # Chain comparison, dispatch in a loop
+    result = True
+    current = left
+    for comparator, op in zip(comparators, ops):
+        cmp_result = _compare_dispatch(current, comparator, op)
+        result = and_(result, cmp_result)
+        current = comparator
+
+    return result
+
+
+def _builtin_redirector(fcn):
+    if fcn == builtins.max:
+        return max
+    elif fcn == builtins.min:
+        return min
+    elif fcn == builtins.any:
+        return any_
+    elif fcn == builtins.all:
+        return all_
+    else:
+        raise DSLRuntimeError(f"Unsupported built-in function: {fcn}")
+
+
+# =============================================================================
+# Set the AST decorator
+# =============================================================================
+
+# Set the DSL specific functions
+executor.set_functions(
+    is_dynamic_expression=is_dynamic_expression,
+    loop_execute_range_dynamic=_loop_execute_range_dynamic,
+    if_dynamic=_if_execute_dynamic,
+    while_dynamic=_while_execute_dynamic,
+    compare_executor=_compare_executor,
+    any_executor=any_,
+    all_executor=all_,
+    builtin_redirector=_builtin_redirector,
+)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b4d8953d69b4100871a496623f051d60ab2a8d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py
@@ -0,0 +1,633 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import List, Tuple
+from types import NoneType
+from cutlass._mlir import ir
+from cutlass._mlir.dialects import scf, arith
+from cutlass._mlir.extras import types as T
+from collections.abc import Sequence
+
+from ..base_dsl.dsl import is_dynamic_expression
+from ..base_dsl.ast_helpers import *
+from ..base_dsl.utils.logger import log
+from ..base_dsl import typing as t
+from ..base_dsl.typing import (
+    Int32,
+    Float32,
+    Boolean,
+    Numeric,
+    get_mlir_types,
+    as_numeric,
+)
+from . import cutlass as cutlass_dsl
+from .tree_utils import PyTreeDef, check_tree_equal
+
+# =============================================================================
+# AST Helpers
+# =============================================================================
+
+
+class LoopUnroll(ir.Attribute):
+    def __init__(self, **kwargs):
+        valid_keys = set(["count", "full"])
+        def to_mlir_attr(val):
+            if isinstance(val, bool):
+                return "true" if val else "false"
+            elif isinstance(val, int):
+                return f"{val} : i32"
+            else:
+                raise DSLNotImplemented(f"{type(val)} is not supported")
+
+        cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs}
+        if kwargs.get("count", None) == 1:
+            cfg["disable"] = "true"
+
+        unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">"
+
+        super().__init__(
+            ir.Attribute.parse(f"#llvm.loop_annotation<unroll = {unroll}>")
+        )
+
+
+class ScfGenerator:
+    """
+    Encapsulates common scf dialect functionality: pack, unpack, and SCF execution.
+    """
+
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def _normalize_region_result_to_list(region_result: Any) -> List[Any]:
+        """
+        Convert region_result to a list if it is not already a list
+        If region_result is a list, return it as is.
+        If region_result is None, return an empty list.
+        If region_result is not a list, return a list containing region_result as the only element.
+        """
+        if region_result is None:
+            region_result_list = []
+        elif not isinstance(region_result, list):
+            region_result_list = [region_result]
+        else:
+            region_result_list = region_result
+        return region_result_list
+
+    @staticmethod
+    def _check_region_result(original_value, region_value, arg_name, op_type_name):
+        """
+        Validate that a region result maintains the same type as the original value.
+
+        This method checks for type consistency between the original value passed to a dynamic
+        SCF operation (like for, if, while) and the value returned from the operation's region.
+
+        Args:
+            original_value: The value before entering the SCF operation region
+            region_value: The value returned from the SCF operation region
+            arg_name: Name of the argument being checked (for error reporting)
+            op_type_name: Type of SCF operation (e.g., 'for', 'if', 'while') for error reporting
+
+        Raises:
+            DSLRuntimeError: If the region value has a different type than the original value.
+                The error includes suggestions for using compile-time control flow instead.
+
+        Note:
+            This method performs relaxed type checking that allows inheritance relationships.
+            For example, a child class can be returned where a parent class was expected.
+            However, fundamental type changes (like None to non-None, different sequence types,
+            or different numeric types) are not allowed in dynamic SCF operations.
+        """
+
+        def get_type_name(value):
+            if isinstance(value, NoneType):
+                return "None"
+            elif isinstance(value, Sequence):
+                return f"{type(value).__name__}<{len(value)}>"
+            else:
+                return type(value).__name__
+
+        # Check for type mismatches
+        type_mismatch = False
+        old_type_name = None
+        new_type_name = None
+
+        # Handle None type changes
+        if isinstance(original_value, NoneType) != isinstance(region_value, NoneType):
+            type_mismatch = True
+            old_type_name = get_type_name(original_value)
+            new_type_name = get_type_name(region_value)
+        # Handle sequence type/length changes
+        elif isinstance(original_value, Sequence) and isinstance(
+            region_value, Sequence
+        ):
+            if type(original_value) != type(region_value) or len(original_value) != len(
+                region_value
+            ):
+                type_mismatch = True
+                old_type_name = get_type_name(original_value)
+                new_type_name = get_type_name(region_value)
+        # Handle numeric type changes
+        elif isinstance(
+            original_value, (Numeric, ArithValue, ir.Value, int, float, bool)
+        ) or isinstance(
+            region_value, (Numeric, ArithValue, ir.Value, int, float, bool)
+        ):
+            try:
+                original_numeric = as_numeric(original_value)
+                region_numeric = as_numeric(region_value)
+                if original_numeric.dtype != region_numeric.dtype:
+                    type_mismatch = True
+                    old_type_name = original_numeric.dtype.__name__
+                    new_type_name = region_numeric.dtype.__name__
+            except Exception:
+                pass
+        # Handle general type changes (relaxed for inheritance)
+        elif type(original_value) != type(region_value):
+            old_type = type(original_value)
+            new_type = type(region_value)
+            if not (issubclass(old_type, new_type) or issubclass(new_type, old_type)):
+                type_mismatch = True
+                old_type_name = old_type.__name__
+                new_type_name = new_type.__name__
+
+        if type_mismatch:
+            raise DSLRuntimeError(
+                f"`{arg_name}` is {old_type_name} prior to this `{op_type_name}`, "
+                f"and update to {new_type_name} inside of this `{op_type_name}` is not supported.",
+                suggestion=(
+                    f"Please avoid changing type inside a dynamic `{op_type_name}`, "
+                    f"or change to compile-time control flow by marking this `{op_type_name}` with "
+                    f"`{'range_constexpr' if op_type_name == 'for' else 'const_expr'}`."
+                ),
+            )
+
+    def scf_execute_dynamic(
+        self,
+        op_type_name: str,
+        mix_iter_args: List[Any],
+        full_write_args_count: int,
+        mix_iter_arg_names: List[str],
+        create_op_func: Callable[[List[ir.Value]], ir.Operation],
+        region_builders: List[
+            Callable[
+                [
+                    "ir.Operation",
+                    List["ir.Value"],  # block_args
+                    List["ir.Value"],  # dyn_yield_ops
+                    PyTreeDef,
+                    List[Any],
+                    int,
+                ],
+                Any,
+            ]
+        ],
+        # block_term_op_builder[region_builder] = scf_op_builder
+        # e.g. scf.ConditionOp for while loop
+        block_term_op_builder: Dict[Callable, Callable] = {},
+    ) -> Any:
+        # 1) Unpack
+        ir_values, pytree_def = cutlass_dsl.unpack_to_irvalue(
+            mix_iter_args, op_type_name, full_write_args_count
+        )
+        # 2) Create the SCF op
+        op = create_op_func(ir_values)
+        log().debug("Generated scf.%s \n[%s]", op_type_name, op)
+
+        # 3) Build the regions
+        for i, builder in enumerate(region_builders):
+            region = op.regions[i]
+            block = region.blocks[0]
+            with ir.InsertionPoint(block):
+                block_args = list(block.arguments)
+                region_result = builder(
+                    op,
+                    block_args,
+                    ir_values,
+                    pytree_def,
+                    mix_iter_args,
+                    full_write_args_count,
+                )
+
+                # Use custom terminator if provided for this builder, otherwise use default YieldOp
+                if builder in block_term_op_builder:
+                    # Use the provided terminator generator
+                    block_term_op_builder[builder](region_result, full_write_args_count)
+                else:
+                    # Normalize region_result
+                    region_result_list = ScfGenerator._normalize_region_result_to_list(
+                        region_result
+                    )
+                    # For standard yield op, check result
+                    for arg, result, name in zip(
+                        mix_iter_args,
+                        region_result_list,
+                        mix_iter_arg_names,
+                    ):
+                        ScfGenerator._check_region_result(
+                            arg, result, name, op_type_name
+                        )
+
+                    # Default behavior - generate YieldOp
+                    region_values, yield_pytree_def = cutlass_dsl.unpack_to_irvalue(
+                        region_result_list, op_type_name, full_write_args_count
+                    )
+
+                    mismatch = check_tree_equal(pytree_def, yield_pytree_def)
+                    if mismatch != -1:
+                        # Get arg name
+                        filterd_arg_names = (
+                            cutlass_dsl.filter_readonly_frozen_dataclass_names(
+                                mix_iter_args, mix_iter_arg_names, full_write_args_count
+                            )
+                        )
+
+                        raise DSLRuntimeError(
+                            f"`{filterd_arg_names[mismatch]}` is structured different after this `{op_type_name}`.",
+                            suggestion=(
+                                f"Please avoid changing type structure inside a dynamic `{op_type_name}`, "
+                                f"or change to compile-time control flow by marking this `{op_type_name}` with "
+                                f"`{'range_constexpr' if op_type_name == 'for' else 'const_expr'}`."
+                            ),
+                        )
+
+                    scf.YieldOp(region_values)
+
+        log().debug("Completed scf.%s \n[%s]", op_type_name, op)
+
+        # 4) Pack final results
+        final_results = cutlass_dsl.pack_from_irvalue(
+            op.results, pytree_def, mix_iter_args, full_write_args_count
+        )
+
+        # 5) Return in a nice pattern
+        if not final_results:
+            return
+        if len(final_results) == 1:
+            return final_results[0]
+        return final_results
+
+
+def _attr_const_check(attr, expected_type, attr_name):
+    # Use strict type equality to prevent `bool` being accepted where `int` is required.
+    if is_dynamic_expression(attr) or type(attr) is not expected_type:
+        raise DSLRuntimeError(
+            f"loop attribute `{attr_name}` must be a Python value of type `{expected_type.__name__}`, got `{type(attr).__name__}`."
+        )
+
+
+def _loop_execute_range_dynamic(
+    func: Callable,
+    start: Any,
+    stop: Any,
+    step: Any,
+    mix_iter_args: List[Any] = [],
+    full_write_args_count: int = 0,
+    mix_iter_arg_names: List[str] = [],
+    unroll: int = -1,
+    unroll_full: bool = False,
+    prefetch_stages: int = None,
+):
+    """
+    Example: build an scf.for with optional unroll, using our universal helper.
+    """
+    scf_gen = ScfGenerator()
+
+    def create_for_op(dyn_yield_ops: List[ir.Value]):
+        for d in dyn_yield_ops:
+            if not isinstance(d, ir.Value):
+                raise DSLRuntimeError(
+                    f"Invalid dyn_yield_ops: {dyn_yield_ops} \n\tExpected ir.Value, got {type(d)}"
+                )
+
+        # Convert Python ints or values to IR constants if needed
+        start_ = t.as_numeric(start)
+        stop_ = t.as_numeric(stop)
+        step_ = t.as_numeric(step)
+        assert start_ is not t.Int32, "Start is required for scf.for"
+        assert stop_ is not t.Int32, "Stop is required for scf.for"
+        assert step_ is not t.Int32, "Step is required for scf.for"
+        start_ = start_.ir_value()
+        stop_ = stop_.ir_value()
+        step_ = step_.ir_value()
+
+        # Attributes must be pure Python value, add a check
+        _attr_const_check(unroll, int, "unroll")
+        _attr_const_check(unroll_full, bool, "unroll_full")
+
+        # Possibly attach unroll attributes
+        unroll_attr = None
+        if unroll_full:
+            unroll_attr = LoopUnroll(full=True)
+        elif unroll != -1:
+            unroll_attr = LoopUnroll(count=unroll)
+        log().debug("Unroll attribute: %s", unroll_attr)
+
+        prefetch_stages_attr = None
+        if prefetch_stages is not None:
+            _attr_const_check(prefetch_stages, int, "prefetch_stages")
+            if prefetch_stages >= 0:
+                prefetch_stages_attr = ir.IntegerAttr.get(
+                    ir.IntegerType.get_signless(32), prefetch_stages
+                )
+            else:
+                raise DSLRuntimeError(
+                    f"loop attribute `prefetch_stages` must be non-negative, got `{prefetch_stages}`."
+                )
+        log().debug("prefetch_stages attribute: %s", prefetch_stages_attr)
+
+        log().debug(
+            "Creating scf.ForOp \n\t\tstart=%s: type : %s\n\t\tstop=%s: type : %s\n\t\tstep=%s: type : %s",
+            start_,
+            type(start_),
+            stop_,
+            type(stop_),
+            step_,
+            type(step_),
+        )
+        # Create scf.ForOp, passing iteration args if any
+        try:
+            if not dyn_yield_ops:
+                for_op = scf.ForOp(start_, stop_, step_)
+            else:
+                for_op = scf.ForOp(start_, stop_, step_, list(dyn_yield_ops))
+        except Exception as e:
+            yield_ops = "\n".join(
+                f"\t\t{i} => {d} : type : {type(d)}"
+                for i, d in enumerate(dyn_yield_ops)
+            )
+            raise DSLRuntimeError(
+                f"Failed to create scf.ForOp \n\t\tstart={start_}: type : {type(start_)}"
+                f"\n\t\tstop={stop_}: type : {type(stop_)}\n\t\tstep={step_}: type : {type(step_)}"
+                f", \n\tdyn_yield_ops:\n{yield_ops}"
+            ) from e
+
+        if unroll_attr is not None:
+            for_op.attributes["loop_annotation"] = unroll_attr
+
+        if prefetch_stages_attr is not None:
+            for_op.attributes["cutlass.pipelining"] = prefetch_stages_attr
+
+        return for_op
+
+    def for_body_builder(
+        op,
+        block_args,
+        _,
+        pytree_def,
+        mix_iter_args,
+        full_write_args_count,
+    ):
+        # scf.ForOp block_args are typically [induction_var, iter_args...]
+        # But MLIR also gives you op.induction_variable
+        iv = t.as_numeric(op.induction_variable)
+        log().debug(
+            "For body builder: %s block_args: %s full_write_args_count: %s",
+            iv,
+            block_args,
+            full_write_args_count,
+        )
+        # block_args[1:] are iteration variables
+        func_args = []
+        func_args.extend(
+            cutlass_dsl.pack_from_irvalue(
+                block_args[1:], pytree_def, mix_iter_args, full_write_args_count
+            )
+        )
+        if not func_args:
+            # No iteration arguments, or only the induction var
+            func(iv)
+            return []  # yield nothing
+        else:
+            updated_func_args = func(iv, *func_args)
+            return updated_func_args
+
+    # Now call the universal SCF executor with a single region builder
+    return scf_gen.scf_execute_dynamic(
+        op_type_name="for",
+        mix_iter_args=mix_iter_args,
+        full_write_args_count=full_write_args_count,
+        mix_iter_arg_names=mix_iter_arg_names,
+        create_op_func=create_for_op,
+        region_builders=[for_body_builder],
+    )
+
+
+def _if_execute_dynamic(
+    pred: "ir.Value",
+    then_block: Callable,
+    else_block: Callable = None,
+    mix_yield_args: List[Any] = [],
+    full_write_args_count: int = 0,
+    mix_yield_arg_names: List[str] = [],
+    if_constexpr=None,  # ignoring for brevity
+):
+    """
+    Build an scf.if with optional else, using our universal helper.
+    """
+    scf_gen = ScfGenerator()
+
+    def create_if_op(dyn_yield_ops: List[ir.Value]):
+        # Assume final result types match the dynamic yields
+        result_types = [arg.type for arg in dyn_yield_ops]
+
+        pred_ = Boolean(pred)
+
+        try:
+            if_op = scf.IfOp(
+                pred_.ir_value(),
+                hasElse=(else_block is not None),
+                results_=result_types,
+            )
+        except Exception as e:
+            raise DSLRuntimeError(
+                f"Failed to create scf.IfOp \n\t\tpred={pred_}: type : {type(pred_)}"
+            ) from e
+        return if_op
+
+    def then_builder(
+        if_op,
+        _,
+        dyn_yield_ops,
+        pytree_def,
+        mix_iter_args,
+        full_write_args_count,
+    ):
+        flat_args = []
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(
+                dyn_yield_ops, pytree_def, mix_iter_args, full_write_args_count
+            )
+        )
+        return then_block(*flat_args)
+
+    region_builders = [then_builder]
+
+    if else_block is not None:
+
+        def else_builder(
+            if_op,
+            _,
+            dyn_yield_ops,
+            pytree_def,
+            mix_iter_args,
+            full_write_args_count,
+        ):
+            flat_args = []
+            flat_args.extend(
+                cutlass_dsl.pack_from_irvalue(
+                    dyn_yield_ops, pytree_def, mix_iter_args, full_write_args_count
+                )
+            )
+            return else_block(*flat_args)
+
+        region_builders.append(else_builder)
+
+    return scf_gen.scf_execute_dynamic(
+        op_type_name="if",
+        mix_iter_args=mix_yield_args,
+        full_write_args_count=full_write_args_count,
+        mix_iter_arg_names=mix_yield_arg_names,
+        create_op_func=create_if_op,
+        region_builders=region_builders,
+    )
+
+
+def _while_execute_dynamic(
+    while_before_block: Callable,
+    while_after_block: Callable = None,
+    write_args=[],
+    full_write_args_count=0,
+    write_args_names=[],
+):
+    """
+    Create and return an SCF WhileOp for dynamic loops.
+    Generate the dynamic loop body using SCF WhileOp.
+
+    Args:
+        while_before_block: Function that returns (condition, updated_values)
+        while_after_block: Function that returns updated values
+        write_args: Values that are updated in the loop
+
+    See create_while_function in ast_preprocessor.py for details on the input structure.
+    """
+    log().debug("_while_execute_dynamic")
+    while_op_type_name = "while"
+    scf_gen = ScfGenerator()
+
+    def create_while_op(dyn_yield_ops: List[ir.Value]):
+        # Create the while operation with the types from yield_args
+        result_types = [arg.type for arg in dyn_yield_ops]
+        try:
+            while_op = scf.WhileOp(result_types, dyn_yield_ops)
+            while_op.before.blocks.append(*result_types)
+            while_op.after.blocks.append(*result_types)
+            log().debug("[%s]", while_op)
+            return while_op
+        except Exception as e:
+            yield_ops = "\n".join(
+                f"\t\t{i} => {d} : type : {type(d)}"
+                for i, d in enumerate(dyn_yield_ops)
+            )
+            raise DSLRuntimeError(
+                f"Failed to create scf.WhileOp with yield_ops:\n{yield_ops}"
+            ) from e
+
+    def before_block_builder(
+        op,
+        block_args,
+        _,
+        pytree_def,
+        mix_iter_args,
+        full_write_args_count,
+    ):
+        # Build the before (condition) block
+        flat_args = []
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(
+                block_args, pytree_def, mix_iter_args, full_write_args_count
+            )
+        )
+
+        log().debug("before block args: %s", flat_args)
+
+        cond, before_results = while_before_block(*flat_args)
+
+        if not isinstance(before_results, (list, ir.OpResultList)):
+            before_results = [before_results]
+
+        log().debug("cond [%s]", cond)
+        log().debug(
+            "before_results [%s]",
+            before_results,
+        )
+
+        return cond, before_results
+
+    def before_block_terminator(cond_and_results, full_write_args_count):
+        # Generate a condition op instead of yield op
+        cond = cond_and_results[0]
+        before_result_list = ScfGenerator._normalize_region_result_to_list(
+            cond_and_results[1]
+        )
+        ir_cond = as_numeric(cond).ir_value()
+        ir_results_list, pytree_def = cutlass_dsl.unpack_to_irvalue(
+            before_result_list, while_op_type_name, full_write_args_count
+        )
+        log().debug(
+            "creating scf.ConditionOp with [%s], [%s]",
+            ir_cond,
+            ir_results_list,
+        )
+        scf.ConditionOp(ir_cond, ir_results_list)
+
+    def after_block_builder(
+        op,
+        block_args,
+        _,
+        pytree_def,
+        mix_iter_args,
+        full_write_args_count,
+    ):
+        # Build the after (body) block
+        flat_args = []
+        flat_args.extend(
+            cutlass_dsl.pack_from_irvalue(
+                block_args, pytree_def, mix_iter_args, full_write_args_count
+            )
+        )
+
+        log().debug("after block args: %s", flat_args)
+
+        after_results = while_after_block(*flat_args)
+
+        if not isinstance(after_results, (list, ir.OpResultList)):
+            after_results = [after_results]
+
+        log().debug(
+            "after_results [%s]",
+            after_results,
+        )
+
+        return after_results
+
+    # Call the universal SCF executor with two region builders
+    return scf_gen.scf_execute_dynamic(
+        op_type_name=while_op_type_name,
+        mix_iter_args=write_args,
+        full_write_args_count=full_write_args_count,
+        mix_iter_arg_names=write_args_names,
+        create_op_func=create_while_op,
+        region_builders=[before_block_builder, after_block_builder],
+        block_term_op_builder={
+            before_block_builder: before_block_terminator
+        },  # Only customize the before block
+    )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/tree_utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/tree_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..599b72ea5c6b1d378480ceeb1d43d14fd58b569d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/CuTeDSL/cutlass_dsl/tree_utils.py
@@ -0,0 +1,763 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+#
+# Use of this software is governed by the terms and conditions of the
+# NVIDIA End User License Agreement (EULA), available at:
+# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html
+#
+# Any use, reproduction, disclosure, or distribution of this software
+# and related documentation outside the scope permitted by the EULA
+# is strictly prohibited.
+
+from typing import Callable, Any, Iterable, Iterator, NamedTuple, Union, get_origin
+import dataclasses
+import itertools as it
+from types import SimpleNamespace
+
+from ..base_dsl.typing import as_numeric, Numeric, Constexpr
+from ..base_dsl._mlir_helpers.arith import ArithValue
+from ..base_dsl.common import DSLBaseError
+from .._mlir import ir
+
+# =============================================================================
+# Tree Utils
+# =============================================================================
+
+
+class DSLTreeFlattenError(DSLBaseError):
+    """Exception raised when tree flattening fails due to unsupported types."""
+
+    def __init__(self, msg: str, type_str: str):
+        super().__init__(msg)
+        self.type_str = type_str
+
+
+def unzip2(pairs: Iterable[tuple[Any, Any]]) -> tuple[list[Any], list[Any]]:
+    """Unzip a sequence of pairs into two lists."""
+    lst1, lst2 = [], []
+    for x1, x2 in pairs:
+        lst1.append(x1)
+        lst2.append(x2)
+    return lst1, lst2
+
+
+def get_fully_qualified_class_name(x: Any) -> str:
+    """
+    Get the fully qualified class name of an object.
+
+    Args:
+        x: Any object
+
+    Returns:
+        str: Fully qualified class name in format 'module.class_name'
+
+    Example:
+        >>> get_fully_qualified_class_name([1, 2, 3])
+        'builtins.list'
+    """
+    return f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+
+
+def is_frozen_dataclass(obj_or_cls: Any) -> bool:
+    """
+    Check if an object or class is a frozen dataclass.
+
+    Args:
+        obj_or_cls: Either a dataclass instance or class
+
+    Returns:
+        bool: True if the object/class is a dataclass declared with frozen=True,
+              False otherwise
+
+    Example:
+        >>> from dataclasses import dataclass
+        >>> @dataclass(frozen=True)
+        ... class Point:
+        ...     x: int
+        ...     y: int
+        >>> is_frozen_dataclass(Point)
+        True
+        >>> is_frozen_dataclass(Point(1, 2))
+        True
+    """
+    cls = obj_or_cls if isinstance(obj_or_cls, type) else obj_or_cls.__class__
+
+    return (
+        dataclasses.is_dataclass(cls)
+        and getattr(cls, "__dataclass_params__", None) is not None
+        and cls.__dataclass_params__.frozen
+    )
+
+
+def is_dynamic_expression(x: Any) -> bool:
+    """
+    Check if an object implements the DynamicExpression protocol.
+
+    Objects implementing this protocol must have both `__extract_mlir_values__`
+    and `__new_from_mlir_values__` methods.
+
+    Args:
+        x: Any object to check
+
+    Returns:
+        bool: True if the object implements the DynamicExpression protocol,
+              False otherwise
+    """
+    return all(
+        hasattr(x, attr)
+        for attr in ("__extract_mlir_values__", "__new_from_mlir_values__")
+    )
+
+
+def is_constexpr_field(field: dataclasses.Field) -> bool:
+    """
+    Check if a field is a constexpr field.
+    """
+    if field.type is Constexpr:
+        return True
+    elif get_origin(field.type) is Constexpr:
+        return True
+    return False
+
+
+# =============================================================================
+# PyTreeDef
+# =============================================================================
+
+class NodeType(NamedTuple):
+    """
+    Represents a node in a pytree structure.
+
+    Attributes:
+        name: String representation of the node type
+        to_iterable: Function to convert node to iterable form
+        from_iterable: Function to reconstruct node from iterable form
+    """
+    name: str
+    to_iterable: Callable
+    from_iterable: Callable
+
+
+class PyTreeDef(NamedTuple):
+    """
+    Represents the structure definition of a pytree.
+
+    Attributes:
+        node_type: The type of this node
+        node_metadata: SimpleNamespace metadata associated with this node
+        child_treedefs: Tuple of child tree definitions
+    """
+    node_type: NodeType
+    node_metadata: SimpleNamespace
+    child_treedefs: tuple["PyTreeDef", ...]
+
+
+@dataclasses.dataclass(frozen=True)
+class Leaf:
+    """
+    Represents a leaf node in a pytree structure.
+
+    Attributes:
+        is_numeric: Whether this leaf contains a `Numeric` value
+        is_none: Whether this leaf represents None
+        node_metadata: SimpleNamespace metadata associated with this leaf
+        ir_type_str: String representation of the IR type
+    """
+    is_numeric: bool = False
+    is_none: bool = False
+    node_metadata: SimpleNamespace = None
+    ir_type_str: str = None
+
+
+# =============================================================================
+# Default to_iterable and from_iterable
+# =============================================================================
+
+
+def extract_dataclass_members(x: Any) -> tuple[list[str], list[Any]]:
+    """
+    Extract non-method, non-function attributes from a dataclass instance.
+
+    Args:
+        x: A dataclass instance
+
+    Returns:
+        tuple: (field_names, field_values) lists
+    """
+    fields = [field.name for field in dataclasses.fields(x)]
+
+    # If the dataclass has extra fields, raise an error
+    for k in x.__dict__.keys():
+        if k not in fields:
+            raise DSLTreeFlattenError(
+                f"`{x}` has extra field `{k}`",
+                type_str=get_fully_qualified_class_name(x),
+            )
+
+    if not fields:
+        return [], []
+
+    # record constexpr fields
+    members = []
+    constexpr_fields = []
+    for field in dataclasses.fields(x):
+        if is_constexpr_field(field):
+            constexpr_fields.append(field.name)
+            fields.remove(field.name)
+            v = getattr(x, field.name)
+            if is_dynamic_expression(v):
+                raise DSLTreeFlattenError(
+                    f"`{x}` has dynamic expression field `{field.name}` with a Constexpr type annotation `{field.type}`",
+                    type_str=get_fully_qualified_class_name(x),
+                )
+        else:
+            members.append(getattr(x, field.name))
+
+    return fields, members, constexpr_fields
+
+
+def default_dataclass_to_iterable(x: Any) -> tuple[SimpleNamespace, list[Any]]:
+    """
+    Convert a dataclass instance to iterable form for tree flattening.
+
+    Extracts all non-method, non-function attributes that don't start with '__'
+    and returns them along with metadata about the dataclass.
+
+    Args:
+        x: A dataclass instance
+
+    Returns:
+        tuple: (metadata, members) where metadata contains type info and field names,
+               and members is the list of attribute values
+    """
+    fields, members, constexpr_fields = extract_dataclass_members(x)
+
+    metadata = SimpleNamespace(
+        type_str=get_fully_qualified_class_name(x),
+        fields=fields,
+        constexpr_fields=constexpr_fields,
+        original_obj=x,
+    )
+    return metadata, members
+
+
+def set_dataclass_attributes(
+    instance: Any,
+    fields: list[str],
+    values: Iterable[Any],
+    constexpr_fields: list[str],
+) -> Any:
+    """
+    Set attributes on a dataclass instance.
+
+    Args:
+        instance: The dataclass instance
+        fields: List of field names
+        values: Iterable of field values
+        is_frozen: Whether the dataclass is frozen
+
+    Returns:
+        The instance with attributes set
+    """
+    if not fields:
+        return instance
+
+    kwargs = dict(zip(fields, values))
+    for field in constexpr_fields:
+        kwargs[field] = getattr(instance, field)
+    return dataclasses.replace(instance, **kwargs)
+
+def default_dataclass_from_iterable(
+    metadata: SimpleNamespace, children: Iterable[Any]
+) -> Any:
+    """
+    Reconstruct a dataclass instance from iterable form.
+
+    Handles both regular and frozen dataclasses appropriately.
+
+    Args:
+        metadata: Metadata containing type information and field names
+        children: Iterable of attribute values to reconstruct the instance
+
+    Returns:
+        The reconstructed dataclass instance
+    """
+    instance = metadata.original_obj
+
+    new_instance = set_dataclass_attributes(
+        instance, metadata.fields, children, metadata.constexpr_fields
+    )
+    metadata.original_obj = new_instance
+    return new_instance
+
+
+def dynamic_expression_to_iterable(x: Any) -> tuple[SimpleNamespace, list[Any]]:
+    """
+    Convert a dynamic expression to iterable form.
+
+    Uses the object's `__extract_mlir_values__` method to extract MLIR values.
+
+    Args:
+        x: A dynamic expression object
+
+    Returns:
+        tuple: (metadata, mlir_values) where metadata marks this as a dynamic expression
+               and mlir_values are the extracted MLIR values
+    """
+    return (
+        SimpleNamespace(is_dynamic_expression=1, original_obj=x),
+        x.__extract_mlir_values__(),
+    )
+
+
+def dynamic_expression_from_iterable(
+    metadata: SimpleNamespace, children: Iterable[Any]
+) -> Any:
+    """
+    Reconstruct a dynamic expression from iterable form.
+
+    Uses the object's `__new_from_mlir_values__` method to reconstruct from MLIR values.
+
+    Args:
+        metadata: Metadata containing the original object
+        children: Iterable of MLIR values to reconstruct from
+
+    Returns:
+        The reconstructed dynamic expression object
+    """
+    return metadata.original_obj.__new_from_mlir_values__(list(children))
+
+
+def default_dict_to_iterable(x: Any) -> tuple[SimpleNamespace, list[Any]]:
+    """
+    Convert a dict to iterable form.
+    """
+    if isinstance(x, SimpleNamespace):
+        keys = list(x.__dict__.keys())
+        values = list(x.__dict__.values())
+    else:
+        keys = list(x.keys())
+        values = list(x.values())
+
+    return (
+        SimpleNamespace(
+            type_str=get_fully_qualified_class_name(x), original_obj=x, fields=keys
+        ),
+        values,
+    )
+
+
+def default_dict_from_iterable(
+    metadata: SimpleNamespace, children: Iterable[Any]
+) -> Any:
+    """
+    Reconstruct a dict from iterable form.
+    """
+    instance = metadata.original_obj
+    fields = metadata.fields
+    is_simple_namespace = isinstance(instance, SimpleNamespace)
+
+    for k, v in zip(fields, children):
+        if is_simple_namespace:
+            setattr(instance, k, v)
+        else:
+            instance[k] = v
+
+    return instance
+
+
+# =============================================================================
+# Register pytree nodes
+# =============================================================================
+
+_node_types: dict[type, NodeType] = {}
+
+
+def register_pytree_node(ty: type, to_iter: Callable, from_iter: Callable) -> NodeType:
+    """
+    Register a new node type for pytree operations.
+
+    Args:
+        ty: The type to register
+        to_iter: Function to convert instances of this type to iterable form
+        from_iter: Function to reconstruct instances of this type from iterable form
+
+    Returns:
+        NodeType: The created NodeType instance
+    """
+    nt = NodeType(str(ty), to_iter, from_iter)
+    _node_types[ty] = nt
+    return nt
+
+
+def register_default_node_types() -> None:
+    """Register default node types for pytree operations."""
+    default_registrations = [
+        (
+            tuple,
+            lambda t: (SimpleNamespace(length=len(t)), list(t)),
+            lambda _, xs: tuple(xs),
+        ),
+        (
+            list,
+            lambda l: (SimpleNamespace(length=len(l)), list(l)),
+            lambda _, xs: list(xs),
+        ),
+        (
+            dict,
+            default_dict_to_iterable,
+            default_dict_from_iterable,
+        ),
+        (
+            SimpleNamespace,
+            default_dict_to_iterable,
+            default_dict_from_iterable,
+        ),
+    ]
+
+    for ty, to_iter, from_iter in default_registrations:
+        register_pytree_node(ty, to_iter, from_iter)
+
+
+# Initialize default registrations
+register_default_node_types()
+
+
+# =============================================================================
+# tree_flatten and tree_unflatten
+# =============================================================================
+
+"""
+Behavior of tree_flatten and tree_unflatten, for example:
+
+```python
+    a = (1, 2, 3)
+    b = MyClass(a=1, b =[1,2,3])
+```
+
+yields the following tree:
+
+```python
+    tree_a = PyTreeDef(type = 'tuple',
+                       metadata = {length = 3},
+                       children = [
+                           Leaf(type = int),
+                           Leaf(type = int),
+                           Leaf(type = int),
+                       ],
+                       )
+    flattened_a = [1, 2, 3]
+    tree_b = PyTreeDef(type = 'MyClass',
+                       metadata = {fields = ['a','b']},
+                       children = [
+                           PyTreeDef(type = `list`,
+                                     metadata = {length = 3},
+                                     children = [
+                                          Leaf(type=`int`),
+                                          Leaf(type=`int`),
+                                          Leaf(type=`int`),
+                                     ],
+                           ),
+                           Leaf(type=int),
+                       ],
+                       )
+    flattened_b = [1, 1, 2, 3]
+```
+
+Passing the flattened values and PyTreeDef to tree_unflatten to reconstruct the original structure.
+
+``` python
+    unflattened_a = tree_unflatten(tree_a, flattened_a)
+    unflattened_b = tree_unflatten(tree_b, flattened_b)
+```
+
+yields the following structure:
+
+``` python
+    unflattened_a = (1, 2, 3)
+    unflattened_b = MyClass(a=1, b =[1,2,3])
+```
+
+unflattened_a should be structurally identical to a, and unflattened_b should be structurally identical to b.
+
+"""
+
+
+def tree_flatten(x: Any) -> tuple[list[Any], PyTreeDef]:
+    """
+    Flatten a nested structure into a flat list of values and a tree definition.
+
+    This function recursively traverses nested data structures (trees) and
+    flattens them into a linear list of leaf values, while preserving the
+    structure information in a PyTreeDef.
+
+    Args:
+        x: The nested structure to flatten
+
+    Returns:
+        tuple: (flat_values, treedef) where flat_values is a list of leaf values
+               and treedef is the tree structure definition
+
+    Raises:
+        DSLTreeFlattenError: If the structure contains unsupported types
+
+    Example:
+        >>> tree_flatten([1, [2, 3], 4])
+        ([1, 2, 3, 4], PyTreeDef(...))
+    """
+    children_iter, treedef = _tree_flatten(x)
+    return list(children_iter), treedef
+
+
+def get_registered_node_types_or_insert(x: Any) -> NodeType | None:
+    """
+    Get the registered node type for an object, registering it if necessary.
+
+    This function checks if a type is already registered for pytree operations.
+    If not, it automatically registers the type based on its characteristics:
+    - Dynamic expressions get registered with dynamic expression handlers
+    - Dataclasses get registered with default dataclass handlers
+
+    Args:
+        x: The object to get or register a node type for
+
+    Returns:
+        NodeType or None: The registered node type, or None if the type
+                         cannot be registered
+    """
+    node_type = _node_types.get(type(x))
+    if node_type:
+        return node_type
+    elif is_dynamic_expression(x):
+        # If a class implements DynamicExpression protocol, register it before default dataclass one
+        return register_pytree_node(
+            type(x), dynamic_expression_to_iterable, dynamic_expression_from_iterable
+        )
+    elif dataclasses.is_dataclass(x):
+        return register_pytree_node(
+            type(x), default_dataclass_to_iterable, default_dataclass_from_iterable
+        )
+    else:
+        return None
+
+
+def create_leaf_for_value(
+    x: Any,
+    is_numeric: bool = False,
+    is_none: bool = False,
+    node_metadata: SimpleNamespace = None,
+    ir_type_str: str = None,
+) -> Leaf:
+    """
+    Create a Leaf node for a given value.
+
+    Args:
+        x: The value to create a leaf for
+        is_numeric: Whether this is a numeric value
+        is_none: Whether this represents None
+        node_metadata: Optional metadata
+        ir_type_str: Optional IR type string
+
+    Returns:
+        Leaf: The created leaf node
+    """
+    return Leaf(
+        is_numeric=is_numeric,
+        is_none=is_none,
+        node_metadata=node_metadata,
+        ir_type_str=ir_type_str or (str(x.type) if hasattr(x, "type") else None),
+    )
+
+
+def _tree_flatten(x: Any) -> tuple[Iterable[Any], PyTreeDef | Leaf]:
+    """
+    Internal function to flatten a tree structure.
+
+    This is the core implementation of tree flattening that handles different
+    types of objects including None, ArithValue, ir.Value, Numeric types,
+    and registered pytree node types.
+
+    Args:
+        x: The object to flatten
+
+    Returns:
+        tuple: (flattened_values, treedef) where flattened_values is an iterable
+               of leaf values and treedef is the tree structure
+
+    Raises:
+        DSLTreeFlattenError: If the object type is not supported
+    """
+    match x:
+        case None:
+            return [], create_leaf_for_value(x, is_none=True)
+
+        case ArithValue() if is_dynamic_expression(x):
+            v = x.__extract_mlir_values__()
+            return v, create_leaf_for_value(
+                x,
+                node_metadata=SimpleNamespace(is_dynamic_expression=1, original_obj=x),
+                ir_type_str=str(v[0].type),
+            )
+
+        case ArithValue():
+            return [x], create_leaf_for_value(x, is_numeric=True)
+
+        case ir.Value():
+            return [x], create_leaf_for_value(x)
+
+        case Numeric():
+            v = x.__extract_mlir_values__()
+            return v, create_leaf_for_value(
+                x,
+                node_metadata=SimpleNamespace(is_dynamic_expression=1, original_obj=x),
+                ir_type_str=str(v[0].type),
+            )
+
+        case _:
+            node_type = get_registered_node_types_or_insert(x)
+            if node_type:
+                node_metadata, children = node_type.to_iterable(x)
+                children_flat, child_trees = unzip2(map(_tree_flatten, children))
+                flattened = it.chain.from_iterable(children_flat)
+                return flattened, PyTreeDef(
+                    node_type, node_metadata, tuple(child_trees)
+                )
+
+            # Try to convert to numeric
+            try:
+                nval = as_numeric(x).ir_value()
+                return [nval], create_leaf_for_value(nval, is_numeric=True)
+            except Exception:
+                raise DSLTreeFlattenError(
+                    "Flatten Error", get_fully_qualified_class_name(x)
+                )
+
+
+def tree_unflatten(treedef: PyTreeDef, xs: list[Any]) -> Any:
+    """
+    Reconstruct a nested structure from a flat list of values and tree definition.
+
+    This is the inverse operation of tree_flatten. It takes the flattened
+    values and the tree structure definition to reconstruct the original
+    nested structure.
+
+    Args:
+        treedef: The tree structure definition from tree_flatten
+        xs: List of flat values to reconstruct from
+
+    Returns:
+        The reconstructed nested structure
+
+    Example:
+        >>> flat_values, treedef = tree_flatten([1, [2, 3], 4])
+        >>> tree_unflatten(treedef, flat_values)
+        [1, [2, 3], 4]
+    """
+    return _tree_unflatten(treedef, iter(xs))
+
+
+def _tree_unflatten(treedef: PyTreeDef | Leaf, xs: Iterator[Any]) -> Any:
+    """
+    Internal function to reconstruct a tree structure.
+
+    This is the core implementation of tree unflattening that handles
+    different types of tree definitions including Leaf nodes and PyTreeDef nodes.
+
+    Args:
+        treedef: The tree structure definition
+        xs: Iterator of flat values to reconstruct from
+
+    Returns:
+        The reconstructed object
+    """
+    match treedef:
+        case Leaf(is_none=True):
+            return None
+
+        case Leaf(
+            node_metadata=metadata
+        ) if metadata and metadata.is_dynamic_expression:
+            return metadata.original_obj.__new_from_mlir_values__([next(xs)])
+
+        case Leaf(is_numeric=True):
+            return as_numeric(next(xs))
+
+        case Leaf():
+            return next(xs)
+
+        case PyTreeDef():
+            children = (_tree_unflatten(t, xs) for t in treedef.child_treedefs)
+            return treedef.node_type.from_iterable(treedef.node_metadata, children)
+
+
+def _check_tree_equal(lhs: Union[PyTreeDef, Leaf], rhs: Union[PyTreeDef, Leaf]) -> bool:
+    """
+    Check if two tree definitions are structurally equal.
+
+    This is a helper function for check_tree_equal that recursively compares
+    tree structures.
+
+    Args:
+        lhs: Left tree definition (PyTreeDef or Leaf)
+        rhs: Right tree definition (PyTreeDef or Leaf)
+
+    Returns:
+        bool: True if the trees are structurally equal, False otherwise
+    """
+    match (lhs, rhs):
+        case (Leaf(), Leaf()):
+            return lhs.is_none == rhs.is_none and lhs.ir_type_str == rhs.ir_type_str
+
+        case (PyTreeDef(), PyTreeDef()):
+            lhs_metadata = lhs.node_metadata
+            rhs_metadata = rhs.node_metadata
+
+            lhs_fields = getattr(lhs_metadata, "fields", [])
+            rhs_fields = getattr(rhs_metadata, "fields", [])
+            lhs_constexpr_fields = getattr(lhs_metadata, "constexpr_fields", [])
+            rhs_constexpr_fields = getattr(rhs_metadata, "constexpr_fields", [])
+
+            return (
+                lhs.node_type == rhs.node_type
+                and lhs_fields == rhs_fields
+                and lhs_constexpr_fields == rhs_constexpr_fields
+                and len(lhs.child_treedefs) == len(rhs.child_treedefs)
+                and all(map(_check_tree_equal, lhs.child_treedefs, rhs.child_treedefs))
+            )
+
+        case _:
+            return False
+
+
+def check_tree_equal(lhs: PyTreeDef, rhs: PyTreeDef) -> int:
+    """
+    Check if two tree definitions are equal and return the index of first difference.
+
+    This function compares two tree definitions and returns the index of the
+    first child that differs, or -1 if they are completely equal.
+
+    Args:
+        lhs: Left tree definition
+        rhs: Right tree definition
+
+    Returns:
+        int: Index of the first differing child, or -1 if trees are equal
+
+    Example:
+        >>> treedef1 = tree_flatten([1, [2, 3]])[1]
+        >>> treedef2 = tree_flatten([1, [2, 4]])[1]
+        >>> check_tree_equal(treedef1, treedef2)
+        1  # The second child differs
+    """
+    assert len(lhs.child_treedefs) == len(rhs.child_treedefs)
+
+    def find_first_difference(
+        index_and_pair: tuple[int, tuple[PyTreeDef, PyTreeDef]]
+    ) -> int:
+        index, (l, r) = index_and_pair
+        return index if not _check_tree_equal(l, r) else -1
+
+    differences = map(
+        find_first_difference, enumerate(zip(lhs.child_treedefs, rhs.child_treedefs))
+    )
+    return next((diff for diff in differences if diff != -1), -1)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bdd259c0203aaca3c7a7e31e64a576630f369a9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/__init__.py
@@ -0,0 +1,213 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import logging
+import os
+import sys
+
+import cutlass_library
+
+
+def _cuda_install_path_from_nvcc() -> str:
+    import subprocess
+    # Attempt to detect CUDA_INSTALL_PATH based on location of NVCC
+    result = subprocess.run(['/usr/bin/which', 'nvcc'], capture_output=True)
+    if result.returncode != 0:
+        raise Exception(f'Unable to find nvcc via `which` utility.')
+
+    cuda_install_path = result.stdout.decode('utf-8').split('/bin/nvcc')[0]
+    if not os.path.isdir(cuda_install_path):
+        raise Exception(f'Environment variable "CUDA_INSTALL_PATH" is not defined, '
+                        f'and default path of {cuda_install_path} does not exist.')
+
+    return cuda_install_path
+
+
+CUTLASS_PATH = os.getenv("CUTLASS_PATH", cutlass_library.source_path)
+
+# Alias CUTLASS_PATH as source_path
+source_path = CUTLASS_PATH
+
+_NVCC_VERSION = None
+def nvcc_version():
+    global _NVCC_VERSION
+    if _NVCC_VERSION is None:
+        import subprocess
+
+        # Attempt to get NVCC version
+        result = subprocess.run(['nvcc', '--version'], capture_output=True)
+        if result.returncode != 0:
+            raise Exception('Unable to run `nvcc --version')
+        _NVCC_VERSION = str(result.stdout).split(" release ")[-1].split(",")[0]
+    return _NVCC_VERSION
+
+_CUDA_INSTALL_PATH = None
+def cuda_install_path():
+    """
+    Helper method for on-demand fetching of the CUDA installation path. This allows
+    the import of CUTLASS to proceed even if NVCC is not available, preferring to
+    raise this error only when an operation that needs NVCC is being performed.
+    """
+    global _CUDA_INSTALL_PATH
+    if _CUDA_INSTALL_PATH is None:
+        _CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())
+    return _CUDA_INSTALL_PATH
+
+CACHE_FILE = "compiled_cache.db"
+
+from cutlass_library import (
+    DataType,
+    EpilogueScheduleType,
+    KernelScheduleType,
+    MathOperation,
+    LayoutType,
+    OpcodeClass,
+    TileDescription,
+    TileSchedulerType,
+)
+
+this = sys.modules[__name__]
+this.logger = logging.getLogger(__name__)
+
+# RMM is only supported for Python 3.9+
+if (sys.version_info.major == 3 and sys.version_info.minor > 8) or sys.version_info.major > 3:
+    try:
+        import rmm
+        this.use_rmm = True
+    except ImportError:
+        this.use_rmm = False
+else:
+    this.use_rmm = False
+
+
+def set_log_level(level: int):
+    """
+    Sets the log level
+
+    :param log_level: severity of logging level to use. See https://docs.python.org/3/library/logging.html#logging-levels for options
+    :type log_level: int
+    """
+    this.logger.setLevel(level)
+
+set_log_level(logging.ERROR)
+
+from cutlass_cppgen.library_defaults import OptionRegistry
+from cutlass_cppgen.backend.utils.device import device_cc
+
+this._option_registry = None
+def get_option_registry():
+    """
+    Helper method for on-demand initialization of the options registry. This avoids building
+    the registry when CUTLASS is imported.
+    """
+    if this._option_registry is None:
+        this.logger.info("Initializing option registry")
+        this._option_registry = OptionRegistry(device_cc())
+    return this._option_registry
+
+this.__version__ = '4.2.1'
+
+from cutlass_cppgen.backend import create_memory_pool
+from cutlass_cppgen.emit.pytorch import pytorch
+from cutlass_cppgen.op.gemm import Gemm
+from cutlass_cppgen.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
+from cutlass_cppgen.op.gemm_grouped import GroupedGemm
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.backend.evt.ir.tensor import Tensor
+from cutlass_cppgen.utils.lazy_import import lazy_import
+
+
+this.memory_pool = None
+def get_memory_pool():
+    """"
+    Helper method for on-demand memory pool. This avoids allocating the memory pool unnecessarily
+    whe CUTLASS is imported.
+    """
+    if this.use_rmm and this.memory_pool is None:
+        this.memory_pool = create_memory_pool(init_pool_size=2 ** 30, max_pool_size=2 ** 32)
+    return this.memory_pool
+
+
+base_cuda = lazy_import("cuda")
+cuda = lazy_import("cuda.cuda")
+cudart = lazy_import("cuda.cudart")
+
+this._device_id = None
+this._nvcc_version = None
+
+def check_cuda_versions():
+    # Strip any additional information from the CUDA version
+    _cuda_version = base_cuda.__version__.split("rc")[0]
+    # Check that Python CUDA version exceeds NVCC version
+    this._nvcc_version = nvcc_version()
+    _cuda_list = _cuda_version.split('.')
+    _nvcc_list = this._nvcc_version.split('.')
+    for val_cuda, val_nvcc in zip(_cuda_list, _nvcc_list):
+        if int(val_cuda) < int(val_nvcc):
+            raise Exception(f"Python CUDA version of {_cuda_version} must be greater than or equal to NVCC version of {this._nvcc_version}")
+
+    if len(_nvcc_list) > len(_cuda_list):
+        if len(_nvcc_list) != len(_cuda_list) + 1:
+            raise Exception(f"Malformatted NVCC version of {this._nvcc_version}")
+        if _nvcc_list[:-1] == _cuda_list and int(_nvcc_list[-1]) != 0:
+            raise Exception(f"Python CUDA version of {_cuda_version} must be greater than or equal to NVCC version of {this._nvcc_version}")
+
+def initialize_cuda_context():
+    check_cuda_versions()
+
+    if this._device_id is not None:
+        return
+
+    if this.use_rmm:
+        # This also covers initializing the CUDA context
+        get_memory_pool()
+
+    device_id = os.getenv("CUTLASS_CUDA_DEVICE_ID")
+    if device_id is None:
+        if not this.use_rmm:
+            # Manually call cuInit() and create context by making a runtime API call
+            err, = cudart.cudaFree(0)
+            if err != cudart.cudaError_t.cudaSuccess:
+                raise RuntimeError(f"cudaFree failed with error {err}")
+
+        err, device_count = cuda.cuDeviceGetCount()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise Exception(f"cuDeviceGetCount failed with error {err}")
+        if device_count <= 0:
+            raise Exception("No CUDA devices found")
+        device_id = 0
+
+    this._device_id = int(device_id)
+
+
+def device_id() -> int:
+    initialize_cuda_context()
+    return this._device_id
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59cfaf7154687fa3a971f2221f0cce2130ff1a4f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/__init__.py
@@ -0,0 +1,48 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.arguments import *
+from cutlass_cppgen.backend.c_types import *
+from cutlass_cppgen.backend.compiler import ArtifactManager
+from cutlass_cppgen.backend.conv2d_operation import *
+from cutlass_cppgen.backend.epilogue import *
+from cutlass_cppgen.backend.frontend import *
+from cutlass_cppgen.backend.gemm_operation import *
+from cutlass_cppgen.backend.library import *
+from cutlass_cppgen.backend.memory_manager import PoolMemoryManager, create_memory_pool
+from cutlass_cppgen.backend.operation import *
+from cutlass_cppgen.backend.reduction_operation import *
+from cutlass_cppgen.backend.type_hint import *
+from cutlass_cppgen.backend.utils import *
+from cutlass_cppgen.backend.utils.device import device_cc
+
+compiler = ArtifactManager()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/arguments.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1b0656a89a8b0a42b864429810b74bc433582d4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/arguments.py
@@ -0,0 +1,136 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+from typing import Union
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+
+cuda = lazy_import("cuda.cuda")
+cudart = lazy_import("cuda.cudart")
+import numpy as np
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.frontend import CupyFrontend, NumpyFrontend, TorchFrontend
+from cutlass_cppgen.backend.memory_manager import DevicePtrWrapper
+from cutlass_cppgen.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+class ArgumentBase:
+    """
+    Base class for operation arguments
+    """
+
+    def __init__(
+        self,
+        A: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        B: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        C: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        D: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]",
+        **kwargs,
+    ) -> None:
+        # tensor_C can be interpreted as the bias with bias=True in keyword args
+        self.bias = kwargs.get("bias", False)
+
+        self.stream = kwargs.get("stream", cuda.CUstream(0))
+
+        # RMM buffers used to track tensor lifetime
+        self.buffers = {}
+        # Host tensor to copy the computed result back
+        self.host_tensors = {}
+
+        self.ptr_A = self.tensor_to_ptr(A, "A")
+        self.ptr_B = self.tensor_to_ptr(B, "B")
+        self.ptr_C = self.tensor_to_ptr(C, "C")
+        self.ptr_D = self.tensor_to_ptr(D, "D", is_output=True)
+        if C is not None:
+            if not isinstance(C, cuda.CUdeviceptr):
+                self.tensor_c_numel = prod(C.shape)
+
+    def tensor_to_ptr(self, tensor, name, is_output=False):
+        """
+        Convert and remember the input tensor to cuda.CUdeviceptr used by cuda python
+        For numpy.ndarray, it also remembers the host buffer for synchronization
+        """
+        if tensor is None:
+            return cuda.CUdeviceptr(0)
+        if is_numpy_tensor(tensor):
+            if is_output:
+                assert name
+            self.buffers[name] = NumpyFrontend.argument(tensor, is_output)
+            if is_output:
+                self.host_tensors[name] = tensor
+            return self.buffers[name].ptr
+        elif is_torch_tensor(tensor):
+            return TorchFrontend.argument(tensor)
+        elif isinstance(tensor, cuda.CUdeviceptr):
+            return tensor
+        elif is_cupy_tensor(tensor):
+            return CupyFrontend.argument(tensor)
+        else:
+            raise TypeError("Unsupported Frontend. Only support numpy and torch")
+
+    def sync(self, stream_sync=True):
+        if stream_sync:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        for key in self.host_tensors.keys():
+            host_tensor = self.host_tensors[key]
+            (err,) = cuda.cuMemcpyDtoH(
+                host_tensor,
+                self.buffers[key].ptr,
+                host_tensor.size * host_tensor.itemsize,
+            )
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.free()
+
+    def free(self):
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass_cppgen.use_rmm:
+            for name, buf in self.buffers.items():
+                if isinstance(buf, DevicePtrWrapper):
+                    err, = cudart.cudaFree(buf.ptr)
+                    if err != cudart.cudaError_t.cudaSuccess:
+                        raise RuntimeError(f"cudaFree failed with error {err}")
+
+            if hasattr(self, "workspace_buffer") and isinstance(self.workspace_buffer, DevicePtrWrapper):
+                err, = cudart.cudaFree(self.workspace_buffer.ptr)
+                if err != cudart.cudaError_t.cudaSuccess:
+                    raise RuntimeError(f"cudaFree failed with error {err}")
+                del self.workspace_buffer
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/c_types.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/c_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f515aa38439e4b2e1392659d188cbe6a68e0481
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/c_types.py
@@ -0,0 +1,625 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+
+from cutlass_library import (
+    DataType,
+    KernelScheduleType,
+    TileSchedulerType
+)
+from cutlass_cppgen.backend.library import DataTypeSizeBytes
+
+
+class GemmCoord_(ctypes.Structure):
+    _fields_ = [
+        ("m", ctypes.c_int),
+        ("n", ctypes.c_int),
+        ("k", ctypes.c_int)
+    ]
+
+    def __init__(self, m, n, k) -> None:
+        self.m = m
+        self.n = n
+        self.k = k
+
+
+class GemmCoordBatched_(ctypes.Structure):
+    """
+    Wrapper around a GemmCoord that also contains batch count. This is used for encoding
+    batched GEMM inputs to CUTLASS 3 GEMMs.
+    """
+
+    _fields_ = [
+        ("m", ctypes.c_int),
+        ("n", ctypes.c_int),
+        ("k", ctypes.c_int),
+        ("batch_count", ctypes.c_int)
+    ]
+
+    def __init__(self, gemm_coord, batch_count) -> None:
+        self.m = gemm_coord.m
+        self.n = gemm_coord.n
+        self.k = gemm_coord.k
+        self.batch_count = batch_count
+
+
+class MatrixCoord_(ctypes.Structure):
+    _fields_ = [
+        ("row", ctypes.c_int),
+        ("column", ctypes.c_int)
+    ]
+
+
+class dim3_(ctypes.Structure):
+    _fields_ = [
+        ("x", ctypes.c_int),
+        ("y", ctypes.c_int),
+        ("z", ctypes.c_int)
+    ]
+
+
+class StrideBatched_(ctypes.Structure):
+    """
+    CUTLASS 3.0 strides for operands contain one static dimension and two variable dimensions. The
+    variable dimensions represent the stride along non-unit-stride dimension of the row/column major
+    layout, and the batch stride. This structure encodes the two variable dimensions.
+    """
+    _fields_ = [
+        ("major_stride", ctypes.c_int64),
+        ("batch_stride", ctypes.c_int64)
+    ]
+
+
+
+class GenericMainloopArguments3x_(ctypes.Structure):
+    """
+    Structure representing the superset of possible mainloop arguments.
+    This structure should not be passed to kernels directly, but, rather,
+    be used as an input to one of the more specific schedule arguments, which
+    will each select those arguments relevant to the particular schedule.
+    """
+    _fields_ = [
+        ("ptr_A", ctypes.c_void_p),
+        ("stride_A", StrideBatched_),
+        ("ptr_B", ctypes.c_void_p),
+        ("stride_B", StrideBatched_),
+        ("mma_promotion_interval", ctypes.c_int)
+    ]
+
+
+class _PersistentTileSchedulerArguments(ctypes.Structure):
+    _fields_ = [
+        ("max_swizzle_size", ctypes.c_int),
+        ("raster_order_option", ctypes.c_int),
+    ]
+
+
+class _PersistentTileSchedulerStreamKArguments(ctypes.Structure):
+    _fields_ = [
+        ("splits", ctypes.c_int),
+        ("max_swizzle_size", ctypes.c_int),
+        ("raster_order_option", ctypes.c_int),
+        ("reduction_mode", ctypes.c_int),
+        ("decomposition_mode", ctypes.c_int),
+    ]
+
+
+def get_tile_scheduler_arguments_3x(
+    tile_scheduler: TileSchedulerType,
+    splits: int = 1):
+    max_swizzle_size = 1
+    raster_order_option = 0 # Heuristic
+    if tile_scheduler in [TileSchedulerType.Default, TileSchedulerType.Persistent]:
+        return _PersistentTileSchedulerArguments(
+            max_swizzle_size,
+            raster_order_option,
+        )
+    elif tile_scheduler == TileSchedulerType.StreamK:
+        reduction_mode = 0 # Deterministic
+        decomposition_mode = 0 # Heuristic
+        return _PersistentTileSchedulerStreamKArguments(
+            splits,
+            max_swizzle_size,
+            raster_order_option,
+            reduction_mode,
+            decomposition_mode,
+        )
+
+
+def get_mainloop_arguments_3x(
+    kernel_schedule: KernelScheduleType,
+    element_A,
+    element_B,
+    alignment_A: int,
+    alignment_B: int) -> ctypes.Structure:
+    """
+    Returns the ctypes structure to be used for the 3.x kernel's mainloop parameters.
+
+    :param kernel_schedule: type of kernel schedule to be used in the mainloop
+    :type kernel_schedule: cutlass_library.KernelScheduleType
+    :param element_A: data type of operand A
+    :param element_B: data type of operand B
+    :param alignment_A: alignment of operand A
+    :type alignment_A: int
+    :param alignment_B: alignment of operand B
+    :type alignment_B: int
+
+    :returns: ctypes structure to be used for the 3.x kernel's mainloop parameters
+    :rtype: ctypes.Structure
+    """
+    class _MainloopArgumentsTma(ctypes.Structure):
+        _fields_ = [
+            ("ptr_A", ctypes.c_void_p),
+            ("stride_A", StrideBatched_),
+            ("ptr_B", ctypes.c_void_p),
+            ("stride_B", StrideBatched_),
+            ("mma_promotion_interval", ctypes.c_int)
+        ]
+
+        @staticmethod
+        def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
+            return _MainloopArgumentsTma(
+                args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
+                args.mma_promotion_interval
+            )
+
+    class _MainloopArgumentsMultistage(ctypes.Structure):
+        _fields_ = [
+            ("ptr_A", ctypes.c_void_p),
+            ("stride_A", StrideBatched_),
+            ("ptr_B", ctypes.c_void_p),
+            ("stride_B", StrideBatched_),
+        ]
+
+        @staticmethod
+        def from_generic_mainloop_args(args: GenericMainloopArguments3x_):
+            return _MainloopArgumentsMultistage(
+                args.ptr_A, args.stride_A, args.ptr_B, args.stride_B,
+            )
+
+    # Currently all 3.x kernels (CpAsync and Tma) have the same argument structure.
+    # Should that become not the case, this is the place to return custom ctypes
+    # structures based on selected kernel schedule.
+    return _MainloopArgumentsTma
+
+
+def get_gemm_arguments_3x(mainloop_arguments, epilogue_functor, scheduler_args, default_epilogue):
+    if not default_epilogue and hasattr(epilogue_functor, "epilogue_type_evt"):
+        _EpilogueOutputOpParams = epilogue_functor.epilogue_type_evt
+    else:
+        _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    if hasattr(epilogue_functor, "visitor"):
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("arg_C", epilogue_functor.arg_c_type),
+                ("arg_D", epilogue_functor.arg_d_type)
+            ]
+
+            def __init__(self, output_op, ptr_c, stride_c, ptr_d, stride_d) -> None:
+                self.epilogue = output_op
+                self.arg_C = epilogue_functor.arg_c_type(ptr_c)
+                self.arg_D = epilogue_functor.arg_d_type(ptr_d)
+    else:
+        class _EpilogueArguments(ctypes.Structure):
+            _fields_ = [
+                ("epilogue", _EpilogueOutputOpParams),
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", StrideBatched_),
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", StrideBatched_),
+            ]
+
+    class _HardwareInfo(ctypes.Structure):
+        _fields_ = [
+            ("device_id", ctypes.c_int),
+            ("sm_count", ctypes.c_int),
+            ("max_active_clusters", ctypes.c_int),
+            ("cluster_shape", dim3_),
+            ("cluster_shape_fallback", dim3_),
+        ]
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoordBatched_),
+            ("mainloop", mainloop_arguments),
+            ("epilogue", _EpilogueArguments),
+            ("hw_info", _HardwareInfo),
+            ("scheduler", type(scheduler_args)),
+        ]
+
+    return _GemmArguments, _EpilogueArguments, _EpilogueOutputOpParams, _HardwareInfo
+
+
+def get_gemm_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            # Arguments from UniversalArgumentsBase
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoord_),
+            ("batch_count", ctypes.c_int),
+            ("batch_stride_D", ctypes.c_longlong),
+            # Remaining arguments
+            ("epilogue", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("batch_stride_A", ctypes.c_longlong),
+            ("batch_stride_B", ctypes.c_longlong),
+            ("batch_stride_C", ctypes.c_longlong),
+            ("stride_a", ctypes.c_longlong),
+            ("stride_b", ctypes.c_longlong),
+            ("stride_c", ctypes.c_longlong),
+            ("stride_d", ctypes.c_longlong),
+            ("lda", ctypes.c_longlong),
+            ("ldb", ctypes.c_longlong),
+            ("ldc", ctypes.c_longlong),
+            ("ldd", ctypes.c_longlong),
+            ("ptr_gather_A_indices", ctypes.c_void_p),
+            ("ptr_gather_B_indices", ctypes.c_void_p),
+            ("ptr_scatter_D_indices", ctypes.c_void_p)
+        ]
+
+    return _GemmArguments, _EpilogueOutputOpParams
+
+
+def get_gemm_arguments_streamk(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GemmArguments(ctypes.Structure):
+        _fields_ = [
+            ("mode", ctypes.c_int),
+            ("problem_size", GemmCoord_),
+            ("batch_count", ctypes.c_int),
+            ("epilogue", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("batch_stride_A", ctypes.c_longlong),
+            ("batch_stride_B", ctypes.c_longlong),
+            ("batch_stride_C", ctypes.c_longlong),
+            ("batch_stride_D", ctypes.c_longlong),
+            ("stride_a", ctypes.c_longlong),
+            ("stride_b", ctypes.c_longlong),
+            ("stride_c", ctypes.c_longlong),
+            ("stride_d", ctypes.c_longlong),
+            ("lda", ctypes.c_longlong),
+            ("ldb", ctypes.c_longlong),
+            ("ldc", ctypes.c_longlong),
+            ("ldd", ctypes.c_longlong),
+            ("avail_sms", ctypes.c_int)
+        ]
+
+    return _GemmArguments, _EpilogueOutputOpParams
+
+
+###########################################################################################
+# GEMM Grouped
+###########################################################################################
+
+
+def get_gemm_grouped_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _GEMMGroupedArguments(ctypes.Structure):
+        _fields_ = [
+            ("problem_sizes", ctypes.c_void_p),
+            ("problem_count", ctypes.c_int),
+            ("threadblock_count", ctypes.c_int),
+            ("output_op", _EpilogueOutputOpParams),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("lda", ctypes.c_void_p),
+            ("ldb", ctypes.c_void_p),
+            ("ldc", ctypes.c_void_p),
+            ("ldd", ctypes.c_void_p),
+            ("host_problem_sizes", ctypes.c_void_p)
+        ]
+
+    return _GEMMGroupedArguments, _EpilogueOutputOpParams
+
+
+############################################################################################
+# Convolution2D
+############################################################################################
+
+
+class Conv2DProblemSize_(ctypes.Structure):
+    _fields_ = [
+        ("N", ctypes.c_int),
+        ("H", ctypes.c_int),
+        ("W", ctypes.c_int),
+        ("C", ctypes.c_int),
+        ("P", ctypes.c_int),
+        ("Q", ctypes.c_int),
+        ("K", ctypes.c_int),
+        ("R", ctypes.c_int),
+        ("S", ctypes.c_int),
+        ("pad_h", ctypes.c_int),
+        ("pad_w", ctypes.c_int),
+        ("stride_h", ctypes.c_int),
+        ("stride_w", ctypes.c_int),
+        ("dilation_h", ctypes.c_int),
+        ("dilation_w", ctypes.c_int),
+        ("mode", ctypes.c_int),  # kCrossCorrelation: 0, kConvolution: 1
+        ("split_k_slices", ctypes.c_int),
+        ("groups", ctypes.c_int)
+    ]
+
+    def __init__(self, problem_size) -> None:
+        for field_name, _ in self._fields_:
+            setattr(self, field_name, getattr(problem_size, field_name))
+
+
+class Layout4D(ctypes.Structure):
+    _fields_ = [("stride", ctypes.c_int * 3)]
+
+    def __init__(self, tensor_ref):
+        stride = tensor_ref.stride()
+        setattr(self, "stride", (stride.at(0), stride.at(1), stride.at(2)))
+
+
+class TensorRef_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("layout", Layout4D)
+    ]
+
+    def __init__(self, tensor_ref):
+        setattr(self, "ptr", tensor_ref.data())
+        setattr(self, "layout", Layout4D(tensor_ref.layout()))
+
+
+class TensorRef2D_(ctypes.Structure):
+    _fields_ = [
+        ("ptr", ctypes.c_void_p),
+        ("stride", ctypes.c_int)
+    ]
+
+
+def get_conv2d_arguments(epilogue_functor):
+    _EpilogueOutputOpParams = epilogue_functor.epilogue_type
+
+    class _Conv2dArguments(ctypes.Structure):
+        _fields_ = [
+            ("conv_kind", ctypes.c_int),
+            ("problem_size", Conv2DProblemSize_),
+            ("ptr_A", ctypes.c_void_p),
+            ("ptr_B", ctypes.c_void_p),
+            ("ptr_C", ctypes.c_void_p),
+            ("ptr_D", ctypes.c_void_p),
+            ("tensor_C_numel", ctypes.c_int),
+            ("output_op", _EpilogueOutputOpParams),
+            ("split_k_mode", ctypes.c_int)
+        ]
+
+    return _Conv2dArguments, _EpilogueOutputOpParams
+
+
+############################################################################################
+# Reduction
+############################################################################################
+
+
+def get_reduction_params(epilogue_functor):
+    _EpilogueOutputParams = epilogue_functor.epilogue_type
+
+    class _ReductionParams(ctypes.Structure):
+        _fields_ = [
+            ("problem_size", MatrixCoord_),
+            ("partitions", ctypes.c_int),
+            ("partition_stride", ctypes.c_longlong),
+            ("workspace", TensorRef2D_),
+            ("destination", TensorRef2D_),
+            ("source", TensorRef2D_),
+            ("output_op", _EpilogueOutputParams),
+        ]
+
+    return _ReductionParams, _EpilogueOutputParams
+
+
+###########################################################################################
+# Epilogue Visitor Type Factory
+###########################################################################################
+
+class Empty(ctypes.Structure):
+    _fields_ = []
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EmptyByte(ctypes.Structure):
+    _fields_ = [
+        ("byte", ctypes.c_byte)
+    ]
+
+    def __init__(self, *arg) -> None:
+        pass
+
+class EBO:
+    def __init__(self, index: int, type) -> None:
+        self.index = index
+        self.type = type
+
+    def __eq__(self, other) -> bool:
+        if isinstance(other, EBO):
+            return self.index == other.index and self.type == other.type
+        return False
+
+    def __hash__(self) -> int:
+        return hash((self.index, self.type))
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self) -> str:
+        return f"<{self.index}, {self.type}>"
+
+
+def tuple_factory_(input_tuple, dtype, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+
+    # The empty base classes of the current tuple
+    empty_bases = []
+    # The first non empty base class
+    first_non_empty_base = None
+    # The ctype fields of the current tuple
+    ctype_fields = []
+
+    for idx, entry in enumerate(input_tuple):
+        # For nested tuples
+        if isinstance(entry, tuple):
+            sub_tuple_ctype, sub_empty_bases = tuple_factory_(entry, dtype, constants)
+            if ctypes.sizeof(sub_tuple_ctype) == 0:
+                # The empty tuple base class is also an empty EBO
+                empty_bases.append(EBO(idx, entry))
+            else:
+                if first_non_empty_base is None:
+                    first_non_empty_base = sub_empty_bases
+            ctype_fields.append((f"entry_{idx}", sub_tuple_ctype))
+        else:
+            if entry in constants:
+                empty_bases.append(EBO(idx, entry))
+                ctype_fields.append((f"entry_{idx}", Empty))
+            else:
+                ctype_fields.append((f"entry_{idx}", dtype))
+                if first_non_empty_base is None:
+                    first_non_empty_base = []
+
+    # Create the ctype tuple
+    class TupleType(ctypes.Structure):
+        _fields_ = ctype_fields
+
+        def __init__(self, args) -> None:
+            fields = self._fields_
+
+            assert len(fields) == len(args)
+            for field, arg in zip(fields, args):
+                name = field[0]
+                field_type = field[1]
+                setattr(self, name, field_type(arg))
+
+    return TupleType, empty_bases
+
+def tuple_factory(input_tuple, dtype: str, constants=[0,1]):
+    """
+    The factory function generating cute::Tuple with input tuple
+    :param input_tuple: the input tuple
+    :type input_tuple: tuple
+    :param dtype: the data type for non-constant values
+    :type dtype: str, "int32_t", "int", "int64_t"
+    :param constant: the values that will be treated as constants
+    :type constant: list[int]
+
+    :return: ctype structure representing the cute::Tuple
+    :return: the empty base classes of the tuple
+    """
+    # Step 1: convert the dtype
+    if dtype == "int64_t":
+        dtype = ctypes.c_longlong
+    elif dtype in ["int", "int32_t"]:
+        dtype = ctypes.c_int32
+    else:
+        raise NotImplementedError(f"Type {dtype} is not supported")
+
+    tuple_type, _ = tuple_factory_(input_tuple, dtype, constants)
+
+    if ctypes.sizeof(tuple_type) == 0:
+        return EmptyByte
+    return tuple_type
+
+
+def visitor_factory(node_types, node_names):
+    """
+    Creates the argument type of epilogue visitor type
+
+    :param node_types: list of argument types under ctypes
+    :param node_names: list of argument names under str
+
+    :return: tuple type in ctypes.Structure
+    """
+    ctypes_field = []
+    # Struct is used when number of nodes < 4
+    # Because the Sm90VisitorImplBase has specification up to 4 nodes
+    # in `include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp`
+    if len(node_types) <= 4:
+        for idx, node_type in enumerate(node_types):
+            if ctypes.sizeof(node_type) == 0:
+                # Special case for empty struct
+                # 1 byte placeholder is used for correct alignment
+                ctypes_field.append((node_names[idx], ctypes.c_byte))
+            else:
+                ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    if ftype != ctypes.c_byte:
+                        setattr(self, fname, ftype(kwargs))
+
+    # For cases with more than 4 nodes, tuple is used
+    else:
+        for idx, node_type in enumerate(node_types):
+            ctypes_field.append((node_names[idx], node_type))
+
+        class VisitorType(ctypes.Structure):
+            _fields_ = ctypes_field
+
+            def __init__(self, kwargs) -> None:
+                for field in self._fields_:
+                    fname, ftype = field
+                    setattr(self, fname, ftype(kwargs))
+
+    return VisitorType
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/compiler.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/compiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b66ce8a2402a109e2da00613e7255760685855c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/compiler.py
@@ -0,0 +1,462 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+import json
+import os
+import sqlite3
+import subprocess
+import tempfile
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart = lazy_import("cuda.cudart")
+nvrtc = lazy_import("cuda.nvrtc")
+from cutlass_library import SubstituteTemplate
+
+import cutlass_cppgen
+from cutlass_cppgen import CACHE_FILE, CUTLASS_PATH, cuda_install_path, logger
+from cutlass_cppgen.backend.gemm_operation import GemmOperationUniversal
+from cutlass_cppgen.backend.library import ApiVersion
+from cutlass_cppgen.backend.utils.device import device_cc
+
+IncludeTemplate = r"""#include "${include}"
+"""
+
+
+def compile_with_nvcc(cmd, source, error_file):
+    succeed = True
+    try:
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        error_message = e.output.decode()
+        with open(error_file, "w") as error_out:
+            error_log = "Compilation error for the following kernel: \n"
+            error_log += source
+            error_log += "\nError Message:\n"
+            error_log += error_message
+            error_out.write(error_log)
+        succeed = False
+    if not succeed:
+        # Print the error log to stdout if log level is set to warning or higher
+        # verbosity. Otherwise, simply point to the error log file.
+        logger.warning(error_log)
+        raise Exception(f"Invalid Kernel. See '{error_file}' for details.")
+
+
+class CompilationOptions:
+    """
+    Compilation options.
+    """
+
+    def __init__(self, flags, arch, include_paths=[]):
+        self.includes = []
+        self.include_paths = include_paths
+        self.flags = flags
+        self.arch = arch
+
+    def get_str(self):
+        opts = []
+        for flag in self.flags:
+            opts.append(flag)
+
+        for incl in self.include_paths:
+            opts.append(f"--include-path={incl}")
+
+        arch_flag = f"-arch=sm_{self.arch}"
+        if self.arch in [90, 100, 101, 103, 120, 121] and int(cutlass_cppgen.nvcc_version().split('.')[0]) >= 12:
+            arch_flag += "a"
+        opts.append(arch_flag)
+
+        return " ".join(opts)
+
+    def get(self):
+        options = []
+
+        for flag in self.flags:
+            options.append(bytes(str.encode(flag)))
+
+        for incl in self.include_paths:
+            options.append(bytes(str.encode(f" --include-path={incl}")))
+
+        arch_flag = f" -arch=sm_{self.arch}"
+        if self.arch in [90, 100, 101, 103, 120, 121]:
+            arch_flag += "a"
+
+        options.append(bytes(str.encode(arch_flag)))
+
+        return options
+
+
+def convertToBinaryData(filename):
+    with open(filename, "rb") as file:
+        blobData = file.read()
+    return blobData
+
+
+def CDLLBin(host_binary):
+    tempfile.tempdir = "./"
+    temp_so = tempfile.NamedTemporaryFile(prefix="host_func", suffix=".so", delete=True)
+    with open(temp_so.name, "wb") as file:
+        file.write(host_binary)
+    host_lib = ctypes.CDLL(temp_so.name)
+    return host_lib
+
+
+class ArtifactManager:
+    """
+    Artifact manager
+    """
+
+    def __init__(self) -> None:
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        # Create the table if it does not already exist
+        sqlite_create_table_query = """
+        CREATE TABLE IF NOT EXISTS compiled_operations(op_key TEXT NOT NULL UNIQUE,
+                                                        cubin BLOB NOT NULL,
+                                                        hostbin BLOB NOT NULL,
+                                                        op_name TEXT NOT NULL,
+                                                        op_attrs TEXT NOT NULL)
+        """
+        cursor.execute(sqlite_create_table_query)
+        connection.commit()
+        cursor.close()
+
+        self._nvrtc_compile_options = ["-std=c++17", "-default-device"]
+        self._nvcc_compile_options = [
+            "-std=c++17",
+            "--expt-relaxed-constexpr",
+            "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
+        ]
+        self.nvcc()
+        self.compiled_cache_device = {}
+        self.compiled_cache_host = {}
+
+    def nvrtc(self):
+        self.backend = "nvrtc"
+        self.default_compile_options = self._nvrtc_compile_options
+
+    def nvcc(self):
+        self.backend = "nvcc"
+        self.default_compile_options = self._nvcc_compile_options
+
+    def insert_operation(self, op_key, cubin, hostfile, op_name, op_attrs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_insert_blob_query = """ INSERT OR IGNORE INTO compiled_operations (op_key, cubin, hostbin, op_name, op_attrs) VALUES (?, ?, ?, ?, ?)"""
+
+        hostbin = convertToBinaryData(hostfile)
+
+        data_tuple = (op_key, cubin, hostbin, op_name, json.dumps(op_attrs))
+
+        cursor.execute(sqlite_insert_blob_query, data_tuple)
+        connection.commit()
+        cursor.close()
+
+    def load_operation(self, op_key, extra_funcs):
+        connection = sqlite3.connect(CACHE_FILE)
+        cursor = connection.cursor()
+        sqlite_fetch_blob_query = """SELECT * from compiled_operations where op_key = ?"""
+        cursor.execute(sqlite_fetch_blob_query, (op_key,))
+        record = cursor.fetchall()
+        if len(record) == 0:
+            return False
+        for row in record:
+            key, cubin_image, host_binary, operation_name, op_attr = row
+            op_attr = json.loads(op_attr)
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            err, kernel = cuda.cuModuleGetFunction(module, bytes(str.encode(operation_name)))
+            self.compiled_cache_device[key] = kernel
+
+            compiled_host_fns = {}
+            host_lib = CDLLBin(host_binary)
+
+            func_name = operation_name + "_get_params"
+            func = getattr(host_lib, func_name)
+            func.restype = ctypes.POINTER(ctypes.c_char * op_attr[0])
+            compiled_host_fns["get_args"] = func
+
+            func_name = operation_name + "_shared_memory_size"
+            func = getattr(host_lib, func_name)
+            compiled_host_fns["shared_memory_capacity"] = func()
+
+            for attr in op_attr:
+                if isinstance(attr, str):
+                    func_name = operation_name + "_" + attr
+                    func = getattr(host_lib, func_name)
+
+                    # Set the return type of the function
+                    if attr in extra_funcs and extra_funcs[attr] != None:
+                        func.restype = extra_funcs[attr]
+
+                    compiled_host_fns[attr] = func
+
+            self.compiled_cache_host[key] = compiled_host_fns
+        return True
+
+    def emit_compile_(self, operation_list, compilation_options, host_compilation_options):
+        """
+        Compile a list of kernels and store them into database
+        """
+        source_buffer_device = ""
+        source_buffer_host = ""
+        # 1. include
+        includes = []
+        for operation in operation_list:
+            for incl in operation.emitter.includes:
+                if incl not in includes:
+                    includes.append(incl)
+
+        includes_host = ["builtin_types.h", "device_launch_parameters.h", "cstddef"] + includes
+        for incl in includes:
+            source_buffer_device += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        for incl in includes_host:
+            source_buffer_host += SubstituteTemplate(
+                IncludeTemplate,
+                {"include": incl},
+            )
+
+        # 2. Operations
+        for operation in operation_list:
+            source_buffer_device += operation.emit()
+            source_buffer_host += operation.emit()
+            values = {
+                "operation_name": operation.name(),
+                "operation_suffix": operation.emitter.operation_suffix,
+            }
+            source_buffer_device += SubstituteTemplate(
+                operation.KernelTemplate,
+                values,
+            )
+            source_buffer_host += SubstituteTemplate(operation.HostTemplate, values)
+
+        if self.backend == "nvrtc":
+            # 3. compile
+            err, program = nvrtc.nvrtcCreateProgram(
+                str.encode(source_buffer_device),
+                bytes(str.encode("module.cu")),
+                0, [], [])
+
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            # Compile program
+            options = compilation_options.get()
+
+            err, = nvrtc.nvrtcCompileProgram(program, len(options), options)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                error_string = "NVRTC Error: {}\n".format(err)
+
+                # Get log from compilation
+                err, logSize = nvrtc.nvrtcGetProgramLogSize(program)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                log = b" " * logSize
+                err, = nvrtc.nvrtcGetProgramLog(program, log)
+                if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                    raise RuntimeError("NVRTC Error: {}".format(err))
+
+                raise RuntimeError(error_string + log.decode() + source_buffer_device)
+
+            # Get data from compilation
+            err, dataSize = nvrtc.nvrtcGetCUBINSize(program)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+            cubin_image = b" " * dataSize
+            (err,) = nvrtc.nvrtcGetCUBIN(program, cubin_image)
+            if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError("NVRTC Error: {}".format(err))
+
+        else:  # with nvcc backend
+            # emit code
+            tempfile.tempdir = "./"
+            temp_cu = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cu", delete=True)
+            temp_cubin = tempfile.NamedTemporaryFile(
+                prefix="kernel", suffix=".cubin", delete=True)
+            with open(temp_cu.name, "w") as file:
+                file.write(source_buffer_device)
+
+            # compile with nvcc
+            cmd_template = "${cuda_install_path}/bin/nvcc ${options} -cubin ${srcfile} -o ${tarfile}"
+            values = {
+                "cuda_install_path": cuda_install_path(),
+                "options": compilation_options.get_str(),
+                "srcfile": temp_cu.name,
+                "tarfile": temp_cubin.name,
+            }
+            cmd = SubstituteTemplate(cmd_template, values)
+            compile_with_nvcc(cmd.split(" "), source_buffer_device, "./cutlass_python_compilation_device_error.txt")
+
+            # load the cubin image
+            with open(temp_cubin.name, "rb") as file:
+                cubin_image = file.read()
+
+        tempfile.tempdir = "./"
+        temp_src = tempfile.NamedTemporaryFile(
+            prefix="host_src", suffix=".cu", delete=True)
+
+        # Write the host source
+        with open(temp_src.name, "w") as outfile:
+            outfile.write(source_buffer_host)
+
+        temp_dst = tempfile.NamedTemporaryFile(
+            prefix="host_func", suffix=".so", delete=True)
+
+        # Set up host compilation arguments
+        cmd = []
+        cmd.append(f"{cuda_install_path()}/bin/nvcc")
+        cmd.extend(["-x", "cu", "-Xcompiler=-fpermissive", "-Xcompiler=-w", "-Xcompiler=-fPIC"])
+        cmd.extend(host_compilation_options.get_str().split(" "))
+        cmd.extend(["-shared", "-o", temp_dst.name, temp_src.name, "-lcudart", "-lcuda"])
+
+        # Comile and load the library
+        compile_with_nvcc( cmd, source_buffer_host, error_file="./cutlass_python_compilation_host_error.txt")
+        host_lib = ctypes.CDLL(temp_dst.name)
+
+        return cubin_image, host_lib, temp_dst
+
+    def add_module(self, operations, compile_options=None, bypass_cache=False):
+        """
+        Insert a new compiled device module
+        """
+        include_paths = [
+            cuda_install_path() + "/include",
+            CUTLASS_PATH + "/include",
+            CUTLASS_PATH + "/tools/util/include",
+            CUTLASS_PATH + "/python/cutlass/cpp/include",
+        ]
+
+        cutlass_cppgen.initialize_cuda_context()
+        arch = device_cc()
+
+        host_compile_options = CompilationOptions(
+            self._nvcc_compile_options, arch, include_paths)
+        if compile_options is None:
+            compile_options = CompilationOptions(
+                self.default_compile_options, arch, include_paths)
+        # save the cubin
+        operation_key = []
+        operation_list = []
+        for operation in operations:
+            # step 1: get kernel string as key
+            key = operation.rt_module.emit() + operation.procedural_name() + self.backend
+            # step 1: check if the operation is in cache
+            compiled_kernel = self.compiled_cache_device.get(key)
+
+            if compiled_kernel is None and not bypass_cache:
+                hit = self.load_operation(key, getattr( operation.rt_module, "extra_funcs", {}))
+                if hit:
+                    compiled_kernel = self.compiled_cache_device.get(key)
+                    assert compiled_kernel is not None
+            if compiled_kernel is not None:
+                operation.rt_module.kernel = compiled_kernel
+                compiled_host_fns = self.compiled_cache_host.get(key)
+                assert compiled_host_fns is not None
+                for key in compiled_host_fns.keys():
+                    setattr(operation.rt_module, key, compiled_host_fns[key])
+                operation.rt_module.initialize()
+            else:
+                operation_list.append(operation.rt_module)
+                operation_key.append(key)
+
+        if len(operation_list) > 0:
+            cubin_image, host_lib, host_file = self.emit_compile_(
+                operation_list, compile_options, host_compile_options)
+
+            err, module = cuda.cuModuleLoadData(cubin_image)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("Cuda Error: {}".format(err))
+
+            operation_name = []
+            operation_attr = []
+            for operation, key in zip(operation_list, operation_key):
+                # get device kernels
+                err, operation.kernel = cuda.cuModuleGetFunction(
+                    module,
+                    bytes(str.encode(operation.name()))
+                )
+                operation_name.append(operation.name())
+                self.compiled_cache_device[key] = operation.kernel
+                # get host functions
+                compiled_host_fns = {}
+                op_attr = []
+
+                # get param size
+                func_name = operation.name() + "_get_param_size"
+                func = getattr(host_lib, func_name)
+                param_size = func()
+
+                func_name = operation.name() + "_get_params"
+                func = getattr(host_lib, func_name)
+                func.argtype = operation.argtype
+                func.restype = ctypes.POINTER(ctypes.c_char * param_size)
+                setattr(operation, "get_args", func)
+                compiled_host_fns["get_args"] = func
+
+                # set shared memory size
+                func_name = operation.name() + "_shared_memory_size"
+                func = getattr(host_lib, func_name)
+                setattr(operation, "shared_memory_capacity", func())
+                compiled_host_fns["shared_memory_capacity"] = func()
+                # set the maximum dynamic shared size
+                operation.initialize()
+
+                # get extra functions
+                op_attr.append(param_size)
+
+                if hasattr(operation, "extra_funcs"):
+                    for suffix, ret_type  in operation.extra_funcs.items():
+                        func_name = operation.name() + "_" + suffix
+                        func = getattr(host_lib, func_name)
+                        if ret_type is not None:
+                            func.restype = ret_type
+                        setattr(operation, suffix, func)
+                        compiled_host_fns[suffix] = func
+                        op_attr.append(suffix)
+
+                operation_attr.append(op_attr)
+                self.compiled_cache_host[key] = compiled_host_fns
+
+            for (key, operation_name, operation_attr,) in zip(operation_key, operation_name, operation_attr):
+                self.insert_operation(
+                    key, cubin_image, host_file.name, operation_name, operation_attr)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/conv2d_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/conv2d_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..03679c434e1a63e9d1f9f2d1571dacedcf6e1470
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/conv2d_operation.py
@@ -0,0 +1,700 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from __future__ import annotations
+
+import ctypes
+from typing import Union
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_library import SubstituteTemplate
+import numpy as np
+
+from cutlass_library import (
+    ConvKindNames,
+    ConvKindTag,
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    IteratorAlgorithmNames,
+    IteratorAlgorithmTag,
+    LayoutTag,
+    LayoutType,
+    MathOperation,
+    MathOperationTag,
+    OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
+    OperationKind,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+    StrideSupport,
+    StrideSupportTag,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
+    get_complex_from_real,
+)
+
+from cutlass_cppgen.backend.arguments import ArgumentBase
+from cutlass_cppgen.backend.c_types import dim3_, get_conv2d_arguments
+from cutlass_cppgen.backend.library import (
+    EmissionType,
+    TensorDescription,
+    TileDescription,
+)
+from cutlass_cppgen.backend.memory_manager import device_mem_alloc
+from cutlass_cppgen.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass_cppgen.backend.utils.device import to_device_ptr
+from cutlass_cppgen.shape import GemmCoord
+
+
+class Conv2dArguments(ArgumentBase):
+    """
+    Argument wrapper for Conv2d. It encodes problem information and
+    user-provide tensors into the kernel's argument.
+
+    :param operation: the Conv2d operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.Conv2dOperation`
+    :param problem_size: the Conv2d problem size
+    :type problem_size: :class:`cutlass_cppgen.shape.Conv2dProblemSize`
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+    :param split_k_mode: conv2d split K mode, defaults to cutlass_library.library.SplitKMode.Serial
+    :type split_k_mode: cutlass_library.library.SplitKMode, optional
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D,
+        split_k_mode=SplitKMode.Serial, **kwargs, ) -> None:
+        self.operation = operation
+        self.conv_kind = operation.conv_kind
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
+
+        self.element_A = operation.A.element
+        self.element_B = operation.B.element
+        self.element_C = operation.C.element
+
+        if self.layout_C == LayoutType.TensorNC32HW32:
+            raise Exception("Layout type TensorNC32HW32 is not currently supported")
+
+        super().__init__(A, B, C, D, **kwargs)
+
+        if "split_k_slices" in kwargs.keys() and kwargs["split_k_slices"] > 1:
+            self.split_k_mode = split_k_mode
+            self.split_k_slices = kwargs["split_k_slices"]
+        else:
+            self.split_k_mode = SplitKMode.Serial
+            self.split_k_slices = 1
+
+        if "output_op" in kwargs.keys() and self.split_k_mode != SplitKMode.Parallel:
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        self.problem_size = problem_size
+        self.problem_size.split_k_slices = self.split_k_slices
+
+        self.initialize()
+
+    def get_arguments(self):
+        tc_numel = -1
+        if hasattr(self, "tensor_c_numel"):
+            tc_numel = self.tensor_c_numel
+
+        self.c_arguments = self.operation.argument_type(
+            int(self.conv_kind),
+            self.problem_size.ctype,
+            int(to_device_ptr(self.ptr_A)),
+            int(to_device_ptr(self.ptr_B)),
+            int(to_device_ptr(self.ptr_C)),
+            int(to_device_ptr(self.ptr_D)),
+            tc_numel,
+            self.output_op,
+            int(self.split_k_mode)
+        )
+
+    def initialize(self):
+        self.launch_config = self.operation.rt_module.plan(self)
+
+        self.get_arguments()
+
+        # Allocate and initialize device workspace
+        device_workspace_size = self.operation.rt_module.get_workspace_size(self.c_arguments)
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        self.semaphore = 0
+        if workspace_ptr is not None and self.split_k_mode == SplitKMode.Parallel:
+            self.ptr_D = workspace_ptr
+            # Reset arguments now that ptr_D has been updated
+            self.get_arguments()
+        elif workspace_ptr is not None and self.split_k_mode == SplitKMode.Serial:
+            self.semaphore = workspace_ptr
+
+        params_ = self.operation.rt_module.get_args(
+            self.c_arguments, ctypes.c_void_p(int(self.semaphore)))
+        self.host_workspace = bytearray(params_.contents)
+        self.device_workspace = None
+
+    def sync(self):
+        """
+        Synchronize the arguments. If the input tensor is in host,
+        copy it from device to host.
+        """
+        return super().sync()
+
+
+class Conv2dRT(ExecutableOperation):
+    """
+    Conv2dRT manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  using ElementA = typename ${operation_name}_base::ElementA;
+  using ElementB = typename ${operation_name}_base::ElementB;
+  using ElementC = typename ${operation_name}_base::ElementC;
+  using LayoutA = typename ${operation_name}_base::LayoutA;
+  using LayoutB = typename ${operation_name}_base::LayoutB;
+  using LayoutC = typename ${operation_name}_base::LayoutC;
+  using EpilogueOutputOp = typename ${operation_name}_base::EpilogueOutputOp;
+
+  struct ${operation_name}_TemporaryArgs {
+    int conv_kind;
+    cutlass::conv::Conv2dProblemSize problem_size;
+    ElementA* ptr_A;
+    ElementB* ptr_B;
+    ElementC* ptr_C;
+    ElementC* ptr_D;
+    int tensor_c_numel;
+    typename EpilogueOutputOp::Params epilogue_params;
+    int split_k_mode;
+  };
+
+  typename ${operation_name}${operation_suffix}::Arguments
+  construct_arguments(${operation_name}_TemporaryArgs args) {
+    cutlass::conv::Operator conv_operator = static_cast<cutlass::conv::Operator>(args.conv_kind);
+    auto tc_A = cutlass::conv::implicit_gemm_tensor_a_extent(conv_operator, args.problem_size);
+    auto tc_B = cutlass::conv::implicit_gemm_tensor_b_extent(conv_operator, args.problem_size);
+    auto tc_C = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+    auto tc_D = cutlass::conv::implicit_gemm_tensor_c_extent(conv_operator, args.problem_size);
+
+    auto size_C = tc_C.at(0) * tc_C.at(1) * tc_C.at(2) * tc_C.at(3);
+    if (args.tensor_c_numel >= 0 && args.tensor_c_numel == tc_C.at(3) && args.tensor_c_numel < size_C) {
+      // C is interpreted as bias
+      tc_C = {0, 0, 0, 0};
+    }
+
+    cutlass::TensorRef<ElementA, LayoutA> tref_A(args.ptr_A, LayoutA::packed(tc_A));
+    cutlass::TensorRef<ElementB, LayoutA> tref_B(args.ptr_B, LayoutB::packed(tc_B));
+    cutlass::TensorRef<ElementC, LayoutA> tref_C(args.ptr_C, LayoutC::packed(tc_C));
+    cutlass::TensorRef<ElementC, LayoutA> tref_D(args.ptr_D, LayoutC::packed(tc_D));
+
+    return {
+      args.problem_size,
+      tref_A,
+      tref_B,
+      tref_C,
+      tref_D,
+      args.epilogue_params,
+      static_cast<cutlass::conv::SplitKMode>(args.split_k_mode)
+    };
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}_TemporaryArgs args, int *semaphore=nullptr) {
+    auto arguments = construct_arguments(args);
+    typename ${operation_name}${operation_suffix}::Params* params;
+    params = new ${operation_name}${operation_suffix}::Params(arguments, semaphore);
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+      output[i] = bytes[i];
+
+    return output;
+  }
+
+  dim3 ${operation_name}_get_grid_shape(
+    int conv_kind,
+    cutlass::conv::Conv2dProblemSize problem_size,
+    cutlass::gemm::GemmCoord tile_size,
+    int split_k_slices
+  ) {
+
+    using Swizzle = typename ${operation_name}_base::ThreadblockSwizzle;
+    auto tiled_shape = Swizzle::get_tiled_shape(
+      static_cast<cutlass::conv::Operator>(conv_kind),
+      problem_size,
+      tile_size,
+      split_k_slices);
+
+    return Swizzle::get_grid_shape(tiled_shape);
+  }
+
+  size_t ${operation_name}_get_workspace_size(${operation_name}_TemporaryArgs args) {
+    auto arguments = construct_arguments(args);
+
+    // Temporarily define device::-level Conv2d so that we can call get_workspace_size
+    using DeviceConv = cutlass::conv::device::ImplicitGemmConvolution<${operation_name}_base>;
+    return DeviceConv::get_workspace_size(arguments);
+  }
+}
+
+    """
+
+    def __init__(self, operation: "Conv2dOperation"):
+        super().__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": dim3_,
+            "get_workspace_size": ctypes.c_uint64
+        }
+        self.argument_type, self.epilogue_type = get_conv2d_arguments(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_void_p]
+        self.conv_kind = operation.conv_kind
+
+        self.operation: Conv2dOperation = operation
+
+        self.emitter = EmitConv2dInstance("_type")
+
+        self.threads = operation.tile_description.num_threads
+
+        self.swizzle_functor = operation.swizzling_functor
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def plan(self, arguments: Conv2dArguments):
+        tile_size = GemmCoord(
+            self.operation.tile_description.threadblock_shape[0],
+            self.operation.tile_description.threadblock_shape[1],
+            self.operation.tile_description.threadblock_shape[2],
+        )
+
+        grid = self.get_grid_shape(
+            int(self.conv_kind),
+            arguments.problem_size.ctype,
+            tile_size.ctype,
+            arguments.split_k_slices
+        )
+
+        return LaunchConfiguration(
+            [grid.x, grid.y, grid.z], [self.threads, 1, 1],
+            self.shared_memory_capacity)
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error: {err}")
+
+
+class Conv2dOperation:
+    """
+    CUTLASS Conv2d operation description.
+
+    :param conv_kind: convolution operator
+    :type conv_kind: :class:`cutlass_library.library.ConvKind`
+
+    :param iterator_algorithm: Selects among several implementation
+    variants trading off performance with simplicity
+    :type iterator_algorithm: :class:`cutlass_library.library.IteratorAlgorithm`
+
+    :param arch: GPU compute capability (sm_xx)
+    :type arch: int
+
+    :param tile_description: tile description
+    :type tile_description: :class:`cutlass_cppgen.backend.TileDescription`
+
+    :param A: tensor A description
+    :type A: :class:`cutlass_cppgen.backend.TensorDescription`
+
+    :param B: tensor B description
+    :type B: :class:`cutlass_cppgen.backend.TensorDescription`
+
+    :param C: tensor C description
+    :type C: :class:`cutlass_cppgen.backend.TensorDescription`
+
+    :param D: tensor D description
+    :type D: :class:`cutlass_cppgen.backend.TensorDescription`
+
+    :param element_epilogue: element type for computation in epilogue \
+    :type element_epilogue: cutlass_library.library.DataType
+
+    :param stride_support: distinguish among partial specializations that \
+    accelerate certain problems where convolution stride is unit \
+    :type stride_support: :class:`cutlass_library.library.StrideSupport`
+
+    :param epilogue_functor: convolution epilogue functor
+    :type epilogue_functor: :class:`EpilogueFunctor`
+
+    :param swizzling_functor: threadblock swizzling functor
+    """
+    def __init__(
+        self,
+        conv_kind,
+        iterator_algorithm,
+        arch: int,
+        tile_description: TileDescription,
+        A: TensorDescription,
+        B: TensorDescription,
+        C: TensorDescription,
+        stride_support,
+        epilogue_functor,
+        swizzling_functor=SwizzlingFunctor.Identity1,
+        emission_type=EmissionType.Kernel,
+        **kwargs
+    ):
+        self.operation_kind: OperationKind = OperationKind.Conv2d
+        self.arch: int = arch
+        self.tile_description: TileDescription = tile_description
+        self.conv_kind = conv_kind
+        self.A: TensorDescription = A
+        self.B: TensorDescription = B
+        self.C: TensorDescription = C
+        self.epilogue_functor = epilogue_functor
+        self.iterator_algorithm = iterator_algorithm
+        self.stride_support = stride_support
+        self.swizzling_functor = swizzling_functor
+
+        self.emission_type = emission_type
+
+        self.rt_module: Conv2dRT = Conv2dRT(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def run(self, arguments: Conv2dArguments) -> cuda.CUresult:
+        """
+        Launch the cuda kernel with input arguments
+
+        :param arguments: conv2d arguments
+        :type arguments: :class:`cutlass_cppgen.backend.Conv2dArguments`
+        """
+
+        # launch the kernel
+        err = self.rt_module.run(
+            arguments.host_workspace,
+            arguments.device_workspace,
+            arguments.launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {err}")
+
+        return err
+
+    #
+    # Get function name
+    #
+
+    def procedural_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        return self.configuration_name()
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+
+        opcode_class_name = OpcodeClassNames[
+            self.tile_description.math_instruction.opcode_class
+        ]
+
+        threadblock = "%dx%d_%dx%d" % (
+            self.tile_description.threadblock_shape[0],
+            self.tile_description.threadblock_shape[1],
+            self.tile_description.threadblock_shape[2],
+            self.tile_description.stages,
+        )
+
+        if self.stride_support == StrideSupport.Unity:
+            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_align${alignment}"
+        else:
+            configuration_name = "cutlass_sm${arch}_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}"
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                "arch": str(self.arch),
+                "opcode_class": opcode_class_name,
+                "extended_name": self.extended_name(),
+                "threadblock": threadblock,
+                "layout": self.layout_name(),
+                "alignment": "%d" % self.A.alignment
+            },
+        )
+
+    def extended_name(self):
+        """Append data types if they differ from compute type."""
+        if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${element_c}_${core_name}_${element_a}"
+        elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+                self.A.element != self.tile_description.math_instruction.element_accumulator:
+            extended_name = "${core_name}_${element_a}"
+        else:
+            extended_name = "${core_name}"
+
+        extended_name = SubstituteTemplate(extended_name, {
+            "element_a": DataTypeNames[self.A.element],
+            "element_c": DataTypeNames[self.C.element],
+            "core_name": self.core_name(),
+        })
+
+        return extended_name
+
+    def layout_name(self):
+        return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+    def core_name(self):
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
+
+        intermediate_type = ""
+
+        if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+            inst_shape = "%dx%dx%d" % tuple(
+                self.tile_description.math_instruction.instruction_shape)
+            if self.tile_description.math_instruction.element_a != self.A.element and \
+                    self.tile_description.math_instruction.element_a != self.accumulator_type():
+                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+        else:
+            inst_shape = ""
+
+        return "%s%s%s%s_%s" % (
+            ShortDataTypeNames[self.accumulator_type()],
+            inst_shape,
+            intermediate_type,
+            ConvKindNames[self.conv_kind],
+            IteratorAlgorithmNames[self.iterator_algorithm]
+        )
+
+    def is_complex(self):
+        complex_operators = [
+            MathOperation.multiply_add_complex,
+            MathOperation.multiply_add_complex_gaussian,
+        ]
+        return self.tile_description.math_instruction.math_operation in complex_operators
+
+    def accumulator_type(self):
+        accum = self.tile_description.math_instruction.element_accumulator
+
+        if self.is_complex():
+            return get_complex_from_real(accum)
+
+        return accum
+
+    def device_op(self):
+        """
+        Returns a new Conv2dOperation object that is constructed with emission type
+        ``EmissionType.Device``.
+
+        :return: operation ready for device-level code emission
+        :rtype: Conv2dOperation
+        """
+        return Conv2dOperation(
+            self.conv_kind, self.iterator_algorithm, self.arch, self.tile_description,
+            self.A, self.B, self.C, self.stride_support, self.epilogue_functor, self.swizzling_functor,
+            emission_type=EmissionType.Device)
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+
+class EmitConv2dInstance:
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/conv/kernel/default_conv2d_fprop.h",
+            "cutlass/conv/kernel/default_conv2d_dgrad.h",
+            "cutlass/conv/kernel/default_conv2d_wgrad.h",
+            "cutlass/conv/device/implicit_gemm_convolution.h"
+        ]
+        self.template = """
+// Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+using ${operation_name}_base =
+typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+  ${element_a},
+  ${layout_a},
+  ${element_b},
+  ${layout_b},
+  ${element_c},
+  ${layout_c},
+  ${element_accumulator},
+  ${opcode_class},
+  ${arch},
+  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+  ${epilogue_functor},
+  ${swizzling_functor},
+  ${stages},
+  ${math_operator},
+  ${iterator_algorithm},
+  ${stride_support},
+  ${align_a},
+  ${align_b}
+>::Kernel;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+
+"""
+
+        self.template_device = """
+// Conv2d operation ${operation_name}
+
+using Conv2d${conv_kind_name}Kernel = typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+  ${element_a},
+  ${layout_a},
+  ${element_b},
+  ${layout_b},
+  ${element_c},
+  ${layout_c},
+  ${element_accumulator},
+  ${opcode_class},
+  ${arch},
+  cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+  cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+  cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+  ${epilogue_functor},
+  ${swizzling_functor},
+  ${stages},
+  ${math_operator},
+  ${iterator_algorithm},
+  ${stride_support},
+  ${align_a},
+  ${align_b}
+>::Kernel;
+
+using DeviceKernel =
+    typename cutlass::conv::device::ImplicitGemmConvolution<Conv2d${conv_kind_name}Kernel>;
+"""
+
+    def emit(self, operation):
+        warp_shape = [int(operation.tile_description.threadblock_shape[idx] /
+                          operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+        epilogue_vector_length = int(min(
+            operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "conv_kind": ConvKindTag[operation.conv_kind],
+            "conv_kind_name": ConvKindNames[operation.conv_kind].capitalize(),
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[operation.A.layout],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[operation.B.layout],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[operation.C.layout],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "epilogue_vector_length": str(epilogue_vector_length),
+            "epilogue_functor": operation.epilogue_functor.emit(),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "iterator_algorithm": IteratorAlgorithmTag[operation.iterator_algorithm],
+            "iterator_algorithm_name": IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+            "stride_support": StrideSupportTag[operation.stride_support],
+            "math_operator": "cutlass::arch::OpMultiplyAddComplex" if operation.is_complex() else MathOperationTag[operation.tile_description.math_instruction.math_operation],
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+        }
+
+        if operation.emission_type == EmissionType.Kernel:
+            conv2d_template = self.template
+        else:
+            conv2d_template = self.template_device
+
+        return SubstituteTemplate(conv2d_template, values)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/epilogue.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/epilogue.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ad79c9c8ecc9cad6067a3d9543b2625344848b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/epilogue.py
@@ -0,0 +1,541 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+
+from cutlass_library import SubstituteTemplate
+import numpy as np
+
+from cutlass_library import DataType, DataTypeTag
+from cutlass_cppgen.backend.c_types import MatrixCoord_, tuple_factory
+from cutlass_cppgen.backend.frontend import NumpyFrontend
+from cutlass_cppgen.backend.library import ActivationOp, ActivationOpTag
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor, is_torch_available, is_torch_tensor
+
+dtype2ctype = {
+    DataType.f16: ctypes.c_uint16,
+    DataType.bf16: ctypes.c_uint16,
+    DataType.f32: ctypes.c_float,
+    DataType.f64: ctypes.c_double,
+    DataType.s8: ctypes.c_int8,
+    DataType.s32: ctypes.c_int32
+}
+
+if is_torch_available():
+    import torch
+    import torch.nn.functional as F
+
+
+def get_scalar(value):
+    """
+    Returns a scalar value from a container (e.g., np.ndarray)
+    """
+    if is_numpy_tensor(value):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    elif is_torch_tensor(value):
+        if value.size != 1:
+            raise Exception("Scalars used in epilogue must be of size 1")
+        return value.reshape(-1)[0]
+    else:
+        return value
+
+
+def to_ctype_value(value, dtype):
+    """
+    Converts ``value`` to the corresponding storage needed for the ctype that
+    will store ``value``.
+    """
+    scalar = get_scalar(value)
+    if dtype == DataType.f16:
+        # Convert f16 value into an integer
+        return int.from_bytes(np.float16(scalar).tobytes(), "little")
+    else:
+        return scalar
+
+
+#################################################################################################
+#
+# Epilogue Functors
+#
+#################################################################################################
+
+
+class EpilogueFunctorBase:
+    """
+    Base class for thread-level epilogue functors
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def emit(self, tag, template_argument):
+        template = """${tag}<${arguments}>"""
+        arguments = ""
+        for idx, arg in enumerate(template_argument):
+            arguments += arg
+            if idx < len(template_argument) - 1:
+                arguments += ", "
+        values = {
+            "tag": tag,
+            "arguments": arguments,
+        }
+
+        return SubstituteTemplate(template, values)
+
+
+class LinearCombination(EpilogueFunctorBase):
+    """
+    Apply a linear combination operator to an array of elements
+    D = alpha * accumulator + beta * source
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombination"
+
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        super().__init__()
+
+        if element_accumulator is None:
+            element_accumulator = element_output
+        if element_epilogue is None:
+            element_epilogue = element_output
+
+        self.element_output = element_output
+        self.element_accumulator = element_accumulator
+        self.element_epilogue = element_epilogue
+        self.epilogue_vector_length = epilogue_vector_length
+
+        self.template_arguments = [
+            DataTypeTag[element_output],
+            str(epilogue_vector_length),
+            DataTypeTag[element_accumulator],
+            DataTypeTag[element_epilogue],
+        ]
+
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParamsEVT(ctypes.Structure):
+            """
+            Epilogue params when using the default linear combination of EVT, which
+            does not currently use {alpha,beta}_ptr_array
+            """
+
+            stride_type = tuple_factory((0,0,1), "int64_t", [0])
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("dalpha", stride_type),
+                ("dbeta", stride_type),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("alpha_ptr_array", ctypes.c_void_p),
+                ("beta_ptr_array", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+            def to_evt_params(self) -> _EpilogueOutputOpParamsEVT:
+                return _EpilogueOutputOpParamsEVT(self.alpha, self.beta)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+        self.epilogue_type_evt = _EpilogueOutputOpParamsEVT
+
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+
+
+class LinearCombinationClamp(LinearCombination):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+
+    D = alpha * accumulator + beta * source + uniform
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombinationClamp"
+
+    def __init__(
+        self, element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        # Base constructor
+        super().__init__(
+            element_output,
+            epilogue_vector_length,
+            element_accumulator,
+            element_epilogue,
+        )
+
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+
+
+class FastLinearCombinationClamp(EpilogueFunctorBase):
+    """
+    Applies a linear combination operator to an array of elements then clamps
+    the output before converting to the output element type.
+
+    D = alpha * accumulator + beta * source
+
+    Note: The below method only when problem_size_K <= 256 for signed int8 gemm
+    or problem_size_K <= 128 for unsigned int8 gemm. The default approach is
+    above.
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+    """
+
+    tag = "cutlass::epilogue::thread::FastLinearCombinationClamp"
+
+    def __init__(self, element_output, epilogue_vector_length, *args) -> None:
+        super().__init__()
+
+        self.template_arguments = [
+            DataTypeTag[element_output], str(epilogue_vector_length)
+        ]
+
+        self.element_accumulator = DataType.s32
+        self.element_epilogue = DataType.f32
+
+        # get epilogue output op
+        c_element_epilogue = dtype2ctype[self.element_epilogue]
+        element_epilogue = self.element_epilogue
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        self.epilogue_type = _EpilogueOutputOpParams
+
+    def emit(self):
+        return super().emit(self.tag, self.template_arguments)
+
+
+class LinearCombinationGeneric(LinearCombination):
+    """
+    Applies a linear combination operator followed by an activation function
+    to an array of elements.
+
+    D = activation(alpha * accumulator + beta * source)
+
+    :param activation_functor: input activation functor
+
+    :param element_output: data type used to load and store tensors
+
+    :param epilogue_vector_length: number of elements computed per operation.
+    Usually it is 128/sizeof_bits_v<ElementOutput_>, but we use 64 and 32 sometimes
+    when there are not enough data to store
+
+    :param element_accumulator: Accumulator data type
+
+    :param element_epilogue: data type used to compute linear combination
+    """
+
+    tag = "cutlass::epilogue::thread::LinearCombinationGeneric"
+
+    def __init__(
+        self, activation_functor,
+        element_output, epilogue_vector_length,
+        element_accumulator=None, element_epilogue=None) -> None:
+        super().__init__(
+            element_output,
+            epilogue_vector_length,
+            element_accumulator,
+            element_epilogue,
+        )
+
+        self.template_arguments = [
+            activation_functor.emit()] + self.template_arguments
+
+        self.activation_functor = activation_functor
+        self.element_epilogue = element_epilogue
+
+        # get epilogue output op
+        self.epilogue_type = self.activation_functor.epilogue_output_op(self.element_epilogue)
+
+
+class ActivationFunctor:
+    """
+    Base class for frequently used activation functions
+    """
+
+    @staticmethod
+    def numpy(x: np.ndarray):
+        raise NotImplementedError()
+
+    @classmethod
+    def emit(cls):
+        return ActivationOpTag[cls.binding_type]
+
+    @staticmethod
+    def epilogue_output_op(element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+            ]
+
+            def __init__(self, alpha, beta, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+
+        return _EpilogueOutputOpParams
+
+class ActivationMeta(type):
+    @classmethod
+    def __call__(cls, x, *args):
+        if is_numpy_tensor(x):
+            return cls.numpy(x, *args)
+        elif is_torch_tensor(x):
+            return cls.torch(x, *args)
+        else:
+            raise NotImplementedError("Unsupported tensor type")
+
+    @classmethod
+    def numpy(cls, *args):
+        raise NotImplementedError(f"Numpy reference for {cls.__name__[:-4]} is not implemented.")
+
+    @classmethod
+    def torch(cls, *args):
+        raise NotImplementedError(f"PyTorch reference for {cls.__name__[:-4]} is not implemented.")
+
+##############################################################################
+# identity operator
+class identityMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x
+
+    @classmethod
+    def torch(cls, x):
+        return x
+
+class identity(ActivationFunctor, metaclass=identityMeta):
+    binding_type = ActivationOp.Identity
+
+
+##############################################################################
+# ReLu operator
+class reluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return np.where(x > 0, x, 0)
+
+    @classmethod
+    def torch(cls, x):
+        return F.relu(x)
+
+class relu(ActivationFunctor, metaclass=reluMeta):
+    binding_type = ActivationOp.ReLU
+
+
+##############################################################################
+# Leaky ReLu operator
+class leakyReLUMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x, leaky_alpha):
+        return np.maximum(x, 0) + np.minimum(x, 0) * leaky_alpha
+
+    @classmethod
+    def torch(cls, x, leaky_alpha):
+        return F.leaky_relu(x, leaky_alpha)
+
+class leaky_relu(ActivationFunctor, metaclass=leakyReLUMeta):
+    binding_type = ActivationOp.LeakyReLU
+
+    @staticmethod
+    def epilogue_output_op(element_epilogue):
+        c_element_epilogue = dtype2ctype[element_epilogue]
+
+        class _EpilogueOutputOpParams(ctypes.Structure):
+            _fields_ = [
+                ("alpha", c_element_epilogue),
+                ("beta", c_element_epilogue),
+                ("alpha_ptr", ctypes.c_void_p),
+                ("beta_ptr", ctypes.c_void_p),
+                ("leaky_alpha", c_element_epilogue)
+            ]
+
+            def __init__(self, alpha, beta, leaky_alpha=0.2, *args) -> None:
+                self.alpha = to_ctype_value(alpha, element_epilogue)
+                self.beta = to_ctype_value(beta, element_epilogue)
+                self.alpha_ptr = 0
+                self.beta_ptr = 0
+                self.leaky_alpha = to_ctype_value(leaky_alpha, element_epilogue)
+
+        return _EpilogueOutputOpParams
+
+
+##############################################################################
+# Tanh operator
+class tanhMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return np.tanh(x)
+
+    @classmethod
+    def torch(cls, x):
+        return torch.tanh(x)
+
+class tanh(ActivationFunctor, metaclass=tanhMeta):
+    binding_type = ActivationOp.Tanh
+
+
+##############################################################################
+# Sigmoid operator
+class sigmoidMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return 1.0 / (1.0 + np.exp(-x))
+
+    @classmethod
+    def torch(cls, x):
+        return F.sigmoid(x)
+
+class sigmoid(ActivationFunctor, metaclass=sigmoidMeta):
+    binding_type = ActivationOp.Sigmoid
+
+
+##############################################################################
+# SiLu operator
+class siluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        return x * sigmoidMeta.numpy()
+
+    @classmethod
+    def silu(cls, x):
+        return F.silu(x)
+
+
+class silu(ActivationFunctor, metaclass=siluMeta):
+    binding_type = ActivationOp.SiLU
+
+
+##############################################################################
+# Hardswish operator
+class hardswishMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        relu6 = np.minimum(np.maximum(x + 3.0, 0), 6.0)
+        return x * relu6 / 6.0
+
+    @classmethod
+    def torch(cls, x):
+        return F.hardswish(x)
+
+
+class hardswish(ActivationFunctor, metaclass=hardswishMeta):
+    binding_type = ActivationOp.HardSwish
+
+
+##############################################################################
+# GELU operator
+class geluMeta(ActivationMeta):
+    @classmethod
+    def numpy(cls, x):
+        from scipy.special import erf
+        return 0.5 * x * (1 + erf(x / np.sqrt(2.0)))
+
+    @classmethod
+    def torch(cls, x):
+        return F.gelu(x)
+
+
+class gelu(ActivationFunctor, metaclass=geluMeta):
+    binding_type = ActivationOp.Gelu
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61e983ab23bb5662d15e185184efa227351446d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/__init__.py
@@ -0,0 +1,34 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.evt.epilogue import EpilogueFunctorVisitor
+from cutlass_cppgen.backend.evt.frontend import PythonASTFrontend
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..945dcf80e307eb870f31722822f959da03e6c421
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/__init__.py
@@ -0,0 +1,38 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.evt.backend.sm80_emitter import Sm80Emitter
+import cutlass_cppgen.backend.evt.backend.sm80_nodes as sm80_nodes
+from cutlass_cppgen.backend.evt.backend.sm90_emitter import Sm90Emitter
+import cutlass_cppgen.backend.evt.backend.sm90_nodes as sm90_nodes
+from cutlass_cppgen.backend.evt.backend.sm100_emitter import Sm100Emitter
+import cutlass_cppgen.backend.evt.backend.sm100_nodes as sm100_nodes
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/emitter_base.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/emitter_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a7d8c04db5c8df2595fab8befaa07bf238c2f2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/emitter_base.py
@@ -0,0 +1,159 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Epilogue Visitor Emitter
+"""
+
+from cutlass_library import DataTypeTag
+from cutlass_cppgen.backend.evt.ir import TopoVisitorNode, DAGIR
+
+
+class FusionCallbacks:
+    def __init__(self, dag_ir: DAGIR, cc: int, emit_CD=True) -> None:
+        """
+        Emit the EVT fusion callbacks
+        :param dag_ir: the DAG IR holding the epilogue visitor
+        :param cc: compute capability
+        :param emit_CD: whether to emit nodes C & D as a part of the fusion callbacks
+                        For Sm90, set emit_CD=False, as Tensor C & D are hardcoded in the collective API
+                        so that their shared memory can be explicitly reused
+                        For Sm89, set emit_CD=True as they are treated as normal AuxLoad & AuxStore nodes.
+        """
+        self.dag_ir = dag_ir
+        self.emit_CD = emit_CD
+        self.cc = cc
+        self.evt_cc = 90 if cc >= 90 else cc
+        if self.cc < 90:
+            self.namespace = "threadblock"
+        else:
+            self.namespace = "fusion"
+
+    #
+    # Helper functions
+    #
+
+    def get_visitor_name(self, node: str):
+        """
+        Get the visitor name
+        """
+        meta = self.dag_ir.get_node_meta(node)
+        if not isinstance(meta, TopoVisitorNode) and self.dag_ir.in_degree(node) > 0:
+            return f"EVT{meta.name_camel}"
+        else:
+            return meta.name_camel
+
+    def emit(self):
+        node_metas = self.dag_ir.node_metas_topological_order()
+        epilogue_str = ""
+        # Step 1: emit individual node type decl
+        #         emit the EVT & DAG connector
+        for meta in node_metas:
+            if not meta.disabled:
+                epilogue_str += self.emit_node(meta)
+            if not self.emit_CD and meta.name == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                epilogue_str += self.emit_dag(meta)
+            else:
+                epilogue_str += self.emit_evt(meta)
+
+        # Step 2: post-processing & get callback name
+        if not self.emit_CD:
+            if not self.dag_ir.has_node("C"):
+                epilogue_str += "using ElementC = void;\nusing StrideC = StrideD;\n"
+            output_node = self.dag_ir.get_all_inputs("D")[0]
+            # The callback is the src of node D
+            callback_name = self.get_visitor_name(output_node)
+        else:
+            # The callback is the last node in the topological order
+            callback_name = self.get_visitor_name(node_metas[-1].name)
+        return epilogue_str, callback_name
+
+    def emit_evt(self, node):
+        if self.dag_ir.in_degree(node.name) == 0:
+            return ""
+
+        evt_tmp = f"""
+using EVT{node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.evt_cc}EVT<
+    {node.name_camel},
+"""
+        sorted_children = self.dag_ir.get_all_inputs(node.name)
+        evt_node_strs = [f"    {self.get_visitor_name(child_name)}" for child_name in sorted_children]
+        evt_tmp += ",\n".join(evt_node_strs) + ">;\n"
+
+        return evt_tmp
+
+    def emit_dag(self, node):
+        subgraph = node.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Emit the Edge Tuple
+        edge_tuples = "cute::tuple<\n"
+        for n in subgraph_nodes[:-1]:
+            in_edges = subgraph.in_edges(n)
+            edge_weights = [subgraph.get_edge_weight(edge[0], edge[1]) for edge in in_edges]
+            sorted_children = [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+            edge_tuple = "        cute::seq<"
+            edge_str = [str(subgraph_nodes.index(child)) for child in sorted_children]
+            edge_tuple += ", ".join(edge_str) + ">,\n"
+
+            edge_tuples += edge_tuple
+        edge_tuples += "    >"
+
+        # Emit the node list
+        dag_nodes = ""
+        dag_node_strs = []
+        for n in subgraph_nodes[:-1]:
+            n_meta = subgraph.get_node_meta(n)
+            if n_meta.disabled:
+                dag_node_strs.append(f"    {self.get_visitor_name(n)}")
+            else:
+                dag_node_strs.append(f"    {n_meta.name_camel}")
+        dag_nodes = ",\n".join(dag_node_strs)
+
+        return f"""
+using {node.name_camel} = cutlass::epilogue::{self.namespace}::Sm{self.evt_cc}TopologicalVisitor<
+    {DataTypeTag[node.subgraph.element_compute]},
+    {edge_tuples},
+{dag_nodes}
+>;
+"""
+
+    def emit_node(self, node):
+        if isinstance(node, TopoVisitorNode):
+            emission = ""
+            for node in node.subgraph.node_metas_topological_order():
+                if not node.disabled:
+                    emission += self.emit_node(node)
+            return emission
+        else:
+            return node.underlying_impl.type_decl
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_emitter.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_emitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..db521e5279c57734a8e408938dc6ea95a608c6d8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_emitter.py
@@ -0,0 +1,116 @@
+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Emitter for Sm100 Epilogue Visitor
+"""
+
+from cutlass_library import DataType, DataTypeTag, EpilogueScheduleTag, OpcodeClassTag
+from cutlass_cppgen.backend.library import to_blackwell_threadblock_shape
+from cutlass_cppgen.backend import GemmOperationUniversal
+from cutlass_cppgen.backend.evt.backend.emitter_base import FusionCallbacks
+from cutlass_cppgen.backend.evt.ir.node import TupleEmitter
+
+
+class Sm100CollectiveEpilogue:
+    def __init__(self, tile_description,
+                 kernel_schedule,
+                 epilogue_schedule,
+                 element_accumulator,
+                 element_d,
+                 fusion_callbacks) -> None:
+
+        self.cta_tile_mnk, _ = to_blackwell_threadblock_shape(tile_description, tile_description.cluster_shape, kernel_schedule)
+        self.element_accumulator = element_accumulator
+        if fusion_callbacks.dag_ir.has_node("C"):
+            self.element_c = fusion_callbacks.dag_ir.get_node_meta("C").element
+        else:
+            self.element_c = DataType.void
+        self.element_d = element_d
+        self.schedule = epilogue_schedule
+        self.fusion_callbacks = fusion_callbacks
+        self.opclass = tile_description.math_instruction.opcode_class
+
+    @property
+    def CtaTileMNK(self) -> str:
+        """
+        The threadblock shape
+        """
+        return f"cute::Shape<_{self.cta_tile_mnk[0]}, _{self.cta_tile_mnk[1]}, _{self.cta_tile_mnk[2]}>"
+
+    @property
+    def EpilogueTileType(self) -> str:
+        """
+        The epilogue tile type
+        """
+        return "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    @property
+    def Schedule(self) -> str:
+        return EpilogueScheduleTag[self.schedule]
+
+    def emit(self):
+        tuple_emitter = TupleEmitter("int64_t")
+        stride_D_str = self.fusion_callbacks.dag_ir.get_node_meta("D").underlying_impl.stride_mnl
+        stride_C_str = stride_D_str
+        if self.fusion_callbacks.dag_ir.has_node("C"):
+            stride_C_str = self.fusion_callbacks.dag_ir.get_node_meta("C").underlying_impl.stride_mnl
+
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, f"""
+using EpilogueDescriptor = cutlass::epilogue::collective::detail::Sm100EpilogueDescriptor<
+  {OpcodeClassTag[self.opclass]},
+  {self.CtaTileMNK}, {self.EpilogueTileType},
+  {DataTypeTag[self.element_accumulator]}, {DataTypeTag[self.element_c]}, {DataTypeTag[self.element_d]},
+  {self.Schedule}, {stride_C_str}, {stride_D_str},
+  false /* IsPerColScaleSupported */,
+  false /* IsBlockScaleSupported */
+>;
+{callback_decl}
+"""
+
+
+class Sm100Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        fusion_callbacks = FusionCallbacks(graph, cc=100, emit_CD=False)
+
+        self.collective_epilogue = Sm100CollectiveEpilogue(
+            tile_description=operation.tile_description,
+            kernel_schedule=operation.tile_description.kernel_schedule,
+            epilogue_schedule=operation.tile_description.epilogue_schedule,
+            element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+            element_d=fusion_callbacks.dag_ir.get_node_meta("D").element,
+            fusion_callbacks=fusion_callbacks
+        )
+
+    def emit(self):
+        return self.collective_epilogue.emit()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e77b4c9f2efbef808f8551e4402f5a6761ea4a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm100_nodes.py
@@ -0,0 +1,134 @@
+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from pycute import product
+
+from cutlass_library import DataTypeSize, DataTypeTag
+
+from cutlass_cppgen.backend.evt.ir import AuxLoadImpl, AuxStoreImpl
+import cutlass_cppgen.backend.evt.backend.sm90_nodes as sm90_nodes
+
+from cutlass_cppgen.backend.library import FloatRoundStyleTag
+
+
+Sm100AccumulatorImpl = sm90_nodes.Sm90AccumulatorImpl
+Sm100LoadSrcImpl = sm90_nodes.Sm90LoadSrcImpl
+Sm100ScalarBroadcastImpl = sm90_nodes.Sm90ScalarBroadcastImpl
+Sm100RowBroadcastImpl = sm90_nodes.Sm90RowBroadcastImpl
+Sm100ColumnBroadcastImpl = sm90_nodes.Sm90ColumnBroadcastImpl
+Sm100ComputeImpl = sm90_nodes.Sm90ComputeImpl
+Sm100StoreDImpl = sm90_nodes.Sm90StoreDImpl
+Sm100ColumnReductionImpl = sm90_nodes.Sm90ColumnReductionImpl
+Sm100RowReductionImpl = sm90_nodes.Sm90RowReductionImpl
+Sm100ScalarReductionImpl = sm90_nodes.Sm90ScalarReductionImpl
+
+
+class Sm100AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"\nusing {self.descriptor} = cutlass::epilogue::collective::detail::Sm100AuxLoadDescriptor<EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}>;\n"
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxLoad<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom, typename {self.descriptor}::CopyOpS2R
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_c * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm100AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"""
+using {self.descriptor} = cutlass::epilogue::collective::detail::Sm100AuxStoreDescriptor<
+    EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}
+>;
+"""
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxStore<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom,
+    typename {self.descriptor}::CopyOpR2S
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_d * product(epilogue_tile_mn) // 8, 128)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_emitter.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_emitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..868453a7cf5049e5899bf6aef419485a1a5dbb43
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_emitter.py
@@ -0,0 +1,47 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################  
+
+"""
+Emitter for Sm80 Epilogue Visitor
+"""
+
+from cutlass_cppgen.backend.evt.backend.emitter_base import FusionCallbacks
+from cutlass_cppgen.backend import GemmOperationUniversal
+
+
+class Sm80Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        self.fusion_callbacks = FusionCallbacks(graph, cc=80)
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, callback_decl
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9fc561354a471f4f97600b27e4dbb21950a9e79
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm80_nodes.py
@@ -0,0 +1,258 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_library import DataTypeSize, DataTypeTag
+
+from cutlass_cppgen.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
+
+from cutlass_cppgen.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm80AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::threadblock::VisitorAccFetch;\n"""
+        return self._type_decl
+
+
+class Sm80AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxLoad<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80LoadSrcImpl(Sm80AuxLoadImpl):
+    pass
+
+
+class Sm80ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowBroadcastImpl(RowBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColBroadcast<
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorCompute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorAuxStore<
+    OutputTileThreadMap, {DataTypeTag[self.element]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80StoreDImpl(Sm80AuxStoreImpl):
+    pass
+
+
+class Sm80ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80RowReductionImpl(RowReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorRowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm80ScalarReductionImpl(ScalarReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::threadblock::VisitorScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    OutputTileThreadMap, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_emitter.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_emitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c058aa8f30a56d97ce3c3600f7c89189e7a15ad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_emitter.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Emitter for Sm90 Epilogue Visitor
+"""
+
+from cutlass_library import DataTypeTag, EpilogueScheduleTag
+from cutlass_cppgen.backend import GemmOperationUniversal
+from cutlass_cppgen.backend.evt.backend.emitter_base import FusionCallbacks
+
+
+class CollectiveEpilogue:
+    def __init__(self, tile_description,
+                 schedule,
+                 element_c,
+                 element_d,
+                 fusion_callbacks) -> None:
+
+        self.cta_tile_mnk = tile_description.threadblock_shape
+        self.element_c = element_c
+        self.element_d = element_d
+        self.schedule = schedule
+        self.fusion_callbacks = fusion_callbacks
+
+    @property
+    def CtaTileMNK(self) -> str:
+        """
+        The threadblock shape
+        """
+        return f"cute::Shape<_{self.cta_tile_mnk[0]}, _{self.cta_tile_mnk[1]}, _{self.cta_tile_mnk[2]}>"
+
+    @property
+    def EpilogueTileType(self) -> str:
+        """
+        The epilogue tile type
+        """
+        return "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    @property
+    def Schedule(self) -> str:
+        return EpilogueScheduleTag[self.schedule]
+
+    def emit(self):
+        callback_decl, callback_name = self.fusion_callbacks.emit()
+        return callback_name, f"""
+using EpilogueDescriptor = cutlass::epilogue::collective::detail::EpilogueDescriptor<
+  {self.CtaTileMNK}, {self.EpilogueTileType},
+  {DataTypeTag[self.element_c]}, {DataTypeTag[self.element_d]},
+  {self.Schedule}
+>;
+{callback_decl}
+"""
+
+
+class Sm90Emitter:
+    def __init__(self, operation: GemmOperationUniversal, graph) -> None:
+        fusion_callbacks = FusionCallbacks(graph, cc=90, emit_CD=False)
+
+        self.collective_epilogue = CollectiveEpilogue(
+            tile_description=operation.tile_description,
+            schedule=operation.tile_description.epilogue_schedule,
+            element_c=operation.C.element,
+            element_d=operation.C.element,
+            fusion_callbacks=fusion_callbacks
+        )
+
+    def emit(self):
+        return self.collective_epilogue.emit()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..43601a424e3ecb175837fb31389436c1470d9c0b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/backend/sm90_nodes.py
@@ -0,0 +1,329 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from pycute import product
+
+from cutlass_library import DataTypeSize, DataTypeTag
+from cutlass_cppgen.backend.evt.ir import (
+    # Load Node
+    AccumulatorImpl,
+    AuxLoadImpl,
+    ColumnBroadcastImpl,
+    LoadNode,
+    LoadSrcImpl,
+    RowBroadcastImpl,
+    ScalarBroadcastImpl,
+    # Compute Node
+    ComputeImpl,
+    ComputeNode,
+    # Store Node
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl,
+    StoreNode,
+    StoreDImpl,
+)
+from cutlass_cppgen.backend.library import (
+    FloatRoundStyleTag,
+    FunctionalOp,
+    op_tag,
+)
+
+
+class Sm90AccumulatorImpl(AccumulatorImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""\nusing {self.name_camel} = cutlass::epilogue::fusion::Sm90AccFetch;\n"""
+        return self._type_decl
+
+
+class Sm90LoadSrcImpl(LoadSrcImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using ElementC = {DataTypeTag[self.element]};
+using StrideC = {self.stride_mnl};
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90SrcFetch<{DataTypeTag[self.element]}>;
+"""
+        return self._type_decl
+
+
+class Sm90AuxLoadImpl(AuxLoadImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"\nusing {self.descriptor} = cutlass::epilogue::collective::detail::AuxLoadDescriptor<EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}>;\n"
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxLoad<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom, typename {self.descriptor}::CopyOpS2R
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_c * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90ScalarBroadcastImpl(ScalarBroadcastImpl):
+    def __init__(self, node: LoadNode) -> None:
+        super().__init__(node)
+        self.broadcast_count = 1
+        self.reduction_fn = FunctionalOp.Multiplies
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarBroadcast<
+    {DataTypeTag[self.element]}, {self.stride_mnl}, {self.broadcast_count}, {op_tag(self.reduction_fn)}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowBroadcastImpl(RowBroadcastImpl):
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]}, {DataTypeTag[self.element_output]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ColumnBroadcastImpl(ColumnBroadcastImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColBroadcast<
+    0 /*Stages*/, typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]}, {DataTypeTag[self.element_output]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ComputeImpl(ComputeImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90Compute<
+    {op_tag(self.fn)}, {DataTypeTag[self.element_output]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90AuxStoreImpl(AuxStoreImpl):
+
+    @property
+    def descriptor(self) -> str:
+        """
+        Descriptor for Aux Load
+        """
+        return f"{self.name_camel}Descriptor"
+
+    def decl_descriptor(self) -> str:
+        """
+        Declare the descriptor type
+        """
+        return f"""
+using {self.descriptor} = cutlass::epilogue::collective::detail::AuxStoreDescriptor<
+    EpilogueDescriptor, {self.stride_mnl}, {DataTypeTag[self.element]}
+>;
+"""
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = self.decl_descriptor()
+        self._type_decl += f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90AuxStore<
+    {self.descriptor}::Stages, typename {self.descriptor}::EpilogueTile, {DataTypeTag[self.element]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}, typename {self.descriptor}::SmemLayoutAtom,
+    typename {self.descriptor}::CopyOpR2S
+>;
+"""
+        return self._type_decl
+
+    def get_smem_size(self, cta_tile_mnk, epilogue_tile_mn, stages_c, stages_d, epi_tiles):
+        """
+        Get the shared memory size based on epilogue_tile_mn, stages_c, and stages_d
+        """
+        return (DataTypeSize[self.element] * stages_d * product(epilogue_tile_mn) // 8, 128)
+
+
+class Sm90StoreDImpl(StoreDImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        return f"""
+using ElementD = {DataTypeTag[self.element]};
+using StrideD = {self.stride_mnl};
+"""
+
+
+class Sm90ColumnReductionImpl(ColumnReductionImpl):
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ColReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90RowReductionImpl(RowReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90RowReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)}, 0 /* Stages */,
+    typename EpilogueDescriptor::TileShape, {DataTypeTag[self.element]},
+    {DataTypeTag[self.element_compute]}, {FloatRoundStyleTag[self.round_style]},
+    {self.stride_mnl}
+>;
+"""
+        return self._type_decl
+
+
+class Sm90ScalarReductionImpl(ScalarReductionImpl):
+
+
+    @property
+    def type_decl(self):
+        """
+        Return the string defining the type
+        """
+        if self._type_decl is not None:
+            return self._type_decl
+
+        self._type_decl = f"""
+using {self.name_camel} = cutlass::epilogue::fusion::Sm90ScalarReduction<
+    {op_tag(self.reg_reduce_fn)}, {op_tag(self.gmem_reduce_fn)},
+    {DataTypeTag[self.element]}, {DataTypeTag[self.element_compute]},
+    {FloatRoundStyleTag[self.round_style]}, {self.stride_mnl}
+>;
+"""
+        return self._type_decl
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/epilogue.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/epilogue.py
new file mode 100644
index 0000000000000000000000000000000000000000..da446e76d9ebd9de04950a89b2451480492147a9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/epilogue.py
@@ -0,0 +1,168 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Epilogue Visitor interface for compiling, and running visitor-based epilogue.
+"""
+
+import ctypes
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_library import DataType
+import numpy as np
+
+from cutlass_cppgen.backend.epilogue import EpilogueFunctorBase
+import cutlass_cppgen.backend.evt.backend
+from cutlass_cppgen.backend.frontend import TensorFrontend
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+
+
+class EpilogueFunctorVisitor(EpilogueFunctorBase):
+    """
+    Apply an epilogue functor described by the epilogue EVT
+
+    :param cc: compute capability
+    :param visitor_frontend: user-provide visitor frontend
+
+    """
+    def __init__(self, cc: int, visitor, element_compute=DataType.f32) -> None:
+        # Type of Emitter based on CC
+        self.emit_cls = getattr(cutlass_cppgen.backend.evt.backend, f"Sm{cc_map[cc]}Emitter")
+
+        # Visitor Types
+        self.visitor = visitor
+        self.graph = visitor.dag_ir
+
+        # Data types
+        self.element_epilogue = element_compute # element compute
+        self.element_output = self.graph.get_node_meta('D').underlying_impl.element
+
+        # Epilogue Thread Type
+        epilogue_thread_type = self.visitor.epilogue_thread_type
+        if cc_map[cc] in [90, 100]:
+            self.arg_c_type = self.visitor.arg_c_type
+            self.arg_d_type = self.visitor.arg_d_type
+        output_names = self.visitor.return_names
+        reduction_names = self.visitor.reduction_names
+
+        # Epilogue stages specialized for sm80 kernel
+        if cc == 80:
+            if hasattr(self.visitor, "epilogue_stages"):
+                self.epilogue_stages = self.visitor.epilogue_stages
+                assert self.epilogue_stages <= 2, "Only supports Stages <=2 in SM80 Epilogue"
+
+        # Epilogue Argument Type
+        class _Arguments(ctypes.Structure):
+            """
+            Concepts:
+            class _EpilogueArguments(ctypes.Structure):
+                _fields_ = [
+                    ("epilogue", _Arguments), <- this class
+                    ("ptr_C", ctypes.c_void_p),
+                    ("stride_C", StrideBatched_),
+                    ("ptr_D", ctypes.c_void_p),
+                    ("stride_D", StrideBatched_)
+                ]
+            """
+            _fields_ = [
+                ("output_op", epilogue_thread_type)
+            ]
+
+            def __init__(self, kwargs: dict) -> None:
+                # The user-input kwargs is a dict of (name: tensors)
+                # We first convert all of them to device pointers
+                ptr_kwargs = {}
+                for key in kwargs.keys():
+                    is_output = key in output_names and key not in reduction_names
+                    ptr_kwargs[key] = self.get_tensor_ptr(key, kwargs, is_output)
+                # Initialize the thread arguments
+                self.output_op = epilogue_thread_type(ptr_kwargs)
+
+            def get_tensor_ptr(self, tensor_name, kwargs, is_output=False):
+                """
+                Helper function for extracting device pointer
+                """
+                # Skip the special tensors
+                if cc in [90, 100]:
+                    if tensor_name in ["C", "D"]:
+                        return 0
+                if tensor_name not in kwargs.keys():
+                    raise ValueError(f"Tensor {tensor_name} is not provided.")
+                tensor = kwargs[tensor_name]
+
+                # For float scalar constant, directly return the value
+                if isinstance(tensor, float):
+                    return tensor
+
+                # The tensor frontend returns a device buffer for np.ndarray
+                # and device ptr for other frontends
+                buffer_or_ptr = TensorFrontend.argument(tensor, is_output)
+                if is_numpy_tensor(tensor):
+                    # Remember the host tensor for later synchronization
+                    setattr(self, f"{tensor_name}_buffer", buffer_or_ptr)
+                    setattr(self, f"{tensor_name}_host", tensor)
+                    return int(buffer_or_ptr.ptr)
+                else:
+                    return int(buffer_or_ptr)
+
+            def sync(self):
+                """
+                Synchronize the results from device to host
+                """
+                for name in output_names:
+                    if hasattr(self, f"{name}_host"):
+                        host_tensor = getattr(self, f"{name}_host")
+                        tensor_ptr = getattr(self, f"{name}_buffer").ptr
+                        (err,) = cuda.cuMemcpyDtoH(
+                            host_tensor,
+                            tensor_ptr,
+                            host_tensor.size * host_tensor.itemsize,
+                        )
+                        if err != cuda.CUresult.CUDA_SUCCESS:
+                            raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.epilogue_type = _Arguments
+
+    def emit(self, operation):
+        """
+        Emit the C++ code
+        """
+        emitter = self.emit_cls(operation, self.graph)
+        return emitter.emit()
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size in bytes
+        """
+        return self.visitor.get_smem_size(tile_description)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2323278ed232adea205e41b901c62a268e56976
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/__init__.py
@@ -0,0 +1,33 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.evt.frontend.python_ast import PythonASTFrontend
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/frontend_base.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/frontend_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..213aafdbe3f922f22186e37ac9f2eefea74e71ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/frontend_base.py
@@ -0,0 +1,272 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base class for Python EVT Frontend
+"""
+
+from typing import Union
+
+from cutlass_library import DataType
+from cutlass_cppgen.backend.evt.ir import (
+    ComputeNode,
+    DAGIR,
+    LayoutNode,
+    LoadNode,
+    StoreNode,
+)
+from cutlass_cppgen.backend.evt.passes import (
+    EVTGraphDrawer,
+    EVTPassManager,
+    GetSmemSize,
+    PassDAG2Tree,
+    PassGetArgumentType,
+    PassGetImpl,
+    PassFixElementD,
+    PassLayoutManipulateElimination,
+    PassPreprocessRed,
+    PassShapeTypePropagation,
+)
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+from cutlass_cppgen.backend.utils import device_cc
+from cutlass_cppgen.epilogue.evt_ops import permute, reshape
+from cutlass_cppgen.utils.datatypes import library_type
+
+
+class EVTFrontendBase:
+    layout_fns = {
+        "permute": permute,
+        "reshape": reshape
+    }
+
+    def __init__(self, cc, element_compute=DataType.f32, additional_passes=[], **kwargs) -> None:
+        self.cc = cc
+        self.element_compute = library_type(element_compute)
+        self.dag_ir = DAGIR(self.cc, self.element_compute)
+        self.compute_cnt = 0
+        self.layout_cnt = 0
+        self.imm_cnt = 0
+
+        self.pass_manager = EVTPassManager(
+            self.dag_ir,
+            [
+                PassPreprocessRed,
+                PassGetArgumentType,
+                PassShapeTypePropagation,
+                PassLayoutManipulateElimination,
+                PassGetImpl,
+                PassDAG2Tree,
+                PassFixElementD
+            ] + additional_passes)
+
+        if self.cc == 80:
+            self._epilogue_stages = 1
+        else:
+            self._epilogue_stages = None
+
+    @property
+    def epilogue_stages(self):
+        return self._epilogue_stages
+
+    @epilogue_stages.setter
+    def epilogue_stages(self, stages):
+        self._epilogue_stages = stages
+
+
+    def parse(self, *args, **kwargs):
+        raise NotImplementedError(f"The 'parse' function must be overloaded in frontend class")
+
+    def trace(self, *args, **kwargs):
+        # Parse the input
+        self.parse(*args, **kwargs)
+
+        # Verify the DAG IR to ensure that "D" is the output node with out_degree = 0
+        if (self.cc >= 90):
+            if (self.dag_ir.out_degree("D") != 0):
+                raise RuntimeError(
+                    f"On SM90 or higher, D is expected to be a output node with 0 users to "
+                    f"enable smem reuse between C and D, but got {self.dag_ir.out_degree('D')}")
+
+        # Run the passes
+        self.pass_manager()
+        # Set the epilogue type
+        self.epilogue_thread_type = self.dag_ir.epilogue_thread_type
+        if cc_map[self.cc] in [90, 100]:
+            self.arg_c_type = self.dag_ir.arg_c_type
+            self.arg_d_type = self.dag_ir.arg_d_type
+        self.reduction_names = self.dag_ir.reduction_names
+
+    #
+    # Helper functions for DAG IR manipulation
+    #
+
+    def add_node(self, node):
+        self.dag_ir.add_node(node)
+
+    def add_edge(self, src, tgt, weight=0):
+        self.dag_ir.add_edge(src, tgt, weight=weight)
+
+    def set_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.tensor = {"tensor": example}
+
+    def set_store_tensor(self, node_name, example):
+        """
+        Add an example tensor to node {node_name} in the DAG IR
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        meta.store_tensor = {"tensor": example}
+
+    def mark_output(self, node_name):
+        """
+        Mark a store node as output
+        """
+        meta = self.dag_ir.get_node_meta(node_name)
+        if not isinstance(meta, StoreNode):
+            raise ValueError(
+                f"Only StoreNodes can be marked as output. "
+                f"Got {type(meta).__name__}: {node_name}")
+        meta.is_output = True
+
+    # Add node with specific type
+
+    def add_load_node(self, name, example):
+        """
+        Add a Load node to DAG IR
+        :param name: name of the loaded variable
+        :type name: str
+        :param example: example input
+        :type example: np.ndarray|torch.Tensor|cupy.ndarray|float
+        """
+        if name is None:
+            raise ValueError(f"Name is not provided.")
+        if example is None:
+            raise ValueError(f"Example input for {name} is not provided.")
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": example}
+        # Special logics for accumulator
+        if name == "accum":
+            if load_node.tensor.rank == 2:
+                new_shape = tuple([1, ] + list(load_node.tensor.shape))
+                load_node.tensor.broadcast(new_shape)
+            elif load_node.tensor.rank < 2 or load_node.tensor.rank > 3:
+                raise ValueError(f"Expect example inputs for 'accum' be a rank-2 or rank-3 tensor. Got {load_node.tensor.shape}.")
+        self.add_node(load_node)
+
+    def add_imm(self, value: Union[float,int]):
+        """
+        Add an immediate scalar value to DAG IR
+        :param value: the value of the immediate scalar
+        :type value: float
+        """
+        try:
+            value = float(value)
+        except:
+            raise ValueError(f"{type(value).__name__} cannot be converted to float.")
+
+        name = f"imm_{value}_k{self.imm_cnt}".replace('.', '_')
+        self.imm_cnt += 1
+        load_node = LoadNode(name)
+        load_node.tensor = {"tensor": value, "is_constant": True}
+        self.add_node(load_node)
+        return name
+
+    def add_compute_node(self, op, name=None):
+        """
+        Add a compute node.
+        :param op: the computation op
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the compute node
+        """
+        if name is None:
+            name = f"compute_{self.compute_cnt}"
+            self.compute_cnt += 1
+        compute_node = ComputeNode(
+            name=name, fn=op,
+            element_output=self.element_compute,
+            element_compute=self.element_compute)
+        self.add_node(compute_node)
+        return compute_node.name
+
+    def add_layout_node(self, op, kwargs, name=None):
+        """
+        Add a layout node.
+        :param op: the layout op
+        :type op: evt_ops
+        :param name: the node name (optional)
+        :type name: str
+        :return: the name of the layout node
+        """
+        if name is None:
+            name = f"layout_{self.layout_cnt}"
+            self.layout_cnt += 1
+        layout_node = LayoutNode(name=name, fn=op, kwargs=kwargs)
+        self.add_node(layout_node)
+        return layout_node.name
+
+    def add_store_node(self, name):
+        store_node = StoreNode(name)
+        self.add_node(store_node)
+
+    #
+    # Visualization The DAG IR
+    #
+
+    def visualize(self, name="dag_ir"):
+        """
+        Visualize the dag ir with svg file
+        :param name: the name of the graph
+        """
+        drawer = EVTGraphDrawer(self.dag_ir, name)
+        try:
+            for name, graph in drawer.get_dot_graph():
+                graph.write_svg(f"./{name}.svg")
+        except:
+            raise RuntimeError(
+                "'dot' is not found in path. GraphDrawer is disabled. "
+                "Please install it with 'sudo apt-get install graphviz'."
+            )
+
+    #
+    # Get shared memory size
+    #
+
+    def get_smem_size(self, tile_description):
+        """
+        Get the shared memory size of the epilogue
+        """
+        smem_size = GetSmemSize(self.dag_ir)(tile_description)
+        return smem_size
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/python_ast.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/python_ast.py
new file mode 100644
index 0000000000000000000000000000000000000000..8727b754cd2b9a557d45760cb0a24a43619a373f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/frontend/python_ast.py
@@ -0,0 +1,194 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Python AST frontend that parses input into DAG IR
+"""
+
+import ast
+import inspect
+import textwrap
+
+from cutlass_library import DataType
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.evt.frontend.frontend_base import EVTFrontendBase
+from cutlass_cppgen.backend.epilogue import identity, relu, tanh, sigmoid, silu, hardswish, gelu
+from cutlass_cppgen.backend.library import FunctionalOp
+
+
+class PythonASTFrontend(EVTFrontendBase, ast.NodeVisitor):
+    def __init__(self, cc, element_compute=DataType.f32, **kwargs):
+        super().__init__(cc, element_compute, **kwargs)
+        # Flags
+        # If this state is True, visit_Constant returns values without creating imm node
+        self.no_imm = False
+        self.visiting_return = False
+
+    def parse(self, example_inputs):
+        self.example_inputs = example_inputs
+        self.source = textwrap.dedent(inspect.getsource(self.__call__))
+        self.ast = ast.parse(self.source)
+        self.visit(self.ast)
+
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def ast_op_to_bindings(op):
+        mapping = {
+            ast.Add: FunctionalOp.Plus,
+            ast.Sub: FunctionalOp.Minus,
+            ast.Mult: FunctionalOp.Multiplies,
+            ast.Div: FunctionalOp.Divides,
+            "maximum": FunctionalOp.Maximum,
+            "minimum": FunctionalOp.Minimum,
+            "identity": identity.binding_type,
+            "relu": relu.binding_type,
+            "tanh": tanh.binding_type,
+            "sigmoid": sigmoid.binding_type,
+            "silu": silu.binding_type,
+            "hardswish": hardswish.binding_type,
+            "gelu": gelu.binding_type,
+            "multiply_add": FunctionalOp.MultiplyAdd,
+            "sum": (FunctionalOp.Plus, FunctionalOp.AtomicAdd),
+            "max": (FunctionalOp.Maximum, FunctionalOp.AtomicMaximum),
+            "exp": FunctionalOp.Exp
+        }
+        return mapping[op]
+
+    #
+    # Visiting different node types
+    #
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        # Visit args and register load nodes
+        for arg in node.args.args:
+            self.visit(arg)
+        for expr in node.body:
+            self.visit(expr)
+
+    def visit_arg(self, node: ast.arg):
+        # Name of the argument
+        name = node.arg
+        try:
+            example_tensor = self.example_inputs[name]
+        except:
+            raise RuntimeError(f"Example input for {name} is not provided.")
+
+        self.add_load_node(name, example_tensor)
+
+    def visit_Name(self, node: ast.Name):
+        return node.id
+
+    def visit_Constant(self, node: ast.Constant):
+        if self.no_imm:
+            return node.value
+        else:
+            name = self.add_imm(node.value)
+            return name
+
+    def visit_Tuple(self, node: ast.Tuple):
+        results = []
+        for elt in node.elts:
+            results.append(self.visit(elt))
+        return tuple(results)
+
+    def visit_keyword(self, node: ast.keyword):
+        return {node.arg: self.visit(node.value)}
+
+    def visit_BinOp(self, node: ast.BinOp):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        lhs = self.visit(node.left)
+        rhs = self.visit(node.right)
+        op = self.ast_op_to_bindings(type(node.op))
+        name = self.add_compute_node(op)
+
+        # Add edges
+        # The edge weights are used to sort the input args
+        self.add_edge(lhs, name, weight=0)
+        self.add_edge(rhs, name, weight=1)
+        return name
+
+    def visit_Assign(self, node: ast.BinOp):
+        target = self.visit(node.targets[0])
+        value = self.visit(node.value)
+        # Create the assign node
+        self.add_store_node(target)
+
+        # Add edges
+        self.add_edge(value, target)
+        return target
+
+    def visit_Call(self, node: ast.Call):
+        if self.visiting_return:
+            raise SyntaxError("Return value cannot be an expression")
+        func = self.visit(node.func)
+        args = [self.visit(arg) for arg in node.args]
+
+        if func in self.layout_fns.keys():
+            # Parse kwargs
+            # By default, visiting imm automatically creates a load node
+            # However, in function call, keyword args are used to set
+            # specific function attributes such as indices for permute
+            # So no_imm is set to True temporarily
+            self.no_imm = True
+            kwargs = {}
+            for kw in node.keywords:
+                kwargs.update(self.visit(kw))
+            self.no_imm = False
+            op = self.layout_fns[func]
+            name = self.add_layout_node(op, kwargs)
+        else:
+            op = self.ast_op_to_bindings(func)
+            name = self.add_compute_node(op)
+
+        # Add edges
+        for idx, arg in enumerate(args):
+            self.add_edge(arg, name, weight=idx)
+        return name
+
+    def visit_Return(self, node: ast.Return):
+        self.visiting_return = True
+        results = self.visit(node.value)
+        self.visiting_return = False
+        self.return_names = results
+        if not isinstance(results, tuple):
+            results = (results,)
+        for rst in results:
+            try:
+                example_tensor = self.example_inputs[rst]
+            except:
+                raise RuntimeError(f"Example input for {rst} is not provided.")
+            self.set_store_tensor(rst, example_tensor)
+            self.mark_output(rst)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3f811a020164dc5ec5eb4a8dfaf3dc5728fe
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/__init__.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.evt.ir.compute_nodes import ComputeNode, ComputeImpl
+from cutlass_cppgen.backend.evt.ir.dag_ir import DAGIR
+from cutlass_cppgen.backend.evt.ir.layout_nodes import LayoutNode
+from cutlass_cppgen.backend.evt.ir.load_nodes import (
+    LoadNode,
+    AccumulatorImpl,
+    LoadSrcImpl,
+    AuxLoadImpl,
+    RowBroadcastImpl,
+    ColumnBroadcastImpl,
+    ScalarBroadcastImpl
+)
+from cutlass_cppgen.backend.evt.ir.node import TopoVisitorNode, NoOpImpl
+from cutlass_cppgen.backend.evt.ir.store_nodes import (
+    StoreNode,
+    StoreDImpl,
+    AuxStoreImpl,
+    ColumnReductionImpl,
+    RowReductionImpl,
+    ScalarReductionImpl
+)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/compute_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/compute_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b05358648694dcf2a5afd7117e6fca6a2d136c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/compute_nodes.py
@@ -0,0 +1,91 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Python registration for compute nodes in EVT
+"""
+
+from cutlass_cppgen.backend.evt.ir.node import NodeBase, ImplBase
+from cutlass_cppgen.backend.library import FloatRoundStyle
+
+
+class ComputeImplBase(ImplBase):
+    """
+    Base class for compute implementation
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+
+class ComputeImpl(ComputeImplBase):
+    """
+    Implementation for Compute Node
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+        self.fn = node.fn
+        self.element_output = node.element_output
+        self.element_compute = node.element_compute
+        self.round_style = node.round_style
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return True
+
+
+class ComputeNode(NodeBase):
+    """
+    Compute Node in DAG IR
+    """
+    possible_impls = [
+        ComputeImpl
+    ]
+    def __init__(
+        self, name: str, fn, element_output,
+        element_compute,
+        round_style=FloatRoundStyle.ToNearest) -> None:
+        super().__init__(name)
+        self.op = "compute"
+        self.fn = fn
+        self.element_compute = element_compute
+        self.round_style = round_style
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        self.element = self.element_compute
+        # In general, the compute nodes have element_output = element_compute
+        # In certain cases like producer of D it is overwritten by other passes
+        if not hasattr(self, "element_output"):
+            self.element_output = self.element
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/dag_ir.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/dag_ir.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7e9f75a9727306d56c049bd491a95542a68bec8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/dag_ir.py
@@ -0,0 +1,254 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+DAG IR used by Python EVT
+"""
+
+import networkx as nx
+
+from cutlass_library import DataType
+
+from cutlass_cppgen.backend.evt.ir.compute_nodes import ComputeNode
+from cutlass_cppgen.backend.evt.ir.node import NodeBase
+from cutlass_cppgen.backend.library import ActivationOp
+from cutlass_cppgen.backend.utils import device_cc
+
+
+class DAGIR:
+    """
+    ``DAGIR`` is the main data structure used in the EVT Intermediate Representation.
+    It consists of a series of ``Node`` s, each representing epilogue visitor nodes.
+
+    In the DAGIR, ``node`` is an string of its name. ``node_meta`` is the underlying class of the node
+    """
+    def __init__(self, cc, element_compute=DataType.f32) -> None:
+        # The EVT DAGIR is managed through the nextworkX Digraph class
+        self._graph = nx.DiGraph()
+
+        self.element_compute = element_compute
+
+        self.reduction_names = []
+
+        self.cc = cc
+
+        self.identity_counter = 0
+
+    #
+    # IR manipulator
+    #
+
+    def add_node(self, meta: NodeBase):
+        """
+        Add a node to dag ir
+        """
+        if self.has_node(meta.name):
+            raise SyntaxError(f"Variable '{meta.name}' cannot be defined twice.")
+        self._graph.add_node(meta.name, meta=meta)
+
+    def add_edge(self, src: str, dst: str, weight: int=0):
+        """
+        Add an edge src -> dst to dag ir with weight
+        """
+        if not self.has_node(src):
+            raise SyntaxError(f"Variable '{src}' is undefined.")
+        if not self.has_node(dst):
+            raise SyntaxError(f"Variable '{dst}' is undefined.")
+
+        if self._graph.has_edge(src, dst):
+            # The DiGraph doesn't support multiple edges between two nodes
+            # We insert an identity node in such case as a workaround
+            identity_name = f"autogen_identity_{self.identity_counter}"
+            self.identity_counter += 1
+            compute_node = ComputeNode(
+                name=identity_name, fn=ActivationOp.Identity,
+                element_output=self.element_compute,
+                element_compute=self.element_compute)
+            self.add_node(compute_node)
+            self.add_edge(src, identity_name, 0)
+            self.add_edge(identity_name, dst, weight)
+        else:
+            self._graph.add_edge(src, dst, weight=weight)
+
+    def remove_node(self, node: str):
+        """
+        Remove node from dag ir
+        """
+        self._graph.remove_node(node)
+
+    def remove_edge(self, src: str, dst: str):
+        """
+        Remove edge src -> dst
+        """
+        self._graph.remove_edge(src, dst)
+
+    #
+    # Helper functions for getting attrs
+    #
+
+    def has_node(self, node: str) -> bool:
+        """
+        Check if the node is in the graph
+        """
+        return self._graph.has_node(node)
+
+    def in_degree(self, node: str):
+        """
+        Get the input degree of node
+        """
+        return self._graph.in_degree(node)
+
+    def in_edges(self, node: str):
+        """
+        Get the input edges of node
+        """
+        return [edge for edge in self._graph.in_edges(node)]
+
+    def out_degree(self, node: str):
+        """
+        Get the output degree of node
+        """
+        return self._graph.out_degree(node)
+
+    def out_edges(self, node: str):
+        """
+        Get the output edges of node
+        """
+        return [edge for edge in self._graph.out_edges(node)]
+
+    def get_node_meta(self, node: str):
+        """
+        Get the meta data of the node
+        """
+        return self._graph.nodes[node]["meta"]
+
+    def get_edge_weight(self, src, dst):
+        """
+        Get the edge weight of edge src->dst
+        """
+        return self._graph.get_edge_data(src, dst)["weight"]
+
+    #
+    # High-level helper functions
+    #
+
+    def all_reachable_nodes(self, node: str):
+        """
+        Get all the nodes reachable from the current node (exclude)
+        """
+        return list(nx.dfs_preorder_nodes(self._graph, source=node))
+
+    def get_users(self, node: str):
+        """
+        Get all users of the current node
+        """
+        return [edge[1] for edge in self.out_edges(node)]
+
+    def get_all_inputs(self, node: str):
+        """
+        Get all the input nodes sorted by edge weight
+        """
+        in_edges = self.in_edges(node)
+        edge_weights = [self.get_edge_weight(*edge) for edge in in_edges]
+        return [edge[0] for _, edge in sorted(zip(edge_weights, in_edges))]
+
+    def get_all_inputs_meta(self, node: str):
+        """
+        Get all the input node metas sorted by edge weight
+        """
+        return [self.get_node_meta(input_node) for input_node in self.get_all_inputs(node)]
+
+    def replace_all_uses_with(self, node1, node2):
+        """
+        Replace all uses of node1 with node2
+        """
+        for edge in self.out_edges(node1):
+            weight = self.get_edge_weight(*edge)
+            user = edge[1]
+            self.add_edge(node2, user, weight)
+            self.remove_edge(node1, user)
+        self.remove_node(node1)
+
+    #
+    # Node accessor
+    #
+    def nodes_topological_order(self):
+        """
+        Get the nodes in the unique lexicographical topological order
+        It generates a unique ordering of nodes by first sorting topologically
+        and then additionally by sorting lexicographically.
+
+        Although topological_sort alone also works, this generates a unique key
+        for each epilogue visitor pattern and ensures the compilation cache can be reused.
+        :return: list[str]
+        """
+        return list(nx.lexicographical_topological_sort(self._graph))
+
+    def node_metas_topological_order(self):
+        """
+        Get the node metas in topological order
+        :return: list[NodeBase]
+        """
+        return [self.get_node_meta(node) for node in self.nodes_topological_order()]
+
+    @property
+    def nodes(self):
+        """
+        Get all nodes
+        :return: list[str]
+        """
+        return list(self._graph.nodes)
+
+    @property
+    def nodes_meta(self):
+        """
+        Get all node metas
+        :return: list[NodeBase]
+        """
+        return [data[1]['meta'] for data in self._graph.nodes.data()]
+
+    @property
+    def edges(self):
+        """
+        Get all edges
+        :return: list[(str, str)]
+        """
+        return list(self._graph.edges)
+
+    #
+    # Path
+    #
+    def has_path(self, src: str, target: str) -> bool:
+        """
+        Return True is a path exists from src to target
+        """
+        return nx.has_path(self._graph, src, target)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_algorithm.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d453b1f4c41d002297c5348cbed8fd7f0ef3081
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_algorithm.py
@@ -0,0 +1,324 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Layout algebras
+"""
+
+from pycute import Layout, composition, make_layout, flatten, product
+
+
+def _infer_split(old_shape, new_shape):
+    old_shape = _tuple_to_list(old_shape)
+    new_shape = _tuple_to_list(new_shape)
+    if len(old_shape) == 0 and len(new_shape) == 0:
+        return []
+    if len(old_shape) == 0:
+        if product(tuple(new_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return new_shape
+    if len(new_shape) == 0:
+        if product(tuple(old_shape)) != 1:
+            raise ValueError("Invalid reshape size")
+        else:
+            return old_shape
+    # This is done recursively by only process the last dimension at each time
+    old_dim = old_shape[-1]
+    new_dim = new_shape[-1]
+    # Exact match
+    if old_dim == new_dim:
+        return _infer_split(old_shape[:-1], new_shape[:-1]) + [new_dim,]
+    # Needs split
+    if old_dim > new_dim and old_dim % new_dim == 0:
+        residual = old_dim // new_dim
+        return _infer_split(old_shape[:-1] + [residual,], new_shape[:-1]) + [new_dim,]
+    # Needs merge
+    if old_dim < new_dim and new_dim % old_dim == 0:
+        residual = new_dim // old_dim
+        return _infer_split(old_shape[:-1], new_shape[:-1] + [residual,]) + [old_dim,]
+
+    raise NotImplementedError(f"Unsupported split: {old_shape} -> {new_shape}")
+
+def _infer_merge(flatten_shape, shape):
+    flatten_shape = _tuple_to_list(flatten_shape)
+    shape = _tuple_to_list(shape)
+    idx_flat = 0
+    merged_shape = []
+    for dim in shape:
+        # Exact match
+        if dim == flatten_shape[idx_flat]:
+            merged_shape.append(dim)
+            idx_flat += 1
+        # Need group
+        elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+            residual = dim
+            group = []
+            while(residual > 1):
+                group.append(flatten_shape[idx_flat])
+                residual = residual // flatten_shape[idx_flat]
+                idx_flat += 1
+            merged_shape.append(group)
+        else:
+            raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+    return merged_shape
+
+def _list_to_tuple(nested_list):
+    if isinstance(nested_list, list) or isinstance(nested_list, tuple):
+        return tuple(_list_to_tuple(item) for item in nested_list)
+    return nested_list
+
+def _tuple_to_list(nested_tuple):
+    if isinstance(nested_tuple, list) or isinstance(nested_tuple, tuple):
+        return list(_tuple_to_list(item) for item in nested_tuple)
+    return nested_tuple
+
+def _reverse_tuple(nested_tuple: tuple):
+    if isinstance(nested_tuple, tuple):
+        return tuple([_reverse_tuple(item) for item in nested_tuple][::-1])
+    return nested_tuple
+
+def _get_first_lhs_nonzero_stride(stride_list, idx):
+    for i in reversed(range(idx)):
+        if stride_list[i] != 0:
+            return i
+    else:
+        return None
+
+def _get_first_rhs_nonzero_stride(stride_list, idx):
+    for i in range(idx+1, len(stride_list)):
+        if stride_list[i] != 0:
+            return i
+        else:
+            return None
+
+def reshape(layout, new_shape):
+    """
+    General reshape of input layout.
+    It takes two steps:
+    1. split the dimensions of the old layout
+    2. merge the splitted dimensions according to the new shape
+    """
+    #
+    # Step 1: Split the dimensions of the old layout
+    #
+    # 1.1 Flat old and new shape
+    old_flatten_shape = list(flatten(layout.shape))
+    new_flatten_shape = list(flatten(new_shape))
+
+    # 1.2 Infer the flatten splitted shape
+    splitted_flatten_shape = _infer_split(old_flatten_shape, new_flatten_shape)
+
+    # 1.3 Unflat the splitted shape based on the old shape
+    splited_shape = _infer_merge(splitted_flatten_shape, old_flatten_shape)
+
+    # 1.4 Infer the type of each split
+    # If the split type is in row-major (R), the dimension list is reversed because
+    # the cute::composition only support column-major split
+    split_type = []  # the type of each split (ColumnMajor or RowMajor)
+    permuted_splitted_shape = []
+    old_flatten_stride = list(flatten(layout.stride))
+    for idx, dim in enumerate(splited_shape):
+        if not isinstance(dim, list):
+            permuted_splitted_shape.append(dim)
+            split_type.append("C")
+        else:
+            lhs_stride = _get_first_lhs_nonzero_stride(old_flatten_stride, idx)
+            rhs_stride = _get_first_rhs_nonzero_stride(old_flatten_stride, idx)
+            # Special case for single tuple
+            # Use column-major by default
+            if lhs_stride is None and rhs_stride is None:
+                permuted_splitted_shape.append(dim)
+                split_type.append("C")
+            else:
+                if lhs_stride is not None and rhs_stride is not None:
+                    # We consider shape[idx]:stride[idx]
+                    # Case 1: stride[idx - 1] <= stride[idx] <= stride[idx + 1]: column major
+                    if lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                    # Case 2: stride[idx - 1] > stride[idx] > stride[idx + 1]: row major
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    # Case 3: stride[idx - 1] <= stride[idx] > stride[idx + 1]: concave
+                    elif lhs_stride <= old_flatten_stride[idx] and old_flatten_stride[idx] > rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    # Case 4: stride[idx - 1] > stride[idx] <= stride[idx + 1]: concave
+                    elif lhs_stride > old_flatten_stride[idx] and old_flatten_stride[idx] <= rhs_stride:
+                        if lhs_stride >= rhs_stride:
+                            permuted_splitted_shape.append(dim)
+                            split_type.append("C")
+                        else:
+                            permuted_splitted_shape.append([d for d in reversed(dim)])
+                            split_type.append("R")
+                    else:
+                        raise NotImplementedError()
+                elif lhs_stride is None:
+                    # Case 1: dim's stride < dim+1's stride, expand in column major
+                    if old_flatten_stride[idx] > rhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+                else:
+                    # Case 1: dim's stride > dim-1's stride
+                    if old_flatten_stride[idx] < lhs_stride:
+                        permuted_splitted_shape.append([d for d in reversed(dim)])
+                        split_type.append("R")
+                    else:
+                        permuted_splitted_shape.append(dim)
+                        split_type.append("C")
+
+    # 1.4 Generate the splitted layout
+    permuted_splitted_layout = composition(layout, Layout(_list_to_tuple(permuted_splitted_shape)))
+
+    # 1.5 Reverse the permutation in 1.4 before merge
+    splitted_shape = []
+    splitted_stride = []
+    for shape_dim, stride_dim, type in zip(
+            permuted_splitted_layout.shape,
+            permuted_splitted_layout.stride,
+            split_type):
+        if type == "C":
+            splitted_shape.append(shape_dim)
+            splitted_stride.append(stride_dim)
+        else:
+            splitted_shape.append(tuple([d for d in reversed(shape_dim)]))
+            splitted_stride.append(tuple([d for d in reversed(stride_dim)]))
+    splitted_layout = Layout(tuple(splitted_shape), tuple(splitted_stride))
+
+
+    #
+    # Step 2: Merge the splitted dimensions according to the new shape
+    #
+    # 2.1 Merge layout
+    merged_layout = composition(splitted_layout, Layout(new_shape))
+
+    # 2.2 Cleaning up
+    output_layout = composition(merged_layout, Layout(new_shape))
+    return output_layout
+
+
+def permutation(layout, permutation):
+    """
+    Permute the layout
+    """
+    new_shape = tuple([layout.shape[idx] for idx in permutation])
+    new_stride = tuple([layout.stride[idx] for idx in permutation])
+    return Layout(new_shape, new_stride)
+
+
+def _broadcast(layout, new_shape):
+    if len(layout) == 1 and isinstance(new_shape, int):
+        old_dim = layout.shape
+        old_stride = layout.stride
+        new_dim = new_shape
+        if old_dim == new_dim:
+            return Layout(old_dim, old_stride)
+        elif old_dim == 1:
+            return Layout(new_dim, 0)
+        else:
+            raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {new_dim}")
+
+    # Align the dimensions
+    old_shape = layout.shape
+    if isinstance(old_shape, int):
+        old_shape = (old_shape,)
+        sub_layouts = [layout,]
+    else:
+        sub_layouts = [sub_layout for sub_layout in layout]
+    rhs_broadcast_layouts = [Layout(1, 0)] * (len(new_shape) - len(old_shape))
+    # Get the broadcasted layout
+    broadcast_layouts = []
+    try:
+        layout = make_layout(*sub_layouts, *rhs_broadcast_layouts)
+        broadcast_layouts = []
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    except NotImplementedError:
+        layout = make_layout(*rhs_broadcast_layouts, *sub_layouts)
+        for idx, sub_layout in enumerate(layout):
+            broadcast_layouts.append(_broadcast(sub_layout, new_shape[idx]))
+    return make_layout(*broadcast_layouts)
+
+
+def broadcast(layout, new_shape):
+    """
+    Broadcast the new layout based on the input shape
+    The broadcasted shape equals to the new shape
+    The stride of broadcasted dimensions are 0
+    """
+    return _broadcast(layout, new_shape)
+
+
+def debroadcast(layout, dims):
+    """
+    Squeeze the 0-stride
+    """
+    for dim in dims:
+        if layout.stride[dim] != 0:
+            raise ValueError(f"Dim{dim} cannot be debroadcasted as it has stride {layout.stride[dim]}")
+    new_shape = tuple([s for idx, s in enumerate(layout.shape) if idx not in dims])
+    new_stride = tuple([s for idx, s in enumerate(layout.stride) if idx not in dims])
+    return Layout(new_shape, new_stride)
+
+
+def canonicalization_(shapes, strides):
+    if isinstance(shapes, tuple):
+        c_shapes = []
+        c_strides = []
+        for shape, stride in zip(shapes, strides):
+            c_shape, c_stride = canonicalization_(shape, stride)
+            c_shapes.append(c_shape)
+            c_strides.append(c_stride)
+        return tuple(c_shapes), tuple(c_strides)
+    else:
+        if shapes == 1:
+            return 1, 0
+        else:
+            return shapes, strides
+
+def canonicalization(layout):
+    """
+    Canonicalize the input layout
+    1. set the stride of shape "1" to 0
+    """
+    new_shape, new_stride = canonicalization_(layout.shape, layout.stride)
+    return Layout(new_shape, new_stride)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1095e2ab1d956399b5e27ddaf140e53d9918ec26
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/layout_nodes.py
@@ -0,0 +1,336 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Layout manipulation nodes and implementations
+
+The layout Nodes change the layout of intermediate nodes in epilogue visitor graph
+"""
+
+from copy import deepcopy
+
+from cutlass_library import LayoutType
+from pycute import product, flatten
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.evt.ir.layout_algorithm import _list_to_tuple, _tuple_to_list
+from cutlass_cppgen.backend.evt.ir.node import NodeBase
+from cutlass_cppgen.backend.evt.ir.tensor import Tensor
+
+
+class PermutationImpl:
+    """
+    Detailed implementation and helper functions for permutation
+    """
+    def __init__(self, node) -> None:
+        assert "indices" in node.kwargs.keys()
+        self.indices = list(node.kwargs["indices"])
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.indices = self.inverse_indices
+        inverse_impl.inverse_indices = self.indices
+        return inverse_impl
+
+    def update(self, shape):
+        num_dim = len(shape)
+        indices = self.indices
+        num_old_dim = len(indices)
+        # Add offset
+        for i, idx in enumerate(indices):
+            indices[i] = idx + num_dim - num_old_dim
+        # Add broadcast dims
+        for i in range(num_dim - num_old_dim):
+            indices = [i,] + indices
+
+        self.indices = indices
+        self.inverse_indices = self.get_inverse_indices(self.indices)
+
+    def get_inverse_indices(self, indices):
+        """
+        Get the indices for inverse permutation
+        """
+        num_dim = len(indices)
+        inverse_indices = [0] * num_dim
+        for i in range(num_dim):
+            inverse_indices[indices[i]] = i
+        return inverse_indices
+
+    def shape_propagation(self, input_node_meta):
+        input_shape = input_node_meta.tensor.shape
+        output_shape = tuple([input_shape[idx] for idx in self.indices])
+        return output_shape
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape
+        """
+        self.update(shape)
+        inverse_shape = tuple([shape[idx] for idx in self.inverse_indices])
+        node_meta.tensor.broadcast(inverse_shape)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to the users of the current nodes
+        """
+        usr_meta.tensor.permute(self.inverse_indices)
+        if hasattr(usr_meta, "store_tensor"):
+            if usr_meta.store_tensor is not None:
+                usr_meta.store_tensor.permute(self.inverse_indices)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to inputs of the current nodes
+        """
+        input_meta.tensor.permute(self.indices)
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.permute(self.indices)
+
+
+class ReshapeImpl:
+    """
+    Detailed implementation and helper functions for reshape
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        assert "new_shape" in node.kwargs.keys()
+        self.output_shape = _list_to_tuple(node.kwargs["new_shape"])
+
+    def get_inverse_impl(self):
+        inverse_impl = deepcopy(self)
+        inverse_impl.output_shape = self.input_shape
+        inverse_impl.input_shape = self.output_shape
+        return inverse_impl
+
+    def shape_propagation(self, input_node_meta):
+        self.input_shape = input_node_meta.tensor.shape
+        return _list_to_tuple(self.output_shape)
+
+    def broadcast(self, shape, node_meta: NodeBase):
+        """
+        Broadcast the inputs based on current shape.
+        """
+        # Step 1: infer split
+        flatten_split_shape = self.infer_split(flatten(self.input_shape), flatten(self.output_shape))
+        split_input_shape = self.infer_merge(flatten_split_shape, self.input_shape)
+        split_output_shape = self.infer_merge(flatten_split_shape, self.output_shape)
+
+        # broadcast shape -> split_output_shape -> flatten_split_shape
+        if len(shape) - len(split_output_shape) > 0:
+            for _ in range(len(shape) - len(split_output_shape)):
+                split_output_shape = [1,] + split_output_shape
+                flatten_split_shape = [1,] + flatten_split_shape
+                split_input_shape = [1,] + split_input_shape
+        broadcast_factor = []
+        for dim, old_dim in zip(shape, split_output_shape):
+            if not isinstance(dim, list):
+                dim = [dim,]
+            if not isinstance(old_dim, list):
+                old_dim = [old_dim,]
+            if product(tuple(dim)) == product(tuple(old_dim)):
+                broadcast_factor += [1] * len(old_dim)
+            elif product(tuple(old_dim)) == 1:
+                assert len(dim) == 1
+                broadcast_factor.append(dim[0])
+            else:
+                raise NotImplementedError(f"Invalid Broadcast: {old_dim} -> {dim}")
+
+        # flatten_split_shape -> split_input_shape
+        factor_idx = 0
+        broadcast_split_input_shape = []
+        for dim in split_input_shape:
+            if isinstance(dim, list):
+                new_dim = []
+                for d in dim:
+                    new_dim.append(d * broadcast_factor[factor_idx])
+                    factor_idx += 1
+                broadcast_split_input_shape.append(new_dim)
+            else:
+                broadcast_split_input_shape.append(dim * broadcast_factor[factor_idx])
+                factor_idx += 1
+        broadcast_split_input_shape = _list_to_tuple(broadcast_split_input_shape)
+        node_meta.tensor.reshape(_list_to_tuple(split_input_shape))
+        node_meta.tensor.broadcast(broadcast_split_input_shape)
+        # Last reshape op to clean up
+        broadcast_input_shape = tuple([product(dim) for dim in broadcast_split_input_shape])
+        node_meta.tensor.reshape(broadcast_input_shape)
+        # Update the input shape and output shape
+        self.input_shape = _list_to_tuple(node_meta.tensor.shape)
+        self.output_shape = _list_to_tuple(shape)
+
+    def apply_to_user(self, user_meta: NodeBase):
+        """
+        Propagate the reshape to user nodes
+        """
+        user_meta.tensor.reshape(tuple(self.input_shape))
+        if hasattr(user_meta, "store_tensor"):
+            if user_meta.store_tensor is not None:
+                user_meta.store_tensor.reshape(tuple(self.input_shape))
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the reshape to input nodes
+        """
+        input_meta.tensor.reshape(tuple(self.output_shape))
+        if hasattr(input_meta, "store_tensor"):
+            if input_meta.store_tensor is not None:
+                input_meta.store_tensor.reshape(tuple(self.output_shape))
+
+    #
+    # Helper functions
+    #
+
+    def infer_split(self, input_shape, output_shape):
+        """
+        Infer the flatten splitted shape that can be merged to both input_shape and output_shape
+        """
+        input_shape = _tuple_to_list(input_shape)
+        output_shape = _tuple_to_list(output_shape)
+        if len(input_shape) == 0 and len(output_shape) == 0:
+            return []
+        if len(input_shape) == 0:
+            if product(tuple(output_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return output_shape
+        if len(output_shape) == 0:
+            if product(tuple(input_shape)) != 1:
+                raise ValueError("Invalid reshape size")
+            else:
+                return input_shape
+        # This is done recursively by only process the last dimension at each time
+        old_dim = input_shape[-1]
+        new_dim = output_shape[-1]
+        # Exact match
+        if old_dim == new_dim:
+            return self.infer_split(input_shape[:-1], output_shape[:-1]) + [new_dim,]
+        # Needs split
+        if old_dim > new_dim and old_dim % new_dim == 0:
+            residual = old_dim // new_dim
+            return self.infer_split(input_shape[:-1] + [residual,], output_shape[:-1]) + [new_dim,]
+        # Needs merge
+        if old_dim < new_dim and new_dim % old_dim == 0:
+            residual = new_dim // old_dim
+            return self.infer_split(input_shape[:-1], output_shape[:-1] + [residual,]) + [old_dim,]
+
+        raise NotImplementedError(f"Unsupported split: {input_shape} -> {output_shape}")
+
+    def infer_merge(self, flatten_shape, shape):
+        flatten_shape = _tuple_to_list(flatten_shape)
+        shape = _tuple_to_list(shape)
+        idx_flat = len(flatten_shape) - 1
+        merged_shape = []
+        for dim in reversed(shape):
+            # Exact match
+            if dim == flatten_shape[idx_flat]:
+                merged_shape.append(dim)
+                idx_flat -= 1
+            # need group
+            elif dim > flatten_shape[idx_flat] and dim % flatten_shape[idx_flat] == 0:
+                residual = dim
+                group = []
+                while(residual > 1):
+                    group.append(flatten_shape[idx_flat])
+                    residual = residual // flatten_shape[idx_flat]
+                    idx_flat -= 1
+                merged_shape.append(group[::-1])
+            else:
+                raise NotImplementedError(f"Unsupported merge: {flatten_shape} -> {shape}")
+
+        return merged_shape[::-1]
+
+
+class LayoutNode(NodeBase):
+    """
+    Layout manipulation nodes
+    """
+    fn_to_impl = {
+        "permute": PermutationImpl,
+        "reshape": ReshapeImpl
+    }
+    def __init__(self, name: str, fn, kwargs: dict) -> None:
+        super().__init__(name)
+        self.op = "layout"
+        self.fn = fn
+        self.kwargs = kwargs
+        self.underlying_impl = self.fn_to_impl[self.fn.__name__](self)
+
+    def get_inverse_node(self):
+        inverse_node = deepcopy(self)
+        inverse_node.underlying_impl = self.underlying_impl.get_inverse_impl()
+        return inverse_node
+
+    def shape_propagation(self, input_node_metas):
+        if self._tensor is not None:
+            return
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+
+        output_shape = self.underlying_impl.shape_propagation(input_node_metas[0])
+
+        self._tensor = Tensor(
+            element=self.element_output,
+            shape=output_shape, layout_tag=LayoutType.RowMajor
+        )
+
+        return super().shape_propagation(input_node_metas)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        assert len(input_node_metas) == 1, "Layout node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        shape = self.tensor.shape
+
+        for child in input_node_metas:
+            self.underlying_impl.broadcast(shape, child)
+
+    def apply_to_user(self, usr_meta: NodeBase):
+        """
+        Propagate the permutation to user nodes
+        """
+        self.underlying_impl.apply_to_user(usr_meta)
+
+    def apply_to_input(self, input_meta: NodeBase):
+        """
+        Propagate the permutation to input nodes
+        """
+        self.underlying_impl.apply_to_input(input_meta)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/load_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/load_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff0aaa2c21ef2545f50745cdb33499270eeb9fb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/load_nodes.py
@@ -0,0 +1,294 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Load nodes and implementations
+"""
+
+import ctypes
+
+from cutlass_cppgen.backend.c_types import tuple_factory
+from cutlass_cppgen.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass_cppgen.backend.evt.ir.node import NodeBase, ImplBase
+
+
+class LoadImplBase(ImplBase):
+    """
+    Base class for load node implementations
+    """
+    reserved_names = ["accum", "C"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.tensor.stride
+
+
+class AccumulatorImpl(LoadImplBase):
+    """
+    Accumulator node implementation
+    """
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "accum" and node.tensor.shape == problem_size
+
+
+class LoadSrcImpl(LoadImplBase):
+    """
+    Load C implementation
+    """
+    @property
+    def name_camel(self) -> str:
+        return "TensorC"
+
+    @property
+    def argument_type_c(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_C", ctypes.c_void_p),
+                ("stride_C", tuple_type)
+            ]
+            def __init__(self, ptr) -> None:
+                self.ptr_C = ptr
+                self.stride_C = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        return node.name == "C" and node.tensor.shape == problem_size
+
+
+class AuxLoadImpl(LoadImplBase):
+    """
+    Load arbitrary tensor
+    """
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+        strideMN = node.tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class RowBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a row vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_row", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dRow", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_row = ptr
+                self.null_default = to_ctype_value(0, element_type)
+                self.dRow = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ColumnBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a column vector
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_col", ctypes.c_void_p),
+                ("null_default", dtype2ctype[element_type]),
+                ("dCol", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_col = int(ptr)
+                self.null_default = to_ctype_value(0, element_type)
+                self.dCol = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class ScalarBroadcastImpl(LoadImplBase):
+    """
+    Broadcast a scalar
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.stride_dtype = "int"
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_type = self.element
+
+        if self.tensor.is_constant:
+            value = self.tensor.value
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    self.scalars = to_ctype_value(value, element_type)
+                    self.scalar_ptrs = 0
+                    self.dScalar = tuple_type(stride_mnl)
+
+        else:
+            class _Argument(ctypes.Structure):
+                _fields_ = [
+                    ("scalars", dtype2ctype[element_type]),
+                    ("scalar_ptrs", ctypes.c_void_p),
+                    ("dScalar", tuple_type)
+                ]
+                def __init__(self, kwargs) -> None:
+                    scalar_or_ptr = kwargs[name]
+                    if isinstance(scalar_or_ptr, float):
+                        self.scalars = to_ctype_value(scalar_or_ptr, element_type)
+                        self.scalar_ptrs = 0
+                    else:
+                        self.scalar_ptrs = int(scalar_or_ptr)
+
+                    self.dScalar = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name in LoadImplBase.reserved_names:
+            return False
+
+        strideMN = node.tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class LoadNode(NodeBase):
+    """
+    Load Node
+    """
+    cnt = 0
+    possible_impls = [
+        AccumulatorImpl, LoadSrcImpl, AuxLoadImpl,
+        RowBroadcastImpl, ColumnBroadcastImpl,
+        ScalarBroadcastImpl
+    ]
+    def __init__(self, name: str) -> None:
+        if name is None:
+            name = f"load{LoadNode.cnt}"
+            LoadNode.cnt += 1
+        super().__init__(name)
+        self.op = "load"
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Load node loads tensor under type `tensor.element` and returns an array of type `tensor.element`.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+
+        self.element = self.tensor.element
+        self.element_output = self.tensor.element
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/node.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/node.py
new file mode 100644
index 0000000000000000000000000000000000000000..606591b8e78c97114b85b329050d630d55460d7a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/node.py
@@ -0,0 +1,306 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base & visitor classes of DAGIR Nodes
+"""
+
+import ctypes
+from re import sub
+
+from cutlass_library import LayoutType
+
+from cutlass_cppgen.backend.evt.ir.layout_algorithm import _list_to_tuple, _reverse_tuple
+from cutlass_cppgen.backend.evt.ir.tensor import Tensor
+
+
+class TupleEmitter:
+    """
+    Emit the cute tuple to C++ code
+    """
+    def __init__(self, stride_dtype):
+        self.stride_dtype = stride_dtype
+
+    def emit(self, py_tuple):
+        if isinstance(py_tuple, int):
+            if py_tuple in [0, 1]:
+                return f"cute::Int<{py_tuple}>"
+            else:
+                return f"{self.stride_dtype}"
+        elif isinstance(py_tuple, tuple):
+            decl = "cute::Stride<"
+            for item in py_tuple:
+                decl += self.emit(item) + ", "
+            return decl[:-2] + ">"
+        else:
+            raise ValueError(f"TupleEmitter.emit only accepts tuple or int, got {type(py_tuple).__name__}")
+
+
+class ImplBase:
+    """
+    Base class for Node Implementation
+    """
+    def __init__(self, node) -> None:
+        self.node = node
+        self.name = node.name
+        self.tensor = node.tensor
+        self._type_decl = None
+        self.tuple_emitter = TupleEmitter("int64_t")
+
+    @property
+    def stride_dtype(self):
+        return self.tuple_emitter.stride_dtype
+
+    @stride_dtype.setter
+    def stride_dtype(self, stride_dtype):
+        self.tuple_emitter.stride_dtype = stride_dtype
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        """
+        Match function used in get_underlying_impl
+        """
+        raise NotImplementedError(f"The `match` function is not defined.")
+
+    @property
+    def argument_type(self):
+        """
+        Default class for Argument Type
+        """
+        class _Argument(ctypes.Structure):
+            _fields_ = []
+
+            def __init__(self, *args, **kwargs) -> None:
+                pass
+
+        return _Argument
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return sub(r"(_|-)+", " ", self.name).title().replace(" ", "")
+
+    @property
+    def stride_mnl(self):
+        """
+        Typename StrideMNL
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return self.tuple_emitter.emit(stride)
+
+    def get_non_constant_stride(self, py_tuple):
+        if isinstance(py_tuple, int):
+            if py_tuple not in [0, 1]:
+                return py_tuple
+            else:
+                return None
+        non_constant_stride = []
+        for item in py_tuple:
+            item_out = self.get_non_constant_stride(item)
+            if item_out:
+                non_constant_stride.append(item_out)
+        return tuple(non_constant_stride)
+
+    def get_stride_mnl(self):
+        """
+        Get the non-zero stride mnl. This is used in argument construction
+        """
+        stride = _list_to_tuple([self.stride[-2], self.stride[-1]] + list(_reverse_tuple(tuple(self.stride[:-2]))))
+        return stride
+
+    def get_smem_size(self, *args, **kwargs):
+        """
+        Get the shared memory size and alignment of current node
+        """
+        return (0, 1)
+
+
+class NoOpImpl(ImplBase):
+    """
+    The NoOpImpl does nothing but forward its input to users
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node)
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.op == "store":
+            # Store that is not output is a No OP
+            return not node.is_output
+
+
+class NodeBase:
+    """
+    Base class of DAG Node
+    """
+    def __init__(self, name: str) -> None:
+        self.name = name
+        self.underlying_impl = None
+
+        self._tensor = None
+
+        # Whether the node is disabled for emit
+        self.disabled = False
+
+    @property
+    def name_camel(self) -> str:
+        """
+        Return the CamelCase name.
+        """
+        return self.underlying_impl.name_camel
+
+    @property
+    def tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass_cppgen.backend.evt.ir.tensor)
+        """
+        return self._tensor
+
+    @tensor.setter
+    def tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._tensor = Tensor(**kwargs)
+
+    #
+    # Helper functions for type/shape propagation
+    #
+
+    def shape_propagation(self, input_node_metas):
+        """
+        Infer shape from input nodes
+        General Broadcasting Rules from NumPy
+        When operating on two arrays, we compare their shapes element-wise.
+        It starts with the trailing (i.e. rightmost) dimension and works its
+        way left. Two dimensions are compatible when
+        1. they are equal
+        2. one of them is 1
+        """
+        if self._tensor is not None:
+            return
+
+        shape = None
+        for src in input_node_metas:
+            src_shape = src.tensor.shape
+            if shape is None:
+                shape = src_shape
+            else:
+                len_difference = len(shape) - len(src_shape)
+                if len_difference > 0:
+                    for _ in range(len_difference):
+                        src_shape = [1, ] + list(src_shape)
+                elif len_difference < 0:
+                    for _ in range(-len_difference):
+                        shape = [1, ] + list(shape)
+                broadcasted_shape = []
+                # Infer broadcast shape
+                for shape_dim, src_dim in zip(reversed(shape), reversed(src_shape)):
+                    if shape_dim == 1:
+                        broadcasted_shape = [src_dim, ] + list(broadcasted_shape)
+                    elif src_dim == 1:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    elif shape_dim == src_dim:
+                        broadcasted_shape = [shape_dim, ] + list(broadcasted_shape)
+                    else:
+                        error_msg = "Dimension mismatch between "
+                        for src_ in input_node_metas:
+                            error_msg += f"{src_.name}{src_.tensor.shape}, "
+                        error_msg = error_msg[:-2] + "."
+                        raise RuntimeError(error_msg)
+                shape = tuple(broadcasted_shape)
+
+        self._tensor = Tensor(element=self.element_output, shape=shape, layout_tag=LayoutType.RowMajor)
+
+    def type_propagation(self, *args, **kwargs):
+        """
+        Each node is associated with two data types: `element` and `element_output`.
+        The `element_output` is the type of return array of the node. The `element`
+        has specific meaning for different node types.
+        * Load Node: data type of tensor in gmem
+        * Compute Node: element compute
+        * Store Node: data type of tensor in gmem
+        This function must be overloaded in the derived classes
+        """
+        raise NotImplementedError(f"Function `type_propagation` is not overloaded in {self.__class__.__name__}")
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        Propagate the broadcast in the reversed topological order.
+        For example:
+            C[l, m, n] = A[m, 1] + B[l, m, n]
+        After the broadcast propagation, it will be come
+            C[l, m, n] = A[l, m, n] + B[l, m, n]
+        and each tensor will have a proper stride accessing the underlying tensor
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The tensor of node {self.name} is unknown.")
+        for child in input_node_metas:
+            child.tensor.broadcast(self.tensor.shape)
+
+    def get_underlying_impl(self, problem_size: tuple):
+        """
+        Get the underlying implementation of the current node.
+        """
+        if self.tensor is None:
+            raise RuntimeError(f"The Layout of node {self.name} is unknown. Please call PassShapeTypePropagation first.")
+
+        for impl in self.possible_impls:
+            if impl.match(self, problem_size):
+                self.underlying_impl = impl(self)
+                break
+
+        if self.underlying_impl is None:
+            raise NotImplementedError(f"No matching op for node {self.name} with stride {self.tensor.stride}.")
+
+#
+# Visitor Nodes & Impls
+#
+
+class TopoVisitorImpl(ImplBase):
+    """
+    Impl for topological visitor
+    """
+    def __init__(self, node) -> None:
+        super().__init__(node.output_node)
+        self.name = node.name
+        self.element_output = node.output_node.element_output
+
+class TopoVisitorNode(NodeBase):
+    def __init__(self, name: str, subgraph, output_node) -> None:
+        super().__init__(name)
+        self.subgraph = subgraph
+        self.output_node = output_node
+        self.op = "dag"
+        self.underlying_impl = TopoVisitorImpl(self)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/store_nodes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/store_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..708405e0647ca3cb22bd0c1d4770d71810a469e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/store_nodes.py
@@ -0,0 +1,277 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Store node and implementations
+"""
+
+import ctypes
+
+from cutlass_library import DataType
+
+from cutlass_cppgen.backend.c_types import tuple_factory
+from cutlass_cppgen.backend.epilogue import dtype2ctype, to_ctype_value
+from cutlass_cppgen.backend.evt.ir.node import NodeBase, ImplBase, NoOpImpl
+from cutlass_cppgen.backend.evt.ir.tensor import Tensor
+from cutlass_cppgen.backend.library import FloatRoundStyle, FunctionalOp
+
+
+class StoreImplBase(ImplBase):
+    """
+    Base class for store node implementation
+    """
+    reserved_names = ["D"]
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.element
+        self.element_output = node.element_output
+        self.stride = node.store_tensor.stride
+
+
+class StoreDImpl(StoreImplBase):
+    """
+    Store D implementation
+    """
+
+    @property
+    def argument_type_d(self):
+        stride_mnl = self.get_stride_mnl()
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_D", ctypes.c_void_p),
+                ("stride_D", tuple_type)
+            ]
+            def __init__(self, ptr: int) -> None:
+                self.ptr_D = ptr
+                self.stride_D = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if node.name == "D" and node.store_tensor.shape == problem_size:
+            return True
+        return False
+
+
+class AuxStoreImpl(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.round_style = FloatRoundStyle.ToNearest
+
+    @property
+    def argument_type(self):
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr_aux", ctypes.c_void_p),
+                ("dAux", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr_aux = ptr
+                self.dAux = tuple_type(stride_mnl)
+
+        return _Argument
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if (strideMN[0] == 1 and strideMN[1] != 0 or
+            strideMN[0] != 0 and strideMN[1] == 1 ):
+            return True
+        else:
+            return False
+
+
+class ReductionImplBase(StoreImplBase):
+    def __init__(self, node) -> None:
+        super().__init__(node)
+        self.element = node.store_tensor.element
+        self.element_compute = node.element_compute
+        self.reg_reduce_fn = self.node.reg_reduce_fn
+        self.gmem_reduce_fn = self.node.gmem_reduce_fn
+        self.round_style = node.round_style
+        self.stride_dtype = "int"
+
+    def get_reduce_identity(self):
+        """
+        Return the reduction identity of the current reduce_fn
+        """
+        maxes = {
+            DataType.f32: (2 ** 31) - 1,
+            DataType.f16: (2 ** 15),
+            DataType.s32: (2 ** 31) - 1,
+            DataType.s8: (2 ** 7) - 1
+        }
+        mins = {
+            DataType.f32: -maxes[DataType.f32],
+            DataType.f16: -maxes[DataType.f16],
+            DataType.s32: -maxes[DataType.s32],
+            DataType.s8: -maxes[DataType.s8]
+        }
+        if self.reg_reduce_fn == FunctionalOp.Maximum:
+            if self.element_compute not in mins:
+                raise Exception(f"No min entry for data type {self.element_compute}")
+            return to_ctype_value(mins[self.element_compute], self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Multiplies:
+            return to_ctype_value(1., self.element_compute)
+        elif self.reg_reduce_fn == FunctionalOp.Minimum:
+            if self.element_compute not in maxes:
+                raise Exception(f"No max entry for data type {self.element_compute}")
+            return to_ctype_value(maxes[self.element_compute], self.element_compute)
+        else:
+            return to_ctype_value(0., self.element_compute)
+
+    @property
+    def argument_type(self):
+        self.get_reduce_identity()
+        stride_mnl = self.get_stride_mnl()
+        name = self.name
+        tuple_type = tuple_factory(stride_mnl, self.stride_dtype)
+        element_compute = self.element_compute
+        reduce_identity = self.get_reduce_identity()
+        class _Argument(ctypes.Structure):
+            _fields_ = [
+                ("ptr", ctypes.c_void_p),
+                ("reduce_identity", dtype2ctype[element_compute]),
+                ("dMNL", tuple_type)
+            ]
+            def __init__(self, kwargs) -> None:
+                ptr = kwargs[name]
+                self.ptr = ptr
+                self.reduce_identity = reduce_identity
+                self.dMNL = tuple_type(stride_mnl)
+
+        return _Argument
+
+
+class ColumnReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (1, 0):
+            return True
+        else:
+            return False
+
+
+class RowReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 1):
+            return True
+        else:
+            return False
+
+
+class ScalarReductionImpl(ReductionImplBase):
+
+    @staticmethod
+    def match(node, problem_size: tuple):
+        if not node.is_output:
+            return False
+        if node.name in StoreImplBase.reserved_names:
+            return False
+
+        strideMN = node.store_tensor.stride[-2:]
+        if strideMN == (0, 0):
+            return True
+        else:
+            return False
+
+
+class StoreNode(NodeBase):
+    """
+    Store node
+    """
+    possible_impls = [
+        AuxStoreImpl, RowReductionImpl,
+        ColumnReductionImpl, ScalarReductionImpl,
+        NoOpImpl, StoreDImpl
+    ]
+    def __init__(self, name: str) -> None:
+        super().__init__(name)
+        self.op = "store"
+        self.is_output = False
+        self._store_tensor = None
+
+    @property
+    def store_tensor(self) -> Tensor:
+        """
+        Return the output tensor (concept: cutlass_cppgen.backend.evt.ir.tensor)
+        """
+        return self._store_tensor
+
+    @store_tensor.setter
+    def store_tensor(self, kwargs):
+        """
+        Setting the tensor
+        """
+        self._store_tensor = Tensor(**kwargs)
+
+    def type_propagation(self, input_node_metas: 'list[NodeBase]'):
+        """
+        The store nodes has element_output = element_input
+        """
+        if self.is_output:
+            if self.store_tensor is None:
+                raise RuntimeError(f"The store tensor of node {self.name} is unknown.")
+            self.element = self.store_tensor.element
+        assert len(input_node_metas) == 1, "Store node can only have one input node"
+        self.element_output = input_node_metas[0].element_output
+
+    def broadcast_propagation(self, input_node_metas: 'list[NodeBase]'):
+        super().broadcast_propagation(input_node_metas)
+        if self.is_output:
+            self._store_tensor.broadcast(self.tensor.shape)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/tensor.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a28b7306a140d08bd1edebd3486990ea69b9344
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/ir/tensor.py
@@ -0,0 +1,137 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level class for tensor
+"""
+
+from cutlass_library import LayoutType
+
+from cutlass_cppgen.backend.evt.ir.layout_algorithm import (
+    Layout,
+    broadcast,
+    canonicalization,
+    permutation,
+    reshape,
+    _reverse_tuple
+)
+from cutlass_cppgen.utils.datatypes import get_datatype_and_layout, get_tensor_shape, library_type
+
+
+class Tensor:
+    """
+    The tensor abstracts the data type
+    """
+    def __init__(self, tensor=None, element=None, shape=None, stride=None,layout_tag=None, is_constant=False) -> None:
+        if element is not None and tensor is not None:
+            raise Exception(f"Must not specify both element and tensor")
+        elif shape is not None and tensor is not None:
+            raise Exception(f"Must not specify both shape and tensor")
+        elif layout_tag is not None and tensor is not None:
+            raise Exception(f"Must not specify both layout_tag and tensor")
+        elif (element is None or (layout_tag is None and stride is None) or shape is None) and (tensor is None) :
+            raise Exception(f"Must specify one of (element, shape, layout/stride) or (tensor)")
+        elif stride is not None and tensor is not None:
+            raise Exception(f"Must not specify both stride and tensor")
+        elif stride is not None and layout_tag is not None:
+            raise Exception(f"Must not specify layout_tag when stride is provided")
+
+        if isinstance(tensor, Tensor):
+            # Directly copy all the attributes
+            self.__dict__.update(vars(tensor))
+        else:
+            if tensor is None:
+                self.element = library_type(element)
+            else:
+                self.element, layout_tag = get_datatype_and_layout(tensor)
+                shape = get_tensor_shape(tensor)
+            if stride is not None:
+                self.layout = Layout(shape[::-1], stride[::-1])
+            else:
+                if layout_tag == LayoutType.RowMajor:
+                    self.layout = Layout(shape[::-1])
+                elif layout_tag == LayoutType.ColumnMajor:
+                    self.layout = permutation(Layout(shape), [idx for idx in reversed(range(len(shape)))])
+            self.layout = canonicalization(self.layout)
+
+            self.is_constant = is_constant
+            # Save the tensor value if it is constant
+            if is_constant and tensor is not None:
+                self.value = tensor
+
+    @property
+    def shape(self):
+        """
+        Returns the RowMajor layout shape
+        """
+        return _reverse_tuple(self.layout.shape)
+
+    @property
+    def stride(self):
+        """
+        Returns the RowMajor layout stride
+        """
+        return _reverse_tuple(self.layout.stride)
+
+    @property
+    def rank(self):
+        """
+        Returns the rank of the tensor
+        """
+        return len(self.shape)
+
+    #
+    # Layout Algorithms
+    #
+
+    def broadcast(self, shape):
+        """
+        Broadcast self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        self.layout = broadcast(self.layout, _reverse_tuple(shape))
+
+    def reshape(self, shape):
+        """
+        Reshape self.layout to shape
+        """
+        assert isinstance(shape, tuple)
+        reverse_shape = _reverse_tuple(shape)
+        self.layout = reshape(self.layout, reverse_shape)
+
+    def permute(self, indices):
+        """
+        Permute self.layout according to indices
+        """
+        length = len(indices)
+        indices = [length - idx - 1 for idx in indices]
+        self.layout = permutation(self.layout, indices[::-1])
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..badc38d96a830992c94afa693ea4b56a8e404c96
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/__init__.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.backend.evt.passes.graph_drawer import EVTGraphDrawer
+from cutlass_cppgen.backend.evt.passes.pass_argument_type import PassGetArgumentType
+from cutlass_cppgen.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass_cppgen.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass_cppgen.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass_cppgen.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassManager
+from cutlass_cppgen.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+from cutlass_cppgen.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass_cppgen.backend.evt.passes.smem_size_calculator import GetSmemSize
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/graph_drawer.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/graph_drawer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a28c6e4e62d1a7bd7431c81aac366b8788fd8df
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/graph_drawer.py
@@ -0,0 +1,143 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from __future__ import annotations
+
+import subprocess
+
+from cutlass_library import DataTypeTag
+
+from cutlass_cppgen.backend.evt.ir.dag_ir import DAGIR
+
+
+_COLOR_MAP = {
+    "load": '"AliceBlue"',
+    "compute": "LemonChiffon1",
+    "accumulator": "LightGrey",
+    "store": "PowderBlue",
+    "layout": "lightseagreen",
+    "dag": "darkorange"
+}
+
+
+class EVTGraphDrawer:
+    """
+    Visualize a EVT DAGIR with graphviz
+    """
+    def __init__(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        self._name = name
+        self._dot_graphs = {}
+
+        self._dot_graphs[name] = self._to_dot(graph, name)
+
+    def _get_node_style(self, node):
+        template = {
+            "shape": "record",
+            "fillcolor": "#CAFFE3",
+            "style": '"filled,rounded"',
+            "fontcolor": "#000000",
+        }
+        if node.op in _COLOR_MAP:
+            template["fillcolor"] = _COLOR_MAP[node.op]
+        else:
+            raise NotImplementedError("unknown node op")
+        if node.disabled:
+            template["fontcolor"] = "grey"
+            template["fillcolor"] = "white"
+        return template
+
+    def _get_node_label(self, node):
+        label = "{" + f"name={node.name}|op={node.op}"
+        if node.op == "layout":
+            label += f"|fn={node.fn.__name__}"
+            for key in node.kwargs:
+                label += f"|{key}={node.kwargs[key]}"
+        if node.underlying_impl is not None:
+            label += f"|impl={type(node.underlying_impl).__name__}"
+            if node.op == "load":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element]}"
+            elif node.op == "compute":
+                label += f"|element_compute={DataTypeTag[node.underlying_impl.element_compute]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "store":
+                label += f"|element_store={DataTypeTag[node.underlying_impl.element]}|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+            elif node.op == "dag":
+                label += f"|element_output={DataTypeTag[node.underlying_impl.element_output]}"
+        if node.tensor is not None:
+            shape = node.tensor.shape
+            stride = node.tensor.stride
+            label += f"|shape={shape}|stride={stride}"
+
+        if hasattr(node, "store_tensor"):
+            if node.store_tensor is not None:
+                store_shape = node.store_tensor.shape
+                store_stride = node.store_tensor.stride
+                label += f"|store_shape={store_shape}|stride_stride={store_stride}"
+
+        label += "}"
+        return label
+
+    def _to_dot(
+        self,
+        graph: DAGIR,
+        name: str
+    ):
+        import pydot
+        dot_graph = pydot.Dot(name, randir="TB")
+        for node in graph.nodes_meta:
+            style = self._get_node_style(node)
+            label = self._get_node_label(node)
+            dot_node = pydot.Node(
+                node.name, label=label, **style
+            )
+            dot_graph.add_node(dot_node)
+            if node.op == "dag":
+                dot_subgraph = self._to_dot(node.subgraph, name=node.name)
+                self._dot_graphs[node.name] = dot_subgraph
+
+        # Add edges
+        for src, dst in graph.edges:
+            weight = graph.get_edge_weight(src, dst)
+            dot_graph.add_edge(pydot.Edge(src, dst, label=weight))
+
+        return dot_graph
+
+    def get_dot_graph(self) -> pydot.Dot:
+        return [(key, self.get_dot_graph_by_name(key)) for key in self._dot_graphs.keys()]
+
+    def get_dot_graph_by_name(self, name) -> pydot.Dot:
+        return self._dot_graphs[name]
+
+    def get_main_dot_graph(self) -> pydot.Dot:
+        return self._dot_graphs[self._name]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_argument_type.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_argument_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0c3cdbde6d46ad8a7e84c3b95422bdb55e877c5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_argument_type.py
@@ -0,0 +1,120 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Construct the epilogue visitor argument type
+"""
+
+from cutlass_cppgen.backend.c_types import visitor_factory
+from cutlass_cppgen.backend.evt.ir import TopoVisitorNode
+from cutlass_cppgen.backend.evt.passes.pass_dag_2_tree import PassDAG2Tree
+from cutlass_cppgen.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass_cppgen.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+
+
+class PassGetArgumentType(EVTPassBase):
+    """
+    Construct the epilogue visitor argument type
+    """
+    dependencies = [
+        PassShapeTypePropagation,     # The Layout of all nodes must be set
+        PassDAG2Tree,                 # The type of each node must be set
+        PassGetImpl                   # The DAG subgraphs must be set
+    ]
+
+    def requires(self) -> None:
+        # Check "D" is in the node list
+        if cc_map[self.cc] in [90, 100] and (not self.dag_ir.has_node("D")):
+            raise SyntaxError(
+                "Sm90+ EVT requires the epilogue to have a returned tensor D, "
+                "but the variable 'D' is not found in the return values.")
+
+    def call(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.argument_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.argument_types[node] = meta.underlying_impl.argument_type
+            if node == "D" and cc_map[self.cc] in [90, 100]:
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_argument_type(node)
+            else:
+                self.get_evt_argument_type(node)
+
+        self.cc_specific_method(self.set_argument_type)()
+
+    def get_evt_argument_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.argument_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(
+                input_types + [self.argument_types[node],], self.dag_ir.get_all_inputs(node) + [node,])
+
+    def get_dag_argument_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.argument_types[n] = m.underlying_impl.argument_type
+        input_types = [self.argument_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            self.argument_types[node] = visitor_factory(input_types, subgraph_nodes[:-1])
+
+    def set_argument_type(self):
+        pass
+
+    def sm90_set_argument_type(self):
+        self.dag_ir.epilogue_thread_type = self.argument_types[self.dag_ir.get_all_inputs("D")[0]]
+        # Get the tensorD argument type
+        self.dag_ir.arg_d_type = self.dag_ir.get_node_meta("D").underlying_impl.argument_type_d
+
+        # Get the tensorC argument type
+        if self.dag_ir.has_node("C"):
+            self.dag_ir.arg_c_type = self.dag_ir.get_node_meta("C").underlying_impl.argument_type_c
+        else:
+            self.dag_ir.arg_c_type = self.dag_ir.arg_d_type
+
+    def sm100_set_argument_type(self):
+        self.sm90_set_argument_type()
+
+    def sm80_set_argument_type(self):
+        nodes = self.dag_ir.nodes_topological_order()
+        self.dag_ir.epilogue_thread_type = self.argument_types[nodes[-1]]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_dag_2_tree.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_dag_2_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..469769664abdf757319949ab48b4e7d5e982f200
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_dag_2_tree.py
@@ -0,0 +1,169 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Merge non-tree sub-graphs of the DAG IR into a single DAG. The fused DAG will be implemented
+by the topological visitor, while the rest of the graph will be implemented with the tree visitor.
+"""
+
+from copy import deepcopy
+
+from cutlass_cppgen.backend.evt.ir import DAGIR, TopoVisitorNode
+from cutlass_cppgen.backend.evt.passes.pass_get_impl import PassGetImpl
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass_cppgen.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassDAG2Tree(EVTPassBase):
+    """
+    Convert the DAG IR to Tree by fusing subgraphs
+    """
+    dependencies = [
+        PassShapeTypePropagation,
+        PassGetImpl
+    ]
+
+    def call(self):
+        # Step 1: find the nodes that have multiple parents
+        multi_parent_nodes = []
+
+        for node in self.dag_ir.nodes_topological_order():
+            if self.dag_ir.out_degree(node) > 1:
+                multi_parent_nodes.append(node)
+        # Step 2: find the lowest common ancestor (LCA) of all its parents
+        for node in multi_parent_nodes:
+            # A multi-parent node could be already fused by the previous node
+            if not self.dag_ir.has_node(node):
+                continue
+            # A node uncovered by the previous fusions can have out degree change
+            # Case 1: it has <= 1 edges to the previously fused subgraph, no degree change
+            # Case 2: it has more than one edges to the previously fused subgraph, degree drops
+            if self.dag_ir.out_degree(node) <= 1:
+                continue
+
+            # Otherwise, the node still
+            reachable_nodes = []
+            # Complexity: O(Dout*N)
+            for parent in self.dag_ir.get_users(node):
+                reachable_nodes.append(set(self.dag_ir.all_reachable_nodes(parent)))
+            # get the common reachable objects
+            common_items = set.intersection(*reachable_nodes)
+            node_to_fuse = set.union(*reachable_nodes).difference(common_items)
+
+            lca = None
+            # If common ancestor exists, find the lowest one
+            if len(common_items) > 0:
+                topo_order = self.dag_ir.nodes_topological_order()
+                topo_idx = -1
+                for item in common_items:
+                    if lca is None:
+                        lca = item
+                        topo_idx = topo_order.index(item)
+                    else:
+                        if topo_idx > topo_order.index(item):
+                            lca = item
+                            topo_idx = topo_order.index(item)
+            else:
+                # there is no common ancestor for all the parents, we pack all the reachable
+                # nodes into a single DAG node as a fallback. The lca should be the input node of
+                # one of the output nodes with out_degree = 0
+                potential_output_nodes = []
+                for node in node_to_fuse:
+                    if self.dag_ir.out_degree(node) == 0:
+                        potential_output_nodes.append(node)
+                if len(potential_output_nodes) == 0:
+                    raise RuntimeError(f"No output node with out degree = 0 found.")
+                
+                output_node = None
+                if (self.dag_ir.cc >= 90):
+                    # For SM90+, the lca should be the input node of D
+                    if (not self.dag_ir.has_node("D")):
+                        raise RuntimeError(f"D is not a node in the DAG IR.")
+                    output_node = "D"
+                else:
+                    output_node = potential_output_nodes[0]
+                
+                if (output_node is None):
+                    raise RuntimeError(f"No output node found.")
+                lca = self.dag_ir.get_all_inputs(output_node)[0]
+                node_to_fuse.remove(output_node)
+
+            # The lca is the output node of the DAG node
+            # Get the nodes to be fused
+            node_to_fuse.add(lca)
+            # Get all the input nodes
+            all_input_nodes = []
+            all_output_nodes = []
+            for node in node_to_fuse:
+                all_input_nodes.append(set(self.dag_ir.get_all_inputs(node)))
+                all_output_nodes.append(set(self.dag_ir.get_users(node)))
+            all_input_nodes = set.union(*all_input_nodes)
+            all_output_nodes = set.union(*all_output_nodes)
+
+            new_subgraph_nodes = set.union(node_to_fuse, all_input_nodes, all_output_nodes)
+
+            # Create the subgraph
+            subgraph_ = self.dag_ir._graph.subgraph(new_subgraph_nodes)
+            subgraph = DAGIR(self.dag_ir.cc)
+            for node in subgraph_.nodes:
+                meta = deepcopy(self.dag_ir.get_node_meta(node))
+                if node not in node_to_fuse:
+                    meta.disabled = True
+                subgraph.add_node(meta)
+            for edge in subgraph_.edges:
+                subgraph.add_edge(edge[0], edge[1], self.dag_ir.get_edge_weight(edge[0], edge[1]))
+
+
+            # Create the fused node
+            dag_node = TopoVisitorNode(
+                name=f"dag_{lca}", subgraph=subgraph,
+                output_node=self.dag_ir.get_node_meta(lca))
+            self.dag_ir.add_node(dag_node)
+
+            # Add input edges
+            for idx, node in enumerate(all_input_nodes):
+                self.dag_ir.add_edge(node, dag_node.name, weight=idx)
+
+            # Replace all uses with DAG node (only 1 output node)
+            self.dag_ir.replace_all_uses_with(lca, dag_node.name)
+
+            # Remove all fused nodes
+            node_to_fuse.remove(lca)
+            for node in node_to_fuse:
+                self.dag_ir.remove_node(node)
+
+    def ensures(self) -> None:
+        # Ensure that after the pass, the resulting DAG becomes a tree
+        for node in self.dag_ir.nodes:
+            out_degree = self.dag_ir.out_degree(node)
+            if out_degree > 1:
+                raise RuntimeError(f"PassDAG2Tree failed. Node {node} still have outdegree = {out_degree}")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_fix_element_d.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_fix_element_d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d57c5b799d125ccc9491760259569731c0bf3ca
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_fix_element_d.py
@@ -0,0 +1,64 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Fix the element_output of producer of D.
+
+In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+element converter, so the compute node producing D must have element_output = type(D).
+"""
+
+from cutlass_cppgen.backend.evt.passes.pass_layout_elimination import PassLayoutManipulateElimination
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassFixElementD(EVTPassBase):
+    """
+    In Sm90 epilogue visitor, the node writing D to gmem does not have internal
+    element converter, so the compute node producing D must have
+    element_output = type(D)
+    """
+    dependencies = [
+        PassLayoutManipulateElimination
+    ]
+    def get_producer(self, node, element_D):
+        node_meta = self.dag_ir.get_node_meta(node)
+        if node_meta.op == "compute":
+            node_meta.element_output = element_D
+        elif node_meta.op == "store":
+            self.get_producer(self.dag_ir.get_all_inputs(node)[0], element_D)
+
+    def call(self):
+        if self.dag_ir.has_node("D"):
+            node_d_meta = self.dag_ir.get_node_meta("D")
+            element_D = node_d_meta.store_tensor.element
+            self.get_producer("D", element_D)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_get_impl.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_get_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..90fdafe7d0e80492bd2e641c69f11d95aace6bba
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_get_impl.py
@@ -0,0 +1,90 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Infer the underlying implement of each node.
+
+While the frontend only distinguish between Load/Store/Compute Node,
+each of these nodes can have different underlying implementation based
+on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+This pass infers the underlying impl of each node
+"""
+
+import cutlass_cppgen.backend.evt.backend as evt_backend
+from cutlass_cppgen.backend.evt.ir import DAGIR, LoadNode
+from cutlass_cppgen.backend.evt.passes.pass_fix_element_d import PassFixElementD
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass_cppgen.backend.evt.passes.pass_no_op_elimination import PassNoOpElimination
+from cutlass_cppgen.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+
+
+class PassGetImpl(EVTPassBase):
+    """
+    While the frontend only distinguish between Load/Store/Compute Node,
+    each of these nodes can have different underlying implementation based
+    on their layout. For instance, a LoadNode can be AuxLoad, Row/Col/Scalar broadcast, etc.
+    This pass infers the underlying impl of each node
+    """
+    dependencies = [
+        PassShapeTypePropagation,  # The shape and type info are required for inference
+        PassFixElementD
+    ]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.no_op_elimination = PassNoOpElimination(dag_ir)
+
+    def requires(self) -> None:
+        # Verify "accum" is in the arg list
+        if not self.dag_ir.has_node("accum"):
+            raise SyntaxError("Cannot find 'accum' in the argument list.")
+
+    def call(self):
+        # The loop structure of the epilogue is determined by the
+        # accumulator shape
+        accumulator: LoadNode = self.dag_ir.get_node_meta("accum")
+        problem_size = accumulator.tensor.shape
+
+        for node_meta in self.dag_ir.node_metas_topological_order():
+            node_meta.get_underlying_impl(problem_size)
+
+    def ensures(self) -> None:
+        # Some nodes will be lowered to NoOp, eliminate them
+        self.no_op_elimination()
+        # Lower to cc-specific impl
+        for node_meta in self.dag_ir.nodes_meta:
+            node_impl_ccs = getattr(evt_backend, f"sm{cc_map[self.cc]}_nodes")
+            node_meta.underlying_impl = getattr(
+                node_impl_ccs,
+                f"Sm{cc_map[self.cc]}" + node_meta.underlying_impl.__class__.__name__
+            )(node_meta)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py
new file mode 100644
index 0000000000000000000000000000000000000000..af147969f016b50ef05034fca99b173777948622
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_layout_elimination.py
@@ -0,0 +1,217 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Eliminate layout manipulation nodes
+"""
+
+from copy import deepcopy
+
+from cutlass_cppgen.backend.evt.ir import DAGIR, LayoutNode
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass_cppgen.backend.evt.passes.pass_shape_type_propagation import PassShapeTypePropagation
+
+
+class PassLayoutManipulateElimination(EVTPassBase):
+    """
+    Eliminate layout manipulation nodes
+    """
+    dependencies = [PassShapeTypePropagation]
+
+    def __init__(self, dag_ir: DAGIR) -> None:
+        super().__init__(dag_ir)
+        self.copy_cnt = 0
+
+    def call(self):
+        self.layout_nodes_worklist = self.get_all_layout_nodes()
+        # Run while loop utill all layout nodes are eliminated
+        while(len(self.layout_nodes_worklist) > 0):
+            node = self.layout_nodes_worklist.pop(0)
+            # for node in layout_nodes:
+            # Step 1: get the propagation direction
+            direction = self.get_propagation_direction(node)
+            self.visited = []
+            getattr(self, f"propagate_to_{direction}")(self.dag_ir.get_node_meta(node), node)
+            # Eliminate the current node
+            input_node = self.dag_ir.get_all_inputs(node)[0]
+            self.dag_ir.replace_all_uses_with(node, input_node)
+            # layout_nodes = self.get_all_layout_nodes()
+
+    def get_all_layout_nodes(self):
+        layout_nodes = []
+        for node_meta in reversed(self.dag_ir.node_metas_topological_order()):
+            if isinstance(node_meta, LayoutNode):
+                layout_nodes.append(node_meta.name)
+        return layout_nodes
+
+    def get_propagation_direction(self, node: str):
+        """
+        The logic is propagating all layout nodes away from the accumulator node.
+        """
+        self.visited = []
+        self.get_influenced_users(node)
+        nodes_influenced_dir_users = self.visited
+        self.visited = []
+        self.get_influenced_inputs(node)
+        nodes_influenced_dir_inputs = self.visited
+
+        if "accum" in nodes_influenced_dir_users and "accum" not in nodes_influenced_dir_inputs:
+            return "inputs"
+        elif "accum" not in nodes_influenced_dir_users and "accum" in nodes_influenced_dir_inputs:
+            return "users"
+        else:
+            raise RuntimeError("Unsolved propagation direction")
+
+    # Get all influenced nodes if we propagate along the user direction
+    def get_influenced_users(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        users = self.dag_ir.get_users(node)
+        for user in users:
+            self.get_influenced_users(user)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.get_influenced_inputs(input)
+
+    # Get all influenced nodes if we propagate along the input direction
+    def get_influenced_inputs(self, node: str):
+        if node in self.visited:
+            return
+        self.visited.append(node)
+
+        inputs = self.dag_ir.get_all_inputs(node)
+        for input in inputs:
+            self.get_influenced_inputs(input)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.get_influenced_users(user)
+
+    def add_copy_before(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        target_inputs = self.dag_ir.get_all_inputs(target)
+        for src in target_inputs:
+            self.dag_ir.remove_edge(src, target)
+            self.dag_ir.add_edge(src, copied_node)
+        self.dag_ir.add_edge(copied_node, target)
+        self.layout_nodes_worklist.append(copied_node)
+
+    def add_copy_after(self, layout_node_meta: LayoutNode, target: str):
+        copied_node_meta = deepcopy(layout_node_meta)
+        copied_node = f"{copied_node_meta.name}_copy{self.copy_cnt}"
+        self.copy_cnt += 1
+        copied_node_meta.name = copied_node
+        self.dag_ir.add_node(copied_node_meta)
+        # Add edges
+        users = self.dag_ir.get_users(target)
+        for user in users:
+            self.dag_ir.remove_edge(target, user)
+            self.dag_ir.add_edge(copied_node, user)
+        self.dag_ir.add_edge(target, copied_node)
+        self.layout_nodes_worklist.append(copied_node)
+
+    # Propagate the layout `node` along the user direction
+    def propagate_to_users(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to users
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_before(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_user(node_meta)
+
+        users = self.dag_ir.get_users(node)
+        user_inputs = []
+        for user in users:
+            user_inputs.append(set(self.dag_ir.get_all_inputs(user)))
+        for user in users:
+            self.propagate_to_users(layout_node_meta, user)
+        if len(user_inputs) > 0:
+            user_inputs = set.union(*user_inputs)
+            user_inputs.remove(node)
+            for input in user_inputs:
+                self.propagate_to_inputs(layout_node_meta.get_inverse_node(), input)
+
+    # Propagate the layout `node` along the input direction
+    def propagate_to_inputs(self, layout_node_meta: LayoutNode, node: str):
+        """
+        Propagate layout node to inputs
+        """
+        if node in self.visited:
+            # Avoid applying twice
+            return
+        self.visited.append(node)
+
+        node_meta = self.dag_ir.get_node_meta(node)
+        if layout_node_meta.name != node:
+            if isinstance(node_meta, LayoutNode):
+                # Layout node is not transparent with layout node
+                self.add_copy_after(layout_node_meta, node)
+                return
+            else:
+                layout_node_meta.apply_to_input(node_meta)
+        inputs = self.dag_ir.get_all_inputs(node)
+        input_users = []
+        for input in inputs:
+            input_users.append(set(self.dag_ir.get_users(input)))
+        for input in inputs:
+            self.propagate_to_inputs(layout_node_meta, input)
+        if len(input_users) > 0:
+            input_users = set.union(*input_users)
+            input_users.remove(node)
+            for user in input_users:
+                self.propagate_to_users(layout_node_meta.get_inverse_node(), user)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_manager.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b46bddb06e7c20be6d20526792777edef64b90
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_manager.py
@@ -0,0 +1,164 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Pass manager for DAG IR.
+"""
+
+from typing import Any
+
+import networkx as nx
+
+from cutlass_cppgen.backend.evt.ir import DAGIR
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+
+
+class EVTPassBase:
+    """
+    Base class for EVT Passes
+    """
+    dependencies = []
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    def requires(self) -> None:
+        """
+        This function will be called before the pass is run.
+        """
+        pass
+
+    def call(self) -> None:
+        """
+        The pass that is run through the self.dag_ir
+        """
+        raise NotImplementedError(
+            f"__call__ is not overwritten in Pass {self.__class__.__name__}")
+
+    def ensures(self) -> None:
+        """
+        This function will be called after the pass is run.
+        """
+        pass
+
+    def __call__(self) -> Any:
+        self.requires()
+        self.call()
+        self.ensures()
+
+    def cc_specific_method(self, func):
+        """
+        This enables defining function that behaves differently under different cc
+        The simplest example of using this function is the following
+
+        .. highlight:: python
+        .. code-block:: python
+
+        class ExamplePass(EVTPassBase):
+
+            def call(sekf):
+                # This automatically select the smXX_func based on current cc
+                self.cc_specific_method(self.func)()
+
+            # Interface func, can be empty
+            def func(self):
+                pass
+
+            # Sm90 specific func
+            def sm90_func(self):
+                // sm90 specific method
+                return
+
+            # Sm80 specific func
+            def sm80_func(self):
+                // sm80 specific method
+                return
+        """
+        func_name = f"sm{cc_map[self.cc]}_{func.__name__}"
+        if hasattr(self, func_name):
+            return getattr(self, func_name)
+        else:
+            raise NotImplementedError(f"func {func.__name__} is not overwritten for Sm{self.cc}")
+
+
+class EVTPassManager(nx.DiGraph):
+    """
+    Topological-based Pass Manager.
+    Each registered pass has a list of dependencies. The pass manager organizes
+    the passes as a DAG and launch the compiler passes under topological order.
+    """
+    def __init__(self, dag_ir: DAGIR, pass_list):
+        super().__init__()
+        self.dag_ir = dag_ir
+        for pass_cls in pass_list:
+            self.add_pass(pass_cls)
+
+        self.sorted_passes = self.schedule()
+
+    def get_callable(self, pass_name):
+        """
+        Return the callable of the pass
+        """
+        return self.nodes[pass_name]["callable"]
+
+    def add_pass(self, pass_cls):
+        """
+        Add a pass to the pass manager
+        :param pass_cls: the class of pass
+        :type pass_cls: derived class of EVTPassBase
+        """
+        name = pass_cls.__name__
+        pass_callable = pass_cls(self.dag_ir)
+        self.add_node(name, callable=pass_callable)
+
+    def schedule(self):
+        """
+        Schedule the added passes under topological order
+        """
+        # Add edges
+        for pass_name in self.nodes:
+            callable = self.get_callable(pass_name)
+            for dependency_cls in callable.dependencies:
+                self.add_edge(
+                    dependency_cls.__name__,
+                    type(callable).__name__)
+
+        # Topological sort
+        return list(nx.topological_sort(self))
+
+    def __call__(self) -> Any:
+        """
+        Launch the registered passes
+        """
+        for pass_name in self.sorted_passes:
+            callable = self.get_callable(pass_name)
+            callable()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_no_op_elimination.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_no_op_elimination.py
new file mode 100644
index 0000000000000000000000000000000000000000..13107eb1d11c9a436348a4e50a92e62ce6f8b312
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_no_op_elimination.py
@@ -0,0 +1,53 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+No op elimination node
+"""
+
+from typing import Any
+
+from cutlass_cppgen.backend.evt.ir import NoOpImpl
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassNoOpElimination(EVTPassBase):
+    """
+    The dead node elimination pass removes nodes with NoOpImpl in DAG IR
+    """
+    dependencies = []
+
+    def call(self) -> Any:
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta = self.dag_ir.get_node_meta(node)
+            if isinstance(node_meta.underlying_impl, NoOpImpl):
+                self.dag_ir.replace_all_uses_with(node, self.dag_ir.get_all_inputs(node)[0])
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_preprocess_red.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_preprocess_red.py
new file mode 100644
index 0000000000000000000000000000000000000000..6423a2b845dd643650cf99037178030bee6f0dbd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_preprocess_red.py
@@ -0,0 +1,97 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Preprocess the reduction nodes.
+
+The parser treats reduction as Compute(op=(reg_reduce_fn, gmem_reduce_fn)) - Store()
+This pass fuses these into a single store node, and then replaces all uses of the
+current node with the new store node.
+"""
+
+from cutlass_cppgen.backend.evt.ir import ComputeNode, StoreNode
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+
+
+class PassPreprocessRed(EVTPassBase):
+    """
+    Preprocess red nodes
+    """
+
+    def call(self):
+        # Step 1: find the compute nodes with op=red
+        red_compute_nodes = []
+        for node_meta in self.dag_ir.nodes_meta:
+            if isinstance(node_meta, ComputeNode):
+                if type(node_meta.fn) == tuple:
+                    # To keep the frontend simple, the reduction nodes
+                    # are parsed into compute nodes by default
+                    # The simple heuristic to distinguish between compute
+                    # and reduction node is that compute node is a single function,
+                    # while the reduction node is a tuple of functions for
+                    # in-register reduction and atomic global memory reduction
+                    red_compute_nodes.append(node_meta.name)
+
+        # Step 2: for each compute, merge it with the succeeding store
+        for node in red_compute_nodes:
+            # Verify
+            users = self.dag_ir.get_users(node)
+            inputs = self.dag_ir.get_all_inputs(node)
+            # Has a single user
+            assert len(users) == 1
+            assert len(inputs) == 1
+            user = users[0]
+            input = inputs[0]
+
+            user_meta = self.dag_ir.get_node_meta(user)
+            # Must be a store node
+            assert isinstance(user_meta, StoreNode)
+            # With output degree == 0
+            assert self.dag_ir.out_degree(user) == 0
+            # Register the reduce op
+            node_meta = self.dag_ir.get_node_meta(node)
+            user_meta.reg_reduce_fn, user_meta.gmem_reduce_fn = node_meta.fn
+            user_meta.element_compute = node_meta.element_compute
+            user_meta.round_style = node_meta.round_style
+
+            # Replace all uses
+            self.dag_ir.remove_edge(input, node)
+            input_users = self.dag_ir.get_users(input)
+            for iu in input_users:
+                weight = self.dag_ir.get_edge_weight(input, iu)
+                self.dag_ir.add_edge(user, iu, weight)
+                self.dag_ir.remove_edge(input, iu)
+            self.dag_ir.add_edge(input, user)
+            self.dag_ir.remove_node(node)
+
+            # Register the reduction name
+            self.dag_ir.reduction_names.append(user)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_shape_type_propagation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_shape_type_propagation.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb90a82c8f637429d3c64b3d881eb30d02c8c804
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/pass_shape_type_propagation.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Shape and type propagation pass
+"""
+
+from cutlass_cppgen.backend.evt.ir.node import NodeBase
+from cutlass_cppgen.backend.evt.passes.pass_manager import EVTPassBase
+from cutlass_cppgen.backend.evt.passes.pass_preprocess_red import PassPreprocessRed
+
+
+class PassShapeTypePropagation(EVTPassBase):
+    """
+    Propagate the shape and type of all nodes
+    """
+    dependencies = [PassPreprocessRed]
+
+    def call(self):
+        # Propagate the node shape and type
+        for node in self.dag_ir.nodes_topological_order():
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.type_propagation(input_node_metas)
+            node_meta.shape_propagation(input_node_metas)
+
+        for node in reversed(self.dag_ir.nodes_topological_order()):
+            node_meta: NodeBase = self.dag_ir.get_node_meta(node)
+            input_node_metas = self.dag_ir.get_all_inputs_meta(node)
+            node_meta.broadcast_propagation(input_node_metas)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/smem_size_calculator.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/smem_size_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8168c59733a5da15eacbbe583c890610655ecff5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/smem_size_calculator.py
@@ -0,0 +1,319 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Compute the shared memory size in bytes
+"""
+
+from math import gcd
+
+import cutlass_library
+from pycute import flatten, shape_div, product
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.evt.ir import TopoVisitorNode, DAGIR
+from cutlass_cppgen.backend.library import DataType, DataTypeSize
+
+
+class GetSmemSize:
+    """
+    Get the size in byte of shared memory used by the kernel
+    """
+    def __init__(self, dag_ir: DAGIR) -> None:
+        self.dag_ir = dag_ir
+        self.cc = self.dag_ir.cc
+
+    #
+    # Sm90 epilogue specific
+    #
+
+    def sm90_epilogue_tile(self, tile_description):
+        # Get the epilogue tile size
+        schedule = tile_description.epilogue_schedule
+        if schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecialized:
+            element_d = self.dag_ir.get_node_meta("D").element
+            nperf = 64 if (DataTypeSize[element_d] == 8 and tile_description.threadblock_shape[1] % 64 == 0) else 32
+            epi_tile_m = min(64, tile_description.threadblock_shape[0])
+            epi_tile_n = gcd(min(nperf, tile_description.threadblock_shape[1]), tile_description.threadblock_shape[1])
+            epilogue_tile_mn = (epi_tile_m, epi_tile_n)
+        elif schedule == cutlass_library.EpilogueScheduleType.TmaWarpSpecializedCooperative:
+            epi_tile_m = min(128, tile_description.threadblock_shape[0])
+            epi_tile_n = gcd(min(32, tile_description.threadblock_shape[1]), tile_description.threadblock_shape[1])
+            epilogue_tile_mn = (epi_tile_m, epi_tile_n)
+        else:
+            raise NotImplementedError(f"Unsupported schedule: {schedule}")
+
+        # Get the pipeline stages
+        stages_d = 2
+        epi_tiles = product(shape_div(tuple(tile_description.threadblock_shape)[:2], epilogue_tile_mn))
+        if self.dag_ir.has_node("C"):
+            element_c = self.dag_ir.get_node_meta("C").element
+        else:
+            element_c = None
+
+        element_d = self.dag_ir.get_node_meta("D").element
+        if element_c == element_d:
+            reuse_smem_c = True
+        else:
+            reuse_smem_c = False
+        stages_c = max(epi_tiles, stages_d + 1) if reuse_smem_c else epi_tiles
+
+        # Record the epilogue tile
+        self.cta_tile_mnk = tuple(tile_description.threadblock_shape)
+        self.epilogue_tile_mn = epilogue_tile_mn
+        self.epi_tiles = epi_tiles
+        self.stages_c = stages_c
+        self.stages_d = stages_d
+        self.reuse_smem_c = reuse_smem_c
+        self.element_c = element_c
+        self.element_d = element_d
+        self.is_source_supported = element_c is not None
+
+    def sm90_or_sm100_epilogue_smem_size(self, tile_description):
+        # Get the Fusion Storage
+        nodes = self.dag_ir.nodes_topological_order()
+        self.smem_types = {}
+        for node in nodes:
+            meta = self.dag_ir.get_node_meta(node)
+            if not meta.disabled:
+                self.smem_types[node] = meta.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+            if node == "D":
+                continue
+            if isinstance(meta, TopoVisitorNode):
+                self.get_dag_smem_type(node)
+            else:
+                self.get_evt_smem_type(node)
+
+        thread_smem_size = self.smem_types[self.dag_ir.get_all_inputs("D")[0]][0]
+        # Get the Tensor Storage
+        tensors = []
+        if self.is_source_supported:
+            smem_C = DataTypeSize[self.element_c] * product(self.epilogue_tile_mn) * self.stages_c // 8
+            tensors.append((smem_C, 128))
+        else:
+            tensors.append((0, 1))
+        if self.reuse_smem_c:
+            tensors.append((0, 128))
+        else:
+            smem_D = DataTypeSize[self.element_d] * product(self.epilogue_tile_mn) * self.stages_d // 8
+            tensors.append((smem_D, 128))
+        tensors.append((thread_smem_size, 128))
+
+        tensor_smem_size = self.get_struct_size(tensors)
+        # Get pipeline storage size
+        # sizeof(uint64_t * stages_c * 2), alignment of uint64_t
+        # 2 is for FullBarrier and EmptyBarrier
+        pipeline_smem_size = (8 * self.stages_c * 2, 8)
+
+        # get SharedStorage size
+        smem_size = self.get_struct_size([tensor_smem_size, pipeline_smem_size])
+        return smem_size[0]
+
+    def sm90_epilogue_smem_size(self, tile_description):
+        """
+        Compute the shared memory size of sm90 collective epilogue
+        """
+        self.sm90_epilogue_tile(tile_description)
+        return self.sm90_or_sm100_epilogue_smem_size(tile_description)
+
+    #
+    # Sm100 epilogue specific
+    #
+
+    def sm100_epilogue_tile(self, tile_description):
+        cta_tile = (tile_description.blackwell_threadblock_shape[0], tile_description.blackwell_threadblock_shape[1])
+        mma_tile = cta_tile
+
+        if tile_description.is_2sm:
+            cta_tile = (cta_tile[0] // 2, cta_tile[1])
+
+        if tile_description.is_2sm and mma_tile[0] == 128:
+            tmem_warps = (2, 2)
+        else:
+            tmem_warps = (4, 1)
+
+        if self.dag_ir.has_node("C"):
+            element_c = self.dag_ir.get_node_meta("C").element
+            element_c_size = DataTypeSize[element_c]
+        else:
+            element_c = None
+            element_c_size = 0
+
+        element_d = self.dag_ir.get_node_meta("D").element
+
+        DisableSource = element_c is None or not self.dag_ir.has_node("C") or self.dag_ir.get_node_meta("C").element == DataType.void
+
+        CtaM = cta_tile[0]
+        CtaN = cta_tile[1]
+        WarpM = tmem_warps[0]
+        WarpN = tmem_warps[1]
+        MaxBits = max(element_c_size, DataTypeSize[element_d])
+        DpFull = 32
+        M = min(CtaM, DpFull * WarpM)
+
+        if DisableSource:
+            # Epilogues w/o residual load are less sensitive to smem allocation
+            # Target a fixed amount of compute per epilogue iteration
+            if MaxBits == 4:
+                # Make epilogue tile larger to reduce the epilogue iterations.
+                # 64 is the experimental value. It will minimize epilogue iterations but keep the number of A/B buffers the same.
+                ComputeElts = 8192
+                Nperf = ComputeElts // M
+            else:
+                ComputeElts = 4096
+                Nperf = ComputeElts // M
+        else:
+            # Epilogues w/ residual load are more sensitive to smem allocation
+            # Target optimal smem distribution between epilogue+mainloop based on datatype+tilesize
+            if MaxBits == 32:
+                Nperf = 16 if CtaM > 64 and CtaN <= 128 else 32
+            elif MaxBits == 16:
+                Nperf = 32 if CtaN <= 128 else 64
+            else:
+                Nperf = 64
+
+        def is_m_major(layout):
+            return flatten(layout.stride[0]) == 1
+
+        if DisableSource or is_m_major(self.dag_ir.get_node_meta("C").tensor.layout):
+            N_min_C = 8 * WarpN
+        elif element_c_size == 6:
+            N_min_C = 128 * WarpN
+        else:
+            N_min_C = (128 // element_c_size) * WarpN
+
+        if is_m_major(self.dag_ir.get_node_meta("D").tensor.layout):
+            N_min_D = 8 * WarpN
+        elif DataTypeSize[element_d] == 6:
+            N_min_D = 128 * WarpN
+        else:
+            N_min_D = (128 // DataTypeSize[element_d]) * WarpN
+
+        N = min(CtaN, max(Nperf, N_min_C, N_min_D))
+
+        tile_m = M
+        tile_n_size = N // WarpN * WarpN
+
+        epilogue_tile_mn = (tile_m, tile_n_size)
+        epi_tiles = product(shape_div(tuple(tile_description.threadblock_shape)[:2], epilogue_tile_mn))
+
+        stages_d = min(epi_tiles, 2)
+        reuse_smem_c = (element_c_size > 8)
+
+        if reuse_smem_c:
+            stages_c = max(min(epi_tiles, 4), stages_d + 1)
+        else:
+            stages_c = min(epi_tiles, 4)
+
+        # Record the epilogue tile
+        self.cta_tile_mnk = tuple(tile_description.threadblock_shape)
+        self.epilogue_tile_mn = epilogue_tile_mn
+        self.epi_tiles = epi_tiles
+        self.stages_c = stages_c
+        self.stages_d = stages_d
+        self.reuse_smem_c = reuse_smem_c
+        self.element_c = element_c
+        self.element_d = element_d
+        self.is_source_supported = not DisableSource
+
+    def sm100_epilogue_smem_size(self, tile_description):
+        """
+        Compute the shared memory size of sm100 collective epilogue
+        """
+        self.sm100_epilogue_tile(tile_description)
+        return self.sm90_or_sm100_epilogue_smem_size(tile_description)
+
+    def __call__(self, tile_description):
+        return getattr(self, f"sm{self.cc}_epilogue_smem_size")(tile_description)
+
+    #
+    # Helper functions
+    #
+
+    @staticmethod
+    def get_visitor_size(members: list, ebo: bool):
+        """
+        Get the size of struct in bytes
+        """
+        offset = 0
+        max_alignment = 1
+        if len(members) > 0:
+            # Get alignment
+            for _, alignment in members:
+                max_alignment = max(max_alignment, alignment)
+
+            for type_size, _ in members:
+                if type_size != 0:
+                    offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+                if type_size == 0 and not ebo:
+                    offset += 1
+                else:
+                    offset += type_size
+            offset = ((offset + max_alignment - 1) // max_alignment) * max_alignment
+            return (offset, max_alignment)
+        else:
+            # Struct size is at least 1
+            return (1, 1)
+
+    def get_struct_size(self, members: list):
+        """
+        Get the size of struct in bytes
+        """
+        return self.get_visitor_size(members, False)
+
+    def get_evt_smem_type(self, node):
+        # Sort the input nodes by edge weight
+        input_types = [self.smem_types[child] for child in self.dag_ir.get_all_inputs(node)]
+        input_types.append(self.smem_types[node])
+        if len(input_types) > 1:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
+
+    def get_dag_smem_type(self, node):
+        meta = self.dag_ir.get_node_meta(node)
+        subgraph = meta.subgraph
+        subgraph_nodes = subgraph.nodes_topological_order()
+        # Visit the unvisited nodes in subgraph
+        for n in subgraph_nodes:
+            m = subgraph.get_node_meta(n)
+            if m.disabled:
+                continue
+            else:
+                self.smem_types[n] = m.underlying_impl.get_smem_size(
+                    self.cta_tile_mnk, self.epilogue_tile_mn,
+                    self.stages_c, self.stages_d, self.epi_tiles)
+        input_types = [self.smem_types[child] for child in subgraph_nodes[:-1]]
+        if len(input_types) > 0:
+            ebo = len(input_types) > 4
+            self.smem_types[node] = self.get_visitor_size(input_types, ebo)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/util.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b72e330523ca1e4fb8c5d4526289641e158e72e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/evt/passes/util.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for passes
+"""
+
+# Map from the CC of the kernel to the EVT implementation that the CC targets
+cc_map = {
+    80:   80,
+    86:   80,
+    89:   80,
+    90:   90,
+    100: 100,
+    101: 100,
+    103: 100,
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/frontend.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/frontend.py
new file mode 100644
index 0000000000000000000000000000000000000000..a959976b8601b0793c4c7c1709d61c8c838df838
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/frontend.py
@@ -0,0 +1,109 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from __future__ import annotations
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+import numpy as np
+
+from cutlass_cppgen.backend.memory_manager import device_mem_alloc, todevice
+from cutlass_cppgen.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+class NumpyFrontend:
+    """
+    Frontend node for numpy
+    """
+
+    @staticmethod
+    def argument(np_tensor: "np.ndarray", is_output: "bool") -> cuda.CUdeviceptr:
+        """Convert the input numpy tensor to CUDA device pointer
+
+        :param np_tensor: input numpy nd array
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+        # copy the data to device
+        if is_output:
+            return device_mem_alloc(np_tensor.size * np_tensor.itemsize)
+        else:
+            return todevice(np_tensor)
+
+
+class TorchFrontend:
+    """
+    Frontend node for torch
+    """
+
+    @staticmethod
+    def argument(torch_tensor: "torch.Tensor") -> cuda.CUdeviceptr:
+        """Convert the input torch tensor to CUDA device pointer
+
+        :param torch_tensor: input torch tensor
+        :param is_output: whether the tensor is output
+
+        :return: CUDA device pointer
+        """
+
+        # check the device of torch_tensor
+        if not torch_tensor.is_cuda:
+            torch_tensor = torch_tensor.to("cuda")
+
+        return cuda.CUdeviceptr(torch_tensor.data_ptr())
+
+
+class CupyFrontend:
+    """
+    Frontend node for cupy
+    """
+
+    @staticmethod
+    def argument(cupy_ndarray: "cp.ndarray"):
+        return cuda.CUdeviceptr(int(cupy_ndarray.data.ptr))
+
+
+class TensorFrontend:
+    """
+    Universal Frontend for client-provide tensors
+    """
+
+    @staticmethod
+    def argument(tensor, is_output=False):
+        if is_numpy_tensor(tensor):
+            return NumpyFrontend.argument(tensor, is_output)
+        elif is_torch_tensor(tensor):
+            return TorchFrontend.argument(tensor)
+        elif is_cupy_tensor(tensor):
+            return CupyFrontend.argument(tensor)
+        else:
+            raise NotImplementedError("Unknown Tensor Type")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/gemm_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/gemm_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e2a3a30a097eb45c691554daf70f8db12e5bc48
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/gemm_operation.py
@@ -0,0 +1,2145 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+from __future__ import annotations
+
+import copy
+import ctypes
+import enum
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart = lazy_import("cuda.cudart")
+from cutlass_library import SubstituteTemplate
+import numpy as np
+
+from cutlass_library import (
+    ComplexTransformTag,
+    DataType,
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    EpilogueScheduleSuffixes,
+    EpilogueScheduleTag,
+    EpilogueScheduleType,
+    GemmKind,
+    GemmKindNames,
+    GemmUniversalMode,
+    KernelScheduleSuffixes,
+    KernelScheduleTag,
+    KernelScheduleType,
+    LayoutTag,
+    LayoutType,
+    MathOperation,
+    MathOperationTag,
+    OpcodeClass,
+    OpcodeClassNames,
+    OpcodeClassTag,
+    OperationKind,
+    ShortComplexLayoutNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SwizzlingFunctor,
+    SwizzlingFunctorTag,
+    TileSchedulerSuffixes,
+    TileSchedulerTag,
+    TileSchedulerType,
+    get_complex_from_real
+)
+from cutlass_cppgen.backend.arguments import ArgumentBase
+from cutlass_cppgen.backend.c_types import (
+    GemmCoord_,
+    GemmCoordBatched_,
+    GenericMainloopArguments3x_,
+    StrideBatched_,
+    dim3_,
+    get_gemm_arguments,
+    get_gemm_arguments_3x,
+    get_gemm_arguments_streamk,
+    get_gemm_grouped_arguments,
+    get_mainloop_arguments_3x,
+    get_tile_scheduler_arguments_3x,
+)
+from cutlass_cppgen.backend.library import (
+    ApiVersion,
+    EmissionType,
+    SchedulerMode,
+    SchedulerModeTag,
+    TensorDescription,
+    TileDescription,
+    api_version,
+)
+from cutlass_cppgen.backend.memory_manager import device_mem_alloc, todevice
+from cutlass_cppgen.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass_cppgen.backend.type_hint import GemmOperation, Tensor
+from cutlass_cppgen.backend.utils.device import device_sm_count
+from cutlass_cppgen.shape import GemmCoord, MatrixCoord
+
+
+################################################################################
+#
+# Data structure modeling a GEMM operation
+#
+################################################################################
+
+
+def leading_dimension(layout: LayoutType, shape: MatrixCoord) -> int:
+    """
+    Returns the leading dimenson of a tensor with layout ``layout`` and shape ``shape``.
+
+    :param layout: layout of the tensor
+    :type layout: cutlass_cppgen.shape.LayoutType
+    :param shape: shape of the tensor
+    :type shape: cutlass_cppgen.shape.MatrixCoord
+
+    :return: leading dimension of the tensor
+    :rtype: int
+    """
+    if layout == LayoutType.RowMajor:
+        return shape.column
+    elif layout == LayoutType.ColumnMajor:
+        return shape.row
+
+
+def transpose_layout(layout: LayoutType) -> LayoutType:
+    if layout == LayoutType.ColumnMajor:
+        return LayoutType.RowMajor
+    elif layout == LayoutType.RowMajor:
+        return LayoutType.ColumnMajor
+    else:
+        raise ValueError(f"Unsupported Layout {layout}")
+
+
+class GemmArguments2x(ArgumentBase):
+    """
+    Argument wrapper for GEMM in CUTLASS 2. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.GemmOperationUniversal` |
+     :class:`cutlass_cppgen.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass_cppgen.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        self.operation = operation
+
+        self.layout_A = operation.A.layout
+        self.layout_B = operation.B.layout
+        self.layout_C = operation.C.layout
+
+        self.element_A = operation.A.element
+        self.element_B = operation.B.element
+        self.element_C = operation.C.element
+
+        if operation.C.layout in [LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32]:
+            raise Exception("Interleaved layout not currently supported")
+
+        if hasattr(self.operation.epilogue_functor, "visitor") and operation.arch not in [90, 100, 101, 103]:
+            super().__init__(A, B, None, None, **kwargs)
+        else:
+            super().__init__(A, B, C, D, **kwargs)
+
+        if operation.switched:
+            self.problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+            self.ptr_A, self.ptr_B = self.ptr_B, self.ptr_A
+        else:
+            self.problem_size = problem_size
+        # If the number of elements in C = problem_size.n, C is treated as the bias
+        if hasattr(self, "tensor_c_numel"):
+            if self.tensor_c_numel == self.problem_size.n and self.problem_size.m != 1:
+                self.bias = True
+
+        self.lda = leading_dimension(self.layout_A, self.problem_size.mk)
+        self.ldb = leading_dimension(self.layout_B, self.problem_size.kn)
+        self.ldc = leading_dimension(self.layout_C, self.problem_size.mn)
+        self.ldd = self.ldc
+
+        if self.bias:
+            self.ldc = 0
+
+        if "output_op" in kwargs.keys() and gemm_mode != GemmUniversalMode.GemmSplitKParallel:
+            self.output_op = kwargs["output_op"]
+        else:
+            if self.operation.epilogue_functor.element_epilogue in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+                dtype = int
+            else:
+                dtype = float
+            self.output_op = self.operation.epilogue_type(dtype(1.0), dtype(0.0))
+
+        self.gemm_mode = gemm_mode
+        if gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
+            if "split_k_slices" in kwargs.keys():
+                self.batch_count = kwargs["split_k_slices"]
+            else:
+                self.batch_count = 1
+            self.split_k_slices = self.batch_count
+
+        if gemm_mode in [GemmUniversalMode.Batched, GemmUniversalMode.Array]:
+            if "batch" in kwargs.keys():
+                self.batch_count = kwargs["batch"]
+            else:
+                self.batch_count = 1
+
+        if "batch_strides" in kwargs:
+            self.batched_stride_A = kwargs["batch_strides"]["A"]
+            self.batched_stride_B = kwargs["batch_strides"]["B"]
+            self.batched_stride_C = kwargs["batch_strides"]["C"]
+            self.batched_stride_D = kwargs["batch_strides"]["D"]
+        else:
+            self.batched_stride_A = self.problem_size.m * self.problem_size.k
+            self.batched_stride_B = self.problem_size.n * self.problem_size.k
+            self.batched_stride_C = self.problem_size.m * self.problem_size.n
+            self.batched_stride_D = self.problem_size.m * self.problem_size.n
+
+        if self.bias:
+            self.batched_stride_C = self.problem_size.n
+
+        if gemm_mode == GemmUniversalMode.Array:
+            self.ptr_A_array = []
+            self.ptr_B_array = []
+            self.ptr_C_array = []
+            self.ptr_D_array = []
+
+            ptr_A_addr = int(self.ptr_A)
+            ptr_B_addr = int(self.ptr_B)
+            ptr_C_addr = int(self.ptr_C)
+            ptr_D_addr = int(self.ptr_D)
+
+            stride_A = self.batched_stride_A * DataTypeSize[self.element_A] // 8
+            stride_B = self.batched_stride_B * DataTypeSize[self.element_B] // 8
+            stride_C = self.batched_stride_C * DataTypeSize[self.element_C] // 8
+            stride_D = self.batched_stride_D * DataTypeSize[self.element_C] // 8
+            for _ in range(self.batch_count):
+                self.ptr_A_array.append(ptr_A_addr)
+                self.ptr_B_array.append(ptr_B_addr)
+                self.ptr_C_array.append(ptr_C_addr)
+                self.ptr_D_array.append(ptr_D_addr)
+
+                ptr_A_addr += stride_A
+                ptr_B_addr += stride_B
+                ptr_C_addr += stride_C
+                ptr_D_addr += stride_D
+
+            self.ptr_A_array_buffer = todevice(self.ptr_A_array, dtype=np.int64)
+            self.ptr_B_array_buffer = todevice(self.ptr_B_array, dtype=np.int64)
+            self.ptr_C_array_buffer = todevice(self.ptr_C_array, dtype=np.int64)
+            self.ptr_D_array_buffer = todevice(self.ptr_D_array, dtype=np.int64)
+
+        if isinstance(self.operation, GemmOperationUniversal):
+            self.initialize()
+
+    def get_arguments(self):
+        problem_size_ = self.problem_size.ctype
+        grid_tiled_shape_ = GemmCoord(
+            self.grid_tiled_shape.x,
+            self.grid_tiled_shape.y,
+            self.grid_tiled_shape.z ).ctype
+
+        if self.gemm_mode == GemmUniversalMode.Array:
+            arguments = self.operation.argument_type(
+                # Arguments from UniversalArgumentsBase
+                self.gemm_mode,
+                problem_size_,
+                self.batch_count,
+                0,
+                # Remaining arguments
+                self.output_op,
+                int(self.ptr_A_array_buffer.ptr),
+                int(self.ptr_B_array_buffer.ptr),
+                int(self.ptr_C_array_buffer.ptr),
+                int(self.ptr_D_array_buffer.ptr),
+                0, 0, 0,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                0, 0, 0
+            )
+        else:
+            arguments = self.operation.argument_type(
+                # Arguments from UniversalArgumentsBase
+                self.gemm_mode, problem_size_, self.batch_count, self.batched_stride_D,
+                # Remaining arguments
+                self.output_op,
+                int(self.ptr_A),
+                int(self.ptr_B),
+                int(self.ptr_C),
+                int(self.ptr_D),
+                self.batched_stride_A,
+                self.batched_stride_B,
+                self.batched_stride_C,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                self.lda, self.ldb, self.ldc, self.ldd,
+                0, 0, 0
+            )
+
+        self.arguments = arguments, grid_tiled_shape_, self.gemm_k_size
+
+    def initialize(self):
+        launch_config = self.operation.rt_module.plan(self)
+
+        # Get the host and device workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        self.get_arguments()
+
+        arguments, grid_tiled_shape, gemm_k_size = self.arguments
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(arguments), ctypes.c_void_p(int(device_workspace)))
+        host_workspace = bytearray(res_arg.contents)
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = launch_config
+
+    def sync(self, stream_sync=True):
+        super().sync(stream_sync)
+        if hasattr(self.output_op, "sync"):
+            self.output_op.sync()
+
+
+class GemmArguments2xStreamK(GemmArguments2x):
+    """
+    Argument wrapper for stream-K GEMMs in CUTLASS 2. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.GemmOperationUniversal` |
+     :class:`cutlass_cppgen.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass_cppgen.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
+
+        super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+    def get_arguments(self):
+        batch_stride_A = self.problem_size.m * self.problem_size.k
+        batch_stride_B = self.problem_size.k * self.problem_size.n
+        batch_stride_C = self.problem_size.m * self.problem_size.n
+        batch_stride_D = self.problem_size.m * self.problem_size.n
+
+        arguments = self.operation.argument_type(
+            self.gemm_mode,
+            GemmCoord_(self.problem_size.m, self.problem_size.n, self.problem_size.k),
+            self.batch_count,
+            self.output_op,
+            int(self.ptr_A),
+            int(self.ptr_B),
+            int(self.ptr_C),
+            int(self.ptr_D),
+            batch_stride_A,
+            batch_stride_B,
+            batch_stride_C,
+            batch_stride_D,
+            self.lda, self.ldb, self.ldc, self.ldd,  # strides
+            self.lda, self.ldb, self.ldc, self.ldd,
+            -1,  # avail_sms
+        )
+        return arguments
+
+    def initialize(self):
+        # Get the host and device workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(
+            self,
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        arguments = self.get_arguments()
+
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(arguments),
+            ctypes.c_void_p(int(device_workspace)),
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+        host_workspace = bytearray(res_arg.contents)
+
+        grid = self.operation.rt_module.get_grid_shape(
+            ctypes.byref(arguments),
+            device_sm_count(),
+            self.operation.rt_module.occupancy
+        )
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = LaunchConfiguration(
+            [grid.m, grid.n, grid.k],
+            [self.operation.rt_module.threads, 1, 1],
+            self.operation.rt_module.shared_memory_capacity
+        )
+
+
+class GemmArguments3x(GemmArguments2x):
+    """
+    Argument wrapper for GEMM in CUTLASS 3. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.GemmOperationUniversal` |
+     :class:`cutlass_cppgen.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass_cppgen.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: GemmUniversalMode
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+    """
+
+    def __init__(self, operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+        if gemm_mode not in [GemmUniversalMode.Gemm, GemmUniversalMode.Batched]:
+            raise Exception(f"Unsupported GEMM mode {gemm_mode}.")
+
+        super().__init__(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+    def get_arguments(self):
+        mainloop_args = get_mainloop_arguments_3x(
+            self.operation.tile_description.kernel_schedule,
+            self.operation.A.element,
+            self.operation.B.element,
+            self.operation.A.alignment,
+            self.operation.B.alignment
+        )
+        scheduler_args = get_tile_scheduler_arguments_3x(self.operation.tile_description.tile_scheduler)
+        uses_default_epilogue = self.operation.rt_module.uses_default_epilogue()
+        argument_type, epilogue_args, epilogue_type, hw_info = get_gemm_arguments_3x(
+            mainloop_args, self.operation.epilogue_functor, scheduler_args, uses_default_epilogue)
+
+        problem_size_ = GemmCoordBatched_(self.problem_size, self.batch_count)
+
+        if self.batch_count > 1:
+            bsA = self.batched_stride_A
+            bsB = self.batched_stride_B
+            bsC = self.batched_stride_C
+            bsD = self.batched_stride_D
+        else:
+            bsA = 0
+            bsB = 0
+            bsC = 0
+            bsD = 0
+        stride_A = StrideBatched_(self.lda, bsA)
+        stride_B = StrideBatched_(self.ldb, bsB)
+        stride_C = StrideBatched_(self.ldc, bsC)
+        stride_D = StrideBatched_(self.ldd, bsD)
+
+        # Superset of potential mainloop arguments
+        generic_args = GenericMainloopArguments3x_(
+            int(self.ptr_A),
+            stride_A,
+            int(self.ptr_B),
+            stride_B,
+            4 # mma_promotion_interval
+        )
+
+        # Set of mainloop arguments needed for this kernel
+        mainloop = mainloop_args.from_generic_mainloop_args(generic_args)
+
+        if not uses_default_epilogue and hasattr(self.output_op, "to_evt_params"):
+            self.output_op = self.output_op.to_evt_params()
+
+        epilogue = epilogue_args(
+            self.output_op,
+            int(self.ptr_C),
+            stride_C,
+            int(self.ptr_D),
+            stride_D,
+        )
+
+        # Set hardware info
+        hw_info_ = hw_info(
+            0, device_sm_count(), 0,
+            dim3_(0,0,0),
+            dim3_(0,0,0),
+        )
+
+        self.arguments = argument_type(
+            int(self.gemm_mode),
+            problem_size_,
+            mainloop,
+            epilogue,
+            hw_info_,
+            scheduler_args
+        )
+        return self.arguments
+
+    def initialize(self):
+        # Get the host and evice workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        device_workspace = 0
+        if workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            # In GEMM splik-K parallel, the D pointer is redirected to the workspace
+            self.ptr_D = cuda.CUdeviceptr(workspace_ptr)
+        elif workspace_ptr is not None and self.gemm_mode == GemmUniversalMode.Gemm:
+            device_workspace = workspace_ptr
+
+        self.get_arguments()
+        res_arg = self.operation.rt_module.get_args(
+            ctypes.byref(self.arguments),
+            ctypes.c_void_p(int(device_workspace)),
+        )
+        host_workspace = bytearray(res_arg.contents)
+
+        grid = self.operation.rt_module.get_grid_shape(
+            ctypes.byref(self.arguments),
+            ctypes.c_void_p(int(device_workspace)),
+        )
+        block = self.operation.rt_module.get_block_shape()
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = LaunchConfiguration(
+            [grid.x, grid.y, grid.z],
+            [block.x, block.y, block.z],
+            self.operation.rt_module.shared_memory_capacity,
+        )
+
+
+def GemmArguments(operation, problem_size, A, B, C, D, gemm_mode=GemmUniversalMode.Gemm, **kwargs):
+    """
+    Argument wrapper for GEMM in CUTLASS 2 or 3. It returns either 2x arguments
+    or 3x arguments depending on the `arch` field specified in `operation`.
+
+    :param operation: the GEMM operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.GemmOperationUniversal` |
+     :class:`cutlass_cppgen.backend.GemmOperationGrouped`
+
+    :param problem_size: GEMM problem size gemm(M, N, K)
+    :type operation: :class:`cutlass_cppgen.shape.GemmCoord`
+
+    :param A: tensor A
+    :type A: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param B: tensor B
+    :type B: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param C: tensor C
+    :type C: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param D: tensor D
+    :type D: cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray
+
+    :param gemm_mode: GEMM mode
+    :type gemm_mode: :class:`cutlass_library.GemmUniversalMode`
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+    """
+    if operation.swizzling_functor == SwizzlingFunctor.StreamK:
+        if operation.api == ApiVersion.v3x:
+            raise Exception("Stream K is currently only supported in CUTLASS 2.x")
+        ArgClass = GemmArguments2xStreamK
+    else:
+        ArgClass = GemmArguments3x if operation.api == ApiVersion.v3x else GemmArguments2x
+    return ArgClass(operation, problem_size, A, B, C, D, gemm_mode, **kwargs)
+
+
+class GemmGroupedArguments:
+    """
+    Argument wrapper for GEMM Grouped. It encodes problem information and
+    user-provide tensors into the kernel's argument
+
+    :param operation: the GEMM Grouped operation to take the argument
+    :type operation: :class:`cutlass_cppgen.backend.GemmOperationGrouped`
+
+    :param problem_size: list of GEMM problem size gemm(M, N, K)
+    :type operation: list[:class:`cutlass_cppgen.shape.GemmCoord`]
+
+    :param A: list of tensor A
+    :type A: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param B: list of tensor B
+    :type B: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param C: list of tensor C
+    :type C: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param D: list of tensor D
+    :type D: list[cuda.CUdeviceptr | numpy.ndarray | torch.Tensor | cupy.ndarray]
+
+    :param output_op: output operator, optional
+    :type output_op: :class:`cutlass_cppgen.backend.LinearCombinationFunctorArguments`
+
+    :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+    :type stream: :class:`cuda.cuda.CUstream`
+    """
+
+    def __init__(self, operation, problem_sizes, A, B, C, D, **kwargs):
+        # Get number of problems in the group
+        self.problem_count = len(problem_sizes)
+
+        # Check the input arguments
+        assert len(A) == self.problem_count
+        assert len(B) == self.problem_count
+        assert len(C) == self.problem_count
+        assert len(D) == self.problem_count
+
+        problem_size_host = []
+        self.ptr_A_host = []
+        self.ptr_B_host = []
+        self.ptr_C_host = []
+        self.ptr_D_host = []
+
+        lda_host = []
+        ldb_host = []
+        ldc_host = []
+        ldd_host = []
+
+        self.partitions = 1
+
+        self.operation = operation
+
+        # Get the threadblock
+        threadblock_shape = operation.tile_description.threadblock_shape
+        self.threadblock_shape = GemmCoord(
+            threadblock_shape[0],
+            threadblock_shape[1],
+            threadblock_shape[2],
+        )
+        self.threadblock_swizzle = operation.swizzling_functor
+
+        self.total_tiles = 0
+
+        self.gemm_arguments = []
+
+        self.stream = kwargs.get("stream", cuda.CUstream(0))
+
+        # Process the input arguments
+        for idx, problem_size in enumerate(problem_sizes):
+            M, N, K = problem_size.m, problem_size.n, problem_size.k
+            temp_argument = GemmArguments2x(
+                operation=operation,
+                problem_size=GemmCoord(M, N, K),
+                A=A[idx], B=B[idx], C=C[idx], D=D[idx])
+            self.gemm_arguments.append(temp_argument)
+
+            problem_size_host.append(
+                [temp_argument.problem_size.m,
+                 temp_argument.problem_size.n,
+                 temp_argument.problem_size.k]
+            )
+
+            self.ptr_A_host.append(int(temp_argument.ptr_A))
+            lda_host.append(temp_argument.lda)
+
+            self.ptr_B_host.append(int(temp_argument.ptr_B))
+            ldb_host.append(temp_argument.ldb)
+
+            self.ptr_C_host.append(int(temp_argument.ptr_C))
+            ldc_host.append(temp_argument.ldc)
+
+            self.ptr_D_host.append(int(temp_argument.ptr_D))
+            ldd_host.append(temp_argument.ldd)
+
+            # Get number of tiles
+            grid = self.operation.rt_module.get_grid_shape(
+                self.operation.rt_module.get_tiled_shape(
+                    temp_argument.problem_size.ctype,
+                    self.threadblock_shape.ctype,
+                    temp_argument.batch_count
+                )
+            )
+            self.total_tiles += grid.x * grid.y * grid.z
+
+        self.problem_size_buffer = todevice(problem_size_host, np.int32)
+        self.ptr_A_buffer = todevice(self.ptr_A_host, np.int64)
+        self.ptr_B_buffer = todevice(self.ptr_B_host, np.int64)
+        self.ptr_C_buffer = todevice(self.ptr_C_host, np.int64)
+        self.ptr_D_buffer = todevice(self.ptr_D_host, np.int64)
+
+        self.lda_buffer = todevice(lda_host, np.int64)
+        self.ldb_buffer = todevice(ldb_host, np.int64)
+        self.ldc_buffer = todevice(ldc_host, np.int64)
+        self.ldd_buffer = todevice(ldd_host, np.int64)
+
+        if "output_op" in kwargs.keys():
+            self.alpha = kwargs["output_op"].alpha
+            self.beta = kwargs["output_op"].beta
+        else:
+            self.alpha = 1.0
+            self.beta = 0.0
+
+        if "output_op" in kwargs.keys():
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        # Get host problem size
+        self.host_problem_size_ptr = np.array(problem_size_host, dtype=np.int32).__array_interface__["data"][0]
+
+        self.arguments = self.get_arguments()
+
+        self.initialize()
+
+    def get_arguments(self):
+        return self.operation.argument_type(
+            self.problem_size_buffer.ptr,
+            self.problem_count,
+            self.total_tiles,
+            self.output_op,
+            self.ptr_A_buffer.ptr,
+            self.ptr_B_buffer.ptr,
+            self.ptr_C_buffer.ptr,
+            self.ptr_D_buffer.ptr,
+            self.lda_buffer.ptr,
+            self.ldb_buffer.ptr,
+            self.ldc_buffer.ptr,
+            self.ldd_buffer.ptr,
+            ctypes.c_void_p(int(self.host_problem_size_ptr)),
+        )
+
+    def initialize(self):
+        # Get launch configuration
+        launch_config = self.operation.rt_module.plan(self)
+
+        # Get the host and evice workspace
+        device_workspace_size = self.operation.rt_module.get_device_workspace_size(self)
+
+        if device_workspace_size > 0:
+            self.workspace_buffer = device_mem_alloc(device_workspace_size)
+            workspace_ptr = self.workspace_buffer.ptr
+            err, = cuda.cuMemsetD32(
+                workspace_ptr, 0, device_workspace_size // 4)
+        else:
+            workspace_ptr = None
+
+        if self.operation.precompute_mode == SchedulerMode.Host:
+            device_workspace_ptr = self.operation.rt_module.host_precompute(
+                self, self.operation.rt_module.get_workspace_size(self),)
+        else:
+            device_workspace_ptr = 0
+
+        result = self.operation.rt_module.get_args(
+            ctypes.byref(self.arguments),
+            self.total_tiles,
+            ctypes.c_void_p(int(device_workspace_ptr)),
+        )
+        host_workspace = bytearray(result.contents)
+
+        device_workspace = None
+
+        self.host_workspace = host_workspace
+        self.device_workspace = device_workspace
+        self.launch_config = launch_config
+
+    def sync(self):
+        err, = cudart.cudaDeviceSynchronize()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+        for arg in self.gemm_arguments:
+            arg.sync(stream_sync=False)
+
+
+################################################################################
+# Base class for GEMM runtime module
+################################################################################
+
+
+class GemmRTbase(ExecutableOperation):
+    """
+    GemmRT manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix}::invoke(params, *shared_storage);
+}
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super().__init__(operation)
+
+        self.operation = operation
+        threadblock_shape = operation.tile_description.threadblock_shape
+        self.threadblock_shape = GemmCoord(
+            threadblock_shape[0], threadblock_shape[1], threadblock_shape[2])
+        self.threadblock_swizzle = operation.swizzling_functor
+
+        # Threads per threadblock
+        self.threads = operation.tile_description.num_threads
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def get_device_workspace_size(self, arguments):
+        return 0
+
+    def initialize(self):
+        err, = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(
+                f"CUDA error on call to cuFuncSetAttribute: {cuda.cuGetErrorString(err)[1]}"
+            )
+
+
+################################################################################
+# Runtime module for GEMM Universal
+################################################################################
+
+
+class GemmRTUniversal(GemmRTbase):
+    """
+    GemmRTUniversal manages the CUTLASS runtime components
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}_base::Arguments* argument, int* workspace){
+    ${operation_name}_base::Params* params;
+    params = new ${operation_name}_base::Params(*argument,
+                                                -1, // SM count. Only used for stream-K
+                                                -1  // Occupancy. Only used for stream-K
+                                                );
+
+    // Semaphore holds the pointer to the workspace in the Params struct
+    params->semaphore = workspace;
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}_base::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}_base::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+    cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+        problem_size, tile_size, split_k_slices);
+  }
+
+  dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+    return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+  }
+}
+  """
+
+    def __init__(self, operation):
+        super(GemmRTUniversal, self).__init__(operation)
+        self.extra_funcs = {
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
+        self.emitter = EmitGemmUniversalInstance(
+            "_type", operation.direct_store)
+
+        self.argument_type, self.epilogue_type = get_gemm_arguments(operation.epilogue_functor)
+        self.argtype = [
+            ctypes.POINTER(self.argument_type),
+            ctypes.POINTER(GemmCoord_), ctypes.c_int, ctypes.c_void_p
+        ]
+
+    def plan(self, arguments):
+        grid = self.get_tiled_shape(
+            arguments.problem_size.ctype,
+            self.threadblock_shape.ctype,
+            arguments.batch_count
+        )
+
+        gemm_k_size = arguments.problem_size.k
+        if arguments.gemm_mode in [GemmUniversalMode.Gemm, GemmUniversalMode.GemmSplitKParallel]:
+            alignk = max(max(128 // DataTypeSize[self.operation.A.element],
+                         128 // DataTypeSize[self.operation.B.element]), 1)
+
+            gemm_k_size = (((arguments.problem_size.k + arguments.batch_count - 1) //
+                           arguments.batch_count + alignk - 1) // alignk) * alignk
+
+            if gemm_k_size:
+                grid_z = (arguments.problem_size.k + gemm_k_size - 1) // gemm_k_size
+                grid = GemmCoord(grid.m, grid.n, grid_z).ctype
+
+        arguments.grid_tiled_shape = dim3_(grid.m, grid.n, grid.k)
+        grid = self.get_grid_shape(grid)
+        arguments.gemm_k_size = gemm_k_size
+        return LaunchConfiguration(
+            [grid.x, grid.y, grid.z],
+            [self.threads, 1, 1],
+            self.shared_memory_capacity)
+
+    def get_device_workspace_size(self, arguments: GemmArguments):
+        workspace_bytes = 0
+        if arguments.gemm_mode == GemmUniversalMode.GemmSplitKParallel:
+            workspace_bytes = (DataTypeSize[arguments.operation.C.element]
+             * arguments.batched_stride_D * arguments.grid_tiled_shape.z // 8)
+        elif (arguments.gemm_mode == GemmUniversalMode.Gemm and
+            arguments.split_k_slices > 1):
+            workspace_bytes = 4 * arguments.grid_tiled_shape.x * arguments.grid_tiled_shape.y
+
+        return workspace_bytes
+
+
+class GemmRTUniversalStreamK(GemmRTUniversal):
+    """
+    Manages the CUTLASS runtime components for 2.x stream K kernels
+    """
+
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  using GemmType = ${operation_name}_base;
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(GemmType::Arguments* argument, int* workspace,
+                                     int sm_count, int occupancy) {
+    GemmType::Params* params;
+    params = new GemmType::Params(*argument, sm_count, occupancy);
+
+    params->init_workspace(workspace);
+
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(GemmType::Params)];
+    for (unsigned int i = 0; i < sizeof(GemmType::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  dim3 ${operation_name}_get_grid_shape(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
+    typename GemmType::Params params(*args, device_sms, sm_occupancy);
+    return params.get_grid_dims();
+  }
+
+  uint64_t ${operation_name}_get_kernel_workspace_size(GemmType::Arguments* args, int device_sms, int sm_occupancy) {
+    typename GemmType::Params params(*args, device_sms, sm_occupancy);
+    return params.get_workspace_size();
+  }
+}
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super(GemmRTUniversalStreamK, self).__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": GemmCoord_,
+            "get_kernel_workspace_size": ctypes.c_uint64,
+        }
+        self._occupancy = None
+        self.argument_type, self.epilogue_type  = get_gemm_arguments_streamk(operation.epilogue_functor)
+
+    @property
+    def occupancy(self):
+        if self._occupancy is None:
+            err, self._occupancy = cuda.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+                self.kernel, self.threads, self.shared_memory_capacity,
+                cuda.CUoccupancy_flags.CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE)
+
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(
+                    "CUDA error on call to cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags: "
+                    f"{cuda.cuGetErrorString(err)[1]}")
+        return self._occupancy
+
+    def get_device_workspace_size(self, arguments: GemmArguments2xStreamK, device_sms: int, sm_occupancy: int):
+        return self.get_kernel_workspace_size(ctypes.byref(arguments.get_arguments()), device_sms, sm_occupancy)
+
+
+################################################################################
+# Runtime module for GEMM Universal within CUTLASS 3
+################################################################################
+
+
+class GemmRTUniversal3x(GemmRTUniversal):
+    """
+    Manages the CUTLASS runtime components for 3.x kernels
+    """
+
+    KernelTemplate = r"""
+
+using Operator = ${operation_name}${operation_suffix};
+extern "C"
+__global__ __launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
+void ${operation_name}(__grid_constant__ typename Operator::Params const params) {
+  // Dynamic shared memory base pointer
+  extern __shared__ char smem[];
+
+  // Declare pointer to dynamic shared memory.
+  Operator op;
+  op(params, smem);
+}
+  """
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return ${operation_name}${operation_suffix}::SharedStorageSize;
+  }
+
+  using GemmType = ${operation_name}_base;
+
+  bool ${operation_name}_uses_default_epilogue() {
+    return std::is_same_v<GemmType::CollectiveEpilogue::DispatchPolicy, cutlass::gemm::EpilogueDefault>;
+  }
+
+  // Get the workspace size
+  uint64_t ${operation_name}_get_kernel_workspace_size(GemmType::Arguments* argument) {
+    return GemmType::get_workspace_size(*argument);
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(GemmType::Arguments* argument, int* workspace){
+    GemmType::Params params = GemmType::to_underlying_arguments(*argument, workspace);
+    char *bytes = ((char*)(&params));
+    char *output = new char[sizeof(GemmType::Params)];
+    for (unsigned int i = 0; i < sizeof(GemmType::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+
+  // Get the total number of blocks for a persistent kernel
+  uint64_t ${operation_name}_get_persistent_tiled_blk_shape_mnl(GemmType::ProblemShape problem) {
+    auto problem_shape_MNKL = append<4>(problem, Int<1>{});
+    auto [problem_blocks_m, problem_blocks_n, problem_blocks_l] =
+        cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::get_tiled_cta_shape_mnl(
+            problem_shape_MNKL, GemmType::TileShape{}, GemmType::DispatchPolicy::ClusterShape{});
+    return problem_blocks_m * problem_blocks_n * problem_blocks_l;
+  }
+
+  // Get the grid shape
+  dim3 ${operation_name}_get_grid_shape(GemmType::Arguments* args, int* workspace) {
+    auto tmp_params = GemmType::to_underlying_arguments(*args, workspace);
+    return GemmType::get_grid_shape(tmp_params);
+  }
+
+  // Get the block shape
+  dim3 ${operation_name}_get_block_shape() {
+    return GemmType::get_block_shape();
+  }
+}
+  """
+
+    def __init__(self, operation):
+        super(GemmRTUniversal3x, self).__init__(operation)
+        self.extra_funcs = {
+            "get_grid_shape": dim3_,
+            "get_block_shape": dim3_,
+            "get_persistent_tiled_blk_shape_mnl": ctypes.c_uint64,
+            "get_kernel_workspace_size": ctypes.c_uint64,
+            "uses_default_epilogue": ctypes.c_bool,
+        }
+        self.emitter = EmitGemmUniversalInstance3x("_type")
+
+    def get_device_workspace_size(self, arguments: GemmArguments3x):
+        return self.get_kernel_workspace_size(ctypes.byref(arguments.get_arguments()))
+
+
+class EmitGemmUniversalInstance3x:
+    """Responsible for emitting a CUTLASS 3 template definition"""
+
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cute/tensor.hpp",
+            "cute/atom/mma_atom.hpp",
+            "cutlass/numeric_types.h",
+            "cutlass/gemm/collective/collective_builder.hpp",
+            "cutlass/gemm/kernel/sm90_tile_scheduler.hpp",
+            "cutlass/gemm/kernel/gemm_universal.hpp",
+            "cutlass/epilogue/collective/collective_builder.hpp",
+            "cutlass/epilogue/collective/default_epilogue.hpp",
+            "cutlass/epilogue/thread/linear_combination.h"
+        ]
+        self.gemm_template_kernel = """
+using namespace cute;
+
+using CollectiveEpilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule}
+  >::CollectiveOp;
+
+using CollectiveMainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    ${stage_count_type},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    ${tile_scheduler}
+>;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_kernel_visitor = """
+using namespace cute;
+
+${callback_decl}
+
+using CollectiveEpilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    ${element_accumulator}, ${element_epilogue},
+    ElementC, StrideC, ${align_c},
+    ElementD, StrideD, ${align_d},
+    ${epilogue_schedule},
+    ${callback_name}
+  >::CollectiveOp;
+
+using CollectiveMainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
+    ${stage_count_type},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>,
+    CollectiveMainloop,
+    CollectiveEpilogue,
+    ${tile_scheduler}
+>;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+        self.gemm_template_device = self.gemm_template_kernel + """
+
+// Define device-level operator
+using DeviceKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}${operation_suffix}>;
+"""
+
+    def emit(self, operation):
+        # Support built-in epilogue functors or user-defined functions
+
+        if operation.tile_description.stages is None or operation.tile_description.stages == 0:
+            stage_count_type = "cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>"
+        else:
+            stage_count_type = "_" + str(operation.tile_description.stages)
+
+        if operation.emission_type == EmissionType.Kernel:
+            gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        kschedule = KernelScheduleType.ScheduleAuto
+        eschedule = EpilogueScheduleType.ScheduleAuto
+        tschedule = TileSchedulerType.Default
+        if operation.tile_description.kernel_schedule is not None:
+            kschedule = operation.tile_description.kernel_schedule
+        if operation.tile_description.epilogue_schedule is not None:
+            eschedule = operation.tile_description.epilogue_schedule
+        if operation.tile_description.tile_scheduler is not None:
+            tschedule = operation.tile_description.tile_scheduler
+
+        emit_tile_m, emit_tile_n, emit_tile_k = operation.tile_description.blackwell_threadblock_shape
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[operation.A.layout],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[operation.B.layout],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[operation.C.layout],
+            "element_d": DataTypeTag[operation.epilogue_functor.element_output],
+            "layout_d": LayoutTag[operation.C.layout],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "element_epilogue": DataTypeTag[operation.epilogue_functor.element_epilogue],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(emit_tile_m),
+            "threadblock_shape_n": str(emit_tile_n),
+            "threadblock_shape_k": str(emit_tile_k),
+            "cluster_m": str(operation.tile_description.cluster_shape[0]),
+            "cluster_n": str(operation.tile_description.cluster_shape[1]),
+            "cluster_k": str(operation.tile_description.cluster_shape[2]),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "align_c": str(operation.C.alignment),
+            "align_d": str(operation.C.alignment),
+            "stage_count_type": stage_count_type,
+            "kernel_schedule": KernelScheduleTag[kschedule],
+            "epilogue_schedule": EpilogueScheduleTag[eschedule],
+            "tile_scheduler": TileSchedulerTag[tschedule]
+        }
+        if hasattr(operation.epilogue_functor, "visitor"):
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
+
+        else:
+            values["epilogue_functor"] = operation.epilogue_functor.emit()
+            return SubstituteTemplate(gemm_template, values)
+
+
+###################################################################################################
+# Runtime module for GEMM Grouped
+###################################################################################################
+
+
+class GemmRTGrouped(GemmRTbase):
+    """
+    GemmRTGrouped manages the CUTLASS runtime components
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+  """
+
+    HostTemplate = r"""
+  extern "C" {
+
+    // precompute scheduling information
+     char * ${operation_name}_precompute(${operation_name}_base::Arguments const &args, int tile_count, size_t workspace_bytes) {
+      char* host_workspace = new char[workspace_bytes];
+      ${operation_name}_base::ProblemVisitor::host_precompute(
+        args.host_problem_sizes,
+        args.problem_count,
+        args.threadblock_count,
+        (void*)host_workspace
+      );
+      return host_workspace;
+    }
+
+    // Get the size of params in bytes
+    int ${operation_name}_get_param_size(){
+      return sizeof(${operation_name}${operation_suffix}::Params);
+    }
+
+    // Get the size of dynamic shared memory in bytes
+    int ${operation_name}_shared_memory_size() {
+      return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+    }
+
+    // Get the params as byte array
+    char* ${operation_name}_get_params(${operation_name}_base::Arguments* argument, int tile_count, void* workspace=nullptr){
+      ${operation_name}_base::Params* params;
+      params = new ${operation_name}_base::Params(*argument, workspace, tile_count);
+
+      char *bytes = ((char*)(params));
+      char *output = new char[sizeof(${operation_name}_base::Params)];
+      for (unsigned int i = 0; i < sizeof(${operation_name}_base::Params); i ++)
+          output[i] = bytes[i];
+
+      return output;
+    }
+
+    cutlass::gemm::GemmCoord ${operation_name}_get_tiled_shape(
+        cutlass::gemm::GemmCoord problem_size, cutlass::gemm::GemmCoord tile_size, int split_k_slices) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_tiled_shape(
+            problem_size, tile_size, split_k_slices);
+    }
+
+    dim3 ${operation_name}_get_grid_shape(cutlass::gemm::GemmCoord tiled_shape) {
+        return ${operation_name}_base::ThreadblockSwizzle::get_grid_shape(tiled_shape);
+    }
+  }
+  """
+
+    def __init__(self, operation: "GemmOperation"):
+        super(GemmRTGrouped, self).__init__(operation)
+        self.extra_funcs = {
+            "precompute": None,
+            "get_tiled_shape": GemmCoord_,
+            "get_grid_shape": dim3_,
+        }
+        self.emitter = EmitGemmGroupedInstance("_type")
+        self.argument_type, self.epilogue_type = get_gemm_grouped_arguments(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type), ctypes.c_int, ctypes.c_void_p]
+
+    def host_precompute(self, arguments, workspace_bytes):
+        self.precompute.argtype = [
+            self.argtype[0], ctypes.c_int, ctypes.c_longlong]
+        self.precompute.restype = ctypes.POINTER(ctypes.c_byte * workspace_bytes)
+
+        problem_info = self.precompute(
+            ctypes.byref(arguments.arguments),
+            arguments.total_tiles,
+            workspace_bytes)
+        problem_info_array = bytearray(problem_info.contents)
+
+        # copy to device memory
+        return todevice(problem_info_array).ptr
+
+    def plan(self, arguments):
+        return LaunchConfiguration(
+            [arguments.total_tiles, 1, 1],
+            [self.threads, 1, 1],
+            self.shared_memory_capacity,
+        )
+
+    def get_workspace_size(self, arguments):
+        if self.operation.precompute_mode == SchedulerMode.Device:
+            return 0
+        elif self.operation.precompute_mode == SchedulerMode.Host:
+            total_tiles = arguments.total_tiles
+            entries_per_block = 1
+            return 8 * entries_per_block * total_tiles  # three int32_t
+
+
+################################################################################
+# Runtime module for GEMM and grouped GEMM
+################################################################################
+
+
+class GemmOperationBase:
+    """
+    CUTLASS GEMM operation
+    """
+
+    def __init__(
+        self, gemm_kind, arch, tile_description: TileDescription,
+        A: TensorDescription, B: TensorDescription, C: TensorDescription,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1,
+        api=ApiVersion.v2x, emission_type=EmissionType.Kernel, **kwargs):
+        self.operation_kind: OperationKind = OperationKind.Gemm
+        self.arch: int = arch
+        self.tile_description: TileDescription = tile_description
+        self.gemm_kind: GemmKind = gemm_kind
+
+        self.api = api
+        self.prefix = "3x" if self.api == ApiVersion.v3x else ""
+        self.emission_type = emission_type
+
+        # Optionally swap the TensorDescriptions for operands A and B and transpose their
+        # layouts. This is needed to mimic the transpose performed by device::GemmUniversal.
+        # The code below uses deep copy to avoid overwritting the original TensorDescription
+        self.switched = (self.api != ApiVersion.v3x and
+                         self.emission_type == EmissionType.Kernel and
+                         C.layout == LayoutType.ColumnMajor)
+
+        self.A, self.B, self.C = GemmOperationBase.get_operands(A, B, C, self.switched)
+
+        self.epilogue_functor = epilogue_functor
+        self.swizzling_functor = swizzling_functor
+
+        if "direct_store" in kwargs:
+            self.direct_store = kwargs["direct_store"]
+        else:
+            self.direct_store = False
+
+    @staticmethod
+    def get_operands(A: TensorDescription, B: TensorDescription, C: TensorDescription, swap: bool):
+        """
+        Makes copies of A, B, and C, and possibly transposes their order. If ``swap`` is set,
+        A and B are swapped, and the layout of A, B, and C are transposed.
+
+        :param A: description of operand A
+        :type A: TensorDescription
+        :param B: description of operand B
+        :type B: TensorDescription
+        :param C: description of operand C
+        :type C: TensorDescription
+
+        :return: descriptions of operands A, B, and C
+        :rtype: tuple[TileDescription]
+        """
+        if swap:
+            A_out = copy.deepcopy(B)
+            B_out = copy.deepcopy(A)
+            C_out = copy.deepcopy(C)
+            A_out.layout = transpose_layout(A_out.layout)
+            B_out.layout = transpose_layout(B_out.layout)
+            C_out.layout = transpose_layout(C_out.layout)
+        else:
+            A_out = copy.deepcopy(A)
+            B_out = copy.deepcopy(B)
+            C_out = copy.deepcopy(C)
+        return A_out, B_out, C_out
+
+    def run(self, arguments: GemmArguments) -> cuda.CUresult:
+        """
+        Configure and launch the cuda kernel with input arguments
+        """
+        if self.emission_type == EmissionType.Device:
+            raise Exception('Running a kernel via PyCUTLASS is only enabled with emission type "Kernel"')
+
+        err = self.rt_module.run(
+            arguments.host_workspace,
+            arguments.device_workspace,
+            arguments.launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError("CUDA Error %s" % str(err))
+
+        return err
+
+    def is_complex(self):
+        complex_operators = [
+            MathOperation.multiply_add_complex,
+            MathOperation.multiply_add_complex_gaussian,
+            MathOperation.multiply_add_complex_fast_f32,
+        ]
+        return self.tile_description.math_instruction.math_operation in complex_operators
+
+    def is_planar_complex(self):
+        return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
+
+    def accumulator_type(self):
+        accum = self.tile_description.math_instruction.element_accumulator
+
+        if self.is_complex():
+            return get_complex_from_real(accum)
+
+        return accum
+
+    def short_math_name(self):
+        if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+            return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+        return ShortDataTypeNames[self.accumulator_type()]
+
+    def core_name(self):
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
+
+        inst_shape = ""
+        inst_operation = ""
+        intermediate_type = ""
+
+        math_operations_map = {
+            MathOperation.xor_popc: "xor",
+        }
+
+        if (self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or
+            self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp):
+            math_op = self.tile_description.math_instruction.math_operation
+            math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ""
+
+            if self.tile_description.math_instruction.instruction_shape is not None:
+                if self.api == ApiVersion.v3x and self.arch >= 90:
+                    inst_shape = "%dx%dx%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
+                else:
+                    inst_shape = "%d%d%d" % tuple(
+                        self.tile_description.math_instruction.instruction_shape)
+            else:
+                inst_shape = "Default"
+            inst_shape += math_op_string
+
+            if (self.tile_description.math_instruction.element_a != self.A.element and
+                self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator):
+                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+        return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
+
+    def extended_name(self):
+        """Append data types if they differ from compute type."""
+        if self.is_complex():
+            extended_name = "${core_name}"
+        else:
+            if (self.C.element != self.tile_description.math_instruction.element_accumulator and
+                self.A.element != self.tile_description.math_instruction.element_accumulator):
+                extended_name = "${element_c}_${core_name}_${element_a}"
+            elif (self.C.element == self.tile_description.math_instruction.element_accumulator and
+                self.A.element != self.tile_description.math_instruction.element_accumulator):
+                extended_name = "${core_name}_${element_a}"
+            else:
+                extended_name = "${core_name}"
+
+        extended_name = SubstituteTemplate(extended_name, {
+            "element_a": DataTypeNames[self.A.element],
+            "element_c": DataTypeNames[self.C.element],
+            "core_name": self.core_name(),
+        })
+
+        return extended_name
+
+    def extended_name_3x(self):
+        """Generates a string representing the MMA atom. Assumes accumulator type is C type."""
+        extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+            element_a=DataTypeNames[self.A.element],
+            element_b=DataTypeNames[self.B.element],
+            element_acc=DataTypeNames[self.accumulator_type()],
+            element_c=DataTypeNames[self.C.element],
+            element_d=DataTypeNames[self.epilogue_functor.element_output],
+            core_name=self.core_name())
+        return extended_name
+
+    def layout_name(self):
+        if self.is_complex() or self.is_planar_complex():
+            return "%s%s" % (
+                ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+                ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+            )
+        return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+
+    # Generates a short string representing the ABC layout tags (e.g. ntn or tnn)
+    def layout_name_3x(self):
+        if self.is_complex() or self.is_planar_complex():
+            return "{}{}{}".format(
+                ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+                ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)],
+                ShortComplexLayoutNames[(self.C.layout, self.C.complex_transform)])
+        else:
+            return "{}{}{}".format(
+                ShortLayoutTypeNames[self.A.layout],
+                ShortLayoutTypeNames[self.B.layout],
+                ShortLayoutTypeNames[self.C.layout])
+
+    # Generates a short string representing underlying kernel schedule type
+    def kernel_schedule_name_3x(self):
+        if self.tile_description.kernel_schedule is None:
+            return KernelScheduleSuffixes[KernelScheduleType.ScheduleAuto]
+        else:
+            return KernelScheduleSuffixes[self.tile_description.kernel_schedule]
+
+    # Generates a short string representing underlying epilogue schedule type
+    def epilogue_schedule_name_3x(self):
+        if self.tile_description.epilogue_schedule is None:
+            return EpilogueScheduleSuffixes[EpilogueScheduleType.ScheduleAuto]
+        else:
+            return EpilogueScheduleSuffixes[self.tile_description.epilogue_schedule]
+
+    def procedural_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+        if self.api == ApiVersion.v3x and self.arch >= 90:
+            kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}_{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{l}_{s}_align{al}{k}{e}"
+            return kernel_name_template.format(
+                p=self.prefix,
+                ar=self.arch,
+                op=opcode_class_name,
+                ex=self.extended_name_3x(),
+                tbm=self.tile_description.threadblock_shape[0],
+                tbn=self.tile_description.threadblock_shape[1],
+                tbk=self.tile_description.threadblock_shape[2],
+                cm=self.tile_description.cluster_shape[0],
+                cn=self.tile_description.cluster_shape[1],
+                ck=self.tile_description.cluster_shape[2],
+                l=self.tile_description.stages,
+                s=self.layout_name_3x(),
+                al=str(self.A.alignment),
+                k=self.kernel_schedule_name_3x(),
+                e=self.epilogue_schedule_name_3x()
+            )
+        else:
+            threadblock = self.tile_description.procedural_name_2x()
+            return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
+                p=self.prefix,
+                op=opcode_class_name,
+                ex=self.extended_name(),
+                tb=threadblock,
+                l=self.layout_name(),
+                a=str(self.A.alignment)
+            )
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size, and layout."""
+        return self.procedural_name()
+
+
+class GemmOperationUniversal(GemmOperationBase):
+    def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
+        api = api_version(arch, tile_description.math_instruction.opcode_class, A.element)
+        super(GemmOperationUniversal, self).__init__(GemmKind.Universal, arch, tile_description,
+                                                     A, B, C, epilogue_functor, swizzling_functor,
+                                                     api=api, **kwargs, )
+        if api == ApiVersion.v3x:
+            if swizzling_functor == SwizzlingFunctor.StreamK:
+                raise Exception("Stream K swizzle functor is currently only supported for CUTLASS 2.x kernels")
+            self.rt_module = GemmRTUniversal3x(self)
+        else:
+            if swizzling_functor == SwizzlingFunctor.StreamK:
+                self.rt_module = GemmRTUniversalStreamK(self)
+            else:
+                self.rt_module = GemmRTUniversal(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def device_op(self):
+        """
+        Returns a new GemmOperationUniversal object that is constructed with emission type
+        ``EmissionType.Device``. Since the device-emitted kernel does not require swapping,
+        any swappng performed by the kernel-emitted operation is reversed.
+
+        :return: operation ready for device-level code emission
+        :rtype: GemmUniversalOperation
+        """
+        A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
+        return GemmOperationUniversal(self.arch, self.tile_description, A, B, C,
+                                      self.epilogue_functor, self.swizzling_functor,
+                                      emission_type=EmissionType.Device, direct_store=self.direct_store)
+
+
+class GemmOperationGrouped(GemmOperationBase):
+    def __init__(self, arch, tile_description: TileDescription, A: TensorDescription, B, C,
+        epilogue_functor, swizzling_functor=SwizzlingFunctor.Identity1, **kwargs):
+        super(GemmOperationGrouped, self).__init__(GemmKind.Grouped, arch, tile_description,
+                                                   A, B, C, epilogue_functor, swizzling_functor, **kwargs)
+        assert "precompute_mode" in kwargs.keys(), "missing keyword arguement 'precompute_mode'."
+        self.precompute_mode = kwargs["precompute_mode"]
+        self.rt_module = GemmRTGrouped(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def device_op(self):
+        """
+        Returns a new GemmOperationGrouped object that is constructed with emission type
+        ``EmissionType.Device``. Since the device-emitted kernel does not require swapping,
+        any swappng performed by the kernel-emitted operation is reversed.
+
+        :return: operation ready for device-level code emission
+        :rtype: GemmOperationGrouped
+        """
+        A, B, C = GemmOperationBase.get_operands(self.A, self.B, self.C, self.switched)
+        return GemmOperationGrouped(
+            self.arch, self.tile_description, A, B, C, self.epilogue_functor,
+            self.swizzling_functor, emission_type=EmissionType.Device,
+            direct_store=self.direct_store, precompute_mode=self.precompute_mode, )
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+
+class EmitGemmUniversalInstance:
+    """Responsible for emitting a CUTLASS template definition"""
+
+    def __init__(
+        self,
+        operation_suffix="",
+        direct_store=False
+    ):
+        self.operation_suffix = operation_suffix
+        self.direct_store = direct_store
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/gemm_coord.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/device/gemm.h",
+            "cutlass/gemm/device/gemm_universal_adapter.h",
+            "cutlass/gemm/kernel/default_gemm_universal.h",
+        ]
+        if self.direct_store:
+            self.includes.append(
+                "cutlass/epilogue/threadblock/default_epilogue_direct_store.h"
+            )
+        self.gemm_template_kernel = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+        self.gemm_template_device = """
+// Gemm operator ${operation_name}
+using DeviceKernel =
+    typename cutlass::gemm::device::GemmUniversal<
+        // Data type and layout of operand A
+        ${element_a}, ${layout_a},
+        // Data type and layout of operand B
+        ${element_b}, ${layout_b},
+        // Data type and layout of operand C
+        ${element_c}, ${layout_c},
+        // Data type of accumulator
+        ${element_accumulator},
+        // Class of operation
+        ${opcode_class},
+        // Compute capability of the target kernel
+        ${arch},
+        // Threadblock tile shape
+        cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+        // Warp tile shape
+        cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+        // Instruction shape
+        cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+        // Epilogue functor
+        ${epilogue_functor},
+        // Swizzling function
+        ${swizzling_functor},
+        // Number of pipeline stages
+        ${stages},
+        // Alignment of operands A and B
+        ${align_a}, ${align_b},
+        // Type of math operation
+        ${math_operation},
+        // Complex transform types of operands A and B
+        ${transform_a}, ${transform_b}
+    >;
+"""
+        self.gemm_template_direct_store = """
+// Gemm operator ${operation_name}
+using ${operation_name}_default =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+using ${operation_name}_base =
+  cutlass::gemm::kernel::GemmUniversal<
+    ${operation_name}_default::Mma,
+    cutlass::epilogue::threadblock::DefaultEpilogueDirectStore<
+      ${operation_name}_default::Epilogue
+    >::Epilogue,
+    ${operation_name}_default::ThreadblockSwizzle
+  >;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_kernel_visitor = """
+
+using OutputTileThreadMap = cutlass::epilogue::threadblock::OutputTileThreadLayout<
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    ${element_c},
+    ${align_c},
+    ${epilogue_stages} /* epilogue stages */
+>;
+
+${callback_decl}
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_accumulator},
+    ${element_epilogue},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${callback_name},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation},
+    ${epilogue_stages} /* epilogue stages */
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+    def instance_template(self):
+        return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
+"""
+
+    def emit(self, operation):
+        threadblock_shape = operation.tile_description.threadblock_shape
+        warp_count = operation.tile_description.warp_count
+
+        warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+        instance_layout_A, instance_layout_B, instance_layout_C = \
+            (operation.A.layout, operation.B.layout, operation.C.layout)
+
+        if operation.emission_type == EmissionType.Kernel:
+            if self.direct_store:
+                gemm_template = self.gemm_template_direct_store
+            else:
+                gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[instance_layout_A],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[instance_layout_B],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[instance_layout_C],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "transform_a": ComplexTransformTag[operation.A.complex_transform],
+            "transform_b": ComplexTransformTag[operation.B.complex_transform],
+            "math_operation": MathOperationTag[operation.tile_description.math_instruction.math_operation],
+        }
+
+        if hasattr(operation.epilogue_functor, "visitor"):
+            self.includes += [
+                "cutlass/epilogue/threadblock/fusion/visitors.hpp",
+                "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
+            ]
+            callback_name, callback_decl = operation.epilogue_functor.emit(operation)
+            values["callback_name"] = callback_name
+            values["callback_decl"] = callback_decl
+            values["align_c"] = str(operation.C.alignment)
+            values["element_epilogue"] = DataTypeTag[operation.epilogue_functor.element_epilogue]
+            if hasattr(operation.epilogue_functor, "epilogue_stages"):
+                epilogue_stages = operation.epilogue_functor.epilogue_stages
+            else:
+                epilogue_stages = 1
+            values["epilogue_stages"] = str(epilogue_stages)
+            return SubstituteTemplate(self.gemm_template_kernel_visitor, values)
+        else:
+            values["epilogue_functor"] = operation.epilogue_functor.emit()
+            return SubstituteTemplate(gemm_template, values)
+
+
+class EmitGemmGroupedInstance:
+    """Responsible for emitting a CUTLASS template definition"""
+
+    def __init__(self, operation_suffix=""):
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/kernel/gemm_grouped.h",
+            "cutlass/gemm/kernel/default_gemm_grouped.h",
+        ]
+        self.gemm_template_kernel = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${precompute_mode},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+        self.gemm_template_device = (
+            self.gemm_template_kernel
+            + """
+using DeviceKernel = cutlass::gemm::device::GemmGrouped<${operation_name}_base>;
+"""
+        )
+
+    def instance_template(self):
+        return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmGrouped<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+    def emit(self, operation):
+        threadblock_shape = operation.tile_description.threadblock_shape
+        warp_count = operation.tile_description.warp_count
+
+        warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+        instance_layout_A, instance_layout_B, instance_layout_C = \
+            (operation.A.layout, operation.B.layout, operation.C.layout)
+
+        # Support built-in epilogue functors or user-defined functions
+        epilogue_functor = operation.epilogue_functor.emit()
+
+        values = {
+            "operation_name": operation.procedural_name(),
+            "operation_suffix": self.operation_suffix,
+            "element_a": DataTypeTag[operation.A.element],
+            "layout_a": LayoutTag[instance_layout_A],
+            "element_b": DataTypeTag[operation.B.element],
+            "layout_b": LayoutTag[instance_layout_B],
+            "element_c": DataTypeTag[operation.C.element],
+            "layout_c": LayoutTag[instance_layout_C],
+            "element_accumulator": DataTypeTag[operation.accumulator_type()],
+            "opcode_class": OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+            "arch": "cutlass::arch::Sm%d" % operation.arch,
+            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
+            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
+            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
+            "warp_shape_m": str(warp_shape[0]),
+            "warp_shape_n": str(warp_shape[1]),
+            "warp_shape_k": str(warp_shape[2]),
+            "instruction_shape_m": str(operation.tile_description.math_instruction.instruction_shape[0]),
+            "instruction_shape_n": str(operation.tile_description.math_instruction.instruction_shape[1]),
+            "instruction_shape_k": str(operation.tile_description.math_instruction.instruction_shape[2]),
+            "epilogue_functor": epilogue_functor,
+            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
+            "stages": str(operation.tile_description.stages),
+            "align_a": str(operation.A.alignment),
+            "align_b": str(operation.B.alignment),
+            "transform_a": ComplexTransformTag[operation.A.complex_transform],
+            "transform_b": ComplexTransformTag[operation.B.complex_transform],
+            "precompute_mode": SchedulerModeTag[operation.precompute_mode],
+            "math_operation": MathOperationTag[operation.tile_description.math_instruction.math_operation],
+        }
+
+        if operation.emission_type == EmissionType.Kernel:
+            gemm_template = self.gemm_template_kernel
+        else:
+            gemm_template = self.gemm_template_device
+
+        return SubstituteTemplate(gemm_template, values)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/library.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/library.py
new file mode 100644
index 0000000000000000000000000000000000000000..a77b302dcccf330cc0e0f9b3f1290ab7030c5932
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/library.py
@@ -0,0 +1,509 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Common data types and string names/tags for them
+"""
+
+import enum
+
+from cutlass_library import (
+    ComplexTransform,
+    DataType,
+    DataTypeSize,
+    EpilogueScheduleType,
+    KernelScheduleSuffixes,
+    KernelScheduleType,
+    MathOperation,
+    OpcodeClass,
+    TileSchedulerType
+)
+
+
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+#
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+
+try:
+    from enum import auto as enum_auto
+except ImportError:
+    __cutlass_library_auto_enum = 0
+
+    def enum_auto() -> int:
+        global __cutlass_library_auto_enum
+        i = __cutlass_library_auto_enum
+        __cutlass_library_auto_enum += 1
+        return i
+
+
+class DataTypeSizeBytes:
+    """
+    Static class to mimic the `DataTypeSize` dictionary, but with checks for whether the
+    data type key is less than a full byte or a non-integer number of bytes.
+    """
+
+    @staticmethod
+    def __class_getitem__(datatype):
+        """
+        Returns the number of bytes in size the data type is. Raises an exception if the data type
+        is either less than a full byte or a non-integer number of bytes in size.
+
+        :param datatype: data type to query
+
+        :return: number of bytes the data type occupies
+        :rtype: int
+        """
+        bits = DataTypeSize[datatype]
+        if bits < 8:
+            raise Exception(
+                f"Data type {datatype} is less than one byte in size."
+            )
+        elif bits % 8 != 0:
+            raise Exception(
+                f"Data type datatype is not an integer number of bytes."
+            )
+        return bits // 8
+
+
+class SchedulerMode(enum.Enum):
+    Device = enum_auto()
+    Host = enum_auto()
+
+
+SchedulerModeTag = {
+    SchedulerMode.Device: "cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly",
+    SchedulerMode.Host: "cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute",
+}
+
+
+ShortSchedulerModeNames = {SchedulerMode.Device: "Device", SchedulerMode.Host: "Host"}
+
+
+class FunctionalOp(enum.Enum):
+    AtomicAdd = enum_auto()
+    AtomicMaximum = enum_auto()
+    Divides = enum_auto()
+    Maximum = enum_auto()
+    Minimum = enum_auto()
+    Minus = enum_auto()
+    Multiplies = enum_auto()
+    MultiplyAdd = enum_auto()
+    Plus = enum_auto()
+    Exp = enum_auto()
+
+
+FunctionalOpTag = {
+    FunctionalOp.AtomicAdd: "cutlass::atomic_add",
+    FunctionalOp.AtomicMaximum: "cutlass::atomic_maximum",
+    FunctionalOp.Divides: "cutlass::divides",
+    FunctionalOp.Maximum: "cutlass::maximum",
+    FunctionalOp.Minimum: "cutlass::minimum",
+    FunctionalOp.Minus: "cutlass::minus",
+    FunctionalOp.Multiplies: "cutlass::multiplies",
+    FunctionalOp.MultiplyAdd: "cutlass::multiply_add",
+    FunctionalOp.Plus: "cutlass::plus",
+    FunctionalOp.Exp: "cutlass::fast_exp_op",
+}
+
+
+class ActivationOp(enum.Enum):
+    DGelu = enum_auto()
+    Gelu = enum_auto()
+    GeluTaylor = enum_auto()
+    HardSwish = enum_auto()
+    Identity = enum_auto()
+    LeakyReLU = enum_auto()
+    ReLU = enum_auto()
+    Sigmoid = enum_auto()
+    SiLU = enum_auto()
+    Tanh = enum_auto()
+
+
+ActivationOpTag = {
+    ActivationOp.DGelu: "cutlass::epilogue::thread::dGELU",
+    ActivationOp.Gelu: "cutlass::epilogue::thread::GELU",
+    ActivationOp.GeluTaylor: "cutlass::epilogue::thread::GELU_taylor",
+    ActivationOp.HardSwish: "cutlass::epilogue::thread::HardSwish",
+    ActivationOp.Identity: "cutlass::epilogue::thread::Identity",
+    ActivationOp.LeakyReLU: "cutlass::epilogue::thread::LeakyReLU",
+    ActivationOp.ReLU: "cutlass::epilogue::thread::ReLu",
+    ActivationOp.Sigmoid: "cutlass::epilogue::thread::Sigmoid",
+    ActivationOp.SiLU: "cutlass::epilogue::thread::SiLu",
+    ActivationOp.Tanh: "cutlass::epilogue::thread::Tanh",
+}
+
+
+def op_tag(op) -> str:
+    """
+    Dispatches `op` to the appropriate *Tag dictionary depending on whether
+    `op` is an ActivationOp or FunctionalOp. This is useful for cases in which
+    either type can be used.
+
+    :param op: operation to emit a tag for
+    :type op: ActivationOp | FunctionalOp
+
+    :return: tag corresponding to op
+    :rtype: str
+    """
+    if isinstance(op, ActivationOp):
+        return ActivationOpTag[op]
+    elif isinstance(op, FunctionalOp):
+        return FunctionalOpTag[op]
+    else:
+        raise Exception(f"Unexpected op type {op}. Must be one of ActivationOp or FunctionalOp.")
+
+
+class FloatRoundStyle(enum.Enum):
+    ToNearest = enum_auto()
+    ToNearestSatfinite = enum_auto()
+    Indeterminate = enum_auto()
+    TowardZero = enum_auto()
+    TowardInfinity = enum_auto()
+    TowardNegInfinity = enum_auto()
+    HalfUlpTruncDntz = enum_auto()
+    HalfUlpTruncate = enum_auto()
+
+
+FloatRoundStyleTag = {
+    FloatRoundStyle.ToNearest: "cutlass::FloatRoundStyle::round_to_nearest",
+    FloatRoundStyle.ToNearestSatfinite: "cutlass::FloatRoundStyle::round_to_nearest_satfinite",
+    FloatRoundStyle.Indeterminate: "cutlass::FloatRoundStyle::round_indeterminate",
+    FloatRoundStyle.TowardZero: "cutlass::FloatRoundStyle::round_toward_zero",
+    FloatRoundStyle.TowardInfinity: "cutlass::FloatRoundStyle::round_toward_infinity",
+    FloatRoundStyle.TowardNegInfinity: "cutlass::FloatRoundStyle::round_toward_neg_infinity",
+    FloatRoundStyle.HalfUlpTruncDntz: "cutlass::FloatRoundStyle::round_half_ulp_trunc_dntz",
+    FloatRoundStyle.HalfUlpTruncate: "cutlass::FloatRoundStyle::round_half_ulp_truncate",
+}
+
+
+class MathInstruction:
+    """
+    Description of a the lowest-level matrix-multiply-accumulate operation to be used in a kernel
+    """
+
+    def __init__(
+        self,
+        instruction_shape,
+        element_a,
+        element_b,
+        element_accumulator,
+        opcode_class=OpcodeClass.Simt,
+        math_operation=MathOperation.multiply_add,
+    ):
+        """
+        :param instruction_shape: size of the [M, N, K] dimensions of the instruction
+        :type instruction_shape: list or tuple
+        :param element_a: data type of operand A
+        :param element_b: data type of operand B
+        :param element_accumulator: data type used in accumulation
+        :param opcode_class: higher-level class of the instruction (e.g., SIMT or Tensor Core)
+        :type opcode_class: cutlass_library.library.OpcodeClass
+        :param math_operation: the type of low-level operation to be performed (e.g., multiply accumulate)
+        :type math_operation: MathOperation
+        """
+        self.instruction_shape = instruction_shape
+        self.element_a = element_a
+        self.element_b = element_b
+        self.element_accumulator = element_accumulator
+        self.opcode_class = opcode_class
+        self.math_operation = math_operation
+
+
+def to_blackwell_threadblock_shape(tile_description, cluster_shape, kernel_schedule):
+    blackwell_threadblock_shape = tile_description.threadblock_shape
+    is_2sm = False if kernel_schedule is None else ("2sm" in KernelScheduleSuffixes[kernel_schedule])
+    if cluster_shape[0] > 0:
+        blackwell_threadblock_shape = [
+            tile_description.threadblock_shape[0] // cluster_shape[0],
+            tile_description.threadblock_shape[1] // cluster_shape[1],
+            tile_description.threadblock_shape[2] // cluster_shape[2]
+        ]
+        if is_2sm:
+            blackwell_threadblock_shape[0] *= 2
+    else:
+        blackwell_threadblock_shape = tile_description.math_instruction.instruction_shape
+    return blackwell_threadblock_shape, is_2sm
+
+
+class TileDescription:
+    """
+    Description of a tile of computation to be performed in the kernel, encompassing threadblock, cluster, and warp shapes,
+    stage count, and math instruction specification
+    """
+
+    def __init__(
+        self,
+        threadblock_shape,
+        stages,
+        warp_count,
+        math_instruction,
+        cluster_shape=[1, 1, 1],
+        kernel_schedule: KernelScheduleType = None,
+        epilogue_schedule: EpilogueScheduleType = None,
+        tile_scheduler: TileSchedulerType = None
+    ):
+        """
+        :param threadblock_shape: shape of a threadblock tyle
+        :type threadblock_shape: list or tuple
+        :param stages: number of pipline stages in the operation. For SM90 kernels, this can be set to `None` and the maximum
+                       number of stages that can be supported for an operation on a given architecture will be computed at a later time
+        :type stages: int or None
+        :param warp_count: number of warps in each [M, N, K] dimension of a threadblock tile
+        :type warp_count: list, tuple, or None
+        :param math_instruction: specification of the instruction type and shape to be performed and the types of its operands
+        :type math_instruction: MathInstruction
+        :param cluster_shape: number of threadblocks in the [X, Y, Z] dimensions of a threadblock cluster
+        :param kernel_schedule: type of kernel schedule to use (only available for SM90+)
+        :type kernel_schedule: cutlass_library.KernelScheduleType
+        :param epilogue_schedule: type of epilogue schedule to use (only available for SM90+)
+        :type epilogue_schedule: cutlass_library.EpilogueScheduleType
+        :param tile_scheduler: type of tile scheduler to use (only available for SM90+)
+        :type tile_scheduler: cutlass_library.TileSchedulerType
+        """
+        if ((kernel_schedule is None and epilogue_schedule is not None) or
+            (kernel_schedule is not None and epilogue_schedule is None)):
+            raise Exception("Kernel and epilogue schedule must either both be Auto or neither be Auto.")
+
+        self.threadblock_shape = threadblock_shape
+        self.cluster_shape = cluster_shape
+        self.kernel_schedule = kernel_schedule
+        self.epilogue_schedule = epilogue_schedule
+        self.tile_scheduler = tile_scheduler
+        self.stages = stages
+
+        self.math_instruction = math_instruction
+        self.instruction_shape = math_instruction.instruction_shape
+
+        # Number of warps along x, y, z directions
+        self.warp_count = warp_count
+
+        self.blackwell_threadblock_shape, self.is_2sm = to_blackwell_threadblock_shape(self, self.cluster_shape, self.kernel_schedule)
+
+    def clone_and_update(self, td: dict):
+        attrs = {
+            "cluster_shape": None,
+            "threadblock_shape": None,
+            "warp_count": None,
+            "stages": None,
+            "instruction_shape": None,
+            "kernel_schedule": None,
+            "epilogue_schedule": None,
+            "tile_scheduler": None
+        }
+        for key in attrs.keys():
+            if key in td.keys():
+                attrs[key] = td[key]
+            else:
+                attrs[key] = getattr(self, key)
+
+        attrs["math_instruction"] = MathInstruction(
+            attrs["instruction_shape"],
+            self.math_instruction.element_a,
+            self.math_instruction.element_b,
+            self.math_instruction.element_accumulator,
+            self.math_instruction.opcode_class,
+            self.math_instruction.math_operation
+        )
+
+        # Remove the instruction shape
+        del attrs["instruction_shape"]
+
+        return TileDescription(**attrs)
+
+    @property
+    def num_threads(self):
+        """
+        Returns the number of threads in the threadblock
+
+        :return: number of threads in the threadblock
+        :rtype: int or None (if warp count is None)
+        """
+        if self.warp_count is not None:
+            threads = 32
+            for cnt in self.warp_count:
+                threads *= cnt
+            return threads
+        return None
+
+    def procedural_name(self):
+        """
+        Returns a name identifying the tile description
+
+        :return: name identifying the tile description
+        :rtype: int
+        """
+        emit_stages = 0 if self.stages is None else self.stages
+        name = "%dx%dx%d_%dx%d_%dx%d" % (
+            self.cluster_shape[0],
+            self.cluster_shape[1],
+            self.cluster_shape[2],
+            self.threadblock_shape[0],
+            self.threadblock_shape[1],
+            self.threadblock_shape[2],
+            emit_stages
+        )
+
+        return name
+
+    def procedural_name_2x(self):
+        """
+        Returns a name identifying the tile description
+
+        :return: name identifying the tile description
+        :rtype: int
+        """
+        return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
+    def __str__(self):
+        """
+        Returns a string with containing each of the tile description's values
+
+        :return: contents of tile description
+        :rtype: str
+        """
+        if self.kernel_schedule is not None:
+            kschedule = self.kernel_schedule
+        else:
+            kschedule = KernelScheduleType.ScheduleAuto
+
+        if self.epilogue_schedule is not None:
+            eschedule = self.epilogue_schedule
+        else:
+            eschedule = EpilogueScheduleType.ScheduleAuto
+
+        if self.tile_scheduler is not None:
+            tschedule = self.tile_scheduler.name
+        else:
+            tschedule = "None"
+        return f"""
+{{
+  ClusterShape: {self.cluster_shape}
+  ThreadblockShape: {self.threadblock_shape}
+  WarpCount: {self.warp_count}
+  Stages: {self.stages if self.stages is not None else 'Auto'}
+  InstructionShape: {self.math_instruction.instruction_shape}
+  Kernel schedule: {kschedule.name}
+  Epilogue schedule: {kschedule.name}
+  TileScheduler: {tschedule}
+}}"""
+
+
+class TensorDescription:
+    def __init__(self, element, layout, alignment=1, complex_transform=ComplexTransform.none):
+        self.element = element
+        self.layout = layout
+        if element != DataType.void:
+            self.alignment = min(128 // DataTypeSize[self.element], alignment)
+        else:
+            self.alignment = alignment
+        self.complex_transform = complex_transform
+
+
+def CalculateSmemUsagePerStage(operation):
+    """
+    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
+
+    :param op: operation for which the maximum stages should be computed. If stages are
+               set via the `op.tile_description.stages` parameter, this setting is ignored
+               in the present calculation
+    :type op: cutlass_cppgen.backend.Operation
+
+    :return: number of bytes of shared memory consumed by a single stage
+    :rtype: int
+    """
+    m, n, k = operation.tile_description.threadblock_shape
+
+    if operation.operation_kind == OperationKind.Gemm:
+        stage_barrier_bytes = 32
+        return (
+            (DataTypeSize[operation.A.element] * m * k // 8)
+            + (DataTypeSize[operation.B.element] * k * n // 8)
+            + stage_barrier_bytes
+        )
+    else:
+        raise Exception("Unsupported operation kind {}.".format(operation.operation_kind))
+
+
+def CalculateSmemUsage(operation):
+    """
+    Returns the amount of shared memory in bytes consumed by a kernel.
+
+    :param op: operation for which the maximum stages should be computed. If stages are
+               set via the `op.tile_description.stages` parameter, this setting is ignored
+               in the present calculation
+    :type op: cutlass_cppgen.backend.Operation
+
+    :return: int
+    """
+    return operation.tile_description.stages * CalculateSmemUsagePerStage(operation)
+
+
+class ApiVersion(enum.Enum):
+    """
+    Differentiate between CUTLASS 2.x and 3.x API versions
+    """
+
+    v2x = enum_auto()
+    v3x = enum_auto()
+
+
+def api_version(arch, opclass, dtype):
+    """
+    Returns whether the architecture, opcode class, and datatype in question require using CUTLASS 2.x
+    or 3.x for code emission.
+
+    :param arch: compute capability of device on which to run
+    :type arch: int
+    :param opclass: class of the operation being performed
+    :type opclass: cutlass_library.OpcodeClass
+    :param dtype: data type to be used in operation (assumes that ElementA and ElementB are the same)
+    :type dtype: cutlass_library.DataType
+
+    :return: API version to be used in code emission
+    :rtype: ApiVersion
+    """
+    if (arch in [90, 100, 101, 103] and
+        opclass == OpcodeClass.TensorOp and
+        (dtype != DataType.f64)):
+        return ApiVersion.v3x
+    else:
+        return ApiVersion.v2x
+
+
+class EmissionType(enum.Enum):
+    """
+    Tags for whether to emit a kernel- or device-level operation
+    """
+
+    Kernel = enum_auto()
+    Device = enum_auto()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/memory_manager.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/memory_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..30e6bb3108ddd30e3776cf92b0671fce4fae5a93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/memory_manager.py
@@ -0,0 +1,121 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import numpy as np
+
+import cutlass_cppgen
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor
+from cutlass_cppgen.utils.lazy_import import lazy_import
+
+if cutlass_cppgen.use_rmm:
+    import rmm
+else:
+    cudart = lazy_import("cuda.cudart")
+
+
+class PoolMemoryManager:
+    def __init__(self, init_pool_size: int, max_pool_size: int) -> None:
+        self.pool = rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(),
+            initial_pool_size=init_pool_size,
+            maximum_pool_size=max_pool_size
+        )
+        self.mr = rmm.mr.TrackingResourceAdaptor(self.pool)
+        rmm.mr.set_current_device_resource(self.mr)
+
+    def pool_size(self):
+        return self.pool.pool_size()
+
+
+class DevicePtrWrapper:
+    """
+    Wrapper around a pointer to device memory to provide a uniform interface with the RMM DeviceBuffer
+    (at least in terms of the interface used by the CUTLASS Python interface)
+    """
+    def __init__(self, dev_ptr):
+        self.dev_ptr = dev_ptr
+
+    @property
+    def ptr(self):
+        return self.dev_ptr
+
+
+def _todevice(host_data):
+    """
+    Helper for transferring host data to device memory
+    """
+    if cutlass_cppgen.use_rmm:
+        return rmm.DeviceBuffer.to_device(host_data.tobytes())
+    else:
+        nbytes = len(host_data.tobytes())
+        dev_ptr_wrapper = device_mem_alloc(nbytes)
+        err, = cudart.cudaMemcpy(
+            dev_ptr_wrapper.ptr,
+            host_data.__array_interface__['data'][0],
+            nbytes,
+            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+        )
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMemcpy failed with error {err}")
+        return dev_ptr_wrapper
+
+
+def todevice(host_data, dtype=np.float32):
+    """
+    Pass the host_data to device memory
+    """
+    if isinstance(host_data, list):
+        return _todevice(np.array(host_data, dtype=dtype))
+    elif is_numpy_tensor(host_data):
+        return _todevice(host_data)
+
+
+def device_mem_alloc(size):
+    if cutlass_cppgen.use_rmm:
+        return rmm.DeviceBuffer(size=size)
+    else:
+        err, ptr = cudart.cudaMalloc(size)
+        if err != cudart.cudaError_t.cudaSuccess:
+            raise Exception(f"cudaMalloc failed with error {err}")
+        return DevicePtrWrapper(ptr)
+
+
+def align_size(size, alignment=256):
+    return ((size + alignment - 1) // alignment) * alignment
+
+
+def create_memory_pool(init_pool_size=0, max_pool_size=2 ** 34):
+    if cutlass_cppgen.use_rmm:
+        memory_pool = PoolMemoryManager(init_pool_size=init_pool_size, max_pool_size=max_pool_size)
+        return memory_pool
+    else:
+        return None
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ee67bc6f547d079b6d990e7abea69a16549c16
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/operation.py
@@ -0,0 +1,140 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import ctypes
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+
+from cutlass_cppgen.backend.utils.device import device_cc
+
+_supports_cluster_launch = None
+
+
+def supports_cluster_launch():
+    from cuda import __version__ 
+    _version_splits = [int(x) for x in __version__.split("rc")[0].split(".post")[0].split(".")]
+    global _supports_cluster_launch
+    if _supports_cluster_launch is None:
+        major, minor = _version_splits[0], _version_splits[1]
+        _supports_cluster_launch = device_cc() in [90, 100, 101, 103] and (major > 11 or (major == 11 and minor >= 8))
+    return _supports_cluster_launch
+
+
+class LaunchConfiguration:
+    def __init__(self, grid=[1, 1, 1], block=[1, 1, 1], smem=0):
+        self.grid = grid
+        self.block = block
+        self.shared_memory_capacity = smem
+
+
+class ExecutableOperation:
+    def __init__(self, operation):
+        self.operation = operation
+        self.module = None
+        self.kernel = None
+
+    def name(self):
+        return self.operation.procedural_name()
+
+    def emit(self):
+        return ""
+
+    def can_implement(self, configuration, arguments):
+        raise NotImplementedError()
+
+    def get_host_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def get_device_workspace_size(self, arguments):
+        raise NotImplementedError()
+
+    def plan(self, arguments):
+        raise NotImplementedError()
+
+    def initialize(self, host_workspace, device_workspace, launch_config, arguments, stream=None):
+        raise NotImplementedError()
+
+    def run_with_clusters(self, launch_config, kernel_params, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        if hasattr(self.operation, "tile_description") and hasattr(self.operation.tile_description, "cluster_shape"):
+            attr = cuda.CUlaunchAttribute()
+            attr.value.clusterDim.x, attr.value.clusterDim.y, attr.value.clusterDim.z = self.operation.tile_description.cluster_shape
+            attr.id = cuda.CUstreamAttrID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+            attrs = [attr]
+
+            # Allow for non-portable cluster sizes
+            err, = cuda.cuFuncSetAttribute(
+                self.kernel, cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                return err
+        else:
+            attrs = []
+
+        config = cuda.CUlaunchConfig()
+        config.gridDimX, config.gridDimY, config.gridDimZ = launch_config.grid
+        config.blockDimX, config.blockDimY, config.blockDimZ = launch_config.block
+        config.blockDimZ = launch_config.block[2]
+        config.sharedMemBytes = launch_config.shared_memory_capacity
+        config.hStream = stream
+        config.attrs = attrs
+        config.numAttrs = len(attrs)
+
+        err, = cuda.cuLaunchKernelEx(
+            config, f=self.kernel, kernelParams=kernel_params, extra=0)
+        return err
+
+    def run_without_clusters(self, launch_config, kernel_params, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        err, = cuda.cuLaunchKernel(
+            self.kernel,
+            launch_config.grid[0], launch_config.grid[1], launch_config.grid[2],
+            launch_config.block[0], launch_config.block[1], launch_config.block[2],
+            launch_config.shared_memory_capacity,
+            stream,
+            kernel_params,
+            0)
+
+        return err
+
+    def run(self, host_workspace, device_workspace, launch_config, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+        cArg = (ctypes.c_char * len(host_workspace)).from_buffer(host_workspace)
+        packed = (ctypes.c_void_p * 1)()
+        packed[0] = ctypes.addressof(cArg)
+
+        if supports_cluster_launch():
+            return self.run_with_clusters(launch_config, packed, stream)
+        else:
+            return self.run_without_clusters(launch_config, packed, stream)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/reduction_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/reduction_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..535cea2cb2a23ccbb29cce7233f42147ed2ea5eb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/reduction_operation.py
@@ -0,0 +1,455 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+from __future__ import annotations
+
+import ctypes
+from typing import Union
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+import numpy as np
+
+from cutlass_library import (
+    DataTypeNames,
+    DataTypeSize,
+    DataTypeTag,
+    LayoutType,
+    SubstituteTemplate
+)
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.c_types import MatrixCoord_, TensorRef2D_, get_reduction_params
+from cutlass_cppgen.backend.frontend import NumpyFrontend, TorchFrontend
+from cutlass_cppgen.backend.library import TensorDescription
+from cutlass_cppgen.backend.memory_manager import DevicePtrWrapper
+from cutlass_cppgen.backend.operation import ExecutableOperation, LaunchConfiguration
+from cutlass_cppgen.shape import MatrixCoord
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor, is_torch_tensor
+
+
+class ReductionOperation:
+    pass
+
+
+class ReductionArguments:
+    """
+    Arguments of reduction
+    """
+
+    def __init__(
+        self,
+        operation: ReductionOperation,
+        problem_size: "list[int]",
+        partitions: int,
+        workspace: cuda.CUdeviceptr,
+        destination: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
+        source: "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor]",
+        **kwargs,
+    ) -> None:
+        # tensor_C can be interpreted as the bias with bias=True in keyword args
+        if "bias" in kwargs.keys():
+            self.bias = kwargs["bias"]
+        else:
+            # by default, tensor_C is not bias
+            self.bias = False
+        if "stream" in kwargs.keys():
+            self.stream = kwargs["stream"]
+        else:
+            self.stream = cuda.CUstream(0)
+
+        self.operation = operation
+        self.ptr_workspace = workspace
+
+        # number of split-k partitions
+        self.partitions = partitions
+
+        if is_numpy_tensor(destination):
+            self.host_D = destination
+            self.destination_buffer = NumpyFrontend.argument(destination, True)
+            self.source_buffer = NumpyFrontend.argument(source, False)
+            self.ptr_destination = cuda.CUdeviceptr(self.destination_buffer.ptr)
+            self.ptr_source = cuda.CUdeviceptr(self.source_buffer.ptr)
+        elif is_torch_tensor(destination):
+            self.ptr_destination = TorchFrontend.argument(destination)
+            self.ptr_source = TorchFrontend.argument(source)
+        elif isinstance(destination, cuda.CUdeviceptr):
+            self.ptr_destination = destination
+            self.ptr_source = source
+        else:
+            raise TypeError("unknown Type")
+
+        self.problem_size = MatrixCoord_(problem_size[0], problem_size[1])
+
+        self.partition_stride = (
+            problem_size[0] * problem_size[1] * DataTypeSize[operation.C.element] // 8
+        )
+
+        if "output_op" in kwargs.keys():
+            self.output_op = kwargs["output_op"]
+        else:
+            self.output_op = self.operation.epilogue_type(1.0, 0.0)
+
+        self.get_arguments()
+
+    @staticmethod
+    def get_tensor_ref(
+        extent: "tuple[int]",
+        device_ptr: cuda.CUdeviceptr,
+        layout: LayoutType,
+    ):
+        if layout == LayoutType.RowMajor:
+            return TensorRef2D_(int(device_ptr), extent[1])
+        else:
+            raise ValueError(f"Unknown layout type {layout}")
+
+    def get_arguments(self):
+        ref_workspace = ReductionArguments.get_tensor_ref(
+            extent=[
+                self.problem_size.row,
+                self.problem_size.column,
+            ],
+            device_ptr=self.ptr_workspace,
+            layout=LayoutType.RowMajor,
+        )
+        if self.bias:
+            ref_source = ReductionArguments.get_tensor_ref(
+                extent=[0, 0],
+                device_ptr=self.ptr_source,
+                layout=LayoutType.RowMajor,
+            )
+        else:
+            ref_source = ReductionArguments.get_tensor_ref(
+                extent=[
+                    self.problem_size.row,
+                    self.problem_size.column,
+                ],
+                device_ptr=self.ptr_source,
+                layout=LayoutType.RowMajor,
+            )
+
+        ref_destination = ReductionArguments.get_tensor_ref(
+            extent=[
+                self.problem_size.row,
+                self.problem_size.column,
+            ],
+            device_ptr=self.ptr_destination,
+            layout=LayoutType.RowMajor,
+        )
+
+        self.c_arguments = self.operation.argument_type(
+            self.problem_size,
+            self.partitions,
+            self.partition_stride,
+            ref_workspace,
+            ref_destination,
+            ref_source,
+            self.output_op,
+        )
+
+        params_ = self.operation.rt_module.get_args(ctypes.byref(self.c_arguments))
+        self.host_workspace = bytearray(params_.contents)
+
+    def sync(self):
+        (err,) = cudart.cudaDeviceSynchronize()
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+        if hasattr(self, "host_D"):
+            (err,) = cuda.cuMemcpyDtoH(
+                self.host_D,
+                self.ptr_destination,
+                self.host_D.size * self.host_D.itemsize,
+            )
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError("CUDA Error %s" % str(err))
+
+        self.free()
+
+    def free(self):
+        """
+        Frees allocated device-side memory
+        """
+        # Free any device memory allocated manually
+        if not cutlass_cppgen.use_rmm:
+            for attr in ["destination_buffer", "source_buffer"]:
+                if hasattr(self, attr):
+                    buf = getattr(self, attr)
+                    if isinstance(buf, DevicePtrWrapper):
+                        err, = cudart.cudaFree(buf.ptr)
+                        if err != cudart.cudaError_t.cudaSuccess:
+                            raise RuntimeError(f"cudaFree failed with error {err}")
+                        del buf
+
+
+class ReductionRT(ExecutableOperation):
+    """
+    ReductionRT manages the CUTLASS runtime components for reduction
+    """
+
+    KernelTemplate = r"""
+extern "C"
+__global__ void
+${operation_name}(${operation_name}${operation_suffix}::Params params) {
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int SharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  ${operation_name}${operation_suffix}::SharedStorage *shared_storage =
+      reinterpret_cast<${operation_name}${operation_suffix}::SharedStorage *>(SharedStorageBase);
+
+  ${operation_name}${operation_suffix} op;
+
+  op(params, *shared_storage);
+}
+    """
+    HostTemplate = r"""
+extern "C" {
+  // Get the size of params in bytes
+  int ${operation_name}_get_param_size(){
+    return sizeof(${operation_name}${operation_suffix}::Params);
+  }
+
+  // Get the size of dynamic shared memory in bytes
+  int ${operation_name}_shared_memory_size() {
+    return int(sizeof(${operation_name}${operation_suffix}::SharedStorage));
+  }
+
+  // Get the params as byte array
+  char* ${operation_name}_get_params(${operation_name}${operation_suffix}::Params* params){
+    char *bytes = ((char*)(params));
+    char *output = new char[sizeof(${operation_name}${operation_suffix}::Params)];
+    for (unsigned int i = 0; i < sizeof(${operation_name}${operation_suffix}::Params); i ++)
+        output[i] = bytes[i];
+
+    return output;
+  }
+}
+    """
+
+    def __init__(self, operation: ReductionOperation):
+        super().__init__(operation)
+
+        self.operation: ReductionOperation = operation
+        self.emitter = EmitReductionInstance("_type")
+
+        self.elements_per_access = self.operation.count
+        (
+            self.argument_type,
+            self.epilogue_type,
+        ) = get_reduction_params(operation.epilogue_functor)
+        self.argtype = [ctypes.POINTER(self.argument_type)]
+
+    def emit(self):
+        return self.emitter.emit(self.operation)
+
+    def plan(self, arguments: ReductionArguments):
+        block_shape = [
+            self.operation.shape.column // self.elements_per_access,
+            self.operation.shape.row,
+            1,
+        ]
+        grid_shape = [
+            (arguments.problem_size.row + self.operation.shape.row - 1)
+            // self.operation.shape.row,
+            (arguments.problem_size.column + self.operation.shape.column - 1)
+            // self.operation.shape.column,
+            1,
+        ]
+        return LaunchConfiguration(
+            grid_shape,
+            block_shape,
+            self.shared_memory_capacity,
+        )
+
+    def initialize(self):
+        (err,) = cuda.cuFuncSetAttribute(
+            self.kernel,
+            attrib=cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            value=self.shared_memory_capacity,
+        )
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error: {err}")
+
+
+class ReductionOperation:
+    """
+    CUTLASS reduction Operation
+    """
+
+    def __init__(
+        self,
+        shape: MatrixCoord,
+        C: TensorDescription,
+        element_accumulator,
+        element_workspace=None,
+        element_compute=None,
+        epilogue_functor=None,
+        count: int = 1,
+        partitions_per_stage: int = 4,
+    ) -> None:
+        self.shape = shape
+        self.epilogue_functor = epilogue_functor
+        self.element_accumulator = element_accumulator
+
+        if element_workspace is None:
+            self.element_workspace = element_accumulator
+        else:
+            self.element_workspace = element_workspace
+
+        if element_compute is None:
+            self.element_compute = element_accumulator
+        else:
+            self.element_compute = element_compute
+
+        self.element_output = C.element
+        self.C: TensorDescription = C
+
+        # Reduce op processing size
+        self.count: int = count
+
+        # Number of partitions to reduce per stage
+        self.partitions_per_stage: int = partitions_per_stage
+
+        self.rt_module: ReductionRT = ReductionRT(self)
+        self.argument_type = self.rt_module.argument_type
+        self.epilogue_type = self.rt_module.epilogue_type
+
+    def extended_name(self):
+        extend_name = "${element_workspace}_${element_accumulator}_${element_compute}_${element_output}"
+
+        return SubstituteTemplate(
+            extend_name,
+            {
+                "element_workspace": DataTypeNames[self.element_workspace],
+                "element_accumulator": DataTypeNames[self.element_accumulator],
+                "element_compute": DataTypeNames[self.element_compute],
+                "element_output": DataTypeNames[self.element_output],
+            },
+        )
+
+    def configuration_name(self):
+        """The full procedural name indicates architecture, extended name, tile size"""
+
+        configuration_name = "cutlass_reduce_split_k_${extended_name}_${threadblock}"
+
+        threadblock = "%dx%d" % (
+            self.shape.row,
+            self.shape.column,
+        )
+
+        return SubstituteTemplate(
+            configuration_name,
+            {
+                "extended_name": self.extended_name(),
+                "threadblock": threadblock,
+            },
+        )
+
+    def procedural_name(self):
+        """The full procedural name indicates architeture, extended name, tile size"""
+        return self.configuration_name()
+
+    def run(self, arguments: ReductionArguments) -> cuda.CUresult:
+        """
+        Configure and launch the cuda kernel with input arguments
+        """
+        launch_config = self.rt_module.plan(arguments)
+
+        host_workspace = arguments.host_workspace
+        device_workspace = None
+
+        err = self.rt_module.run(
+            host_workspace,
+            device_workspace,
+            launch_config,
+            arguments.stream
+        )
+
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+        return err
+
+
+class EmitReductionInstance:
+    def __init__(self, operation_suffix="") -> None:
+        self.operation_suffix = operation_suffix
+        self.includes = [
+            "cutlass/cutlass.h",
+            "cutlass/numeric_types.h",
+            "cutlass/arch/arch.h",
+            "cutlass/arch/mma.h",
+            "cutlass/layout/matrix.h",
+            "cutlass/gemm/device/gemm.h",
+            "cutlass/gemm/device/gemm_universal_adapter.h",
+            "cutlass/gemm/kernel/default_gemm_universal.h",
+            "cutlass/reduction/kernel/reduce_split_k.h",
+            "cutlass/reduction/thread/reduction_operators.h",
+        ]
+        self.template = """
+// Reduction kernel instance
+using ${operation_name}_base =
+typename cutlass::reduction::kernel::ReduceSplitK<
+  cutlass::MatrixShape<${shape_row}, ${shape_column}>,
+  ${epilogue_functor},
+  cutlass::reduction::thread::ReduceAdd<
+    ${element_accumulator},
+    ${element_output},
+    ${count}>,
+  ${partition_per_stage}>;
+
+struct ${operation_name}${operation_suffix}:
+  public ${operation_name}_base { };
+      """
+
+    def emit(self, operation: ReductionOperation):
+        vector_length_bits = min(operation.C.alignment * DataTypeSize[operation.C.element], 128)
+        epilogue_vector_length = vector_length_bits // DataTypeSize[operation.C.element]
+
+        values = {
+            "operation_name": operation.configuration_name(),
+            "operation_suffix": self.operation_suffix,
+            "shape_row": str(operation.shape.row),
+            "shape_column": str(operation.shape.column),
+            "epilogue_functor": operation.epilogue_functor.emit(),
+            "element_output": DataTypeTag[operation.element_output],
+            "epilogue_vector_length": str(epilogue_vector_length),
+            "element_accumulator": DataTypeTag[operation.element_accumulator],
+            "element_compute": DataTypeTag[operation.element_compute],
+            "element_workspace": DataTypeTag[operation.element_workspace],
+            "count": str(operation.count),
+            "partition_per_stage": str(operation.partitions_per_stage),
+        }
+
+        return SubstituteTemplate(self.template, values)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/type_hint.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/type_hint.py
new file mode 100644
index 0000000000000000000000000000000000000000..fffa03360f7e0eb2f3a2a20e5c8a4e04d009bee9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/type_hint.py
@@ -0,0 +1,35 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+GemmOperation = "Union[GemmOperationUniversal, GemmOperationGrouped]"
+
+Tensor = "Union[cuda.CUdeviceptr, np.ndarray, torch.Tensor, cp.ndarray]"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bae3bac1163c55a698dfc8722c62ac85cb25abf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/__init__.py
@@ -0,0 +1,33 @@
+################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+from cutlass_cppgen.backend.utils.device import check_cuda_errors, device_cc
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/device.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed4096a6f4b772a58702c2f4b089cc32d707614
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/backend/utils/device.py
@@ -0,0 +1,126 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for interacting with the device
+"""
+from __future__ import annotations
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+
+import cutlass_cppgen
+from cutlass_cppgen.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_tensor
+
+
+def check_cuda_errors(result: list):
+    """
+    Checks whether `result` contains a CUDA error raises the error as an exception, if so. Otherwise,
+    returns the result contained in the remaining fields of `result`.
+
+    :param result: the results of the `cudart` method, consisting of an error code and any method results
+    :type result: list
+
+    :return: non-error-code results from the `results` parameter
+    """
+    # `result` is of the format : (cudaError_t, result...)
+    err = result[0]
+    if err.value:
+        raise RuntimeError("CUDA error: {}".format(cudart.cudaGetErrorName(err)))
+
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
+
+
+def device_cc(device: int = -1) -> int:
+    """
+    Returns the compute capability of the device with ID `device`.
+
+    :param device: ID of the device to query
+    :type device: int
+
+    :return: compute capability of the queried device (e.g., 80 for SM80)
+    :rtype: int
+    """
+    if device == -1:
+        device = cutlass_cppgen.device_id()
+
+    deviceProp = check_cuda_errors(cudart.cudaGetDeviceProperties(device))
+    major = str(deviceProp.major)
+    minor = str(deviceProp.minor)
+    return int(major + minor)
+
+
+def device_sm_count(device: int = -1):
+    if device == -1:
+        device = cutlass_cppgen.device_id()
+    err, device_sm_count = cuda.cuDeviceGetAttribute(
+        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device
+    )
+    if err != cuda.CUresult.CUDA_SUCCESS:
+        raise Exception(
+            "Failed to retireve SM count. "
+            f"cuDeviceGetAttribute() failed with error: {cuda.cuGetErrorString(err)[1]}"
+        )
+
+    return device_sm_count
+
+
+def to_device_ptr(tensor) -> cuda.CUdeviceptr:
+    """
+    Converts a tensor to a CUdeviceptr
+
+    :param tensor: tensor to convert
+    :type tensor: np.ndarray | torch.Tensor | cp.ndarray | int
+
+    :return: device pointer
+    :rtype: cuda.CUdeviceptr
+    """
+    if is_numpy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.__array_interface__["data"][0])
+    elif is_torch_tensor(tensor):
+        ptr = cuda.CUdeviceptr(tensor.data_ptr())
+    elif is_cupy_tensor(tensor):
+        ptr = cuda.CUdeviceptr(int(tensor.data.ptr))
+    elif isinstance(tensor, cuda.CUdeviceptr):
+        ptr = tensor
+    elif isinstance(tensor, int):
+        ptr = cuda.CUdeviceptr(tensor)
+    else:
+        raise NotImplementedError(tensor)
+
+    return ptr
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e4121b59e57e26e8a32022916089e0916db4988
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/__init__.py
@@ -0,0 +1,33 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.emit.pytorch import pytorch
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/common.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..58f94e15148f934c92318b586d63b669757ed5f0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/common.py
@@ -0,0 +1,267 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Common utilities for emitting CUTLASS kernels
+"""
+
+import cutlass_cppgen
+
+# Strings used for printing information about the generation of emitted scripts
+_AUTOGEN_STR = f"This file was automatically generated by the CUTLASS {cutlass_cppgen.__version__} Python interface (https://github.com/nvidia/cutlass/python)"
+
+
+_CSTYLE_AUTOGEN_COMMENT = f"""// {_AUTOGEN_STR}
+"""
+
+
+_PYSTYLE_AUTOGEN_COMMENT = f"""# {_AUTOGEN_STR}
+"""
+
+_CUTLASS_KERNEL_ARGS_2x = """
+  typename DeviceKernel::Arguments arguments {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K},                                        // problem size
+      1,
+      {alpha, beta},
+      A, B, C, D,
+      0, 0, 0, 0,                                       // batch strides
+      DeviceKernel::LayoutA::packed({M, K}).stride(0),  // lda
+      DeviceKernel::LayoutB::packed({K, N}).stride(0),  // ldb
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldc
+      DeviceKernel::LayoutC::packed({M, N}).stride(0)   // ldd
+  };
+"""
+
+_CUTLASS_KERNEL_ARGS_2x_STREAM_K = """
+  typename DeviceKernel::Arguments arguments {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K},                                        // problem size
+      1,
+      {alpha, beta},
+      A, B, C, D,
+      0, 0, 0, 0,                                       // batch strides
+      DeviceKernel::LayoutA::packed({M, K}).stride(0),  // lda
+      DeviceKernel::LayoutB::packed({K, N}).stride(0),  // ldb
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldc
+      DeviceKernel::LayoutC::packed({M, N}).stride(0),  // ldd
+      -1                                                // avail_sms
+  };
+"""
+
+_CUTLASS_KERNEL_RUN_GEMM_2x = """
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+cutlass::Status ${name}_kernel_run(int M, int N, int K,
+                        const DeviceKernel::ElementA* A, const DeviceKernel::ElementB* B, const DeviceKernel::ElementC* C, DeviceKernel::ElementC* D,
+                        ElementCompute alpha, ElementCompute beta) {
+  ${args}
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.initialize(arguments,
+                                              workspace.get(),
+                                              nullptr);     // CUDA stream
+
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = gemm_op();
+  return status;
+}
+"""
+
+_CUTLASS_KERNEL_RUN_GEMM_3x = """
+using StrideA = typename DeviceKernel::GemmKernel::StrideA;
+using StrideB = typename DeviceKernel::GemmKernel::StrideB;
+using StrideC = typename DeviceKernel::GemmKernel::StrideC;
+using StrideD = typename DeviceKernel::GemmKernel::StrideD;
+
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+cutlass::Status ${name}_kernel_run(
+        int M, int N, int K, int L,
+        const DeviceKernel::ElementA* A, const DeviceKernel::ElementB* B, const DeviceKernel::ElementC* C, DeviceKernel::ElementC* D,
+        ElementCompute alpha, ElementCompute beta, const cutlass::KernelHardwareInfo& hw_info) {
+
+  typename DeviceKernel::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {M, N, K, L},                                                              // problem size
+      {
+        A,                                                                         // ptrA
+        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)),    // stride A
+        B,                                                                         // ptrB
+        cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)),    // stride B
+      },
+      {
+        {alpha, beta},
+        C,                                                                       // ptrC
+        cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)),  // stride C
+        D,                                                                       // ptrD
+        cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)),  // stride D
+      },
+      hw_info
+  };
+
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.run(arguments,
+                                       workspace.get(),
+                                       nullptr);     // CUDA stream
+
+  return status;
+}
+"""
+
+
+_CUTLASS_KERNEL_RUN_GROUPED_GEMM_2x = """
+using ElementCompute = typename DeviceKernel::EpilogueOutputOp::ElementCompute;
+
+int threadblock_count = DeviceKernel::sufficient();
+
+cutlass::Status ${name}_kernel_run(int problem_count, cutlass::gemm::GemmCoord* problem_sizes,
+                        DeviceKernel::ElementA** A, DeviceKernel::ElementB** B, DeviceKernel::ElementC** C, DeviceKernel::ElementC** D,
+                        int64_t* lda, int64_t* ldb, int64_t* ldc, int64_t* ldd,
+                        ElementCompute alpha, ElementCompute beta) {
+
+  typename DeviceKernel::Arguments arguments {
+    problem_sizes,
+    problem_count,
+    threadblock_count,
+    {alpha, beta},
+    A, B, C, D,
+    lda, ldb, ldc, ldd
+  };
+
+  size_t workspace_size = DeviceKernel::get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+  DeviceKernel gemm_op;
+  cutlass::Status status = gemm_op.initialize(arguments,
+                                              workspace.get(),
+                                              nullptr);     // CUDA stream
+
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = gemm_op();
+  return status;
+}
+"""
+
+
+_CUTLASS_KERNEL_RUN_CONV2D_2x = """
+
+using UnderlyingKernel = typename DeviceKernel::UnderlyingKernel;
+namespace {
+using TensorRefA = typename UnderlyingKernel::TensorRefA;
+using TensorRefB = typename UnderlyingKernel::TensorRefB;
+using TensorRefC = typename UnderlyingKernel::TensorRefC;
+using ElementCompute = typename UnderlyingKernel::EpilogueOutputOp::ElementCompute;
+}
+
+template<typename TensorRef, typename Element>
+TensorRef get_tensor_ref(cutlass::Tensor4DCoord tensor_coord, Element* ptr){
+  cutlass::layout::TensorNHWC layout = cutlass::layout::TensorNHWC::packed(tensor_coord);
+  TensorRef tensor_ref(ptr, layout);
+  return tensor_ref;
+}
+
+cutlass::Status ${name}_kernel_run(cutlass::conv::Conv2dProblemSize* problem_size,
+                        UnderlyingKernel::ElementA* A, UnderlyingKernel::ElementB* B,
+                        UnderlyingKernel::ElementC* C, UnderlyingKernel::ElementC* D,
+                        ElementCompute alpha, ElementCompute beta, std::string split_k_mode,
+                        cudaStream_t stream, int device_id=0) {
+  // create the tensor references
+  cutlass::Tensor4DCoord tensor_coord_A = cutlass::conv::implicit_gemm_tensor_a_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+  cutlass::Tensor4DCoord tensor_coord_B = cutlass::conv::implicit_gemm_tensor_b_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+  cutlass::Tensor4DCoord tensor_coord_C = cutlass::conv::implicit_gemm_tensor_c_extent(
+    cutlass::conv::Operator::k${conv_kind_name}, *problem_size
+  );
+
+  TensorRefA tensor_ref_A = get_tensor_ref<TensorRefA, UnderlyingKernel::ElementA>(tensor_coord_A, A);
+  TensorRefB tensor_ref_B = get_tensor_ref<TensorRefB, UnderlyingKernel::ElementB>(tensor_coord_B, B);
+  TensorRefC tensor_ref_C = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, C);
+  TensorRefC tensor_ref_D = get_tensor_ref<TensorRefC, UnderlyingKernel::ElementC>(tensor_coord_C, D);
+
+  cutlass::conv::SplitKMode mode;
+  if (split_k_mode == "serial") {
+    mode = cutlass::conv::SplitKMode::kSerial;
+  } else if (split_k_mode == "parallel") {
+    mode = cutlass::conv::SplitKMode::kParallel;
+  } else {
+    throw std::runtime_error("Invalid split_k_mode: " + split_k_mode);
+  }
+
+  typename DeviceKernel::Arguments arguments{
+    *problem_size,
+    tensor_ref_A,
+    tensor_ref_B,
+    tensor_ref_C,
+    tensor_ref_D,
+    {alpha, beta},
+    mode
+  };
+
+  DeviceKernel implicit_gemm_op;
+
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+
+  void* workspace_ptr = device_memory_allocation(workspace_size, device_id);
+
+  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  status = implicit_gemm_op.initialize(arguments, workspace_ptr, stream);
+  if (status != cutlass::Status::kSuccess) {
+    return status;
+  }
+
+  //
+  // Launch initialized CUTLASS kernel
+  //
+  status = implicit_gemm_op(stream);
+
+  return status;
+}
+"""
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/pytorch.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe96f3ede11163da01520f972eb97282a2ab2b14
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/emit/pytorch.py
@@ -0,0 +1,936 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for generating source for building a PyTorch CUDA extension that using a CUTLASS kernel.
+If specified, the extension can be JIT compiled via PyTorch's ``cpp_extension.load`` method.
+
+Example usage with JIT compilation:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass_cppgen.op.Gemm(element=torch.float32, layout=cutlass_library.LayoutType.RowMajor)
+    op = plan.construct()
+    mod = cutlass_cppgen.emit.pytorch(op, 'cutlass_gemm', 80, jit=True)
+
+    # Generate inputs for the GEMM
+    A, B, C = [torch.ones((512, 512)).to('cuda') for _ in range(3)]
+
+    # Run the module
+    D = mod.run(A, B, C)
+
+
+Example usage without JIT compilation:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass_cppgen.op.Gemm(element=torch.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+    op = plan.construct()
+    cutlass_cppgen.emit.pytorch(op, 'cutlass_gemm', 80, jit=False, sourcedir='output')
+
+After this call, the directory ``output`` contains ``setup.py``,
+``cutlass_gemm.cpp``, and ``cutlass_gemm_kernel.cu``. The module can be built from
+within ``output`` by running: ``TORCH_CUDA_ARCH_LIST="8.0" python setup.py develop --user``.
+
+The module can later be used in Python via:
+
+.. highlight:: python
+.. code-block:: python
+
+    import torch
+    import cutlass_gemm
+
+    # Generate inputs for the GEMM
+    A, B, C = [torch.ones((512, 512)).to('cuda') for _ in range(3)]
+
+    # Run the module
+    D = cutlass_gemm.run(A, B, C)
+"""
+
+import logging
+import os
+
+from cutlass_library import ConvKind, ConvKindNames, DataType, SubstituteTemplate
+
+from cutlass_cppgen import CUTLASS_PATH, logger, swizzle
+from cutlass_cppgen.backend.gemm_operation import GemmOperationGrouped, GemmOperationUniversal
+from cutlass_cppgen.backend.conv2d_operation import Conv2dOperation
+from cutlass_cppgen.backend.library import ApiVersion
+from cutlass_cppgen.emit import common
+from cutlass_cppgen.utils.datatypes import is_torch_available
+
+if is_torch_available():
+    import torch
+
+
+_PYTORCH_CUDA_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/device_memory.h"
+
+// helper function allocating the memory
+void* device_memory_allocation(size_t size, int device_id=0) {
+    if (size > 0) {
+        torch::Device device(torch::kCUDA, device_id);
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        torch::TensorOptions options = torch::TensorOptions().dtype(torch::kI8).device(device);
+        at::Tensor device_tensor = torch::empty({(long)size,}, options);
+        return reinterpret_cast<void*>(device_tensor.data_ptr());
+    } else {
+        return nullptr;
+    }
+}
+
+${includes}
+${declaration}
+${impl}
+"""
+
+_PYTORCH_GEMM_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, float alpha=1.f, float beta=0.f);
+
+// C++ interface
+at::Tensor ${name}(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt, float alpha=1.f, float beta=0.f) {
+  return ${name}_kernel(A, B, C, alpha, beta);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run", py::overload_cast<const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>, float, float>(&${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr, py::arg("alpha") = 1.f, py::arg("beta") = 0.f);
+}
+"""
+
+_PYTORCH_GROUPED_GEMM_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+std::vector<at::Tensor> ${name}_kernel(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C=at::nullopt, float alpha=1.f, float beta=0.f);
+
+// C++ interface
+std::vector<at::Tensor> ${name}(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C=at::nullopt, float alpha=1.f, float beta=0.f) {
+  return ${name}_kernel(A, B, C, alpha, beta);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run", py::overload_cast<const std::vector<at::Tensor>&, const std::vector<at::Tensor>&, at::optional<const std::vector<at::Tensor>>, float, float>(&${name}),
+        py::arg("A"), py::arg("B"), py::arg("C") = nullptr, py::arg("alpha") = 1.f, py::arg("beta") = 0.f);
+}
+"""
+
+_PYTORCH_CONV2D_FPROP_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1);
+
+// C++ interface
+at::Tensor ${name}(
+    const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    return ${name}_kernel(A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run",
+  py::overload_cast<
+    const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
+    std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float,  std::string, int>(
+        &${name}), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
+        py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
+        py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
+        py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
+}
+"""
+
+_PYTORCH_CONV2D_GRAD_CPP_TEMPLATE = common._CSTYLE_AUTOGEN_COMMENT + """
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <pybind11/stl.h>
+
+// CUDA forward declarations
+at::Tensor ${name}_kernel(
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1);
+
+// C++ interface
+at::Tensor ${name}(
+    std::tuple<int, int, int, int> result_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    return ${name}_kernel(result_size, A, B, C, stride, padding, dilation, alpha, beta, split_k_mode, split_k_slices);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("run",
+  py::overload_cast<
+    std::tuple<int, int, int, int>, const at::Tensor&, const at::Tensor&, at::optional<const at::Tensor>,
+    std::tuple<int, int>, std::tuple<int, int>, std::tuple<int, int>, float, float, std::string, int>(
+        &${name}), py::arg("result_size"), py::arg("A"), py::arg("B"), py::arg("C") = nullptr,
+        py::arg("stride") = std::make_tuple(1, 1), py::arg("padding") = std::make_tuple(1, 1), py::arg("dilation") = std::make_tuple(1, 1),
+        py::arg("alpha") = 1.f, py::arg("beta") = 0.f,
+        py::arg("split_k_mode") = "serial", py::arg("split_k_slices") = 1);
+}
+"""
+
+_PYTORCH_GEMM_INCLUDES = {
+    ApiVersion.v2x: """
+#include "cutlass/gemm/device/gemm_universal.h"
+""",
+    ApiVersion.v3x: """
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
+""",
+}
+
+_PYTORCH_GROUPED_GEMM_INCLUDES = """
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+"""
+
+_PYTORCH_CONV2D_INCLUDES = """
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+"""
+
+_CUTLASS_TYPE_TO_TORCH_TYPE = {
+    DataType.f16: "torch::kF16",
+    DataType.f32: "torch::kF32",
+    DataType.f64: "torch::kF64",
+    DataType.s8: "torch::kI8",
+    DataType.s32: "torch::kI32",
+    DataType.bf16: "torch::kBFloat16",
+}
+
+_PYTORCH_GEMM_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_GEMM_2x
+    + """
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C, float alpha, float beta) {
+    int M = A.size(0);
+    int N = B.size(1);
+    int K = A.size(1);
+
+    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename DeviceKernel::ElementC*>(C->contiguous().data_ptr());
+    at::Tensor D = B.new_empty({M, N}, ${torch_type_C});
+
+    cutlass::Status status = ${name}_kernel_run(M, N, K,
+                                                reinterpret_cast<typename DeviceKernel::ElementA*>(A.contiguous().data_ptr()),
+                                                reinterpret_cast<typename DeviceKernel::ElementB*>(B.contiguous().data_ptr()),
+                                                ptrC,
+                                                reinterpret_cast<typename DeviceKernel::ElementC*>(D.contiguous().data_ptr()),
+                                                ElementCompute(alpha), ElementCompute(beta));
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+_PYTORCH_GEMM_IMPL_TEMPLATE_3x = (
+    common._CUTLASS_KERNEL_RUN_GEMM_3x
+    + """
+bool hw_info_queried = false;
+cutlass::KernelHardwareInfo hw_info;
+
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C, float alpha, float beta) {
+    int M = A.size(0);
+    int N = B.size(1);
+    int K = A.size(1);
+    int L = 1;
+
+    // Query hardware info if we haven't already
+    if (!hw_info_queried) {
+        hw_info.device_id = 0;
+        hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    }
+
+    typename DeviceKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename DeviceKernel::ElementC*>(C->contiguous().data_ptr());
+    at::Tensor D = B.new_empty({M, N}, ${torch_type_C});
+
+    cutlass::Status status = ${name}_kernel_run(M, N, K, L,
+                                                reinterpret_cast<typename DeviceKernel::ElementA*>(A.contiguous().data_ptr()),
+                                                reinterpret_cast<typename DeviceKernel::ElementB*>(B.contiguous().data_ptr()),
+                                                ptrC,
+                                                reinterpret_cast<typename DeviceKernel::ElementC*>(D.contiguous().data_ptr()),
+                                                ElementCompute(alpha), ElementCompute(beta),
+                                                hw_info);
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+
+_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE = (
+    common._CUTLASS_KERNEL_RUN_GROUPED_GEMM_2x
+    + """
+std::vector<at::Tensor> ${name}_kernel(const std::vector<at::Tensor>& A, const std::vector<at::Tensor>& B, at::optional<const std::vector<at::Tensor>> C, float alpha, float beta) {
+    size_t num = A.size();
+
+    // To avoid performing many small cudaMallocs and host-to-device copies,
+    // we serialize the grouped GEMM arguments on the host, allocate one
+    // large chunk of device memory, and perform a single cudaMemcpy to
+    // copy the host data to the device. Allocation overheads could be
+    // avoided by using a memory pool.
+
+    // Calculate the total size of the data to be copied from host to device
+    size_t total_size = sizeof(cutlass::gemm::GemmCoord) +
+                        sizeof(DeviceKernel::ElementA*) +
+                        sizeof(DeviceKernel::ElementB*) +
+                        sizeof(DeviceKernel::ElementC*) +
+                        sizeof(DeviceKernel::ElementC*) +
+                        sizeof(int64_t) +
+                        sizeof(int64_t) +
+                        sizeof(int64_t);
+    total_size *= num;
+
+    // num * sizeof(cutlass::gemm::GemmCoord) may leave one at a non-multiple
+    // of sizeof(DeviceKernel::ElementA*) (which will be 64 on a 64-bit system).
+    // To ensure that we don't end up having misaligned loads in the kernel,
+    // we pad to the nearest multiple of 8.
+    //
+    // Note that, even on a 32-bit system (for which sizeof(X*) will not equal
+    // sizeof(int64_t)), only padding between the list of GemmCoords and the
+    // list of ptr_As is sufficient because the set of four equal-length lists of pointers
+    // (A*, B*, C*, D*) will ensure that the first list of int64_ts will always
+    // start on a multiple of 8.
+    int64_t padding = 8 - (total_size % 8);
+    total_size += padding;
+
+    uint8_t* host_data = new uint8_t[total_size];
+    cutlass::DeviceAllocation<uint8_t> device_data(total_size);
+
+    uint8_t* start = host_data;
+    cutlass::gemm::GemmCoord* problem_sizes_host = reinterpret_cast<cutlass::gemm::GemmCoord*>(start);
+
+    // Apply the padding after the list of GemmCoords
+    start += num * sizeof(cutlass::gemm::GemmCoord) + padding;
+
+    int64_t ptr_A_offset = start - host_data;
+    DeviceKernel::ElementA** ptr_A_host = reinterpret_cast<DeviceKernel::ElementA**>(start);
+    start += num * sizeof(DeviceKernel::ElementA*);
+
+    int64_t ptr_B_offset = start - host_data;
+    DeviceKernel::ElementB** ptr_B_host = reinterpret_cast<DeviceKernel::ElementB**>(start);
+    start += num * sizeof(DeviceKernel::ElementB*);
+
+    int64_t ptr_C_offset = start - host_data;
+    DeviceKernel::ElementC** ptr_C_host = reinterpret_cast<DeviceKernel::ElementC**>(start);
+    start += num * sizeof(DeviceKernel::ElementC*);
+
+    int64_t ptr_D_offset = start - host_data;
+    DeviceKernel::ElementC** ptr_D_host = reinterpret_cast<DeviceKernel::ElementC**>(start);
+    start += num * sizeof(DeviceKernel::ElementC*);
+
+    int64_t lda_offset = start - host_data;
+    int64_t* lda_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    int64_t ldb_offset = start - host_data;
+    int64_t* ldb_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    int64_t ldc_offset = start - host_data;
+    int64_t* ldc_host = reinterpret_cast<int64_t*>(start);
+    start += num * sizeof(int64_t);
+
+    std::vector<at::Tensor> D(num);
+
+    bool need_C = (C != at::nullopt) && (beta != 0.f);
+    for (size_t i = 0; i < num; ++i) {
+        int M = A[i].size(0);
+        int N = B[i].size(1);
+        int K = A[i].size(1);
+        *(problem_sizes_host + i) = {M, N, K};
+        *(ptr_A_host + i) = reinterpret_cast<typename DeviceKernel::ElementA*>(A[i].contiguous().data_ptr());
+        *(ptr_B_host + i) = reinterpret_cast<typename DeviceKernel::ElementB*>(B[i].contiguous().data_ptr());
+
+        if (need_C) {
+            *(ptr_C_host + i) = reinterpret_cast<typename DeviceKernel::ElementC*>(C->at(i).contiguous().data_ptr());
+        }
+        else {
+            *(ptr_C_host + i) = nullptr;
+        }
+
+        D[i] = B[i].new_empty({M, N}, ${torch_type_C});
+        *(ptr_D_host + i) = reinterpret_cast<typename DeviceKernel::ElementC*>(D[i].contiguous().data_ptr());
+
+        *(lda_host + i) = DeviceKernel::LayoutA::packed({M, K}).stride(0);
+        *(ldb_host + i) = DeviceKernel::LayoutB::packed({K, N}).stride(0);
+        *(ldc_host + i) = DeviceKernel::LayoutC::packed({M, N}).stride(0);
+    }
+
+    device_data.copy_from_host(host_data);
+
+    cutlass::Status status = ${name}_kernel_run(
+        num,
+        reinterpret_cast<cutlass::gemm::GemmCoord*>(device_data.get()),
+        reinterpret_cast<DeviceKernel::ElementA**>(device_data.get() + ptr_A_offset),
+        reinterpret_cast<DeviceKernel::ElementB**>(device_data.get() + ptr_B_offset),
+        reinterpret_cast<DeviceKernel::ElementC**>(device_data.get() + ptr_C_offset),
+        reinterpret_cast<DeviceKernel::ElementC**>(device_data.get() + ptr_D_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + lda_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldb_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldc_offset),
+        reinterpret_cast<int64_t*>(device_data.get() + ldc_offset),
+        ElementCompute(alpha), ElementCompute(beta));
+
+    delete[] host_data;
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+)
+
+_PYTORCH_CONV2D_IMPL_TEMPLATE_2x = """
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    cutlass::Status status = ${name}_kernel_run(
+        &problem_size,
+        reinterpret_cast<typename UnderlyingKernel::ElementA*>(A.data_ptr()),
+        reinterpret_cast<typename UnderlyingKernel::ElementB*>(B.data_ptr()),
+        ptrC,
+        reinterpret_cast<typename UnderlyingKernel::ElementC*>(D.data_ptr()),
+        alpha, beta,
+        split_k_mode, stream, B.device().index());
+
+    TORCH_CHECK(status == cutlass::Status::kSuccess, "CUTLASS kernel failed");
+    return D;
+}
+"""
+
+_PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1},
+    float alpha=1.f, float beta=0.f, std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S, P, Q;
+    N = A.size(0);
+    C_ = A.size(1);
+    H = A.size(2);
+    W = A.size(3);
+
+    K = B.size(0);
+    R = B.size(2);
+    S = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    P = problem_size.P;
+    Q = problem_size.Q;
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::zeros({N, K, P, Q}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> input_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S;
+    N = std::get<0>(input_size);
+    C_ = std::get<1>(input_size);
+    H = std::get<2>(input_size);
+    W = std::get<3>(input_size);
+
+    K = B.size(0);
+    R = B.size(2);
+    S = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::empty({N, C_, H, W}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x = (
+    common._CUTLASS_KERNEL_RUN_CONV2D_2x
+    + """
+at::Tensor ${name}_kernel(std::tuple<int, int, int, int> weight_size, const at::Tensor& A, const at::Tensor& B, at::optional<const at::Tensor> C=at::nullopt,
+    std::tuple<int, int> stride={1, 1}, std::tuple<int, int> padding={0, 0}, std::tuple<int, int> dilation={1, 1}, float alpha=1.f, float beta=0.f,
+    std::string split_k_mode="serial", int split_k_slices=1) {
+    int N, H, W, C_, K, R, S;
+    K = std::get<0>(weight_size);
+    C_ = std::get<1>(weight_size);
+    R = std::get<2>(weight_size);
+    S = std::get<3>(weight_size);
+
+    N = B.size(0);
+    H = B.size(2);
+    W = B.size(3);
+
+    cutlass::conv::Conv2dProblemSize problem_size(
+        cutlass::Tensor4DCoord(N, H, W, C_),
+        cutlass::Tensor4DCoord(K, R, S, C_),
+        cutlass::Tensor4DCoord(std::get<0>(padding), std::get<0>(padding), std::get<1>(padding), std::get<1>(padding)),
+        cutlass::MatrixCoord(std::get<0>(stride), std::get<1>(stride)),
+        cutlass::MatrixCoord(std::get<0>(dilation), std::get<1>(dilation)),
+        cutlass::conv::Mode::kCrossCorrelation,
+        split_k_slices
+    );
+
+    typename UnderlyingKernel::ElementC* ptrC = (C == at::nullopt) ?
+                                            nullptr :
+                                            reinterpret_cast<typename UnderlyingKernel::ElementC*>(C->data_ptr());
+
+    torch::TensorOptions options = torch::TensorOptions().dtype(${torch_type_C}).device(B.device()).memory_format(at::MemoryFormat::ChannelsLast);
+    at::Tensor D = torch::empty({K, C_, R, S}, options);
+""" + _PYTORCH_CONV2D_IMPL_TEMPLATE_2x
+)
+
+
+_PYTORCH_SETUP_PY = common._PYSTYLE_AUTOGEN_COMMENT + """
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='${name}',
+    ext_modules=[
+        CUDAExtension('${name}', [
+            '${name}.cpp',
+            '${name}_kernel.cu',
+        ],
+        include_dirs=['${cutlass_path}/include', '${cutlass_path}/tools/util/include'],
+        extra_compile_args={
+            'cxx': ['-std=c++17'],
+            'nvcc': ['-std=c++17', ${extra_compile_args}],
+        },
+        libraries=['cuda']
+        ),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
+
+"""
+
+
+def _generate_setup(name: str, sourcedir: str, extra_compile_args: str=""):
+    """
+    Generates a setup.py file for the extension
+
+    :param name: name of the module to generate
+    :type name: str
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+    :param extra_compile_args: additional arguments to pass to setup.py
+    :type extra_args: str
+    """
+    setup_py_file = os.path.join(sourcedir, "setup.py")
+    setup_source = SubstituteTemplate(
+        _PYTORCH_SETUP_PY, {"name": name, "cutlass_path": CUTLASS_PATH, "extra_compile_args": extra_compile_args}
+    )
+    with open(setup_py_file, "w") as outfile:
+        outfile.write(setup_source)
+
+
+class _ArchListSetter:
+    """
+    Utility context manager for temporarily setting the value of the ``TORCH_CUDA_ARCH_LIST``
+    environment variable when building a PyTorch CUDA module.
+
+    ``TORCH_CUDA_ARCH_LIST`` is a space-delmited list of compute capabilites for which a PyTorch
+    CUDA module should be compiled.
+
+    For example, ``TORCH_CUDA_ARCH_LIST="7.0 8.0"`` would result in the inclusion of
+    ``-gencode=arch=compute_70,code=sm_70`` and ``-gencode=arch=compute_80,code=sm_80`` in the
+    compilation of the module.
+
+    This utility wraps the building of a PyTorch CUDA module with a setting of this environment
+    variable according to the current compute capability being targetted.
+
+    Example usage:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Temporarily set TORCH_CUDA_ARCH_LIST="8.0"
+        with _ArchListSetter(80):
+            # Perform JIT compilation and loading of the module
+            mod = torch.utils.cpp_extension.load(...)
+
+    :param cc: compute capability
+    :type cc: int
+    """
+
+    _TORCH_CUDA_ARCH_LIST = "TORCH_CUDA_ARCH_LIST"
+
+    def __init__(self, cc: int):
+        self.cc_str = ".".join(list(str(cc)))
+
+    def __enter__(self):
+        """
+        Saves the old value of TORCH_CUDA_ARCH_LIST and reset it to the new value based on ``cc``
+        """
+        self.old_arch_list = os.getenv(_ArchListSetter._TORCH_CUDA_ARCH_LIST)
+        os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST] = self.cc_str
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        """
+        Restores the old value of TORCH_CUDA_ARCH_LIST
+        """
+        if self.old_arch_list is None:
+            del os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST]
+        else:
+            os.environ[_ArchListSetter._TORCH_CUDA_ARCH_LIST] = self.old_arch_list
+
+
+def _jit(name: str, cc: int, cpp_file: str, cuda_file: str):
+    """
+    JIT compiles and loads a PyTorch CUDA extension.
+
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param cpp_file: path to file containing extension's C++ interface
+    :type cpp_file: str
+    :param cuda_file: path to file containing extension's CUDA interface
+    :type cuda_file: str
+
+    :return: loaded PyTorch module
+    """
+
+    from torch.utils.cpp_extension import load
+
+    extra_cuda_cflags = ["-std=c++17"]
+    if cc in [90, 100, 101, 103]:
+        # PyTorch does not currently add the sm_90a target when compute capability
+        # 9.0 is set within TORCH_CUDA_ARCH_LIST. Thus, we manually add the sm_90a target.
+        extra_cuda_cflags.append(f"-gencode=arch=compute_{cc}a,code=sm_{cc}a")
+
+    with _ArchListSetter(cc):
+        jitmodule = load(
+            name,
+            [cpp_file, cuda_file],
+            extra_cuda_cflags=extra_cuda_cflags,
+            extra_include_paths=[
+                os.path.join(CUTLASS_PATH, "include"),
+                os.path.join(CUTLASS_PATH, "tools/util/include"),
+            ],
+            extra_ldflags=["-lcuda"],
+            verbose=(logger.level == logging.DEBUG)
+        )
+    return jitmodule
+
+
+def _pytorch_gemm(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS GEMM
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    extra_kw = {}
+    if op.api == ApiVersion.v3x:
+        impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_3x
+    else:
+        impl_template = _PYTORCH_GEMM_IMPL_TEMPLATE_2x
+        if op.swizzling_functor == swizzle.ThreadblockSwizzleStreamK:
+            extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x_STREAM_K
+        else:
+            extra_kw["args"] = common._CUTLASS_KERNEL_ARGS_2x
+    impl_template = (
+        _PYTORCH_GEMM_IMPL_TEMPLATE_3x
+        if op.api == ApiVersion.v3x
+        else _PYTORCH_GEMM_IMPL_TEMPLATE_2x
+    )
+    cuda_impl = SubstituteTemplate(impl_template, {"name": name, **extra_kw})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_GEMM_INCLUDES[op.api],
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        _PYTORCH_GEMM_CPP_TEMPLATE,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} GEMM"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    extra_compile_args = ""
+    if cc in [90, 100, 101, 103]:
+        extra_compile_args = f"'--generate-code=arch=compute_{cc}a,code=[sm_{cc}a]'"
+    _generate_setup(name, sourcedir, extra_compile_args)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def _pytorch_grouped_gemm(
+    op, name: str, cc: int, jit: bool = False, sourcedir: str = ""
+):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS grouped GEMM
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if op.api != ApiVersion.v2x:
+        raise Exception("Grouped GEMM is currently only supported for CUTLASS 2.x")
+
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    cuda_impl = SubstituteTemplate(_PYTORCH_GROUPED_GEMM_IMPL_TEMPLATE, {"name": name})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_GROUPED_GEMM_INCLUDES,
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        _PYTORCH_GROUPED_GEMM_CPP_TEMPLATE,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} grouped GEMM"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    _generate_setup(name, sourcedir)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def _pytorch_conv2d(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS Conv2d
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    Note that the when conv kind is `dgrad` or `wgrad`, the size of the input `(N, C, H, W)` or
+    weight `(K, C, R, S)` should be provided. This is because there are multiple valid solutions
+    for H/W/R/S given the same P/Q.
+
+    :return: loaded PyTorch module if ``jit=True`` or ``None`` otherwise
+    """
+    if sourcedir != "" and not os.path.isdir(sourcedir):
+        os.makedirs(sourcedir)
+    cuda_file = os.path.join(sourcedir, name + "_kernel.cu")
+    extra_kw = {}
+    if op.conv_kind == ConvKind.Fprop:
+        impl_template = _PYTORCH_CONV2D_FPROP_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_FPROP_CPP_TEMPLATE
+    elif op.conv_kind == ConvKind.Dgrad:
+        impl_template = _PYTORCH_CONV2D_DGRAD_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
+    elif op.conv_kind == ConvKind.Wgrad:
+        impl_template = _PYTORCH_CONV2D_WGRAD_IMPL_TEMPLATE_2x
+        cpp_template = _PYTORCH_CONV2D_GRAD_CPP_TEMPLATE
+    extra_kw["conv_kind_name"] = ConvKindNames[op.conv_kind].capitalize()
+    extra_kw["torch_type_C"] = _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element]
+    cuda_impl = SubstituteTemplate(impl_template, {"name": name, **extra_kw})
+    cuda_source = SubstituteTemplate(
+        _PYTORCH_CUDA_TEMPLATE,
+        {
+            "includes": _PYTORCH_CONV2D_INCLUDES,
+            "declaration": op.rt_module.emit(),
+            "procedural_name": op.procedural_name(),
+            "impl": cuda_impl,
+            "torch_type_C": _CUTLASS_TYPE_TO_TORCH_TYPE[op.C.element],
+        },
+    )
+    with open(cuda_file, "w") as outfile:
+        outfile.write(cuda_source)
+
+    cpp_file = os.path.join(sourcedir, name + ".cpp")
+    cpp_source = SubstituteTemplate(
+        cpp_template,
+        {"name": name, "description": f"CUTLASS {op.procedural_name()} Conv2d"},
+    )
+    with open(cpp_file, "w") as outfile:
+        outfile.write(cpp_source)
+
+    _generate_setup(name, sourcedir)
+
+    if jit:
+        return _jit(name, cc, cpp_file, cuda_file)
+
+    return None
+
+
+def pytorch(op, name: str, cc: int, jit: bool = False, sourcedir: str = ""):
+    """
+    Generates source for building a PyTorch CUDA module that leverages the CUTLASS kernel
+    specified by ``op``. If the ``jit`` parameter is set to true, the module is just-in-time
+    compiled, loaded, and returned.
+
+    The result of this method is files within ``sourcedir`` that can be used for building
+    a PyTorch module.
+
+    :param op: operation to emit in the module
+    :param name: name of the module to generate
+    :type name: str
+    :param cc: compute capability of the device the module should target
+    :type cc: int
+    :param jit: whether the module should be just-in-time compiled
+    :type jit: bool
+    :param sourcedir: directory to which generated source files should be written
+    :type sourcedir: str
+
+    :return: loaded PyTorch module (if ``jit=True``) or None
+    """
+    device_op = op.device_op()
+    if isinstance(op, GemmOperationUniversal):
+        return _pytorch_gemm(device_op, name, cc, jit, sourcedir)
+    elif isinstance(op, GemmOperationGrouped):
+        return _pytorch_grouped_gemm(device_op, name, cc, jit, sourcedir)
+    elif isinstance(op, Conv2dOperation):
+        return _pytorch_conv2d(device_op, name, cc, jit, sourcedir)
+    else:
+        raise Exception(
+            f"Operation type {type(op)} is not currently supported for PyTorch emission."
+        )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..faf6896e99ba78130ede8e09be9b9115e9169541
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/__init__.py
@@ -0,0 +1,56 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.epilogue.epilogue import (
+    get_activations,
+    get_activation_epilogue,
+    gelu,
+    hardswish,
+    identity,
+    leaky_relu,
+    relu,
+    sigmoid,
+    silu,
+    tanh,
+    trace
+)
+
+from cutlass_cppgen.epilogue.evt_ops import (
+    max,
+    multiply_add,
+    sum,
+    permute,
+    reshape,
+    maximum,
+    minimum,
+    exp
+)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/epilogue.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/epilogue.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3a17506ee2be609ed8d5b299114df52c55ca0cf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/epilogue.py
@@ -0,0 +1,176 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Registry of elementwise epilogues
+
+Elementwise epilogues can be added to many CUTLASS kernels in the CUTLAS Python interface via
+code like the following for GEMM:
+
+.. highlight:: python
+.. code-block:: python
+
+    plan = cutlass_cppgen.op.Gemm(element=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+    plan.activation = cutlass_cppgen.epilogue.relu
+"""
+
+from cutlass_cppgen.backend import epilogue, device_cc
+
+
+gelu = epilogue.gelu
+hardswish = epilogue.hardswish
+identity = epilogue.identity
+leaky_relu = epilogue.leaky_relu
+relu = epilogue.relu
+sigmoid = epilogue.sigmoid
+silu = epilogue.silu
+tanh = epilogue.tanh
+
+
+_activations = [gelu, hardswish, identity, leaky_relu, relu, sigmoid, silu, tanh]
+
+
+def get_activations() -> list:
+    """
+    Returns a list of available activation functions
+
+    :return: list of available activation functions
+    :rtype: list
+    """
+    return _activations
+
+
+def get_activation_epilogue(
+    activation,
+    element_output,
+    elements_per_access,
+    element_accumulator,
+    element_compute,
+):
+    """
+    Return an epilogue corresponding to the activation function, data types, and alignment
+    used in the kernel
+
+    :param activation: elementwise activation function to use
+    :param element_output: data type of the output
+    :param elements_per_access: alignment of operand C of the kernel
+    :type elements_per_access: int
+    :param element_accumulator: data type of the accumulated output C
+    :param element_compute: data type in which compute operations should be performed
+
+    :return: epilogue functor
+    """
+    if activation not in _activations:
+        raise Exception(
+            f"Unsupported activation type {activation}. Available activations are: {_activations}"
+        )
+
+    if activation == identity:
+        return epilogue.LinearCombination(
+            element_output, elements_per_access, element_accumulator, element_compute
+        )
+    else:
+        return epilogue.LinearCombinationGeneric(
+            activation,
+            element_output,
+            elements_per_access,
+            element_accumulator,
+            element_compute,
+        )
+
+
+"""
+Frontend for EVT that generates epilogue functor through tracing the input function
+"""
+from cutlass_cppgen.backend.evt.frontend import PythonASTFrontend
+
+
+def trace(fn, example_tensors, **kwargs):
+    """
+    Trace `fn(**example_tensors)` and generates epilogue visitor
+
+    :param fn or str: Python callable or string of the epilogue function
+    :param example_tensors: example inputs for fn
+    :type example_tensors: dict
+
+    .. hightlight:: python
+    .. code-block:: python
+        import cutlass_cppgen.backend.evt
+
+        # Define epilogue function as Python callable
+        def example_fn(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        # Define the example tensors
+        example_inputs = {
+            "accum": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "C": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda"),
+            "alpha": 1.5,
+            "beta": 0.5,
+            "gamma": 2.5,
+            "D": torch.empty(size=(6, 512, 512), dtype=torch.float16, device="cuda")
+        }
+
+        # Generate the epilogue functor
+        epilogue_visitor = cutlass_cppgen.epilogue.trace(example_fn, example_inputs)
+    """
+    if callable(fn):
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, cc=None, **kwargs):
+                if not cc:
+                    cc = device_cc()
+                super().__init__(cc, **kwargs)
+            pass
+        setattr(EpilogueFunctor, "__call__", staticmethod(fn))
+
+        epilogue_functor = EpilogueFunctor(**kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+    elif isinstance(fn, str):
+        class EpilogueFunctor(PythonASTFrontend):
+            def __init__(self, cc=None, **kwargs):
+                self.source = textwrap.dedent(fn)
+                if not cc:
+                    cc = device_cc()
+                super().__init__(cc, **kwargs)
+
+            def parse(self, example_inputs) -> None:
+                self.example_inputs = example_inputs
+                self.ast = ast.parse(self.source)
+                self.visit(self.ast)
+
+        epilogue_functor = EpilogueFunctor(**kwargs)
+        epilogue_functor.trace(example_tensors)
+        return epilogue_functor
+    else:
+        raise NotImplementedError("Expect a callable Python function")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/evt_ops.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/evt_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d8e2c01286886ffc936052c84205a60a5d869fb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/epilogue/evt_ops.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Collection of builtin functions used for host reference in EVT
+"""
+
+import numpy as np
+
+from cutlass_cppgen.utils.datatypes import is_cupy_tensor, is_numpy_tensor, is_torch_available, is_torch_tensor
+
+if is_torch_available():
+    import torch
+
+
+def multiply_add(x, y, z):
+    return x * y + z
+
+
+def sum(x, dim):
+    if is_numpy_tensor(x):
+        return x.sum(axis=tuple(dim))
+    elif is_torch_tensor(x):
+        return torch.sum(x, dim)
+
+
+def max(x, dim):
+    if is_numpy_tensor(x):
+        return x.max(axis=tuple(dim))
+    elif is_torch_tensor(x):
+        return torch.amax(x, dim)
+
+
+def maximum(x, y):
+    if is_numpy_tensor(x):
+        return np.maximum(x, y)
+    elif is_torch_tensor(x):
+        return torch.maximum(x, torch.tensor(y))
+
+
+def minimum(x, y):
+    if is_numpy_tensor(x):
+        return np.minimum(x, y)
+    elif is_torch_tensor(x):
+        return torch.minimum(x, torch.tensor(y))
+
+def exp(x):
+    if is_numpy_tensor(x):
+        return np.exp(x)
+    elif is_torch_tensor(x):
+        return torch.exp(x)
+
+
+##############################################################################
+# Layout manipulate nodes
+##############################################################################
+
+def permute(x, indices: tuple):
+    if is_numpy_tensor(x):
+        return np.transpose(x, axes=indices)
+    elif is_torch_tensor(x):
+        return x.permute(*indices)
+
+
+def reshape(x, new_shape: tuple):
+    if is_numpy_tensor(x):
+        return np.reshape(x, newshape=new_shape)
+    elif is_torch_tensor(x):
+        return x.view(new_shape)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/library_defaults.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/library_defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ea04419955f6a71225b6daaeab884dcc4e3399
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/library_defaults.py
@@ -0,0 +1,569 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Classes containing valid operations for a given compute capability and data types.
+"""
+
+from itertools import combinations_with_replacement
+import logging
+
+import cutlass_library
+from cutlass_library.library import ConvKind, IteratorAlgorithm, StrideSupport, GroupMode
+
+import cutlass_cppgen
+from cutlass_cppgen.utils.check import valid_stage_count
+from cutlass_cppgen.utils.datatypes import td_from_profiler_td, td_from_profiler_op
+
+
+_generator_ccs = [50, 60, 61, 70, 75, 80, 90, 100]
+
+
+class KernelsForDataType:
+    """
+    Container class for keeping track of kernels that correspond to a particular combination
+    of data types for operands A, B, and accumulator
+    """
+
+    def __init__(self, datatype_comb: tuple, layout_comb: tuple):
+        self.datatype_comb = datatype_comb
+        self.layout_comb = layout_comb
+        self.math_operations = set()
+
+        # Dictionary mapping from alignment (int) to a list of kernels that fit the alignment
+        # constraint for the data type combination
+        self.kernels_by_alignment = {}
+
+    def add(self, operation):
+        """
+        Add an operation to the list of supported kernels
+        """
+        alignment_key = f"{operation.A.alignment} {operation.B.alignment} {operation.C.alignment}"
+        if alignment_key not in self.kernels_by_alignment:
+            self.kernels_by_alignment[alignment_key] = []
+        self.kernels_by_alignment[alignment_key].append(operation)
+        self.math_operations.add(operation.tile_description.math_instruction.math_operation)
+
+    def alignments(self, operand: str):
+        """
+        Returns an unsorted list of alignments supported by this data type combination
+
+        :param operand: identifier of operand in question (e.g., A, B, C)
+        :type operand: str
+
+        :return: unsorted list of alignments supported by this data type combination
+        :rtype: list
+        """
+        operand_idx = self._operand_idx(operand)
+        return [int(key.split(" ")[operand_idx]) for key in self.kernels_by_alignment.keys()]
+
+    @property
+    def all_operations(self):
+        """
+        Returns a list of all operations supported by this data type combination
+
+        :return: list of all operations supported by this data type combination
+        :rtype: list
+        """
+        ops = []
+        for _, alignment_ops in self.kernels_by_alignment.items():
+            ops.extend(alignment_ops)
+        return ops
+
+    def default_operation(self, math_operation: cutlass_cppgen.MathOperation):
+        key = sorted(list(self.kernels_by_alignment.keys()))[0]
+        kernels = self.kernels_by_alignment[key]
+        if math_operation is not None:
+            kernels = [x for x in kernels if x.tile_description.math_instruction.math_operation == math_operation]
+        return kernels[0]
+
+    def operations(self, alignment_A: int, alignment_B: int, alignment_C: int, math_operation: cutlass_cppgen.MathOperation):
+        """
+        Returns operations satisfying the alignment constraints
+
+        :param alignment_A: alignment constraint of operations to return
+        :type alignment_A: int
+        :param alignment_B: alignment constraint of operations to return
+        :type alignment_B: int
+        :param alignment_C: alignment constraint of operations to return
+        :type alignment_C: int
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass_cppgen.MathOperation
+
+        :return: list of operations
+        :rtype: list
+        """
+        key = f"{alignment_A} {alignment_B} {alignment_C}"
+
+        if key not in self.kernels_by_alignment:
+            og_key = key
+            # Reconcile A, B, and C alignments by trying to align to the minimum
+            min_alignment = min(alignment_A, alignment_B, alignment_C)
+            key = f"{min_alignment} {min_alignment} {min_alignment}"
+            if key not in self.kernels_by_alignment:
+                # Finally, go through all available alignment combinations and find
+                # one for which all values are less than those passed in.
+                key = None
+                alignments = sorted([tuple(int(x) for x in k.split(" ")) for k in self.kernels_by_alignment.keys()], reverse=True)
+                for align_A, align_B, align_C in alignments:
+                    if alignment_A % align_A == 0 and alignment_B % align_B == 0 and alignment_C % align_C == 0:
+                        key = f"{align_A} {align_B} {align_C}"
+                        break
+
+                if key is None:
+                    raise Exception(
+                        f"No operations of alignment {og_key} found for data type and layout "
+                        f"combination {self.datatype_comb} {self.layout_comb}. Compatible alignments "
+                        f"are {self.kernels_by_alignment.keys()}"
+                    )
+
+        ops = self.kernels_by_alignment[key]
+        if math_operation is not None:
+            ops = [op for op in ops if op.tile_description.math_instruction.math_operation == math_operation]
+        return ops
+
+    def _operand_idx(self, key: str) -> int:
+        operand_list = ["A", "B", "C"]
+        if key not in operand_list:
+            raise Exception(f"Unexpected operand {operand}")
+
+        return operand_list.index(key)
+
+    def find_alignment(self, shape: tuple, layout: cutlass_cppgen.LayoutType, operand=str) -> int:
+        """
+        Returns the most preferable alignment for a given shape and layout
+
+        :param shape: extent of each dimension of the tensor
+        :type shape: tuple
+        :param layout: layout of the tensor
+        :type layout: cutlass_cppgen.LayoutType
+        :param operand: descriptor of the operand in question
+        :type operand: str
+
+        :return: maximum alignment supported by the data type combination and tensor size
+        :rtype: int
+        """
+        operand_idx = self._operand_idx(operand)
+
+        # Determine the leading dimension of the shape
+        if layout == cutlass_cppgen.LayoutType.ColumnMajor:
+            ld = shape[-2]
+        elif layout == cutlass_cppgen.LayoutType.RowMajor:
+            ld = shape[-1]
+        elif layout == cutlass_cppgen.LayoutType.TensorNHWC:
+            ld = shape[-1]
+        else:
+            raise Exception(f"Unexpected or unsupported layout {layout}")
+
+        for alignments in sorted(list(self.kernels_by_alignment.keys()), reverse=True):
+            alignment = int(alignments.split(" ")[operand_idx])
+            if ld % alignment == 0:
+                return alignment
+
+        # Default to alignment of 1 if no others match
+        return 1
+
+    def sort(self):
+        """
+        Sorts each list of kernels in `kernels_by_alignment` in descending order of threadblock shape
+        """
+        key = lambda op: (
+            op.tile_description.threadblock_shape[0]
+            * op.tile_description.threadblock_shape[1]
+            * op.tile_description.threadblock_shape[2]
+        )
+        for alignment in self.kernels_by_alignment.keys():
+            self.kernels_by_alignment[alignment].sort(key=key, reverse=True)
+
+    def supports_math_operation(self, math_operation: cutlass_cppgen.MathOperation) -> bool:
+        """
+        Returns whether `math_operation` is supported by at least one operation.
+
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass_cppgen.MathOperation
+
+        :return: whether math_operation is supported by at least one operation
+        :rtype: bool
+        """
+        return math_operation is None or math_operation in self.math_operations
+
+
+class ArchOptions:
+    """
+    Structure for keeping track of kernels available on a given compute capability
+
+    :param target_cc: compute capability of the device on which kernels will be run
+    :type target_cc: int
+    :param kernel_cc: compute capability of the kernels to generate
+    :type kernel_cc: int
+    :param operation_kind: type of operation to register
+    :type operation_kind: cutlass_library.OperationKind
+    :param gemm_kinds: types of GEMM operations that can be included
+    :type gemm_kinds: list
+    :param allowed_math_operations: types of primitive math operations allowed
+    :type allowed_math_operations: list
+    """
+
+    def __init__(
+        self,
+        target_cc: int,
+        kernel_cc: int,
+        operation_kind: cutlass_library.OperationKind,
+        gemm_kinds: list,
+        allowed_math_operations: list = [
+            cutlass_library.MathOperation.multiply_add,
+            cutlass_library.MathOperation.multiply_add_saturate,
+            cutlass_library.MathOperation.multiply_add_mixed_input_upcast,
+            cutlass_library.MathOperation.multiply_add_fast_f32
+        ]
+    ):
+        self.cc = kernel_cc
+
+        # Dictionary with following structure:
+        #  Key: OpcodeClass
+        #  Value: Dictionary with the following structure:
+        #     Key: tuple of ((DataType, DataType, DataType), (LayoutType, LayoutType, LayoutType),
+        #          representing ((element_a, element_b, element_accumulator), (layout_a, layout_b))
+        #     Value: KernelsForDataType
+        self.operations_by_opclass = {}
+        self.op_class = None
+        self.allowed_math_operations = allowed_math_operations
+
+        if target_cc == 100 and kernel_cc == 90 or target_cc == 90 and kernel_cc == 100:
+            return
+
+        # Identify the method within CUTLASS generator script that generates kernel
+        # descriptions for the target CC
+        generate_function_name = "GenerateSM" + str(kernel_cc)
+        if not hasattr(cutlass_library.generator, generate_function_name):
+            cutlass_cppgen.logger.warning(f"No generator found for architecture {kernel_cc}")
+            return
+        generate_function = getattr(cutlass_library.generator, generate_function_name)
+
+        # Initialize a default manifest and populate it with valid kernel descriptions
+        # for the target CC
+        args = [
+            "--kernels=all",
+            f"--log-level={logging.getLevelName(cutlass_cppgen.logger.level)}"
+        ]
+        manifest_args = cutlass_library.generator.define_parser().parse_args(args)
+        manifest = cutlass_library.manifest.Manifest(manifest_args)
+        generate_function(manifest, cutlass_cppgen._nvcc_version)
+
+        if operation_kind not in manifest.operations:
+            # No kernels generated for this architecture, this could be because the CUDA
+            # toolkit is insufficient to support operations in this CC
+            cutlass_cppgen.logger.warning(f"No operations of type {operation_kind} found for CC {kernel_cc}")
+            return
+
+        # Only one CC should be returned, given the setup above of calling only the generation scripts
+        # for a given CC
+        if len(manifest.operations[operation_kind].keys()) != 1 or kernel_cc not in manifest.operations[operation_kind]:
+            raise Exception(f"Error finding kernels for SM{kernel_cc}. Check that your CUDA toolkit version "
+                             "is sufficient for the architecture in question.")
+
+        # Iterate through the available operations for this operation kind and
+        # find available opclasses and data types
+        for name, op_list in manifest.operations[operation_kind][kernel_cc].items():
+            for op in op_list:
+
+                if operation_kind == cutlass_library.OperationKind.Gemm:
+                    if op.gemm_kind not in gemm_kinds:
+                        continue
+
+                mi = op.tile_description.math_instruction
+                if mi.math_operation not in self.allowed_math_operations:
+                    continue
+
+                # Prune operations that don't fit in shared memory
+                td = td_from_profiler_op(op)
+                if not valid_stage_count(target_cc, kernel_cc, td, verbose=False)[0]:
+                    continue
+
+                if mi.opcode_class not in self.operations_by_opclass:
+                    self.operations_by_opclass[mi.opcode_class] = {}
+
+                datatype_comb = (mi.element_a, mi.element_b, mi.element_accumulator)
+                layout_comb = (op.A.layout, op.B.layout)
+
+                # Register TF32 kernels as F32 to enable F32 -> TF32 conversion + TF32 Tensor Core operations
+                if datatype_comb == (cutlass_library.DataType.tf32, cutlass_library.DataType.tf32, cutlass_library.DataType.f32):
+                    # TF32 kernels only supported on SM80 and beyond
+                    if self.cc < 80:
+                        continue
+                    elif self.cc == 90 or self.cc == 100:
+                        if (op.A.element != cutlass_library.DataType.f32
+                            or op.B.element != cutlass_library.DataType.f32
+                            or op.C.element != cutlass_library.DataType.f32):
+                            continue
+
+                    datatype_comb = (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32)
+
+                opclass_dict = self.operations_by_opclass[mi.opcode_class]
+                key = (datatype_comb, layout_comb)
+                if key not in opclass_dict:
+                    opclass_dict[key] = KernelsForDataType(datatype_comb, layout_comb)
+                opclass_dict[key].add(op)
+
+        # Set the default opclass to TensorOp, if available. Otherwise default to SIMT
+        if cutlass_library.OpcodeClass.TensorOp in self.operations_by_opclass:
+            self.op_class = cutlass_library.OpcodeClass.TensorOp
+        else:
+            self.op_class = cutlass_library.OpcodeClass.Simt
+
+        # The profiler's generator may generate only a limited set of combinations of operands for SIMT kernels.
+        # Here, we generate additional versions via a generic TileDescription.
+        if cutlass_library.OpcodeClass.Simt not in self.operations_by_opclass:
+            self.operations_by_opclass[cutlass_library.OpcodeClass.Simt] = {}
+
+        if operation_kind == cutlass_library.OperationKind.Gemm:
+            types = [
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s8),
+                (cutlass_library.DataType.s8, cutlass_library.DataType.s8, cutlass_library.DataType.s32),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
+            ]
+
+            # Add FP8 A/B/C
+            fp8_types = [cutlass_library.DataType.e4m3, cutlass_library.DataType.e5m2]
+            for type_comb in combinations_with_replacement(fp8_types, 3):
+                types.append(type_comb)
+
+            # Add FP8 A/B with FP32 C
+            for type_comb in combinations_with_replacement(fp8_types, 2):
+                types.append(type_comb + (cutlass_cppgen.DataType.f32,))
+
+            layouts = [
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.RowMajor, cutlass_library.LayoutType.ColumnMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.RowMajor),
+                (cutlass_library.LayoutType.ColumnMajor, cutlass_library.LayoutType.ColumnMajor),
+            ]
+        elif operation_kind == cutlass_library.OperationKind.Conv2d:
+            types = [
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f16),
+                (cutlass_library.DataType.f16, cutlass_library.DataType.f16, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f32, cutlass_library.DataType.f32, cutlass_library.DataType.f32),
+                (cutlass_library.DataType.f64, cutlass_library.DataType.f64, cutlass_library.DataType.f64),
+            ]
+
+            layouts = [
+                (cutlass_library.LayoutType.TensorNHWC, cutlass_library.LayoutType.TensorNHWC),
+            ]
+        else:
+            raise NotImplementedError(f"Operation kind {operation_kind} is currently unsupported.")
+
+        alignment = 1
+        epilogue_functor = cutlass_library.EpilogueFunctor.LinearCombination
+        swizzling_functor = cutlass_library.SwizzlingFunctor.Identity8
+        for type_comb in types:
+            for layout_comb in layouts:
+                comb = (type_comb, layout_comb)
+                if comb in self.operations_by_opclass[cutlass_library.OpcodeClass.Simt]:
+                    continue
+
+                A = cutlass_library.TensorDescription(type_comb[0], layout_comb[0], alignment)
+                B = cutlass_library.TensorDescription(type_comb[1], layout_comb[1], alignment)
+                C = cutlass_library.TensorDescription(type_comb[2], cutlass_library.LayoutType.ColumnMajor, alignment)
+                math_inst = cutlass_library.MathInstruction(
+                    [1, 1, 1],
+                    type_comb[0],
+                    type_comb[1],
+                    type_comb[2],
+                    cutlass_library.OpcodeClass.Simt,
+                    cutlass_library.MathOperation.multiply_add
+                )
+
+                td = cutlass_library.TileDescription(
+                    [128, 128, 8], 2, [4, 2, 1], math_inst, 50, 1024)
+
+                # Prune operations that don't fit in shared memory
+                if not valid_stage_count(target_cc, kernel_cc, td_from_profiler_td(td), verbose=False)[0]:
+                    continue
+
+                new_kernels = KernelsForDataType(type_comb, layout_comb)
+
+                if operation_kind == cutlass_library.OperationKind.Gemm:
+                    new_operation = cutlass_library.manifest.GemmOperation(
+                        cutlass_library.GemmKind.Universal, td.minimum_compute_capability,
+                        td, A, B, C, type_comb[2], epilogue_functor, swizzling_functor)
+                    new_kernels.add(new_operation)
+                elif operation_kind == cutlass_library.OperationKind.Conv2d:
+                    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+                        new_operation = cutlass_library.manifest.Conv2dOperation(
+                            conv_kind, IteratorAlgorithm.Analytic, td.minimum_compute_capability, td,
+                            A, B, C, type_comb[2], StrideSupport.Strided, epilogue_functor, swizzling_functor,
+                            group_mode=GroupMode.SingleGroup
+                        )
+                        new_kernels.add(new_operation)
+
+                self.operations_by_opclass[cutlass_library.OpcodeClass.Simt][comb] = new_kernels
+
+        # Sort all operations
+        for oc in self.operations_by_opclass.keys():
+            for comb in self.operations_by_opclass[oc].keys():
+                self.operations_by_opclass[oc][comb].sort()
+
+    def opclass_supports_combination(
+        self, op_class: cutlass_library.OpcodeClass, datatype_comb: tuple, layout_comb: tuple, math_operation: cutlass_library.MathOperation
+    ) -> bool:
+        """
+        Returns whether the provided operation class supports the provided data type and layout combination
+
+        :param op_class: operation class to consider
+        :type op_class: cutlass_library.OpcodeClass
+        :param datatype_comb: tuple of data types for (element_A, element_B, element_accumulator)
+        :type datatype_comb: tuple[cutlass_library.DataType]
+        :param layout_comb: tuple of data types for (layout_A, layout_B)
+        :type layout_comb: tuple[cutlass_library.LayoutType]
+        :param math_operation: math operation to consider or None if any can be considered
+        :type math_operation: cutlass_cppgen.MathOperation
+
+        :return: set of operation classes that support the provided data type and layout combination
+        :rtype: set
+        """
+        if op_class not in self.operations_by_opclass:
+            raise Exception(f"Unexpected or unsupported operation class {op_class}")
+
+        if operations := self.operations_by_opclass[op_class].get((datatype_comb, layout_comb)):
+            if math_operation is not None:
+                return operations.supports_math_operation(math_operation)
+            else:
+                return True
+
+        return False
+
+
+    def supporting_opclasses(
+        self,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
+        math_operation: cutlass_library.MathOperation,
+    ) -> set:
+        """
+        Returns a set of operation classes that support the provided data type combination
+
+        :param element_a: data type of operand A
+        :type element_a: cutlass_library.DataType
+        :param element_b: data type of operand B
+        :type element_b: cutlass_library.DataType
+        :param element_accumulator: data type of accumulator
+        :type element_accumulator: cutlass_library.DataType
+        :param layout_a: layout of operand A
+        :type layout_a: cutlass_library.LayoutType
+        :param layout_b: layout of operand B
+        :type layout_b: cutlass_library.LayoutType
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass_cppgen.MathOperation
+
+        :return: set of operation classes that support the provided data type combination
+        :rtype: set
+        """
+        supporting_op_classes = set()
+        datatype_comb = (element_a, element_b, element_accumulator)
+        layout_comb = (layout_a, layout_b)
+
+        for op_class in self.operations_by_opclass.keys():
+            if self.opclass_supports_combination(op_class, datatype_comb, layout_comb, math_operation):
+                supporting_op_classes.add(op_class)
+        return supporting_op_classes
+
+    def operations(
+        self,
+        op_class: cutlass_library.OpcodeClass,
+        element_a: cutlass_library.DataType,
+        element_b: cutlass_library.DataType,
+        element_accumulator: cutlass_library.DataType,
+        layout_a: cutlass_library.LayoutType,
+        layout_b: cutlass_library.LayoutType,
+        math_operation: cutlass_library.MathOperation,
+    ) -> KernelsForDataType:
+        """
+        Returns whether the provided operation class supports the provided data type combination
+
+        :param op_class: operation class to consider
+        :type op_class: cutlass_library.OpcodeClass
+        :param element_a: data type of operand A
+        :type element_a: cutlass_library.DataType
+        :param element_b: data type of operand B
+        :type element_b: cutlass_library.DataType
+        :param element_accumulator: data type of accumulator
+        :type element_accumulator: cutlass_library.DataType
+        :param layout_a: layout of operand A
+        :type layout_a: cutlass_library.LayoutType
+        :param layout_b: layout of operand B
+        :type layout_b: cutlass_library.LayoutType
+        :param math_operation: math operation to consider
+        :type math_operation: cutlass_cppgen.MathOperation
+
+        :return: container of kernels by alignment supported by the provided combination of parameters
+        :rtype: KernelsForDataType
+        """
+        datatype_comb = (element_a, element_b, element_accumulator)
+        layout_comb = (layout_a, layout_b)
+        if not self.opclass_supports_combination(op_class, datatype_comb, layout_comb, math_operation):
+            raise Exception(
+                f"Data type layout combination {datatype_comb}, {layout_comb} "
+                f"is not supported by opcode class {op_class} on CC {self.cc}."
+            )
+        return self.operations_by_opclass[op_class][(datatype_comb, layout_comb)]
+
+
+class OptionRegistry:
+    """
+    Container of all architecture-specific options
+
+    :param target_cc: compute capability of the device on which operations will be run
+    :type target_cc: int
+    """
+
+    def __init__(self, target_cc: int):
+        self.registry = {}
+
+        if target_cc > 100 and (target_cc not in [101, 103, 120, 121]):
+            raise Exception(f"Unsupported compute capability {target_cc}. The CUTLASS Python interface only supports compute capabilities up to the Blackwell architecture.")
+
+        gemm_kinds = [cutlass_library.GemmKind.Universal, cutlass_library.GemmKind.Universal3x]
+        operation_kinds = [cutlass_library.OperationKind.Gemm, cutlass_library.OperationKind.Conv2d]
+        # Construct options for each CC
+        for kernel_cc in _generator_ccs:
+            self.registry[kernel_cc] = {}
+            for opkind in operation_kinds:
+                self.registry[kernel_cc][opkind] = ArchOptions(target_cc, kernel_cc, opkind, gemm_kinds)
+
+    def options_for_cc(self, cc: int, op_kind=cutlass_library.OperationKind.Gemm) -> ArchOptions:
+        return self.registry.get(cc, None)[op_kind]
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0286907040fb3ded84f989bfc9d14e740307f6a9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.op.conv import Conv2d, Conv2dFprop, Conv2dDgrad, Conv2dWgrad
+from cutlass_cppgen.op.gemm import Gemm
+from cutlass_cppgen.op.gemm_grouped import GroupedGemm
+from cutlass_cppgen.op.op import OperationBase
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/conv.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..711b27da13b54e30f8b25e839ffc4f51ed80dc5c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/conv.py
@@ -0,0 +1,997 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running CONVs
+
+    The ``Conv2d`` interface is meant to allow one to easily instantiate, compile, and run
+    CONV2D operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS CONVs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.Conv(A, B, C, D)
+        plan.run(stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # The following is shorthand for:
+        #        cutlass_cppgen.op.Conv2d(kind="fprop",
+        #                          element_A=torch.float32, element_B=torch.float32,
+        #                          element_C=torch.float32, element_D=torch.float32,
+        #                          element_accumulator=torch.float32)
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=torch.float32)
+
+        A0 = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        B0 = torch.rand((256, 64), dtype=torch.float32, device='cuda')
+        C0 = torch.zeros((128, 64), dtype=torch.float32, device='cuda')
+        D0 = torch.zeros((128, 64), dtype=torch.float32, device.'cuda')
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+        A = torch.rand((32, 128), dtype=torch.float32, device='cuda')
+        B = torch.rand((128, 256), dtype=torch.float32, device='cuda')
+        C = torch.zeros((32, 256), dtype=torch.float32, device='cuda')
+        D = torch.zeros((32, 256), dtype=torch.float32, device.'cuda')
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+
+        # Do other work...
+
+        plan.run(A0, B0, C0, D0, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+        # Do other work...
+
+        plan.run(A1, B1, C1, D1, stride=(1, 1), padding=(0, 0), dilation=(1, 1))
+
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+        plan.activation = cutlass_cppgen.epilogue.relu
+
+    Operations can also be run asynchronously:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=np.float32)
+        args = plan.run()
+
+        # Do other work...
+
+        args.sync()
+"""
+
+from __future__ import annotations
+from typing import Optional
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataTypeSize,
+    IteratorAlgorithm,
+    OperationKind,
+    SplitKMode,
+    StrideSupport,
+)
+
+import cutlass_cppgen
+from cutlass_cppgen import epilogue
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.conv2d_operation import Conv2dArguments, Conv2dOperation
+from cutlass_cppgen.backend.reduction_operation import ReductionOperation, ReductionArguments
+from cutlass_cppgen.backend.library import TensorDescription, TileDescription
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import Conv2DProblemSize, MatrixCoord
+from cutlass_cppgen.utils import check, datatypes
+
+
+class Conv2d(OperationBase):
+    """
+    Constructs a ``Conv2d`` object.
+
+    The convolution kind (fprop, wgrad, degrad), the data types of operands A, B, and C,
+    along with the data type of output D and that used for accumulation, are bound to the ``Conv``
+    object throughout its lifetime -- these are not to be changed after a ``Conv2d`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Use F32 for A, B, C, D, and accumulation in fprop
+
+        # Use the generic ``element`` parameter to concisely set all data types for operands to the same values.
+        Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f32)
+
+        # Explicitly specify the data types to use for A, B, C, and D.
+        Conv2d(kind="fprop", element_A=cutlass_cppgen.DataType.f32, element_B=cutlass_cppgen.DataType.f32,
+            element_C=cutlass_cppgen.DataType.f32, element_D=cutlass_cppgen.DataType.f32)
+
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type as those passed in here).
+        # A, B, C, and D are torch.Tensor objects of type torch.float32 under the channel-last layout
+        Conv2d(kind="fprop", A=A, B=B, C=C, D=D)
+
+        # Explicitly specify the data type for only some of A, B, C, and D. Unspecified data types will inherit
+        # those passed in via the generic ``element``
+        Conv2d(kind="fprop", element_A=cutlass_cppgen.DataType.f32, element_accumulator=cutlass_cppgen.DataType.f32,
+            element=cutlass_cppgen.DataType.f32)
+
+    The order of precedence for the setting of the data type for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type inferred from this tensor
+        2) Otherwise, if the data type (e.g., ``element_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``)
+
+    :param kind: the convolution kind (i.e. fprop, wgrad, and dgrad)
+    :type kind: str
+    :param A: tensor representing data type of operand A
+    :param B: tensor representing data type of operand B
+    :param C: tensor representing data type of operand C
+    :param D: tensor representing data type of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    """
+    def __init__(
+        self, kind="fprop",
+        A=None, B=None, C=None, D=None, alpha=1.0, beta=0.0,
+        element=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc, operation_kind=OperationKind.Conv2d)
+        # Verify the kernel cc
+        if self.current_cc in [90, 100, 101, 103]:
+            # The Conv2d kernel on Hopper (SM90) is currently unsupported
+            # Revert to use SM80-tagged kernels
+            cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+            self.specified_kernel_cc = 80
+            self._reset_options(80)
+
+        # The arch is used in testing
+        self.arch = self.current_cc
+        self.name = "conv2d" + kind
+
+        # The convolution kind. (concept: cutlass_library.library.ConvKind)
+        self.conv_kind = datatypes.getattr_enum(ConvKind, kind)
+
+        # The element types (concept: cutlass library types) of A, B, C, and D
+        elements = []
+        layouts = []
+
+        # Complete the data types based on user-provided arguments
+        for elt, tens, name in zip([element_A, element_B, element_C, element_D],
+                                   [A, B, C, D],
+                                   ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+
+            elt_to_set = None
+            lay_to_set = None
+
+            if tens is not None:
+                elt_to_set, _ = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+
+            assert elt_to_set is not None
+
+            # Currently we only support layout TensorNHWC
+            lay_to_set = cutlass_cppgen.LayoutType.TensorNHWC
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+
+        self.A, self.B, self.C, self.D, self.alpha, self.beta = A, B, C, D, alpha, beta
+
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+
+        # Default inputs if none is supplied in run()
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+
+        self.alpha = alpha
+        self.beta = beta
+
+        # We only specify the stride of the swizzling functor here
+        # The actual swizzling functor is determined in run based on conv_kind and stride
+        self._swizzling_stride = 1
+
+        # Arguments that will be set to default value in _reset_operations
+        # The default tile_description and op_class are fetched from manifest of cutlass library
+        self._tile_description = None
+        self.op_class = None
+        # The default identity epilogue will be created
+        self.epilogue_functor = None
+
+        self._reset_operations()
+
+        # Arguments that will be determined online based on arguments of "run"
+        # based on stride, input/output channels, alignment, and conv_kind
+        self._iterator_algorithm = None
+        self._stride_support = None
+
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation
+        )
+
+        if cutlass_cppgen.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+        elif cutlass_cppgen.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(epilogue.identity)
+
+        self.alignment_pref_A = min(
+            128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        self.alignment_pref_B = min(
+            128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        self.alignment_pref_C = min(
+            128 // DataTypeSize[self._element_c], max(self.possible_operations.alignments("C")))
+
+    #
+    # Tile description Related
+    #
+
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+
+        :param td: tile description
+        :type td: cutlass_cppgen.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            if "cluster_shape" in td.keys():
+                if td["cluster_shape"] != [1, 1, 1]:
+                    cutlass_cppgen.logger.warning("Conv2d currently only support 'cluster_shape'=[1, 1, 1]'.")
+                    td["cluster_shape"] = [1, 1, 1]
+            td = self._tile_description.clone_and_update(td)
+
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+
+        :param td: tile description to validate
+        :type td: cutlass_cppgen.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+
+        return valid, msg
+
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        descriptions = []
+        description_str = []
+        for op in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(op)
+
+            if self._math_operation is not None:
+                if td.math_instruction.math_operation != self._math_operation:
+                    continue
+
+            if str(td) not in description_str:
+                description_str.append(str(td))
+                descriptions.append(td)
+        return descriptions
+
+    #
+    # Swizzling functor Related
+    #
+
+    @property
+    def swizzling_stride(self):
+        """
+        Returns the stride of swizzling currently being used by the Conv2d
+
+        :return: swizzing stride
+        """
+        return self._swizzling_stride
+
+    @swizzling_stride.setter
+    def swizzling_stride(self, stride: int):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if not isinstance(stride, int):
+            raise Exception(f"Expect integer (1, 2, 4, 8), got {stride}")
+        self._swizzling_stride = stride
+
+    def _propose_swizzling_functor(self, stride):
+        """
+        Automatically propose the swizzling functor based on the stride
+        """
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] != 1 or stride[1] != 1:
+                return getattr(cutlass_cppgen.swizzle, f"StridedDgradIdentitySwizzle{self._swizzling_stride}")
+
+        return getattr(cutlass_cppgen.swizzle, f"IdentitySwizzle{self._swizzling_stride}")
+
+    #
+    # Iterator Algorithm Related
+    #
+
+    @property
+    def iterator_algorithm(self) -> IteratorAlgorithm:
+        """
+        Returns the iterator algorithm
+        """
+        return self._iterator_algorithm
+
+    @iterator_algorithm.setter
+    def iterator_algorithm(self, alg: str):
+        """
+        Sets the iterator algorithm
+
+        :param alg: The iterator algorithm
+        :type td: string, options: "analytic", "optimized", "few_channels", and "fixed_channels"
+        """
+        iterator_alg = datatypes.getattr_enum(IteratorAlgorithm, alg)
+
+        # Check if the iterator algorithm is valid
+        if iterator_alg in [IteratorAlgorithm.FewChannels, IteratorAlgorithm.FixedChannels] and self.conv_kind != ConvKind.Fprop:
+            raise Exception(f"{self.conv_kind} does not support iterator algorithm {alg}.")
+
+        self._iterator_algorithm = iterator_alg
+
+    def _propose_iterator_algorithm(self, problem_size, alignment_a, alignment_b) -> IteratorAlgorithm:
+        """
+        Propose a valid iterator algorithm based on problem size and alignment
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            # Check whether the fixed channel is applicable
+            if problem_size.C == alignment_a:
+                return IteratorAlgorithm.FixedChannels
+            elif (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Dgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.R <= 32 and problem_size.S <= 32 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+        elif self.conv_kind == ConvKind.Wgrad:
+            if (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0):
+                return IteratorAlgorithm.Optimized
+            else:
+                return IteratorAlgorithm.Analytic
+
+    def _validate_iterator_algorithm(self, iterator_algorithm, problem_size, alignment_a, alignment_b) -> bool:
+        """
+        Validate whether the user provide iterator algorithm works for the given problem size
+        """
+        if self.conv_kind == ConvKind.Fprop:
+            if iterator_algorithm == IteratorAlgorithm.FixedChannels:
+                return problem_size.C == alignment_a
+            elif iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.C % alignment_a == 0 and
+                  problem_size.R <= 32 and problem_size.S <= 32)
+            elif iterator_algorithm == IteratorAlgorithm.FewChannels:
+                return problem_size.C % alignment_a == 0
+        elif self.conv_kind == ConvKind.Dgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                        problem_size.R <= 32 and problem_size.S <= 32 and
+                        problem_size.C % alignment_b == 0)
+        elif self.conv_kind == ConvKind.Wgrad:
+            if iterator_algorithm == IteratorAlgorithm.Optimized:
+                return (problem_size.K % alignment_a == 0 and
+                problem_size.C % alignment_b == 0)
+
+        return True
+
+    #
+    # Stride Support Related
+    #
+
+    def _propose_stride_support(self, stride):
+        if self.conv_kind == ConvKind.Dgrad:
+            if stride[0] == 1 and stride[1] == 1:
+                return StrideSupport.Unity
+
+        return StrideSupport.Strided
+
+    #
+    # Construct and Compilation
+    #
+
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+        iterator_algorithm: IteratorAlgorithm = None,
+        stride_support = None, swizzling_functor: cutlass_cppgen.swizzle = None,
+        epilogue_functor=None) -> cutlass_cppgen.backend.Conv2dOperation:
+        """
+        Constructs a ``cutlass_cppgen.backend.Conv2dOperation`` based on the input parameters and current
+        kernel specification of the ``Conv2d`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass_cppgen.swizzle
+        :param epilogue_functor: the epilogue functor
+
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.Conv2dOperation
+        """
+        # Get alignment
+        alignment_A = check.alignment_or_default(alignment_A, self.alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, self.alignment_pref_B)
+        alignment_C = check.alignment_or_default(alignment_C, self.alignment_pref_C)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+
+        if tile_description is None:
+            if self.tile_description is not None:
+                tile_description = self.tile_description
+            else:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+
+        if iterator_algorithm is None:
+            # If the iterator algorithm is already set
+            if self.iterator_algorithm is not None:
+                iterator_algorithm = self.iterator_algorithm
+            else:
+                # Otherwise, we conservatively use the analytic iterator for correctness
+                iterator_algorithm = IteratorAlgorithm.Analytic
+
+        if stride_support is None:
+            # If the stride support is already set
+            if self._stride_support is not None:
+                stride_support = self._stride_support
+            else:
+                # Otherwise, we assume strided
+                stride_support = StrideSupport.Strided
+
+        if swizzling_functor is None:
+            # If the swizzling functor is already set
+            swizzling_functor = self._propose_swizzling_functor(stride=(2, 2))
+
+        if epilogue_functor is None:
+            if self.epilogue_functor is not None:
+                epilogue_functor = self.epilogue_functor
+            else:
+                epilogue_functor = self._create_epilogue_functor_activation(self._activation)
+
+        # Reset the alignment of the epilogue functor
+        epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, epilogue_functor)
+
+        operation = Conv2dOperation(
+            conv_kind=self.conv_kind,
+            iterator_algorithm=iterator_algorithm,
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            stride_support=stride_support,
+            epilogue_functor=epilogue_functor,
+            swizzling_functor=swizzling_functor,
+        )
+
+        return operation
+
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                iterator_algorithm: IteratorAlgorithm = None,
+                stride_support = None, swizzling_functor: cutlass_cppgen.swizzle = None,
+                epilogue_functor = None, print_module: bool = False) -> cutlass_cppgen.backend.Conv2dOperation:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+
+        ::param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param iterator_algorithm: the iterator algorithm used
+        :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+        :param stride_support: the stride support of dgrad
+        :type stride_support: cutlass_library.library.StrideSupport
+        :param swizzling_functor: the swizzling functor
+        :type swizzling_functor: cutlass_cppgen.swizzle
+        :param epilogue_functor: the epilogue functor
+
+        :return: operation that was compiled
+        :rtype: cutlass_cppgen.backend.Conv2dOperation
+        """
+
+        self.operation = self.construct(
+            tile_description, alignment_A, alignment_B, alignment_C,
+            iterator_algorithm, stride_support, swizzling_functor, epilogue_functor)
+
+        if print_module:
+            print(self.operation.rt_module.emit())
+
+        compiler.add_module([self.operation,])
+        return self.operation
+
+    #
+    # Run Related
+    #
+
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, _ = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type:
+            raise Exception(f'Tensor {name} with type and layout {dtype} '
+                            f'does not match the expected type of {ref_type}.')
+
+    def _get_and_verify_conv_problem_size(self, A, B, C, stride, padding, dilation):
+        if self.conv_kind == ConvKind.Fprop:
+            input = A
+            weight = B
+            output = C
+            output_tensor = "C"
+        elif self.conv_kind == ConvKind.Dgrad:
+            output = A
+            weight = B
+            input = C
+            output_tensor = "A"
+        elif self.conv_kind == ConvKind.Wgrad:
+            output = A
+            input = B
+            weight = C
+            output_tensor = "A"
+        else:
+            raise Exception(f"Convolution kind {self.conv_kind} is not supported")
+
+        N_, H_, W_, C_ = datatypes.get_tensor_shape(input, op="CONV")
+        K_, R_, S_, _ = datatypes.get_tensor_shape(weight, op="CONV")
+        _, P_, Q_, _ = datatypes.get_tensor_shape(output, op="CONV")
+
+        problem_size = Conv2DProblemSize(
+            N_, H_, W_, C_,
+            K_, R_, S_, C_,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        if P_ != problem_size.P or Q_ != problem_size.Q:
+            raise Exception(
+                f"Tensor {output_tensor} size should be ({N_}, {problem_size.P}, {problem_size.Q}, {K_}), got ({N_}, {P_}, {Q_}, {K_})")
+
+        return problem_size
+
+    def run(self, A=None, B=None, C=None, D=None,
+            stride=(1, 1), padding=(0, 0), dilation=(1, 1),
+            alpha=None, beta=None,
+            split_k=("serial", 1), sync: bool = True,
+            print_module: bool = False,
+            stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in the call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param stride: (stride_h, stride_w) describing the convolution stride. Default: (1, 1)
+        :param padding: (pad_h, pad_w) describing the convolution padding. Default: (0, 0)
+        :param dilation: (dilation_h, dilation_w) describing the dilation of convolution. Default: (1, 1)
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param split_k: a tuple (split_k_mode, split_k_slices)
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.Conv2dArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+        super().run_setup()
+
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        # handle the case when there is no C
+        if C is None:
+            if beta != 0:
+                raise Exception(f"With beta {beta} != 0, C has to be provided.")
+            else:
+                C = D
+
+        # Construct problem size based on input
+        # It also verifies whether the A, B, C, D, stride, padding, and dilation are matching
+        problem_size = self._get_and_verify_conv_problem_size(A, B, C, stride, padding, dilation)
+
+        # Propose stride support based on input
+        stride_support = self._propose_stride_support(stride)
+
+        # Propose swizzling functor
+        swizzling_functor = self._propose_swizzling_functor(stride)
+
+        shape_a = datatypes.get_tensor_shape(A, op="CONV")
+        shape_b = datatypes.get_tensor_shape(B, op="CONV")
+        shape_c = datatypes.get_tensor_shape(C, op="CONV")
+
+        # Get the alignment
+        alignment_a = self.possible_operations.find_alignment(shape_a, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(shape_b, self._layout_b, operand="B")
+        alignment_c = self.possible_operations.find_alignment(shape_c, self._layout_c, operand="C")
+
+        alignment_a = check.update_alignment(alignment_a, self.alignment_pref_A)
+        alignment_b = check.update_alignment(alignment_b, self.alignment_pref_B)
+        alignment_c = check.update_alignment(alignment_c, self.alignment_pref_C)
+
+        # Propose iterator algorithm based on input
+        if self._iterator_algorithm is None:
+            # Propose a default iterator algorithm based on the problem size
+            iterator_algorithm = self._propose_iterator_algorithm(problem_size, alignment_a, alignment_b)
+        else:
+            if (self._validate_iterator_algorithm(self._iterator_algorithm, problem_size, alignment_a, alignment_b)):
+                iterator_algorithm = self._iterator_algorithm
+            else:
+                raise Exception(f"Iterator algorithm {self._iterator_algorithm} is invalid for current problem.")
+
+        epilogue_args = [alpha, beta]
+
+        if hasattr(self, "_activation_args"):
+            if isinstance(self._activation_args, list):
+                epilogue_args += self._activation_args
+            else:
+                epilogue_args.append(self._activation_args)
+
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor = self._create_epilogue_functor_activation(epilogue.identity)
+        else:
+            epilogue_functor = self.epilogue_functor
+
+        # The alignment is determined by the iterator function (I believe)
+        self.compile(tile_description=self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, iterator_algorithm=iterator_algorithm, stride_support=stride_support,
+                     swizzling_functor=swizzling_functor, epilogue_functor=epilogue_functor, print_module=print_module)
+
+        # Create reduction operation for parallel split-k
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            epilogue_functor_reduction = self._reset_epilogue_functor_alignment(alignment_c, self.epilogue_functor)
+            self.reduction_operation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * alignment_c), C=self.operation.C,
+                element_accumulator=self._element_accumulator,
+                element_compute=self._element_accumulator,
+                epilogue_functor=epilogue_functor_reduction,
+                count=alignment_c
+            )
+            if print_module:
+                print(self.reduction_operation.rt_module.emit())
+            compiler.add_module([self.reduction_operation,])
+
+        arguments = Conv2dArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=self.operation.epilogue_type(*epilogue_args),
+            split_k_mode=datatypes.getattr_enum(SplitKMode, split_k[0]),
+            split_k_slices=split_k[1],
+            stream=stream
+        )
+
+        self.operation.run(arguments)
+
+        if split_k[0] == "parallel" and split_k[1] > 1:
+            implicit_gemm_size = arguments.problem_size.implicit_gemm_size(self.conv_kind)
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[implicit_gemm_size.m, implicit_gemm_size.n],
+                partitions=split_k[1],
+                workspace=arguments.ptr_D,
+                destination=D,
+                source=C,
+                output_op=self.reduction_operation.epilogue_type(*epilogue_args),
+                stream=stream
+            )
+            self.reduction_operation.run(reduction_arguments)
+
+        if sync:
+            if split_k[0] == "parallel" and split_k[1] > 1:
+                reduction_arguments.sync()
+
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+
+        return arguments
+
+    #
+    # Helper functions
+    #
+    @staticmethod
+    def output_size(input_size, weight_size, padding, stride, dilation):
+        problem_size = Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            padding[0], padding[1],
+            stride[0], stride[1],
+            dilation[0], dilation[1],
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+        return (problem_size.N, problem_size.P, problem_size.Q, problem_size.K)
+
+
+#
+# Easy to use interfaces for fprop, wgrad, and dgrad
+#
+
+class Conv2dFprop(Conv2d):
+    def __init__(
+        self,
+        input=None, weight=None, C=None, output=None, alpha=1, beta=0,
+        element=None,
+        element_input=None, element_weight=None, element_C=None, element_output=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = input, weight, output
+        element_A, element_B, element_D = element_input, element_weight, element_output
+        super().__init__(
+            "fprop", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(
+        self, input=None, weight=None, C=None, output=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        A, B, D = input, weight, output
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+
+
+class Conv2dDgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, weight=None, C=None, grad_input=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_weight=None, element_C=None, element_grad_input=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, weight, grad_input
+        element_A, element_B, element_D = element_grad_output, element_weight, element_grad_input
+        super().__init__(
+            "dgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(self, grad_output=None, weight=None, C=None, grad_input=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        #
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        A, B, D = grad_output, weight, grad_input
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
+
+
+class Conv2dWgrad(Conv2d):
+    def __init__(
+        self,
+        grad_output=None, input=None, C=None, grad_weight=None, alpha=1, beta=0,
+        element=None,
+        element_grad_output=None, element_input=None, element_C=None, element_grad_weight=None,
+        element_accumulator=None,
+        cc: int = None, kernel_cc: int = None):
+        A, B, D = grad_output, input, grad_weight
+        element_A, element_B, element_D = element_grad_output, element_input, element_grad_weight
+        super().__init__(
+            "wgrad", A, B, C, D, alpha, beta, element,
+            element_A, element_B, element_C, element_D,
+            element_accumulator, cc, kernel_cc)
+
+    def run(self, grad_output=None, input=None, C=None, grad_weight=None, alpha=None, beta=None,
+        stride=(1, 1), padding=(0, 0), dilation=(1, 1), split_k=("serial", 1),
+        sync: bool = True, print_module: bool = False,
+        stream: Optional[cuda.CUstream] = None) -> Conv2dArguments:
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        A, B, D = grad_output, input, grad_weight
+        return super().run(
+            A, B, C, D, alpha, beta, stride, padding, dilation, split_k, sync, print_module, stream)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f9b1ab43a1c45d0024e99e50e45813ba18866e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm.py
@@ -0,0 +1,725 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+
+    The ``Gemm`` interface is meant to allow one to easily instantiate, compile, and run
+    GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS GEMMs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # A, B, C, and D are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.Gemm(A, B, C, D)
+        plan.run()
+
+
+    One can also use the interface by specifying data types of operands at construction
+    and using different tensor objects with these data types at runtime:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # The following is shorthand for:
+        #        cutlass_cppgen.op.Gemm(element_A=torch.float32, element_B=torch.float32,
+        #                        element_C=torch.float32, element_D=torch.float32,
+        #                        element_accumulator=torch.float32,
+        #                        layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan = cutlass_cppgen.op.Gemm(element=torch.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+        A0 = torch.rand((128, 256), device='cuda')
+        B0 = torch.rand((256, 64), device='cuda')
+        C0 = torch.zeros((128, 64), device='cuda')
+        D0 = torch.zeros((128, 64), device.'cuda')
+        plan.run(A0, B0, C0, D0)
+
+        A = torch.rand((32, 128), device='cuda')
+        B = torch.rand((128, 256), device='cuda')
+        C = torch.zeros((32, 256), device='cuda')
+        D = torch.zeros((32, 256), device.'cuda')
+        plan.run(A1, B1, C1, D1)
+
+    The interface additionally enables one to decouple the compilation of the underlying CUTLASS
+    kernel from its execution:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.compile()
+
+        # Do other work...
+
+        plan.run(A0, B0, C0, D0)
+
+        # Do other work...
+
+        plan.run(A1, B1, C1, D1)
+
+    Elementwise activation functions are easily fused to the GEMM via the interface:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.activation = cutlass_cppgen.epilogue.relu
+
+    Operations can also be run asynchronously:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        plan = cutlass_cppgen.op.Gemm(element=np.float32, layout=cutlass_cppgen.LayoutType.RowMajor)
+        args = plan.run()
+
+        # Do other work...
+
+        args.sync()
+"""
+from __future__ import annotations
+from typing import Optional
+from math import prod
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    KernelScheduleSuffixes,
+)
+
+import cutlass_cppgen
+from cutlass_cppgen import epilogue, swizzle
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.evt import EpilogueFunctorVisitor
+from cutlass_cppgen.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass_cppgen.backend.library import TensorDescription, TileDescription
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils import check, datatypes
+
+
+class Gemm(OperationBase):
+    """
+    Constructs a ``Gemm`` object.
+
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``Gemm`` object throughout its lifetime --
+    these are not to be changed after a ``Gemm`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. The following
+    constructors are equivalent:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # Use F32 for A, B, C, D, and accumulation. All operands are row major.
+
+        # Use the generic ``element`` and ``layout`` parameters to concisely set all data types and layouts
+        # for operands to the same values.
+        Gemm(element=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+        # Explicitly specify the data types to use for A, B, C, and D. Use the generic ``layout``.
+        Gemm(element_A=cutlass_cppgen.DataType.f32, element_B=cutlass_cppgen.DataType.f32, element_C=cutlass_cppgen.DataType.f32,
+            element_D=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+        # Set the data types and elements from existing tensors. Note that one can use different tensors when
+        # executing GEMM via the ``run()`` method than passed in here (though those passed in to ``run()`` must
+        # have the same data type and layout as those passed in here).
+        # A, B, C, and D are row-major torch.Tensor objects of type torch.float32
+        Gemm(A=A, B=B, C=C, D=D)
+
+        # Use the generic ``element`` and explicitly specify the layouts to use for A, B, and C (layout of D is
+        # the same as that for D, at present)
+        Gemm(element=cutlass_cppgen.DataType.f32, layout_A=cutlass_cppgen.LayoutType.RowMajor,
+            layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor)
+
+        # Explicitly specify the data type and layout for only some of A, B, C, and D. Unspecified data types
+        # and layouts will inherit those passed in via the generic ``element`` and ``layout``
+        Gemm(element_A=cutlass_cppgen.DataType.f32, layout_B=cutlass_cppgen.LayoutType.RowMajor,
+            element=cutlass_cppgen.DataType.f32, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+    The order of precedence for the setting of the data type and layout for a given operand/output is as follows:
+        1) If the tensor type is specified (e.g., ``A``), use the data type and layout inferred from this tensor
+        2) Otherwise, if the data type/layout (e.g., ``element_A``, ``layout_A``) is specified, use those
+        3) Otherwise, use the generic values (e.g., ``element``, ``layout``)
+
+    :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+    :type cc: int
+    :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+    :type kernel_cc: int
+    :param A: tensor representing data type and layout of operand A
+    :param B: tensor representing data type and layout of operand B
+    :param C: tensor representing data type and layout of operand C
+    :param D: tensor representing data type and layout of operand D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass_cppgen.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param layout_A: layout of operand A
+    :type layout_A: cutlass_cppgen.LayoutType
+    :param layout_B: layout of operand B
+    :type layout_B: cutlass_cppgen.LayoutType
+    :param layout_C: layout of operand C
+    :type layout_C: cutlass_cppgen.LayoutType
+    :param layout_D: layout of operand D
+    :type layout_D: cutlass_cppgen.LayoutType
+    """
+
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None, kernel_cc: int = None
+    ):
+        super().__init__(cc=cc, kernel_cc=kernel_cc)
+        self.name = "gemm"
+        self.compiled = False
+
+        elements = []
+        layouts = []
+
+        # Check that at least one of the following is set for each tensor (illustrated assuming tensor A):
+        # ``A``, ``element_A``, ``element`` and ``A``, ``layout_A``, ``layout``
+        for elt, lay, tens, name in zip([element_A, element_B, element_C, element_D],
+                                        [layout_A, layout_B, layout_C, layout_C],
+                                        [A, B, C, D],
+                                        ["A", "B", "C", "D"]):
+            if elt is not None and tens is not None:
+                raise Exception(f'Must not specify both element_{name} and tensor {name}')
+            if lay is not None and tens is not None:
+                raise Exception(f'Must not specify both layout_{name} and tensor {name}')
+            if elt is None and tens is None and element is None:
+                raise Exception(f'Must specify one of element_{name}, tensor {name}, or generic element.')
+            if lay is None and tens is None and layout is None:
+                raise Exception(f'Must specify one of layout_{name}, tensor {name}, or generic layout.')
+
+            elt_to_set = None
+            lay_to_set = None
+            if tens is not None:
+                elt_to_set, lay_to_set = datatypes.get_datatype_and_layout(tens)
+            else:
+                elt_to_set = elt if elt is not None else element
+                lay_to_set = lay if lay is not None else layout
+
+            elements.append(datatypes.library_type(elt_to_set))
+            layouts.append(lay_to_set)
+
+        self._element_a, self._element_b, self._element_c, self._element_d = elements
+        self._layout_a, self._layout_b, self._layout_c, self._layout_d = layouts
+
+        if element_accumulator is None:
+            self._element_accumulator = self._element_c
+        else:
+            self._element_accumulator = datatypes.library_type(element_accumulator)
+
+        self.A = A
+        self.B = B
+        self.C = C
+        self.D = D
+
+        self.alpha = alpha
+        self.beta = beta
+
+        self.epilogue_functor = None
+        self.op_class = None
+        self._tile_description = None
+
+        self._reset_operations()
+
+        self._swizzling_functor = cutlass_cppgen.swizzle.IdentitySwizzle1
+
+    def _reset_operations(self, reset_epilogue: bool = True):
+        # Set the default op class
+        datatype_comb = (self._element_a, self._element_b, self._element_accumulator)
+        layout_comb = (self._layout_a, self._layout_b)
+
+        self.possible_op_classes = self.options.supporting_opclasses(
+            self._element_a, self._element_b, self._element_accumulator,
+            self._layout_a, self._layout_b, self._math_operation)
+
+        if cutlass_cppgen.OpcodeClass.TensorOp in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+        elif cutlass_cppgen.OpcodeClass.Simt in self.possible_op_classes:
+            self.opclass = cutlass_cppgen.OpcodeClass.Simt
+        else:
+            if self._math_operation is not None:
+                math_op_str = f' and math operation {self._math_operation}'
+            else:
+                math_op_str = ''
+
+            raise Exception(f'No kernel configuration found for supported data type and layout '
+                            f'combination {datatype_comb}x{layout_comb}{math_op_str}')
+
+        if reset_epilogue:
+            self._reset_epilogue_functor_activation(cutlass_cppgen.epilogue.identity)
+
+    @property
+    def swizzling_functor(self):
+        """
+        Returns the type of the swizzling functor currently being used by the GEMM
+
+        :return: swizzing functor type
+        """
+        return self._swizzling_functor
+
+    @swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        if swizzling_functor == cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK:
+            if self.op_class == cutlass_cppgen.OpcodeClass.Simt:
+                raise Exception('ThreadblockSwizzleStreamK is currently only supported with opcode class TensorOp')
+
+            if self.current_cc in [90, 100, 101, 103]:
+                raise Exception('ThreadblockSwizzleStreamK is currently unsupported on SM90+')
+        self._swizzling_functor = swizzling_functor
+
+    #
+    # Tile description Related
+    #
+
+    @property
+    def tile_description(self) -> TileDescription:
+        """
+        Returns the tile description
+        """
+        return self._tile_description
+
+    @tile_description.setter
+    def tile_description(
+        self, td=None):
+        """
+        Set the tile description
+
+        :param td: tile description
+        :type td: cutlass_cppgen.backend.TileDescription, or a dict with keys
+                  {
+                      "threadblock_shape": [int, int, int],
+                      "warp_count": [int, int, int],
+                      "stages": int,
+                      "instruction_shape": [int, int, int] (optional),
+                      "cluster_shape": [int, int, int] (optional)
+                  }
+        """
+        if td is None:
+            return
+        if isinstance(td, dict):
+            if self._tile_description is None:
+                op = self.possible_operations.default_operation(self._math_operation)
+                self._tile_description = datatypes.td_from_profiler_op(op)
+            td = self._tile_description.clone_and_update(td)
+
+        valid, msg = self._valid_tile_description(td)
+        if valid:
+            self._tile_description = td
+        else:
+            raise Exception(msg)
+
+    def _valid_tile_description(self, td: TileDescription) -> tuple:
+        """
+        Checks whether the provided tile description is valid for the given compute capability. At present,
+        this checks the following:
+
+        - Does the tile description use a number of stages supported by the compute capability in question?
+        - Does the tile size requested fit within shared memory?
+        - Are cluster dimensions outside the valid range requested for a given architecture (e.g.,
+          more non-unit cluster dimensions for pre-SM90 architectures)?
+        - Is the kernel schedule being used supported on the architecture in question?
+
+        :param td: tile description to validate
+        :type td: cutlass_cppgen.backend.TileDescription
+        :return: tuple in which the first element is a bool indicating that the tile description is valid
+                 and the second element is a string providing an optional error message.
+        :rtype: tuple
+        """
+        valid, msg = check.valid_stage_count(self.cc, self.current_cc, td, self._element_c, self._element_d)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_cluster_shape(self.current_cc, td.cluster_shape)
+        if not valid:
+            return (valid, msg)
+
+        valid, msg = check.valid_schedule(self.current_cc, td.kernel_schedule, td.epilogue_schedule, td.tile_scheduler)
+
+        if self.cc in [100, 101, 103] and td.kernel_schedule is not None and td.is_2sm and td.cluster_shape[0] % 2 != 0:
+            valid = False
+            msg = "Cluster shape must be divisible by 2 for 2SM kernels on SM100, SM101, and SM103"
+
+        return valid, msg
+
+    def tile_descriptions(self) -> list:
+        """
+        Returns a list of valid tile descriptions for the operations
+
+        :returns: list of valid tile descriptions for the operations
+        :rtype: list
+        """
+        tds = [datatypes.td_from_profiler_op(op) for op in self.possible_operations.all_operations]
+        if self._math_operation is not None:
+            tds = [td for td in tds if td.math_instruction.math_operation == self._math_operation]
+        return tds
+
+    def construct(
+        self, tile_description: TileDescription = None,
+        alignment_A: int = None, alignment_B: int = None, alignment_C: int = None) -> GemmOperationUniversal:
+        """
+        Constructs a ``cutlass_cppgen.backend.GemmUniversalOperation`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.GemmOperationUniversal
+        """
+        alignment_pref_A = min(128 // DataTypeSize[self._element_a], max(self.possible_operations.alignments("A")))
+        alignment_pref_B = min(128 // DataTypeSize[self._element_b], max(self.possible_operations.alignments("B")))
+        alignment_A = check.alignment_or_default(alignment_A, alignment_pref_A)
+        alignment_B = check.alignment_or_default(alignment_B, alignment_pref_B)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_a, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+
+        if alignment_C is None:
+            alignment_C = max(self.possible_operations.alignments("C"))
+            if self._element_c != DataType.void:
+                alignment_C = min(128 // DataTypeSize[self._element_c], alignment_C)
+
+        if tile_description is None:
+            if self._tile_description is None:
+                op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+                tile_description = datatypes.td_from_profiler_op(op)
+
+                # The selected op may have lower alignment than that determined above, so we must
+                # reset alignment here.
+                alignment_C = op.C.alignment
+            else:
+                tile_description = self._tile_description
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self._tile_description = tile_description
+
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+
+        operation = GemmOperationUniversal(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+        )
+
+        return operation
+
+    def compile(self, tile_description: TileDescription = None,
+                alignment_A: int = None, alignment_B: int = None, alignment_C: int = None,
+                print_module: bool = False) -> cutlass_cppgen.backend.GemmOperationUniversal:
+        """
+        Emits and compiles the kernel currently specified. If ``tile_description`` and any
+        of the ``alignment`` parameters are set, the kernel will be chosen using this
+        tile description and alignments. Otherwise, a default tile description and alignment
+        will be used.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+
+        :return: operation that was compiled
+        :rtype: cutlass_cppgen.backend.GemmOperationUniversal
+        """
+        self.operation = self.construct(tile_description, alignment_A, alignment_B, alignment_C)
+
+        if print_module:
+            print(self.operation.rt_module.emit())
+
+        compiler.add_module([self.operation,])
+        return self.operation
+
+    def _verify_rank(self, tensor):
+        """
+        Verifies that ``tensor`` has rank greater than 1
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        """
+        if len(tensor.shape) < 2:
+            raise Exception(f"Tensors must be of rank greater than 1. Received tensor of shape: {tensor.shape}")
+
+    def _get_batch_count(self, A, B, C, D) -> int:
+        """
+        Returns the batch count specified by the tensors A, B, C, and D and verifies that these
+        tensors match in batch size. Presence of a batch dimension is detected by one of the
+        tensors being rank 3. If a batch dimension is present, it must be present in one of
+        operands A, B, or C (but need not be in all), and must be present in D.
+
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+
+        :return: tuple of batch count dimensions
+        :rtype: tuple
+        """
+        A_batch = prod(A.shape[:-2]) if len(A.shape) > 2 else 1
+        B_batch = prod(B.shape[:-2]) if len(B.shape) > 2 else 1
+
+        if 1 not in [A_batch, B_batch]:
+            if A_batch != B_batch:
+                raise Exception(f"Get invalid batch counts: A={A_batch}, B={B_batch}")
+        return max(A_batch, B_batch)
+
+    def _get_batch_stride(self, tensor) -> int:
+        """
+        Returns the batch stride of ``tensor``. If ``tensor`` is only rank-2, batch stride is 0.
+
+        :param tensor: tensor object to process
+        :type tensor: numpy/cupy/torch array/tensor object
+
+        :return: stride between each matrix in the batch
+        :rtype: int
+        """
+        if tensor is not None and len(tensor.shape) > 2:
+            return tensor.shape[-2] * tensor.shape[-1]
+        else:
+            return 0
+
+    def _get_problem_args(self, A, B, C, D) -> tuple:
+        """
+        Returns the problem size and GEMM universal mode to use for the
+        given operands.
+
+        :param A: tensor A
+        :type A: numpy/cupy/torch array/tensor object
+        :param B: tensor B
+        :type B: numpy/cupy/torch array/tensor object
+        :param C: tensor C
+        :type C: numpy/cupy/torch array/tensor object
+        :param D: tensor D
+        :type D: numpy/cupy/torch array/tensor object
+
+        :return: tuple containing the problem size (cutlass_cppgen.shape.GemmCoord), the GEMM mode (cutlass_cppgen.GemmUniversalMode), and the batch count (int)
+        :rtype: tuple
+        """
+        M, K = A.shape[-2:]
+        N = B.shape[-1]
+        mode = GemmUniversalMode.Gemm
+
+        batch_count = self._get_batch_count(A, B, C, D)
+        returned_batch_count = batch_count
+
+        # If we are running a batched GEMM in which there is a nonzero batch stride
+        # only for A, then we can fold the batched dimension of A into the M dimension
+        # (i.e., (b, m, k) x (k, n) -> (m*b, k) x (k, n)). This works only if both A
+        # and C are row major. A similar operation can be performed if only B has a nonzero
+        # batch dimension
+        if batch_count > 1:
+            A_row = self._layout_a == cutlass_cppgen.LayoutType.RowMajor
+            B_row = self._layout_b == cutlass_cppgen.LayoutType.RowMajor
+            C_row = self._layout_c == cutlass_cppgen.LayoutType.RowMajor
+
+            # Consider a Tensor to be batched if its rank is > 2 and
+            # the product of the modes beyond rank 2 equals our pre-determined batch size.
+            batched = lambda x : x is None or (len(x.shape) > 2 and prod(x.shape[:-2]) == batch_count)
+
+            if batched(A) and not batched(B) and (C is None or batched(C)) and A_row and C_row:
+                M *= batch_count
+                returned_batch_count = 1
+            elif not batched(A) and batched(B) and (C is None or batched(C)) and not B_row and not C_row:
+                N *= batch_count
+                returned_batch_count = 1
+            else:
+                mode = GemmUniversalMode.Batched
+
+        return GemmCoord(M, N, K), mode, returned_batch_count
+
+    def _verify_type_and_layout(self, tensor, ref_type, ref_layout, name):
+        """
+        Verifies that ``tensor`` has data type ``ref_type`` and layout ``ref_layout``. An exception
+        is raised if it does not.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+        """
+        dtype, layout = datatypes.get_datatype_and_layout(tensor)
+        if dtype != ref_type or layout != ref_layout:
+            try:
+                # Attempt to transpose the tensor to fit the desired layout
+                tensor = tensor.transpose(-1, -2)
+            except:
+                raise Exception(f'Tensor {name} with type and layout ({dtype}, {layout}) '
+                                f'does not match the expected type and '
+                                f'layout of ({ref_type}, {ref_layout}) and transpose failed.')
+
+    def run(self, A=None, B=None, C=None, D=None,
+            alpha=None, beta=None, sync: bool = True, print_module: bool = False, visitor_args: dict = None,
+            stream: Optional[cuda.CUstream] = None) -> GemmArguments:
+        """
+        Runs the kernel currently specified. If it has not already been, the kernel is emitted and
+        compiled. Tensors holding operands and outputs of the kernel are sourced either from the
+        ``A``, ``B``, ``C``, ``D``, ``alpha``, and ``beta``
+        parameters provided in this call, or from those
+        passed in on the construction of this object -- one of the two must be specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: tensor representing data type and layout of operand A
+        :param B: tensor representing data type and layout of operand B
+        :param C: tensor representing data type and layout of operand C
+        :param D: tensor representing data type and layout of operand D
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.GemmArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+        super().run_setup()
+        A = self._verify_tensor(A, self.A, self._element_a, self._layout_a, "A")
+        B = self._verify_tensor(B, self.B, self._element_b, self._layout_b, "B")
+        C = self._verify_tensor(C, self.C, self._element_c, self._layout_c, "C")
+        D = self._verify_tensor(D, self.D, self._element_d, self._layout_d, "D")
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        is_void_c = self._element_c == DataType.void
+
+        self._verify_rank(A)
+        self._verify_rank(B)
+        if not is_void_c:
+            self._verify_rank(C)
+        self._verify_rank(D)
+
+        alignment_a = self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A")
+        alignment_b = self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B")
+
+        # Set C alignment based on D.shape so as to correctly get an alignment with void-C
+        # kernels, for which `C` is None.
+        alignment_c = self.possible_operations.find_alignment(D.shape, self._layout_c, operand="C")
+        self.compile(self._tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+
+        problem_size, mode, batch_count = self._get_problem_args(A, B, C, D)
+
+        if mode == GemmUniversalMode.Gemm or batch_count == 1:
+            kwargs = {'split_k_slices': 1}
+        else:
+            kwargs = {
+                'batch': batch_count,
+                'batch_strides': {
+                    'A': self._get_batch_stride(A),
+                    'B': self._get_batch_stride(B),
+                    'C': self._get_batch_stride(C),
+                    'D': self._get_batch_stride(D)
+                }
+            }
+
+        kwargs['stream'] = stream
+
+        if isinstance(self.epilogue_functor, EpilogueFunctorVisitor):
+            output_op = self.operation.epilogue_type(visitor_args)
+        else:
+            output_op = self.operation.epilogue_type(alpha, beta)
+
+        arguments = GemmArguments(
+            operation=self.operation, problem_size=problem_size,
+            A=A, B=B, C=C, D=D,
+            output_op=output_op,
+            gemm_mode=mode,
+            **kwargs
+        )
+
+        self.operation.run(arguments)
+
+        if sync:
+            arguments.sync()
+
+        return arguments
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm_grouped.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm_grouped.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f90535c29a816541bc1a2155fea35afd1c94fd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/gemm_grouped.py
@@ -0,0 +1,269 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+    Ease-of-use interface for constructing, compiling, and running GEMMs.
+
+    The ``GroupedGemm`` interface is meant to allow one to easily instantiate, compile, and run
+    grouped GEMM operations in CUTLASS via Python, without specifying many configuration parameters.
+    Under the hood, the interface will select sensible default parameters for the many template
+    parameters for CUTLASS grouped GEMMs.
+
+    Note: optimal performance is not to be expected from this interface. To achieve optimal
+    performance, one should specify and tune each configuration parameter.
+
+    The simplest example of using this interface is the following:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        # As, Bs, Cs, and Ds are torch/numpy/cupy tensor objects
+        plan = cutlass_cppgen.op.GroupedGemm(element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        plan.run([A0, A1], [B0, B1], [C0, C1], [D0, D1])
+"""
+from __future__ import annotations
+from typing import Optional
+from cutlass_library import DataTypeSize
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+from cutlass_cppgen.backend.gemm_operation import (
+    GemmGroupedArguments,
+    GemmOperationGrouped,
+)
+from cutlass_cppgen.backend.library import (
+    SchedulerMode,
+    TensorDescription,
+    TileDescription,
+)
+from cutlass_cppgen.op.gemm import Gemm
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils import check, datatypes
+
+
+class GroupedGemm(Gemm):
+    """
+    Constructs a ``GroupedGemm`` object.
+
+    The data types and layouts of operands A, B, and C, along with the data type of output D
+    and that used for accumulation, are bound to the ``GroupedGemm`` object throughout its lifetime --
+    these are not to be changed after a ``GroupedGemm`` has been constructed.
+
+    The constructor has optional parameters for flexibly setting these parameters. Please see the constructor
+    for ``Gemm`` for examples of these.
+
+    :param cc: compute capability of device to generate kernels for
+    :type cc: int
+    :param A: tensor representing data type and layout of operands A
+    :param B: tensor representing data type and layout of operands B
+    :param C: tensor representing data type and layout of operands C
+    :param D: tensor representing data type and layout of operands D
+    :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+    :param beta: scalar parameter beta from GEMM operation that scales operand C
+    :param element_accumulator: data type to be used in accumulation of the product of operands A and B
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param element: generic data type to be used for operands A, B, C, D, as well as the accumulation data type
+    :type element: cutlass_cppgen.DataType
+    :param layout: generic layout type to be used for operands A, B, C, and D
+    :type layout: cutlass_cppgen.LayoutType
+    :param element_A: data type to be used for operand A
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type to be used for operand B
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type to be used for operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type to be used for operand D
+    :type element_D: cutlass_cppgen.DataType
+    :type layout_A: layout of operand A
+    :param layout_A: cutlass_cppgen.LayoutType
+    :type layout_B: layout of operand B
+    :param layout_B: cutlass_cppgen.LayoutType
+    :type layout_C: layout of operand C
+    :param layout_C: cutlass_cppgen.LayoutType
+    :type layout_D: layout of operand D
+    :param layout_D: cutlass_cppgen.LayoutType
+    """
+
+    def __init__(
+        self, A=None, B=None, C=None, D=None,
+        alpha=1.0, beta=0.0, element_accumulator=None,
+        element=None, layout=None,
+        element_A=None, element_B=None, element_C=None, element_D=None,
+        layout_A=None, layout_B=None, layout_C=None,
+        cc: int = None,
+    ):
+        super().__init__(
+            A=A, B=B, C=C, D=D,
+            alpha=alpha, beta=beta,
+            element_accumulator=element_accumulator,
+            element=element, layout=layout,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+            cc=cc
+        )
+
+        # Grouped GEMM specializations for SM90 are currently unavailable. Revert to using SM80
+        if self.current_cc in [90, 100, 101, 103]:
+            self._reset_options(80)
+            self._reset_operations(reset_epilogue=False)
+
+        self.name = "grouped_gemm"
+
+    @Gemm.swizzling_functor.setter
+    def swizzling_functor(self, swizzling_functor):
+        """
+        Sets the swizzling functor to the type specified by `swizzling_functor`
+        """
+        raise Exception('Grouped GEMM does not currently support different swizzling functors')
+
+    def construct(self, tile_description: TileDescription = None,
+                  alignment_A: int = None,
+                  alignment_B: int = None,
+                  alignment_C: int = None) -> GemmOperationGrouped:
+        """
+        Constructs a ``cutlass_cppgen.backend.GemmOperationGrouped`` based on the input parameters and current
+        kernel specification of the ``Gemm`` object.
+
+        :param tile_description: tile description specifying shapes and operand types to use in the kernel
+        :type tile_description: cutlass_cppgen.backend.TileDescription
+        :param alignment_A: alignment of operand A
+        :type alignment_A: int
+        :param alignment_B: alignment of operand B
+        :type alignment_B: int
+        :param alignment_C: alignment of operand C
+        :type alignment_C: int
+
+        :return: operation that was constructed
+        :rtype: cutlass_cppgen.backend.GemmOperationGrouped
+        """
+        alignment_A = check.alignment_or_default(alignment_A, max(self.possible_operations.alignments("A")))
+        alignment_B = check.alignment_or_default(alignment_B, max(self.possible_operations.alignments("B")))
+        alignment_C = check.alignment_or_default(alignment_C, max(self.possible_operations.alignments("C")))
+
+        self.epilogue_functor = self._reset_epilogue_functor_alignment(alignment_C, self.epilogue_functor)
+
+        tensor_A = TensorDescription(self._element_a, self._layout_b, alignment_A)
+        tensor_B = TensorDescription(self._element_b, self._layout_b, alignment_B)
+        tensor_C = TensorDescription(self._element_c, self._layout_c, alignment_C)
+
+        if tile_description is None:
+            op = self.possible_operations.operations(alignment_A, alignment_B, alignment_C, self._math_operation)[0]
+            tile_description = datatypes.td_from_profiler_op(op)
+        else:
+            valid, err_str = self._valid_tile_description(tile_description)
+            if not valid:
+                raise Exception(f"Invalid tile description. {err_str}")
+            self.tile_description = tile_description
+
+        operation = GemmOperationGrouped(
+            arch=self.current_cc,
+            tile_description=tile_description,
+            A=tensor_A, B=tensor_B, C=tensor_C,
+            epilogue_functor=self.epilogue_functor,
+            swizzling_functor=self._swizzling_functor,
+            precompute_mode=SchedulerMode.Device)
+
+        return operation
+
+    def run(self, A, B, C, D,
+            alpha=None, beta=None, sync: bool = True,
+            print_module: bool = False,
+            stream: Optional[cuda.CUstream] = None) -> GemmGroupedArguments:
+        """
+        Runs the kernel currently specified.
+
+        By default, this call returns only once the kernel has completed. To launch the kernel
+        and immediately return, set ``sync=False``. In this case, it is the responsibility of the
+        caller to syncrhonize the results of the kernel before attempting to access outputs
+        by calling ``sync()`` on the arguments returned from this call.
+
+        :param A: list of tensors representing data type and layout of operand A
+        :type A: list
+        :param B: list of tensors representing data type and layout of operand B
+        :type B: list
+        :param C: list of tensors representing data type and layout of operand C
+        :type C: list
+        :param D: list of tensors representing data type and layout of operand D
+        :type D: list
+        :param alpha: scalar paramter alpha from GEMM computation that scales the product of operands A and B
+        :param beta: scalar parameter beta from GEMM operation that scales operand C
+        :param sync: whether the call should wait for the kernel to complete before returning
+        :type sync: bool
+        :param print_module: whether to print the emitted C++ code
+        :type print_module: bool
+        :param stream: cuda stream, defaults to cuda.cuda.CUstream(0)
+        :type stream: :class:`cuda.cuda.CUstream`
+
+        :return: arguments passed in to the kernel
+        :rtype: cutlass_cppgen.backend.GemmGroupedArguments
+        """
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        super().run_setup()
+
+        if len(A) != len(B) or len(A) != len(C) or len(A) != len(D):
+            raise Exception("Lengths of A, B, C, and D lists must be equal")
+
+        problem_sizes = []
+        As, Bs, Cs, Ds = ([None] * len(A) for _ in range(4))
+        for i in range(len(A)):
+            As[i] = self._verify_tensor(A[i], self.A, self._element_a, self._layout_a, "A")
+            Bs[i] = self._verify_tensor(B[i], self.B, self._element_b, self._layout_b, "B")
+            Cs[i] = self._verify_tensor(C[i], self.C, self._element_c, self._layout_c, "C")
+            Ds[i] = self._verify_tensor(D[i], self.D, self._element_d, self._layout_d, "D")
+            problem_sizes.append(GemmCoord(A[i].shape[0], B[i].shape[1], A[i].shape[1]))
+
+        alpha = self._verify_scalar(alpha, self.alpha, self._element_c, "alpha")
+        beta = self._verify_scalar(beta, self.beta, self._element_c, "beta")
+
+        alignment_a = min((self.possible_operations.find_alignment(A.shape, self._layout_a, operand="A") for A in As))
+        alignment_b = min((self.possible_operations.find_alignment(B.shape, self._layout_b, operand="B") for B in Bs))
+        alignment_c = min((self.possible_operations.find_alignment(C.shape, self._layout_c, operand="C") for C in Cs))
+        self.compile(self.tile_description, alignment_A=alignment_a, alignment_B=alignment_b,
+                     alignment_C=alignment_c, print_module=print_module)
+
+        arguments = GemmGroupedArguments(
+            operation=self.operation,
+            problem_sizes=problem_sizes,
+            A=As, B=Bs, C=Cs, D=Ds,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            stream=stream
+        )
+
+        self.operation.run(arguments)
+
+        if sync:
+            arguments.sync()
+
+        return arguments
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/op.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bebf07a7e5b83a1cf14cfecf19e90f730e305dce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/op/op.py
@@ -0,0 +1,431 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+"""
+
+from bisect import bisect_left
+
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    MathOperation,
+    OperationKind,
+    SharedMemPerCC
+)
+
+import cutlass_cppgen
+from cutlass_cppgen import get_option_registry
+from cutlass_cppgen.backend.evt import EpilogueFunctorVisitor
+from cutlass_cppgen.backend.evt.passes.util import cc_map
+from cutlass_cppgen.backend.utils.device import device_cc
+from cutlass_cppgen.epilogue import get_activations, get_activation_epilogue, identity
+from cutlass_cppgen.library_defaults import KernelsForDataType, _generator_ccs
+from cutlass_cppgen.swizzle import get_swizzling_functors
+from cutlass_cppgen.utils import datatypes, check
+
+
+class OperationBase:
+    """
+    Base operation used for defining high-level CUTLASS operations (e.g., GEMM, Conv2d)
+    """
+
+    def __init__(self, cc: int = None, kernel_cc: int = None, operation_kind = OperationKind.Gemm):
+        """
+        :param cc: compute capability of device for which kernels should be compiled. For example, if running on H100, this should be set to 90
+        :type cc: int
+        :param kernel_cc: compute capability of kernels to generate. For example, if running on SM90, but desiring to use a CUTLASS 2.x-style Ampere kernel, this should be set to 80
+        :type kernel_cc: int
+        :param operation_kind: class of operation that will be performed (e.g., GEMM, Conv)
+        :type operation_kind: cutlass_library.OperationKind
+        """
+        self.operation_kind = operation_kind
+        self.cc = cc if cc is not None else device_cc()
+        self.specified_kernel_cc = kernel_cc is not None
+        self.current_cc = kernel_cc if kernel_cc is not None else self._find_closest_cc(self.cc)
+        self.tile_description = None
+        self._math_operation = None
+
+        self.options = get_option_registry().options_for_cc(self.current_cc, operation_kind)
+
+        if self.options is None:
+            raise Exception(f"Invalid or unsupported compute capability: {self.current_cc}")
+
+        # Default activation function: identity
+        self._activation = identity
+
+    def _find_closest_cc(self, cc: int) -> int:
+        """
+        Returns the closest CC in _generator_ccs less than or equal to `cc`
+
+        :param cc: compute capability to query
+        :type cc: int
+
+        :returns: closest CC in _generator_ccs less than or equal to `cc`
+        :rtype: int
+        """
+        if cc in _generator_ccs:
+            return cc
+
+        # Find closest CC lower than this CC
+        idx = bisect_left(_generator_ccs, cc)
+        if idx == 0:
+            raise Exception(f'No valid CC to fall back to for {cc}')
+        return _generator_ccs[idx-1]
+
+    def activations(self) -> list:
+        """
+        Returns possible activation functions that can be used
+
+        :return: list of activation functions that can be used
+        :rtype: list
+        """
+        return get_activations()
+
+    def swizzling_functors(self) -> list:
+        """
+        Returns possible swizzling functions that can be used
+
+        :return: list of swizzling functions that can be used
+        :rtype: list
+        """
+        return get_swizzling_functors()
+
+    def _reset_options(self, cc: int):
+        """
+        Resets the kernel options based on cc
+
+        :param cc: compute capability to reset to
+        :type cc: int
+        """
+        if cc != self.current_cc:
+            if cc not in _generator_ccs:
+                raise Exception(f'Invalid CC for CUTLASS kernels: {cc}.')
+            self.current_cc = cc
+            self.options = get_option_registry().options_for_cc(self.current_cc, self.operation_kind)
+
+    def _verify_scalar(self, scalar, ref_scalar, ref_dtype, name):
+        """
+        Verifies the following properties:
+            1) Either ``scalar`` or ``ref_scakar`` must be set (i.e., not ``None``)
+            2) If ``scalar`` is not ``None``, its datatype must match matches the current version
+               set by the plan (i.e., those in ``ref_dtype``)
+
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``scalar`` is not ``None``, ``scalar`` is returned. Otherwise, ``ref_scalar`` is returned.
+
+        :param scalar: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type scalar: numpy/cupy/torch scalar
+        :param ref_scalar: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_scalar: numpy/cupy/torch scalar
+        :param ref_dtype: data type for the scalar that this object was initialized to
+        :param name: identifier of the scalar to verify. Used in raising exceptions
+        :type name: str
+
+        :return: valid scalar to use
+        :rtype: numpy/cupy/torch scalar
+        """
+        if scalar is None:
+            if ref_scalar is None:
+                raise Exception(f"Scalar {name} must be set.")
+            return ref_scalar
+        if hasattr(scalar, "dtype"):
+            dtype = datatypes.library_type(scalar.dtype)
+            if dtype != ref_dtype:
+                raise Exception(
+                    f"Tensor {name} with type {dtype} does not match expected type {ref_dtype}."
+                )
+        return scalar
+
+    def _verify_tensor(self, tensor, ref_tensor, ref_dtype, ref_layout, name):
+        """
+        Verifies the following properties:
+            If ref_dtype is not void:
+                1) Either ``tensor`` or ``ref_tensor`` must be set (i.e., not ``None``)
+                2) If ``tensor`` is not ``None``, its datatype and layout must match matches the current versions
+                set by the plan (i.e., those in ``ref_dtype`` and ``ref_layout``)
+            If ref_dtype is void:
+                Neither ``tensor`` nor ``ref_tensor`` are set
+
+        If either of these properties does not hold, an exception is raised. If these properties hold and
+        ``tensor`` is not ``None``, ``tensor`` is returned. Otherwise, ``ref_tensor`` is returned.
+
+        :param tensor: object representing a tensor passed in to verify, or ``None`` if no tensor was passed in
+        :type tensor: numpy/cupy/torch array/tensor object
+        :param ref_tensor: object representing a tensor passed in on construction of this object, or ``None`` if no tensor was passed in
+        :type ref_tensor: numpy/cupy/torch array/tensor object
+        :param ref_dtype: data type for the tensor that this object was initialized to
+        :param ref_layout: layout for the tensor that this object was initialized to
+        :param name: identifier of the tensor to verify. Used in raising exceptions
+        :type name: str
+
+        :return: valid tensor object to use
+        :rtype: numpy/cupy/torch array/tensor object
+        """
+        if ref_dtype == DataType.void:
+            if tensor is not None or ref_tensor is not None:
+                raise Exception("Operands with element DataType.void must not be provided a tensor")
+            return None
+
+        if tensor is None:
+            if ref_tensor is None:
+                raise Exception(f"Tensor {name} must be set.")
+            return ref_tensor
+
+        self._verify_type_and_layout(tensor, ref_dtype, ref_layout, name)
+        return tensor
+
+    @property
+    def opclass(self) -> cutlass_cppgen.OpcodeClass:
+        """
+        Returns the opcode class currently in use
+
+        :return: opcode class currently in use
+        :rtype: cutlass_cppgen.OpcodeClass
+        """
+        return self.op_class
+
+    @opclass.setter
+    def opclass(self, oc: cutlass_cppgen.OpcodeClass):
+        if isinstance(oc, str):
+            oc = datatypes.getattr_enum(cutlass_cppgen.OpcodeClass, oc)
+        if oc in self.possible_op_classes:
+            self.op_class = oc
+        else:
+            raise Exception(
+                f'Unsupported operation class {oc} for CC {self.cc} and data type combination '
+                f'({self._element_a}, {self._element_b}, {self._element_accumulator}) and '
+                f'layout combination ({self._layout_a}, {self._layout_b}).')
+
+        # Changing the op class also changes the possible operations available. Reset these.
+        self.possible_operations = self.options.operations(
+            self.op_class, self._element_a, self._element_b,
+            self._element_accumulator, self._layout_a, self._layout_b, self._math_operation)
+
+        # Changing the op class changes the elements per access in the epilogue. Reset this.
+        if self.epilogue_functor is not None:
+            self.epilogue_functor = self._reset_epilogue_functor_alignment(self._elements_per_access(), self.epilogue_functor)
+
+    @property
+    def math_operation(self) -> cutlass_cppgen.MathOperation:
+        """
+        Returns the math operation currently in use
+
+        :return: math operation currently in use
+        :rtype: cutlass_cppgen.MathOperation
+        """
+        return self._math_operation
+
+    @math_operation.setter
+    def math_operation(self, mo: cutlass_cppgen.MathOperation):
+        if isinstance(mo, str):
+            mo = datatypes.getattr_enum(cutlass_cppgen.MathOperation, mo)
+
+        if not self.specified_kernel_cc:
+            if self.current_cc in [90, 100, 101, 103]:
+                # CUTLASS 3.0 kernels do not use different math operations. If one is specified, we
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+        elif self.current_cc in [90, 100, 101, 103]:
+            raise Exception("CUTLASS 3.0 kernels do not use different math operations. "
+                "To use 2.x kernels with a specific math operation, do not set the `kernel_cc`"
+                "parameter when constructing the plan.")
+
+        self._math_operation = mo
+        self._reset_operations()
+
+    def _elements_per_access(self):
+        if self.op_class == cutlass_cppgen.OpcodeClass.Simt:
+            return 1
+        elif self._element_c != DataType.void:
+            return 128 // DataTypeSize[self._element_c]
+        else:
+            return 128 // max(self.possible_operations.alignments("C"))
+
+    def _create_epilogue_functor_activation(self, activation):
+        """
+        Returns the epilogue functor with given activation function
+        """
+        if self.epilogue_functor is None:
+            elements_per_access = self._elements_per_access()
+        else:
+            elements_per_access = self.epilogue_functor.epilogue_vector_length
+
+        if not self.specified_kernel_cc:
+            if self.current_cc in [90, 100, 101, 103] and activation != identity:
+                # CUTLASS 3.0 kernels in Python currently only support identity activation. If one requests a non-identity activation,
+                # revert to using a CUTLASS 2.x kernel by using SM80-tagged kernels.
+                cutlass_cppgen.logger.warning("Reverting to using SM80-tagged kernel. Opclass may change.")
+                if self._element_c != self._element_d:
+                    raise Exception("CUTLASS 2.x kernels require element C to be the same as element D")
+                self._reset_options(80)
+                self._reset_operations(reset_epilogue=False)
+            elif (self.cc in [90, 100, 101, 103] and self.current_cc not in [90, 100, 101, 103] and activation == identity and self._math_operation is None):
+                # SM80 fallback kernels are currently used. Since an identity activation is requested,
+                # we can switch back to using SM90 kernels.
+                self._reset_options(self.cc)
+                self._reset_operations(reset_epilogue=False)
+        else:
+            if self.current_cc in [90, 100, 101, 103] and activation != identity:
+                raise Exception("Epilogues with elementwise fusion are not currently supported "
+                                "in the Python interface for 3.x kernels. To use 2.x kernels "
+                                "with fused elementwise epilogues, do not set the `kernel_cc` "
+                                "parameter when constructing the plan.")
+
+        return get_activation_epilogue(
+            activation,
+            self._element_d,
+            elements_per_access,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+
+    def _reset_epilogue_functor_activation(self, activation):
+        """
+        Set the epilogue functor based on the provided activation function
+        """
+        self.epilogue_functor = self._create_epilogue_functor_activation(activation)
+
+    def _reset_epilogue_functor_alignment(self, alignment, epilogue_functor):
+        """
+        Reset the alignment of the current epilogue functor based on alignment C
+        """
+        if isinstance(epilogue_functor, EpilogueFunctorVisitor):
+            return epilogue_functor
+
+        if epilogue_functor is None or not hasattr(epilogue_functor, 'activation_functor'):
+            # Identity epilogue does not have 'activation_functor'
+            activation = identity
+        else:
+            activation = epilogue_functor.activation_functor
+
+        epilogue_functor = get_activation_epilogue(
+            activation,
+            self._element_d,
+            alignment,
+            self._element_accumulator,
+            self._element_accumulator,
+        )
+        return epilogue_functor
+
+    @property
+    def activation(self):
+        """
+        Returns the type of the current activation function used
+        """
+        if hasattr(self.epilogue_functor, "activation_functor"):
+            return self.epilogue_functor.activation_functor
+        else:
+            return identity
+
+    @activation.setter
+    def activation(self, act):
+        """
+        Sets the type of the activation function to use
+        Activation can come with a set of arguments
+
+        :param act: type of activation function to use
+        :type act: str or tuple. e.g. "relu", ("leaky_relu", 0.01)
+
+        """
+        if isinstance(act, tuple):
+            if isinstance(act[0], str):
+                act_fn = getattr(cutlass_cppgen.backend.epilogue, act[0])
+            else:
+                act_fn = act[0]
+            self._reset_epilogue_functor_activation(act_fn)
+            self._activation_args = act[1]
+            self._activation = act[0]
+        else:
+            if isinstance(act, str):
+                act = getattr(cutlass_cppgen.backend.epilogue, act)
+            self._reset_epilogue_functor_activation(act)
+            self._activation = act
+
+    @property
+    def epilogue_visitor(self):
+        """
+        Return the epilogue functor
+        """
+        return self.epilogue_functor
+
+    @epilogue_visitor.setter
+    def epilogue_visitor(self, visitor):
+        """
+        Create the epilogue visitor
+        """
+        self.epilogue_functor = EpilogueFunctorVisitor(cc_map[self.cc], visitor)
+
+        # The epilogue_functor may consume too much shared memory
+        # Reset the possible operations
+        if self.cc not in [90, 100, 101, 103]:
+            # The shared memory is only a concern for sm90+ epilogue
+            # In sm80, the epilogue and mainloop share the shared memory
+            return
+
+        datatype_comb = self.possible_operations.datatype_comb
+        layout_comb = self.possible_operations.layout_comb
+        new_possible_operations = KernelsForDataType(datatype_comb, layout_comb)
+        for operation in self.possible_operations.all_operations:
+            td = datatypes.td_from_profiler_op(operation)
+            # Filter invalid epilogue schedules
+            if cc_map[self.cc] == 90 and td.epilogue_schedule not in [
+                cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative]:
+                continue
+            epilogue_smem_bytes = self.epilogue_functor.get_smem_size(td)
+
+            # Verify the maximum number of mainloop stages
+            mainloop_smem_per_stage = check.calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+            smem_capacity_bytes = SharedMemPerCC[self.cc] << 10
+            mainloop_stages = (smem_capacity_bytes - epilogue_smem_bytes) // mainloop_smem_per_stage
+            if mainloop_stages < 2:
+                # Mainloop stages must >= 2
+                continue
+
+            new_possible_operations.add(operation)
+        if len(new_possible_operations.all_operations) == 0:
+            raise RuntimeError(
+                "The epilogue consumes too much shared memory. "
+                "No valid tile description is found in the generator.")
+        self.possible_operations = new_possible_operations
+
+
+    def run_setup(self):
+        """
+        Steps that must be taken before caling `plan.run()`
+        """
+        # Initialize the memory pool if, if not already done
+        cutlass_cppgen.get_memory_pool()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/shape.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..a718f9bb4432f1f51457661abe27e24ea818aba4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/shape.py
@@ -0,0 +1,184 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for expressing shapes
+"""
+
+from cutlass_library import (
+    ConvMode,
+    ConvKind,
+    LayoutType
+)
+from cutlass_cppgen.backend.c_types import (
+    Conv2DProblemSize_,
+    GemmCoord_,
+    GemmCoordBatched_
+)
+
+
+class MatrixCoord:
+    def __init__(self, row, col):
+        self._row = row
+        self._col = col
+
+    @property
+    def row(self):
+        return self._row
+
+    @property
+    def column(self):
+        return self._col
+
+    def leading_dimension(self, layout: LayoutType) -> int:
+        """
+        Returns the leading dimension for a matrix with layout ``layout`` and shape provided by the MatrixCoord.
+
+        :param layout: layout of matrix
+        :type layout: cutlass_library.LayoutType
+
+        :returns: leading dimension
+        :rtype: int
+        """
+        if layout == LayoutType.RowMajor:
+            return self._col
+        elif layout == LayoutType.ColumnMajor:
+            return self._row
+        else:
+            raise Exception(f'Unsupported layout for leading dimension calculation: {layout}')
+
+
+class GemmCoord:
+    def __init__(self, m: int, n: int, k: int):
+        self._m = m
+        self._n = n
+        self._k = k
+
+    @property
+    def m(self) -> int:
+        return self._m
+
+    @property
+    def n(self) -> int:
+        return self._n
+
+    @property
+    def k(self) -> int:
+        return self._k
+
+    @property
+    def mk(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._k)
+
+    @property
+    def mn(self) -> MatrixCoord:
+        return MatrixCoord(self._m, self._n)
+
+    @property
+    def kn(self) -> MatrixCoord:
+        return MatrixCoord(self._k, self._n)
+
+    @property
+    def ctype(self) -> GemmCoord_:
+        return GemmCoord_(self._m, self._n, self._k)
+
+    def batched_ctype(self, batch_count: int) -> GemmCoordBatched_:
+        return GemmCoordBatched_(self._m, self._n, self._k, batch_count)
+
+
+class Conv2DProblemSize:
+    def __init__(
+        self, n: int, h: int, w: int, c: int,
+        k: int, r: int, s: int, c_: int,
+        pad_h: int, pad_w: int, stride_h: int, stride_w: int,
+        dilation_h: int, dilation_w: int, mode: ConvMode=ConvMode.CrossCorrelation,
+        split_k_slices: int=1, groups: int=1):
+
+        self.N = n
+        self.H = h
+        self.W = w
+        self.C = c
+        self.K = k
+        self.R = r
+        self.S = s
+        self.pad_h = pad_h
+        self.pad_w = pad_w
+        self.stride_h = stride_h
+        self.stride_w = stride_w
+        self.dilation_h = dilation_h
+        self.dilation_w = dilation_w
+        self.mode = int(mode)
+        self.split_k_slices = split_k_slices
+        self.groups = groups
+        self.P = ((h + pad_h * 2 - r * dilation_h) // stride_h) + 1
+        self.Q = ((w + pad_w * 2 - s * dilation_w) // stride_w) + 1
+
+    @property
+    def ctype(self) -> Conv2DProblemSize_:
+        return Conv2DProblemSize_(self)
+
+    def implicit_gemm_size(self, kind: ConvKind):
+        if kind == ConvKind.Fprop:
+            return GemmCoord(
+                self.N * self.P * self.Q,
+                self.K,
+                self.R * self.S * self.C // self.groups
+            )
+        elif kind == ConvKind.Dgrad:
+            return GemmCoord(
+                self.N * self.H * self.W,
+                self.C,
+                self.R * self.S * self.K
+            )
+        elif kind == ConvKind.Wgrad:
+            return GemmCoord(
+                self.K,
+                self.R * self.S * self.C,
+                self.N * self.P * self.Q
+            )
+
+    @staticmethod
+    def from_sizes(input_size, weight_size):
+        K, R, S, _ = weight_size
+        pad_h = R // 2
+        pad_w = S // 2
+        stride_h = 1
+        stride_w = 1
+        dilation_h = 1
+        dilation_w = 1
+        return Conv2DProblemSize(
+            *input_size,
+            *weight_size,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dilation_h, dilation_w
+        )
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/swizzle.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/swizzle.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd9483415ea36716bf4643d27b8d92f3e9878a5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/swizzle.py
@@ -0,0 +1,65 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Registry of swizzling functions
+"""
+
+from cutlass_library import SwizzlingFunctor
+
+
+IdentitySwizzle1 = SwizzlingFunctor.Identity1
+IdentitySwizzle2 = SwizzlingFunctor.Identity2
+IdentitySwizzle4 = SwizzlingFunctor.Identity4
+IdentitySwizzle8 = SwizzlingFunctor.Identity8
+HorizontalSwizzle = SwizzlingFunctor.Horizontal
+ThreadblockSwizzleStreamK = SwizzlingFunctor.StreamK
+StridedDgradIdentitySwizzle1 = SwizzlingFunctor.StridedDgradIdentity1
+StridedDgradIdentitySwizzle4 = SwizzlingFunctor.StridedDgradIdentity4
+StridedDgradHorizontalSwizzle = SwizzlingFunctor.StridedDgradHorizontal
+
+
+_swizzling_functors = [
+    IdentitySwizzle1,
+    IdentitySwizzle2,
+    IdentitySwizzle4,
+    IdentitySwizzle8,
+    HorizontalSwizzle,
+    ThreadblockSwizzleStreamK,
+    StridedDgradIdentitySwizzle1,
+    StridedDgradIdentitySwizzle4,
+    StridedDgradHorizontalSwizzle,
+]
+
+
+def get_swizzling_functors():
+    return _swizzling_functors
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75d8416a15070ddcf2c6270248ccd9deff8e2137
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/__init__.py
@@ -0,0 +1,41 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_cppgen.utils.check import (
+    alignment_or_default,
+    calculate_smem_usage,
+    calculate_smem_usage_per_stage,
+    valid_cluster_shape,
+    valid_schedule,
+    valid_stage_count,
+    update_alignment,
+)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/check.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/check.py
new file mode 100644
index 0000000000000000000000000000000000000000..108f268b4bc54ec0839afb5c1602ba63e5b98743
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/check.py
@@ -0,0 +1,262 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for checking constraints on kernels and calculating kernel attributes
+"""
+
+import ctypes
+
+from cutlass_library import DataTypeSize, KernelScheduleSuffixes, OperationKind, SharedMemPerCC
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.library import TileDescription
+
+
+def calculate_smem_usage_per_stage(td: TileDescription, operation_kind: OperationKind) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed in a single stage of a kernel.
+
+    :param td: tile description to compute shared memory of
+    :type td: TileDescription
+    :param operation_kind: identifier for the type of operation being performed
+    :type operation_kind: cutlass_library.OperationKind
+
+    :return: number of bytes of shared memory consumed by a single stage
+    :rtype: int
+    """
+    m, n, k = td.blackwell_threadblock_shape
+    if td.is_2sm:
+        m //= 2
+
+    if operation_kind == OperationKind.Gemm:
+        stage_barrier_bytes = 32
+        return (
+            (DataTypeSize[td.math_instruction.element_a] * m * k // 8)
+            + (DataTypeSize[td.math_instruction.element_b] * k * n // 8)
+            + stage_barrier_bytes
+        )
+    else:
+        raise Exception(f"No available shared memory calculation for operation kind {operation.operation_kind}")
+
+
+def calculate_smem_usage(operation) -> int:
+    """
+    Returns the amount of shared memory in bytes consumed by a kernel.
+
+    :return: number of bytes of shared memory consumed by the operation
+    :return: int
+    """
+    _per_stage = calculate_smem_usage_per_stage(operation.tile_description, operation.operation_kind)
+    return _per_stage * operation.tile_description.stages
+
+
+def valid_stage_count(
+    cc: int,
+    kernel_cc: int,
+    td: TileDescription,
+    element_C: cutlass_cppgen.DataType = None,
+    element_D: cutlass_cppgen.DataType = None,
+    verbose: bool = True) -> tuple:
+    """
+    Checks whether a device with `cc` supports the number of stages within `tile_description`, both
+    based on raw limits on the number of stages and based on shared memory capacity
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_cc: compute capability that the kernel targets (corresponding to the arch::SMxy tag in CUTLASS)
+    :type kernel_cc: int
+    :param td: tile description to check
+    :type td: TileDescription
+    :param element_C: data type of operand C
+    :type element_C: cutlass_cppgen.DataType
+    :param element_D: data type of operand D
+    :type element_D: cutlass_cppgen.DataType
+    :param verbose: whether to log warnings
+    :type verbose: bool
+
+    :return: tuple with the first element indicating whether the provided tile description is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    if kernel_cc in [90, 100, 101, 103]:
+        if (td.stages is None or td.stages == 0):
+            # Stage count of None or 0 for SM90 indicates that the CollectiveBuilder automatically
+            # determines the stage count to use. Thus, all settings are valid in these scenarios.
+            return (True, "")
+        elif verbose:
+            cutlass_cppgen.logger.warning(
+                "Setting an explicit stage count for SM90 kernels currently may "
+                "result in compilation errors if the combination of tile shape, "
+                "stage count, and shared memory requirement of the epilogue exceeds "
+                "the available shared memory per SM.")
+
+    if td.stages <= 0:
+        return (False, f"Stage counts must be positive integers. Tile description has stage count of {td.stages}.")
+
+    if cc < 80 and td.stages != 2:
+        return (False, f"Tile description has stage count of {td.stages}, "
+                       f"but only 2 stages are supported on SM{cc}.")
+
+    # The calculation below does not consider shared memory used by the epilogue and, thus,
+    # only catches cases in which the mainloop exceeds the device's shared memory capacity.
+    # This is not a concern for CUTLASS 2.x kernels, for which the shared memory of the
+    # mainloop and epilogue is shared.
+    smem_per_stage = calculate_smem_usage_per_stage(td, OperationKind.Gemm)
+    smem_usage_mainloop = (smem_per_stage * td.stages)
+    smem_arch = SharedMemPerCC[cc] << 10
+    if smem_usage_mainloop > smem_arch:
+        return ( False,
+            "Configuration uses too much shared memory. Consider reducing stage count or tile shape.\n"
+            f"Details:\n"
+            f"Mainloop uses {smem_per_stage} bytes of shared memory per stage, and "
+            f"{td.stages} stages for a total of {smem_usage_mainloop} bytes.\n"
+            f"The maxmium amount of shared memory that can be used per block on CC {cc} is {smem_arch}.")
+
+    return (True, "")
+
+
+def valid_cluster_shape(cc: int, cluster_shape: list) -> tuple:
+    """
+    Checks whether a device with `cc` supports a thread block cluster of shape `cluster_shape`.
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param cluster_shape: dimensions of thread block cluster shape to check
+    :type cluster_shape: list
+
+    :return: tuple with the first element indicating whether the provided cluster shape is
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+
+    if cc < 90 or cc in [120, 121]:
+        if cluster_shape != [1, 1, 1]:
+            return (False,
+                    f"Cluster shape for pre-SM90 architectures and SM 120 and 121 must be [1, 1, 1]. Received cluster shape of "
+                    f"{cluster_shape} for SM{cc}.")
+        else:
+            return (True, "")
+
+    if len(cluster_shape) != 3:
+        return (False,
+                f"Cluster shapes must be rank-3. Received {cluster_shape} (rank {len(cluster_shape)}")
+
+    if cluster_shape[2] != 1:
+        return (False,
+                "CUTLASS kernels currently require the third dimension of cluster shape to be 1. "
+                f"Received cluster shape of {cluster_shape}.")
+
+    return (True, "")
+
+
+def valid_schedule(
+    cc: int,
+    kernel_schedule: cutlass_cppgen.KernelScheduleType,
+    epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
+    tile_scheduler: cutlass_cppgen.TileSchedulerType) -> tuple:
+    """
+    Checks that the kernel and epilogue schedules passed in are a valid combination for
+    a device of compute capability ``cc``.
+
+    :param cc: compute capability of device in question
+    :type cc: int
+    :param kernel_schedule: kernel schedule type
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule type
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param tile_scheduler: tile scheduler type
+    :type tile_scheduler: cutlass_cppgen.TileSchedulerType
+
+    :return: tuple with the first element indicating whether the provided schedules are
+             valid for the provided device and the second element being an error message
+    :rtype: tuple
+    """
+    kernel_auto = (kernel_schedule == cutlass_cppgen.KernelScheduleType.ScheduleAuto)
+    epilogue_auto = (epilogue_schedule == cutlass_cppgen.EpilogueScheduleType.ScheduleAuto)
+    tile_scheduler_default = (tile_scheduler == cutlass_cppgen.TileSchedulerType.Default)
+    if (cc < 90 or cc in [120, 121]) and not (kernel_auto and epilogue_auto and tile_scheduler_default):
+        return (False, "Non-default schedules are only supported on SM90 and beyond (excluding SM120 and SM121)")
+
+    if cc == 90 and ((kernel_auto and not epilogue_auto) or (not kernel_auto and epilogue_auto)):
+        return (False, "Kernel and epilogue schedules must either both be auto or neither be auto")
+
+    if not tile_scheduler_default:
+        cooperative_kernels = [cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative, 
+                               cutlass_cppgen.KernelScheduleType.CpAsyncWarpSpecializedCooperative]
+        if cc == 90 and (tile_scheduler == cutlass_cppgen.TileSchedulerType.StreamK) and (kernel_schedule not in cooperative_kernels):
+            return (False, "Stream-K tile scheduler is currently only supported with the cooperative kernel schedule")
+    return (True, "")
+
+
+def alignment_or_default(alignment_provided: int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+
+    return default_alignment
+
+
+def update_alignment(alignment_provided:int, default_alignment: int) -> int:
+    """
+    Returns `alignment_provided` if it is set, otherwise `default_alignment` and checks
+    that `alignment_provided` does not exceed `default_alignment`.
+
+    :param alignment_provided: alignment preference specified. Can be None.
+    :type alignment_provided: int
+    :param default_alignment: alignment to use if `alignment_provided` is None
+    :type default_alignment: int
+
+    :return: alignment to use
+    :rtype: int
+    """
+    if alignment_provided is not None:
+        if alignment_provided > default_alignment:
+            if alignment_provided % default_alignment == 0:
+                return default_alignment
+            raise Exception(f"Alignment {alignment_provided} exceeds the maximum supported of {default_alignment}.")
+        return alignment_provided
+
+    return default_alignment
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/datatypes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/datatypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c03a834dc47871bebe618752e4775a0a7434ff78
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/datatypes.py
@@ -0,0 +1,362 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for converting between frontend datatypes and CUTLASS datatypes
+"""
+
+import cutlass_cppgen
+from cutlass_library import (
+    DataTypeSize,
+    MathOperation,
+    MathInstruction
+)
+from cutlass_cppgen.backend.library import (
+    TileDescription,
+)
+
+bfloat16_available = None
+cupy_available = None
+numpy_available = None
+torch_available = None
+_library_to_cupy_dict = None
+_library_to_numpy_dict = None
+_library_to_torch_dict = None
+_torch_to_library_dict = None
+
+
+def is_numpy_available():
+    global numpy_available, _library_to_numpy_dict
+    if numpy_available is None:
+        try:
+            import numpy as np
+
+            numpy_available = True
+            _library_to_numpy_dict = {
+                cutlass_cppgen.DataType.f16: np.float16,
+                cutlass_cppgen.DataType.f32: np.float32,
+                cutlass_cppgen.DataType.f64: np.float64,
+                cutlass_cppgen.DataType.s8: np.int8,
+                cutlass_cppgen.DataType.s32: np.int32,
+            }
+        except ImportError:
+            numpy_available = False
+            _library_to_numpy_dict = {}
+    return numpy_available
+
+
+def is_numpy_tensor(inp) -> bool:
+    if is_numpy_available():
+        import numpy as np
+        return isinstance(inp, np.ndarray)
+    return False
+
+
+def numpy_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_numpy_available():
+        import numpy as np
+        if inp == np.float16:
+            return cutlass_cppgen.DataType.f16
+        elif inp == np.float32:
+            return cutlass_cppgen.DataType.f32
+        elif inp == np.float64:
+            return cutlass_cppgen.DataType.f64
+        elif inp == np.int8:
+            return cutlass_cppgen.DataType.s8
+        elif inp == np.int32:
+            return cutlass_cppgen.DataType.s32
+    return None
+
+
+def numpy_type(inp):
+    return _library_to_numpy_dict.get(inp, None)
+
+
+def is_cupy_available():
+    global cupy_available
+    if cupy_available is None:
+        try:
+            import cupy as cp
+
+            cupy_available = True
+            _library_to_cupy_dict = {
+                cutlass_cppgen.DataType.f16: cp.float16,
+                cutlass_cppgen.DataType.f32: cp.float32,
+                cutlass_cppgen.DataType.f64: cp.float64,
+                cutlass_cppgen.DataType.s8: cp.int8,
+                cutlass_cppgen.DataType.s32: cp.int32,
+            }
+        except ImportError:
+            cupy_available = False
+            _library_to_cupy_dict = {}
+    return cupy_available
+
+
+def is_cupy_tensor(inp) -> bool:
+    if is_cupy_available():
+        import cupy as cp
+        return isinstance(inp, cp.ndarray)
+    return False
+
+
+def cupy_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_cupy_available():
+        import cupy as cp
+        if inp == cp.float16:
+            return cutlass_cppgen.DataType.f16
+        elif inp == cp.float32:
+            return cutlass_cppgen.DataType.f32
+        elif inp == cp.float64:
+            return cutlass_cppgen.DataType.f64
+    return None
+
+
+def cupy_type(inp):
+    return _library_to_cupy_dict.get(inp, None)
+
+
+def is_torch_available():
+    global torch_available, _library_to_torch_dict, _torch_to_library_dict
+    if torch_available is None:
+        try:
+            import torch
+
+            torch_available = True
+            _torch_to_library_dict = {
+                torch.half: cutlass_cppgen.DataType.f16,
+                torch.float16: cutlass_cppgen.DataType.f16,
+                torch.bfloat16: cutlass_cppgen.DataType.bf16,
+                torch.float: cutlass_cppgen.DataType.f32,
+                torch.float32: cutlass_cppgen.DataType.f32,
+                torch.double: cutlass_cppgen.DataType.f64,
+                torch.float64: cutlass_cppgen.DataType.f64,
+                torch.int8: cutlass_cppgen.DataType.s8,
+                torch.int32: cutlass_cppgen.DataType.s32,
+                torch.uint8: cutlass_cppgen.DataType.u8,
+            }
+
+            _library_to_torch_dict = {
+                cutlass_cppgen.DataType.f16: torch.half,
+                cutlass_cppgen.DataType.f16: torch.float16,
+                cutlass_cppgen.DataType.bf16: torch.bfloat16,
+                cutlass_cppgen.DataType.f32: torch.float,
+                cutlass_cppgen.DataType.f32: torch.float32,
+                cutlass_cppgen.DataType.f64: torch.double,
+                cutlass_cppgen.DataType.f64: torch.float64,
+                cutlass_cppgen.DataType.s8: torch.int8,
+                cutlass_cppgen.DataType.s32: torch.int32,
+                cutlass_cppgen.DataType.u8: torch.uint8,
+            }
+
+            def possibly_add_type(torch_type_name, cutlass_type):
+                # Only try adding the type if the version of torch being used supports it
+                if hasattr(torch, torch_type_name):
+                    torch_type = getattr(torch, torch_type_name)
+                    _torch_to_library_dict[torch_type] = cutlass_type
+                    _library_to_torch_dict[cutlass_type] = torch_type
+
+            possibly_add_type("float8_e4m3fn", cutlass_cppgen.DataType.e4m3)
+            possibly_add_type("float8_e5m2", cutlass_cppgen.DataType.e5m2)
+
+        except ImportError:
+            torch_available = False
+            _torch_to_library_dict = {}
+            _library_to_torch_dict = {}
+    return torch_available
+
+
+def is_torch_tensor(inp) -> bool:
+    if is_torch_available():
+        import torch
+        return isinstance(inp, torch.Tensor)
+    return False
+
+
+def torch_library_type(inp) -> cutlass_cppgen.DataType:
+    return _torch_to_library_dict.get(inp, None)
+
+
+def torch_type(inp):
+    return _library_to_torch_dict.get(inp, None)
+
+
+def is_bfloat16_available():
+    global bfloat16_available
+
+    if bfloat16_available is None:
+        try:
+            import bfloat16
+
+            bfloat16_available = True
+        except ImportError:
+            bfloat16_available = False
+    return bfloat16_available
+
+
+def bfloat16_library_type(inp) -> cutlass_cppgen.DataType:
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == bfloat16.bfloat16:
+            return cutlass_cppgen.DataType.bf16
+
+
+def bfloat16_type(inp):
+    if is_bfloat16_available():
+        import bfloat16
+        if inp == cutlass_cppgen.DataType.bf16:
+            return bfloat16.bfloat16
+
+
+def library_type(inp):
+    if inp in DataTypeSize:
+        return inp
+
+    for cvt_fn in [
+        bfloat16_library_type,
+        cupy_library_type,
+        numpy_library_type,
+        torch_library_type,
+    ]:
+        out = cvt_fn(inp)
+        if out is not None:
+            return out
+
+    raise Exception(f"No available conversion from type {inp} to a library type.")
+
+
+def _tensor_from_numpy(np_tensor):
+    dtype = library_type(np_tensor.dtype)
+    if np_tensor.flags.c_contiguous:
+        layout = cutlass_cppgen.LayoutType.RowMajor
+    elif np_tensor.flags.f_contiguous:
+        layout = cutlass_cppgen.LayoutType.ColumnMajor
+    return (dtype, layout)
+
+
+def _tensor_from_torch(pt_tensor):
+    dtype = library_type(pt_tensor.dtype)
+    return (dtype, cutlass_cppgen.LayoutType.RowMajor)
+
+
+def get_datatype_and_layout(tensor):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return _tensor_from_numpy(tensor)
+    elif is_torch_tensor(tensor):
+        return _tensor_from_torch(tensor)
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (cutlass_cppgen.DataType.f32, cutlass_cppgen.LayoutType.RowMajor)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+
+
+def get_tensor_shape(tensor, op="GEMM"):
+    if (is_numpy_tensor(tensor) or is_cupy_tensor(tensor)):
+        return tensor.shape
+    elif is_torch_tensor(tensor):
+        size = tensor.size()
+        if op == "CONV":
+            # PyTorch Tensors have shape NCHW
+            return (size[0], size[2], size[3], size[1])
+        else:
+            return tuple(tensor.size())
+    elif isinstance(tensor, float) or isinstance(tensor, int):
+        return (1,)
+    else:
+        raise Exception(f"Unable to convert tensor of type {type(tensor)} to Python-bound CUTLASS datatype and layout.")
+
+
+_math_operation_value_map = {x.value: x for x in MathOperation}
+
+
+def backend_math_operation(math_op: MathOperation):
+    if math_op.value not in _math_operation_value_map.keys():
+        raise Exception(f"Unable to convert math operation of type {math_op} to backend math operation.")
+    return _math_operation_value_map[math_op.value]
+
+
+def construct_backend_td(td: cutlass_cppgen.TileDescription,
+                         kernel_schedule: cutlass_cppgen.KernelScheduleType,
+                         epilogue_schedule: cutlass_cppgen.EpilogueScheduleType,
+                         tile_scheduler: cutlass_cppgen.TileSchedulerType) -> TileDescription:
+    mi = td.math_instruction
+    backend_mi = MathInstruction(
+        mi.instruction_shape,
+        mi.element_a,
+        mi.element_b,
+        mi.element_accumulator,
+        mi.opcode_class,
+        backend_math_operation(mi.math_operation)
+    )
+    cluster_shape = td.cluster_shape if hasattr(td, "cluster_shape") else [1, 1, 1]
+    return TileDescription(td.threadblock_shape, td.stages, td.warp_count,
+                           backend_mi, cluster_shape, kernel_schedule, epilogue_schedule, tile_scheduler)
+
+
+def td_from_profiler_op(op) -> TileDescription:
+    """
+    Converts the profiler's TileDescription in ``op`` into the backend TileDescription
+
+    :param op: profiler Operation
+
+    :returns: backend TileDescription
+    :rtype: cutlass_cppgen.backend.TileDescription
+    """
+    kschedule = op.kernel_schedule if hasattr(op, 'kernel_schedule') else None
+    eschedule = op.epilogue_schedule if hasattr(op, 'epilogue_schedule') else None
+    tschedule = op.tile_scheduler if hasattr(op, 'tile_scheduler') else None
+    return construct_backend_td(op.tile_description, kschedule, eschedule, tschedule)
+
+
+def td_from_profiler_td(td: TileDescription) -> TileDescription:
+    """
+    Converts the profiler's TileDescription into the backend TileDescription
+
+    :param td: profiler TileDescription
+    :type td: cutlass_cppgen.TileDescription
+
+    :returns: backend TileDescription
+    :rtype: cutlass_cppgen.backend.TileDescription
+    """
+    return construct_backend_td(td, kernel_schedule=None, epilogue_schedule=None, tile_scheduler=None)
+
+
+def to_camel_case(snake_str):
+    return "".join(x.capitalize() for x in snake_str.lower().split("_"))
+
+
+def getattr_enum(obj, attr_name):
+    # The attr_name is under the snake_case
+    camel_attr = to_camel_case(attr_name)
+    if hasattr(obj, camel_attr):
+        return getattr(obj, camel_attr)
+    else:
+        raise Exception(f"Invalid option: {attr_name}")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/lazy_import.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/lazy_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f6a185040f4c2f6167c6191c9bee766a92b1b9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/lazy_import.py
@@ -0,0 +1,41 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+import importlib
+from typing import Any
+
+def lazy_import(mod_name: str) -> Any:
+    class Lazy:
+        def __getattr__(self, name:str) -> Any:
+            module = importlib.import_module(mod_name)
+            return getattr(module, name)
+    
+    return Lazy()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/profiler.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f53b1567978d17f2eaec0208d896aafb296f033f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_cppgen/utils/profiler.py
@@ -0,0 +1,196 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Profiler based on the cuda events
+"""
+
+import re
+import subprocess
+
+from cutlass_cppgen.utils.lazy_import import lazy_import
+cuda = lazy_import("cuda.cuda")
+cudart =  lazy_import("cuda.cudart")
+import numpy as np
+
+from cutlass_cppgen import CUTLASS_PATH
+from cutlass_cppgen.backend.library import DataTypeSize
+from cutlass_cppgen.op.op import OperationBase
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils.datatypes import is_numpy_tensor
+
+
+class GpuTimer:
+    def __init__(self) -> None:
+        self.events = [
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+            cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1],
+        ]
+
+    def start(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        (err,) = cuda.cuEventRecord(self.events[0], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def stop(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        (err,) = cuda.cuEventRecord(self.events[1], stream)
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        pass
+
+    def stop_and_wait(self, stream=None):
+        if not stream:
+            stream = cuda.CUstream(0)
+
+        self.stop(stream)
+        if stream:
+            (err,) = cuda.cuStreamSynchronize(stream)
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+        else:
+            (err,) = cudart.cudaDeviceSynchronize()
+            if err != cuda.CUresult.CUDA_SUCCESS:
+                raise RuntimeError(f"CUDA Error {str(err)}")
+
+    def duration(self, iterations=1):
+        err, duration = cuda.cuEventElapsedTime(self.events[0], self.events[1])
+        if err != cuda.CUresult.CUDA_SUCCESS:
+            raise RuntimeError(f"CUDA Error {str(err)}")
+        return duration / float(iterations)
+
+
+class CUDAEventProfiler:
+    def __init__(self, op: OperationBase, warmup_iterations: int=500, iterations: int=500, *args, **kwargs) -> None:
+        self.arguments = op.run(*args, **kwargs)
+        self.operation = op.operation
+        self.warmup_iterations = warmup_iterations
+        self.iterations = iterations
+        self.timer = GpuTimer()
+
+    #
+    # Cutlass Python Interface Profiler
+    #
+
+    def __call__(self):
+        for _ in range(self.warmup_iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.start()
+        for _ in range(self.iterations):
+            self.operation.run(self.arguments)
+
+        self.timer.stop_and_wait()
+        runtime = self.timer.duration(self.iterations)
+        return runtime
+
+    #
+    # CUTLASS Profiler
+    #
+
+    def run_cutlass_profiler(self):
+        alpha = 1.0
+        beta = 1.0
+
+        profiler_path = CUTLASS_PATH + "/build/tools/profiler/cutlass_profiler"
+        kernel_name = self.operation.procedural_name()
+        verification_providers = "device"
+        provider = "cutlass"
+        problem_size = self.arguments.problem_size
+
+        if "cutlass3x" in kernel_name:
+            # cutlass3x generator only have column-major output
+            layout_name = self.operation.layout_name_3x()
+            if layout_name[-1] == "t":
+                new_layout_name = "".join(["n" for l in layout_name if l == "t" or "t"])
+                problem_size = GemmCoord(problem_size.n, problem_size.m, problem_size.k)
+                kernel_name = kernel_name.replace(layout_name, new_layout_name)
+
+        batch_count = self.arguments.batch_count
+
+        cmd = f"{profiler_path} --kernels={kernel_name} --verification-providers={verification_providers} " \
+              f"--providers={provider} --m={problem_size.m()} --n={problem_size.n()} --k={problem_size.k()} " \
+              f"--batch_count={batch_count} --alpha={alpha} --beta={beta} "\
+              f"--warmup-iterations={self.warmup_iterations} --profiling-iterations={self.iterations}"
+
+        result = subprocess.getoutput(cmd)
+
+        m = re.search(r"Runtime:\s+(?P<runtime>\d+.\d+)", result)
+        runtime = float(m.group("runtime"))
+
+        m = re.search(r"Bytes:\s+(?P<bytes>\d+)", result)
+        bytes = int(m.group("bytes"))
+
+        m = re.search(r"FLOPs:\s+(?P<flops>\d+)", result)
+        flops = int(m.group("flops"))
+
+        # check if the problem size matches
+        assert bytes == self.bytes(problem_size, batch_count, beta)
+        assert flops == self.flops(problem_size, batch_count, beta)
+
+        return runtime
+
+    def bytes(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        bytes = (
+            (DataTypeSize[self.operation.A.element] * m // 8) * k
+            + (DataTypeSize[self.operation.B.element] * n // 8) * k
+            + (DataTypeSize[self.operation.C.element] * m // 8) * n
+        )
+
+        if beta != 0:
+            bytes += (DataTypeSize[self.operation.C.element] * m // 8) * n
+
+        bytes *= batch_count
+
+        return bytes
+
+    def flops(self, problem_size, batch_count=1, beta=0.0):
+        m = problem_size.m()
+        n = problem_size.n()
+        k = problem_size.k()
+
+        flops_ = (m * n * k) * 2 * batch_count
+
+        if beta != 0:
+            flops_ += m * n * batch_count * 2
+
+        return flops_
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..534eef47d810eb9f17a9ba6dbbe2e0dff935eb3f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/__init__.py
@@ -0,0 +1,63 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import os
+import sys
+
+from . import conv2d_operation
+from . import conv3d_operation
+from . import emit_kernel_listing
+from . import gemm_operation
+
+if '-m' not in sys.argv:
+    # Do not import generator when running python -m cutlass_library.generator to
+    # avoid double-import warnings
+    from . import generator
+
+from . import library
+from . import manifest
+from . import rank_2k_operation
+from . import rank_k_operation
+from . import symm_operation
+from . import trmm_operation
+# Make enum types from library.py accessible via cutlass_library.*
+from .library import *
+
+# Set up `source` to point to the path containing the CUTLASS source.
+# Check first if the path contains a `source` subdirectory -- this will
+# be the case when the package has been installed via pip. Otherwise,
+# default to the root of CUTLASS.
+install_source_path = os.path.join(__path__[0], 'source')
+if os.path.isdir(install_source_path):
+    source_path = install_source_path
+else:
+    source_path = os.path.join(__path__[0], '../..')
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv2d_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv2d_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b674463a2c5795be8610883c4dc98a1e7123a01b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv2d_operation.py
@@ -0,0 +1,621 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv2d kernels
+"""
+
+import enum
+import logging
+import os.path
+import shutil
+from string import Template
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+
+#
+class Conv2dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity1, \
+    group_mode = GroupMode.NoneGroup):
+
+    self.operation_kind = OperationKind.Conv2d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+    self.group_mode = group_mode
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian
+      ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    intermediate_type = ''
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.accumulator_type():
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+
+    return "%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    threadblock = self.tile_description.procedural_name()
+
+    # grouped conv
+    if self.group_mode != GroupMode.NoneGroup:
+      group_conv_name = f"{GroupModeNames[self.group_mode]}_"
+    else:
+      group_conv_name = ""
+
+    if self.stride_support == StrideSupport.Unity and self.conv_kind == ConvKind.Dgrad:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_unity_stride_${group_conv_name}align${alignment}"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${group_conv_name}align${alignment}"
+
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'alignment': "%d" % self.A.alignment,
+        'group_conv_name': group_conv_name
+      }
+    )
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv2dInstance:
+  def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
+    self.template = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2d${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_group_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv2dGroup${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator},
+    ${group_mode},
+    ${iterator_algorithm},
+    ${stride_support},
+    ${align_a},
+    ${align_b}
+  >::Kernel;
+"""
+    self.template_depthwise_direct_conv = """
+  // Conv2d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultDepthwiseDirect2dConv${conv_kind_name}<
+    ${element_a},
+    ${layout_a},
+    ${element_b},
+    ${layout_b},
+    ${element_c},
+    ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::conv::TensorNHWCShape<${threadblock_output_shape_n}, ${threadblock_output_shape_p}, ${threadblock_output_shape_q}, ${groups_per_cta}>,
+    cutlass::MatrixShape<${filter_shape_r}, ${filter_shape_s}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+
+    cutlass::conv::threadblock::DepthwiseDirect2dConvIdentityThreadblockSwizzle<
+          1,
+          ${threadblock_output_shape_n},
+          ${threadblock_output_shape_p},
+          ${threadblock_output_shape_q}>,
+    ${stages},
+    ${math_operator},
+    ${iterator_algorithm},
+    ${stride_support},
+    cutlass::MatrixShape<${stride_r}, ${stride_s}>,
+    cutlass::MatrixShape<${dilation_r}, ${dilation_s}>
+  >::Kernel;
+"""
+
+  def arch_number_to_type(self, arch: int):
+    return f"cutlass::arch::Sm{arch}"
+
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitConv2dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+
+    _LOGGER.debug("***   CUTLASS 2 operation")
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support],
+      'math_operator': 'cutlass::arch::OpMultiplyAddComplex' if operation.is_complex() else \
+      MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+    }
+
+    if operation.group_mode == GroupMode.NoneGroup:
+      _LOGGER.debug("***   group_mode=NoneGroup")
+      return SubstituteTemplate(self.template, values)
+
+    elif operation.group_mode == GroupMode.Depthwise:
+      _LOGGER.debug("***   group_mode=Depthwise")
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      # Setup other template params
+      values['threadblock_output_shape_n'] = str(operation.tile_description.threadblock_output_shape[0])
+      values['threadblock_output_shape_p'] = str(operation.tile_description.threadblock_output_shape[1])
+      values['threadblock_output_shape_q'] = str(operation.tile_description.threadblock_output_shape[2])
+
+      values['groups_per_cta'] = str(operation.tile_description.threadblock_output_shape[3])
+
+      values['filter_shape_r'] = str(operation.tile_description.filter_shape[0])
+      values['filter_shape_s'] = str(operation.tile_description.filter_shape[1])
+
+      values['stride_r'] = str(operation.tile_description.stride[0])
+      values['stride_s'] = str(operation.tile_description.stride[1])
+
+      values['dilation_r'] = str(operation.tile_description.dilation[0])
+      values['dilation_s'] = str(operation.tile_description.dilation[1])
+
+      return SubstituteTemplate(self.template_depthwise_direct_conv, values)
+
+    else:
+      _LOGGER.debug("***   group_mode=" + GroupModeTag[operation.group_mode])
+      values['group_mode'] = GroupModeTag[operation.group_mode]
+      return SubstituteTemplate(self.template_group_conv, values)
+
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+
+#
+def GenerateConv2dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+  _LOGGER.debug("*** GenerateConv2dTensorOp")
+
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNHWC, max(1, int(align / DataTypeSize[output_type])))
+
+          manifest.append(Conv2dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+
+class EmitConv2dIncludes:
+  '''Emit includes that are specific to the operation.'''
+
+  def __init__(self):
+    self.includes = ['conv2d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitConv2dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+
+    self.instance_emitter = EmitConv2dInstance()
+    self.includes_emitter = EmitConv2dIncludes()
+
+    self.header_template = """
+/*
+  Generated by conv2d_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+"""
+
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.configuration_header = """
+
+namespace cutlass {
+namespace library {
+
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
+    ${operation_name}>;
+
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
+"""
+
+    self.configuration_epilogue = "}\n"
+
+    self.epilogue_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to all kinds of kernels.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file = open(self.configuration_path, "w")
+
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
+    self.operations.append(operation)
+
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+
+    self.configuration_file.write(Template(self.instance_template).substitute({
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+
+    The "rest of the C++ code" has the following components.
+
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+
+    3. Configuration epilogue: Close the definition of the registration function.
+
+    4. Epilogue template: Close the namespace(s).
+    """
+
+    _LOGGER.debug('*** EmitConv2dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+
+      if operation.group_mode == GroupMode.Depthwise:
+        kernel_name = 'DirectConvolution'
+        operation_wrapper = 'DirectConv2dOperation'
+      else:
+        kernel_name = 'ImplicitGemmConvolution'
+        operation_wrapper = 'Conv2dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
+      }))
+
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+
+###################################################################################################
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3d_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3d_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96b6db74224e52bd90b6e184a62624475385352
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3d_operation.py
@@ -0,0 +1,482 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Conv3d kernels
+"""
+
+import enum
+import logging
+import os.path
+import shutil
+from string import Template
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+except ImportError:
+  from library import *
+  from conv3x_emitter import EmitConv3xInstance, EmitConv3xIncludes
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+
+#
+class Conv3dOperation:
+  #
+  def __init__(self, conv_kind, iterator_algorithm, arch, tile_description, A, B, C, element_epilogue, \
+    stride_support, epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+    self.operation_kind = OperationKind.Conv3d
+    self.arch = arch
+    self.tile_description = tile_description
+    self.conv_kind = conv_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.iterator_algorithm = iterator_algorithm
+    self.stride_support = stride_support
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    intermediate_type = ''
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+    else:
+      inst_shape = ''
+
+    return "%s%s%s%s3d_%s" % (ShortDataTypeNames[self.tile_description.math_instruction.element_accumulator], \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], IteratorAlgorithmNames[self.iterator_algorithm])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${element_c}_${core_name}_${element_a}"
+    elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+      self.A.element != self.tile_description.math_instruction.element_accumulator:
+      extended_name = "${core_name}_${element_a}"
+    else:
+      extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    threadblock = "%dx%d_%dx%d" % (
+      self.tile_description.threadblock_shape[0],
+      self.tile_description.threadblock_shape[1],
+      self.tile_description.threadblock_shape[2],
+      self.tile_description.stages
+    )
+
+    if self.stride_support == StrideSupport.Unity:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_unity_stride"
+    else:
+      configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}"
+
+    return SubstituteTemplate(
+      configuration_name,
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+      }
+    )
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.configuration_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv3dInstance:
+  def __init__(self):
+    # Emitter for CUTLASS 3 convolution operations
+    self.conv3x_emitter = EmitConv3xInstance()
+    self.template = """
+  // Conv3d${conv_kind_name} ${iterator_algorithm_name} kernel instance "${operation_name}"
+  using ${operation_name}_base =
+  typename cutlass::conv::kernel::DefaultConv3d${conv_kind_name}<
+    ${element_a},
+    cutlass::layout::TensorNDHWC,
+    ${element_b},
+    cutlass::layout::TensorNDHWC,
+    ${element_c},
+    cutlass::layout::TensorNDHWC,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k} >,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor}, // cutlass::gemm::threadblock::GemmSplitKIdentityThreadblockSwizzle<>,
+    ${stages},
+    cutlass::arch::OpMultiplyAdd,
+    ${iterator_algorithm},
+    ${stride_support}
+  >::Kernel;
+"""
+
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitConv3dInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    if hasattr(operation, 'is_3x') and operation.is_3x:
+      _LOGGER.debug("***   CUTLASS 3 operation")
+      return self.conv3x_emitter.emit(operation)
+
+    _LOGGER.debug("***   CUTLASS 2 operation")
+
+    warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind': ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'iterator_algorithm': IteratorAlgorithmTag[operation.iterator_algorithm],
+      'iterator_algorithm_name': IteratorAlgorithmNames[operation.iterator_algorithm].capitalize(),
+      'stride_support': StrideSupportTag[operation.stride_support]
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+###################################################################################################
+#
+# Generator functions for all layouts
+#
+###################################################################################################
+
+#
+def GenerateConv3dTensorOp(manifest, tile_descriptions, min_cc, align = 128):
+
+  for tile in tile_descriptions:
+    for conv_kind in [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad]:
+
+      if conv_kind == ConvKind.Fprop or (tile.math_instruction.element_accumulator in [DataType.f16, DataType.f32]):
+
+        #
+        output_types = [tile.math_instruction.element_a, tile.math_instruction.element_accumulator] \
+          if DataTypeSize[tile.math_instruction.element_accumulator] == 32 \
+          else [tile.math_instruction.element_accumulator,]
+
+        for output_type in output_types:
+          A = TensorDescription(tile.math_instruction.element_a, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_a]))
+          B = TensorDescription(tile.math_instruction.element_b, LayoutType.TensorNDHWC, int(align / DataTypeSize[tile.math_instruction.element_b]))
+          C = TensorDescription(output_type,  LayoutType.TensorNDHWC, max(1, int(align / DataTypeSize[output_type])))
+
+          manifest.append(Conv3dOperation(conv_kind, min_cc, tile, A, B, C, tile.math_instruction.element_accumulator))
+
+class EmitConv3dIncludes:
+  '''Emit includes that are specific to the operation.'''
+
+  def __init__(self):
+    self.includes = ['conv3d_operation.h']
+    self.emitter_3x = EmitConv3xIncludes()
+
+  def operation_is_3x(self, operation) -> bool:
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def emit(self, operation) -> str:
+    if self.operation_is_3x(operation):
+      return self.emitter_3x.emit(operation)
+
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitConv3dConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name)
+
+    self.instance_emitter = EmitConv3dInstance()
+    self.includes_emitter = EmitConv3dIncludes()
+
+    self.header_template = """
+/*
+  Generated by conv3d_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+"""
+
+    self.instance_template = """
+${stub_begin}
+${operation_instance}
+// Derived class
+struct ${operation_name} :
+  public ${operation_name}_base { };
+${stub_end}
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.configuration_header = """
+
+namespace cutlass {
+namespace library {
+
+// Initialize all instances
+void initialize_${configuration_name}(Manifest &manifest) {
+"""
+
+    self.configuration_instance = """${stub_begin}
+  using Operation_${operation_name} = cutlass::conv::device::${kernel_name}<
+    ${operation_name}>;
+
+  manifest.append(new cutlass::library::${operation_wrapper}<
+      Operation_${operation_name}
+    >(
+      "${operation_name}"
+    ));
+${stub_end}
+"""
+
+    self.configuration_epilogue = "}\n"
+
+    self.epilogue_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def operation_is_3x(self, operation):
+    """Whether operation is a CUTLASS 3 convolution (as opposed to CUTLASS 2)"""
+    return hasattr(operation, 'is_3x') and operation.is_3x
+
+  def __enter__(self):
+    """
+    Open the configuration_file, and write the "header" C++ code to it.
+
+    The "header" consists of a comment (that this is generated code,
+    so it should not be edited), and includes that are common
+    to both the CUTLASS 2 and the CUTLASS 3 cases.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__enter__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+    self.configuration_file = open(self.configuration_path, "w")
+
+    self.configuration_file.write(SubstituteTemplate(self.header_template, {
+      'configuration_name': self.configuration_name
+      }))
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    """
+    Write three pieces of C++ code to the configuration_file
+    (that was opened by the __enter__ method above):
+
+    1. the header includes that are specific to the operation
+       (CUTLASS 2 vs. CUTLASS 3);
+
+    2. the "operation instance" (a "using" declaration ending in "_base"); and
+
+    3. the "operation name" (declaration and definition of a derived class
+       of the above operation instance).
+
+    The "using" declaration turns a C++ class name, possibly namespace-qualified,
+    possibly also with angle brackets, into a C-style, easily demangled identifier.
+    """
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::emit')
+    _LOGGER.debug('***   operation.procedural_name(): ' + operation.procedural_name())
+    self.operations.append(operation)
+
+    self.configuration_file.write(self.includes_emitter.emit(operation))
+
+    stub_begin = ''
+    stub_end = ''
+    # It can be useful to stub (comment) out instantiations for testing.
+    # In this case, one need only set is_stub to True.
+    is_stub = False
+    if is_stub:
+      stub_begin = "// STUB for now\n#if 0"
+      stub_end = '#endif // 0'
+
+    self.configuration_file.write(Template(self.instance_template).substitute({
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'operation_instance': self.instance_emitter.emit(operation),
+      'stub_begin': stub_begin,
+      'stub_end': stub_end
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    """
+    Write the rest of the C++ code to the configuration_file, and close the file.
+
+    The "rest of the C++ code" has the following components.
+
+    1. Configuration header: Open the namespace(s), and open the definition
+       of the "initialize_${configuration_name}" registration function
+       that registers the operation with the Manifest.
+       ("Registration" helps turn C++ compile-time polymorphism
+       (via template parameters) into a run-time choice of parameters.)
+
+    2. Configuration instance: In the body of the registration function,
+       make a "using" declaration Operation_${operation_name} for the
+       operation type (which uses operation_name as its template argument).
+       Then, tell the manifest about the operation via a "manifest.append" call.
+       The argument of the call is a new instance of
+       "SomethingOperation<Operation_${operation_name}>"
+       (replace Something with a specific name).
+
+    3. Configuration epilogue: Close the definition of the registration function.
+
+    4. Epilogue template: Close the namespace(s).
+    """
+
+    _LOGGER.debug('*** EmitConv3dConfigurationLibrary::__exit__')
+    _LOGGER.debug('***   configuration_path (file to write): ' +
+                  str(self.configuration_path))
+    _LOGGER.debug('***   configuration_name: ' + self.configuration_name)
+
+    self.configuration_file.write(SubstituteTemplate(self.configuration_header, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for operation in self.operations:
+      stub_begin = ''
+      stub_end = ''
+      # It can be useful to stub (comment) out instantiations for testing.
+      # In this case, one need only set is_stub to True.
+      is_stub = False
+      if is_stub:
+        stub_begin = "// STUB for now\n#if 0"
+        stub_end = "#endif // 0"
+
+      kernel_name = 'ImplicitGemmConvolution'
+      operation_wrapper = 'Conv3dOperation'
+      if self.operation_is_3x(operation):
+        kernel_name = 'ConvUniversalAdapter'
+        operation_wrapper = 'ConvOperation3x'
+
+      self.configuration_file.write(SubstituteTemplate(self.configuration_instance, {
+        'configuration_name': self.configuration_name,
+        'operation_name': operation.procedural_name(),
+        'kernel_name': kernel_name,
+        'operation_wrapper': operation_wrapper,
+        'stub_begin': stub_begin,
+        'stub_end': stub_end
+      }))
+
+    self.configuration_file.write(self.configuration_epilogue)
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+
+###################################################################################################
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3x_emitter.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3x_emitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d6da1a4675c0bbd07315717a7f5ba0ba0dc10c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/conv3x_emitter.py
@@ -0,0 +1,250 @@
+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting CUTLASS >= 3 convolution kernels
+"""
+
+import enum
+import os.path
+import shutil
+import logging
+from string import Template
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+class EmitConv3xInstance:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xInstance::__init__")
+
+    # Define epilogue type first, so that the mainloop type
+    # can use it with StageCountAutoCarveout.
+    self.template = """
+
+// CUTLASS >= 3 convolution ${conv_kind_name} kernel instance "${operation_name}"
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_epi},
+    ${mma_tile_shape},               // mma tile shape
+    ${cluster_shape},                // cluster shape
+    ${epi_tile_mn},
+    ${element_accumulator},
+    ${element_compute},
+    ${element_c}, ${layout_c}, 128 / cute::sizeof_bits_v<${element_c}>,
+    ${element_d}, ${layout_d}, 128 / cute::sizeof_bits_v<${element_d}>,
+    ${epilogue_schedule}
+    // , class FusionOpOrCallbacks = cutlass::epilogue::fusion::LinearCombination<ElementD,ElementCompute>
+  >::CollectiveOp;
+
+using ${operation_name}_mainloop =
+  typename cutlass::conv::collective::CollectiveBuilder<
+    ${arch},
+    ${opcode_class_main},
+    ${conv_kind},         // kFprop, kDgrad, or kWgrad
+    ${element_a}, ${layout_a}, 128 / cute::sizeof_bits_v<${element_a}>,
+    ${element_b}, ${layout_b}, 128 / cute::sizeof_bits_v<${element_b}>,
+    ${element_accumulator},
+    ${mma_tile_shape},        // mma tile shape
+    ${cluster_shape},         // cluster shape
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+using ${operation_name}_problem_shape = cutlass::conv::ConvProblemShape<${conv_kind}, ${operation_name}_mainloop::NumSpatialDimensions>;
+
+// Unit tests call this "ConvKernel".
+// Conv operator ${operation_name}
+using ${operation_name}_base = cutlass::conv::kernel::ConvUniversal<
+    ${operation_name}_problem_shape,
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}
+  >;
+"""
+
+  def arch_number_to_type(self, arch: int) -> str:
+    return f"cutlass::arch::Sm{arch}"
+
+  def mma_tile_shape(self, operation, cta_m, cta_n, cta_k) -> str:
+    mma_m = cta_m
+    mma_n = cta_n
+    mma_k = cta_k
+
+    if operation.arch >= 100:
+      # MmaTileShape (mma_m, mma_n, mma_k) is passed to kernel mainloop where
+      # mma_m = cta_m for 1sm version and mma_m = cta_m * 2 for 2sm version.
+      # If schedule is auto and cluster size is static and cta_m % 64 == 0 and cluster_m % 2 == 0, 2sm kernel version is allocated,
+      # otherwise 1sm kernel is allocated.
+      cta_m_per_mma_instruction = 1
+      if "2sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 2
+      elif "1sm" in operation.procedural_name() :
+        cta_m_per_mma_instruction = 1
+      elif operation.tile_description.cluster_shape[0] > 0 and operation.tile_description.cluster_shape[0] % 2 == 0 and cta_m % 64 == 0 :
+        cta_m_per_mma_instruction = 2
+      mma_m = cta_m * cta_m_per_mma_instruction
+
+    # For all three kinds of convolutions, the tile shape's K mode
+    # differs from GEMM in that needs to be wrapped in a Shape.
+    # For Wgrad convolutions specifically,
+    # the N tile shape also needs to be wrapped in a Shape.
+    m_template = 'cute::_${mma_m}'
+    if operation.conv_kind == ConvKind.Wgrad:
+      n_template = 'cute::Shape<cute::_${mma_n}>'
+    else:
+      n_template = 'cute::_${mma_n}'
+    k_template = 'cute::Shape<cute::_${mma_k}>'
+
+    mma_tile_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'mma_m': mma_m,
+      'mma_n': mma_n,
+      'mma_k': mma_k
+    }
+    return Template(mma_tile_shape_template).substitute(values)
+
+  def cluster_shape(self, operation) -> str:
+    m_template = 'cute::_${cluster_shape_m}' if operation.tile_description.cluster_shape[0] > 0 else 'int(0)'
+    n_template = 'cute::_${cluster_shape_n}' if operation.tile_description.cluster_shape[1] > 0 else 'int(0)'
+    k_template = 'cute::_${cluster_shape_k}' if operation.tile_description.cluster_shape[2] > 0 else 'int(0)'
+    cluster_shape_template = f'cute::Shape<{m_template}, {n_template}, {k_template}>'
+    values = {
+      'cluster_shape_m': operation.tile_description.cluster_shape[0],
+      'cluster_shape_n': operation.tile_description.cluster_shape[1],
+      'cluster_shape_k': operation.tile_description.cluster_shape[2],
+    }
+    return Template(cluster_shape_template).substitute(values)
+
+  def stage_count(self, operation) -> str:
+    # stages == 0 tells builder to pick the number of stages automatically
+    namespace_prefix = 'cutlass::conv::collective::'
+    if operation.tile_description.stages > 0:
+      return f"{namespace_prefix}StageCount<{str(operation.tile_description.stages)}>"
+    else:
+      return f"{namespace_prefix}StageCountAutoCarveout<sizeof(typename {operation.procedural_name()}_epilogue::SharedStorage)>"
+
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xInstance::emit")
+    _LOGGER.debug("***   operation: procedural_name()=" + operation.procedural_name())
+
+    # Identify the operation as CUTLASS 3 by its is_3x field
+    if (not hasattr(operation, 'is_3x')) or (not operation.is_3x):
+      raise RuntimeError("operation must be a CUTLASS 3 operation")
+
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+    opcode_class_main = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+    opcode_class_epi = opcode_class_main
+
+    tile_shape = operation.tile_description.tile_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+
+    cta_m, cta_n, cta_k = tile_shape
+    # account for static/dynamic cluster shapes
+    if operation.arch >= 100:
+      cta_m = cta_m // cluster_m if cluster_m > 0 else cta_m
+      cta_n = cta_n // cluster_n if cluster_n > 0 else cta_n
+
+    warp_count = operation.tile_description.warp_count
+    epilogue_schedule = EpilogueScheduleTag[operation.epilogue_schedule]
+
+    # KernelScheduleTag and TileSchedulerTag both hard-code the
+    # namespace qualification of KernelScheduleAuto as
+    # "cutlass::gemm::collective::" (unless the tag is 'void').
+    #
+    # For TileSchedulerTag, this namespace is fine, since CUTLASS 3
+    # convolutions use the same tile schedulers (from the same
+    # cutlass::gemm::collective namespace) as GEMMs.
+    kernel_schedule = KernelScheduleTag[operation.kernel_schedule].replace('gemm::', 'conv::')
+    tile_scheduler = TileSchedulerTag[operation.tile_scheduler]
+    opcode_class = OpcodeClassTag[operation.tile_description.math_instruction.opcode_class]
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'conv_kind':      ConvKindTag[operation.conv_kind],
+      'conv_kind_name': ConvKindNames[operation.conv_kind].capitalize(),
+      'element_a':      DataTypeTag[operation.A.element],
+      'layout_a':       LayoutTag[operation.A.layout],
+      'align_a':        int(operation.A.alignment),
+      'element_b':      DataTypeTag[operation.B.element],
+      'layout_b':       LayoutTag[operation.B.layout],
+      'align_b':        int(operation.B.alignment),
+      'element_c':      DataTypeTag[operation.C.element],
+      'layout_c':       LayoutTag[operation.C.layout],
+      'align_c':        int(operation.C.alignment),
+      'element_d':      DataTypeTag[operation.D.element],
+      'layout_d':       LayoutTag[operation.D.layout],
+      'align_d':        int(operation.D.alignment),
+      'element_accumulator':   DataTypeTag[operation.accumulator_type()],
+      'opcode_class':          opcode_class,
+      'arch':                  self.arch_number_to_type(operation.arch),
+      'mma_tile_shape':        self.mma_tile_shape(operation, cta_m, cta_n, cta_k),
+      'cluster_shape':         self.cluster_shape(operation),
+      'opcode_class_epi':      opcode_class_epi,
+      'opcode_class_main':     opcode_class_main,
+      'epi_tile_mn':           epi_tile_mn,
+      'stages':                self.stage_count(operation),
+      'kernel_schedule':       kernel_schedule,
+      'epilogue_schedule':     epilogue_schedule,
+      'tile_scheduler':        tile_scheduler,
+      'element_compute':       DataTypeTag[operation.element_compute]
+    }
+    return Template(self.template).substitute(values)
+
+class EmitConv3xIncludes:
+  def __init__(self):
+    _LOGGER.debug("*** EmitConv3xIncludes::__init__")
+    self.includes = ['conv_operation_3x.hpp',
+                     'cutlass/conv/device/conv_universal_adapter.hpp',
+                     'cutlass/conv/kernel/conv_universal.hpp',
+                     'cutlass/conv/collective/collective_builder.hpp',
+                     'cutlass/epilogue/collective/collective_builder.hpp']
+
+  def emit(self, operation) -> str:
+    _LOGGER.debug("*** EmitConv3xIncludes::emit")
+    return '\n'.join(f"#include \"{incl}\"" for incl in self.includes) + \
+      "\n\n///////////////////////////////////////////////////////////////////////////////////////////////////"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/emit_kernel_listing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/emit_kernel_listing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe52eb587ab1b5e4595739be5790151b00e0a70
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/emit_kernel_listing.py
@@ -0,0 +1,868 @@
+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+#
+#
+# \brief Generates the CUTLASS kernel listing with kernel filtering
+#
+
+#
+
+###############################################################################
+# Example usage:
+# generator.py --operations all --generator-target kernel_listing \
+# --architectures "70;75;80" --kernels "*" --disable-cutlass-package-imports
+###############################################################################
+
+import collections
+import csv
+import json
+import math
+import os
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+audit_csv_fields = [
+  "KernelType", "KernelName", "Type_A", "Type_B", "Type_C", "Type_Acc", "Type_EpilogueScale", "Type_D", "Type_SFA", "Type_SFD",
+  "Layout_A", "Layout_B", "Layout_C", "Layout_D", 
+  "Alignment_A", "Alignment_B", "Alignment_C", "Alignment_D",  
+  "1SM/2SM", 
+  "StreamK Enabled", "Support Runtime_Cluster_Shape", "Support Runtime_Input_Types",
+  "Test Counts"
+]
+
+audit_csv_runtime_fields = [
+  "KerneIndex", "KernelName", 
+  "Inst_M", "Inst_N", "Inst_K", "Tile_M", "Tile_N", "Tile_K",
+  "Cluster_M", "Cluster_N", "Cluster_K", "Preferred_Cluster_M", "Preferred_Cluster_N", "Preferred_Cluster_K", "Fallback_Cluster_M", "Fallback_Cluster_N", "Fallback_Cluster_K",
+  "M", "N", "K", "L", "Alpha_val", "Beta_val",
+  "Runtime_Input_Types Enabled", "Runtime_Cluster_Shape Enabled"
+]
+
+def hash_cutlass_string(input_string):
+  mma_cluster_shape_pattern = r"_\d+x\d+x\d+"         # Matches MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
+
+  # Remove MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1')
+  output = re.sub(mma_cluster_shape_pattern, "", input_string)
+
+  return output
+
+def transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b):
+  # Define a dictionary mapping the detected types to runtime values
+  datatype_map = {
+    'f4_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f4_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f6_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f4': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f6': runtime_datatype_a + '_' + runtime_datatype_b,
+    'f8_f8': runtime_datatype_a + '_' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue4m3xf4_ue4m3xf4': 'ue4m3x' + runtime_datatype_a + '_ue4m3x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf4_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf6_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf4': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf6': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+    'ue8m0xf8_ue8m0xf8': 'ue8m0x' + runtime_datatype_a + '_ue8m0x' + runtime_datatype_b,
+  }
+
+  # Regular expression to detect all the keys in datatype_map
+  pattern = re.compile(r'(' + '|'.join(map(re.escape, datatype_map.keys())) + r')')
+
+  # Replace detected patterns using the dictionary
+  updated_kernel_name = pattern.sub(lambda match: datatype_map[match.group(0)], hashed_kernel_name)
+
+  return updated_kernel_name
+
+# This helper function reports foundational kernel features: datatypes, layouts, alignment and stream-k.
+def get_kernel_features(operation, kernel_name,
+              dynamic_datatype, runtime_input_datatype):
+  numcta_inst = "2sm" if "2sm" in kernel_name else "1sm"
+  math_inst = operation.tile_description.math_instruction
+
+  if dynamic_datatype:
+      dtype_name_A = runtime_input_datatype[0]
+      dtype_name_B = runtime_input_datatype[1]
+  else:
+      dtype_name_A = DataTypeNames[operation.A.element]
+      dtype_name_B = DataTypeNames[operation.B.element]
+
+  layout_name_A = ShortLayoutTypeNames[operation.A.layout]
+  layout_name_B = ShortLayoutTypeNames[operation.B.layout]
+  layout_name_C = ShortLayoutTypeNames[operation.C.layout]
+  layout_name_D = ShortLayoutTypeNames[operation.D.layout]
+
+  scale_factor_D_type = operation.ScaleFactorD.element if hasattr(operation, "ScaleFactorD") else DataType.void
+  scale_factor_A_type = getattr(operation, "ScaleFactorA", DataType.void)
+  audit_vals = [
+          "BlockScaledGEMM" if math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp else "GEMM",
+          kernel_name,
+          dtype_name_A,
+          dtype_name_B,
+          DataTypeNames[operation.C.element],
+          DataTypeNames[operation.tile_description.math_instruction.element_accumulator],
+          DataTypeNames[operation.element_epilogue],
+          DataTypeNames[operation.D.element],
+          DataTypeNames[scale_factor_D_type],
+          DataTypeNames[scale_factor_A_type],
+          layout_name_A,
+          layout_name_B,
+          layout_name_C,
+          layout_name_D,
+          str(operation.A.alignment),
+          str(operation.B.alignment),
+          str(operation.C.alignment),
+          str(operation.D.alignment),
+          numcta_inst,
+          "Y" if 'stream_k' in kernel_name else "N",
+  ]
+  return audit_vals
+
+# This helper function reports other performance-related kernel parameters and those can be specified at runtime: cluster_shape, instruction shap, m/n/k and alpha/beta.
+def get_kernel_params(operation, kernel_name, cluster_shape, fallback_cluster_shape, problem_shape, alpha, beta, dynamic_datatype, dynamic_cluster):
+  math_inst = operation.tile_description.math_instruction
+  audit_vals = [
+          str(math_inst.instruction_shape[0]),
+          str(math_inst.instruction_shape[1]),
+          str(math_inst.instruction_shape[2]),
+          str(operation.tile_description.threadblock_shape[0]),
+          str(operation.tile_description.threadblock_shape[1]),
+          str(operation.tile_description.threadblock_shape[2]),
+          str(operation.tile_description.cluster_shape[0]),
+          str(operation.tile_description.cluster_shape[1]),
+          str(operation.tile_description.cluster_shape[2]),
+          str(cluster_shape[0]),
+          str(cluster_shape[1]),
+          str(cluster_shape[2]),
+          str(fallback_cluster_shape[0]),
+          str(fallback_cluster_shape[1]),
+          str(fallback_cluster_shape[2]),
+          str(problem_shape[0]),
+          str(problem_shape[1]),
+          str(problem_shape[2]),
+          str(problem_shape[3]),
+          str(alpha),
+          str(beta),
+          "Y" if dynamic_datatype else "N",
+          "Y" if dynamic_cluster else "N",
+  ]
+  return audit_vals
+
+
+def _getSubOperationType(kernel):
+
+  if kernel.operation_kind == OperationKind.Gemm:
+      return GemmKindNames[kernel.gemm_kind]
+  elif kernel.operation_kind == OperationKind.Conv2d:
+    return "conv_" + ConvKindNames[kernel.conv_kind]
+  elif kernel.operation_kind == OperationKind.Syrk:
+    return "syrk_" + SyrkKindNames[kernel.syrk_kind]
+  elif kernel.operation_kind == OperationKind.Trmm:
+    return "trmm_" + TrmmKindNames[kernel.trmm_kind]
+  elif kernel.operation_kind == OperationKind.Symm:
+    return "symm_" + SymmKindNames[kernel.symm_kind]
+  else:
+    raise Exception("Unsupported kernel type")
+
+def _get_inst_shape(math_instruction):
+  return "".join(str(x) for x in math_instruction.instruction_shape)
+
+def _is_simt_inst(math_instruction):
+  return _get_inst_shape(math_instruction) in ["111","114"]
+
+def _getInstType(input_precision, accumulate_precision, math_instruction):
+
+  # inst_shape
+  inst_shape = _get_inst_shape(math_instruction)
+
+  # input precision
+  if input_precision == "fp32" and inst_shape != "111":
+    inp = "tf32"
+  else:
+    inp = input_precision
+
+  # Handle SIMT op types first
+  if _is_simt_inst(math_instruction):
+
+    simt_input_precision_to_inst = {
+      "fp32": "FFMA",
+      "fp64": "DFMA",
+      "fp16": "HFMA",
+      "int8": "IDP4A",
+    }
+    inst = simt_input_precision_to_inst[input_precision]
+
+  else: # Tensor op instructions
+
+    if accumulate_precision == "cf64":
+      fp64_acc_map = {
+        MathOperation.multiply_add_complex_gaussian : "gz",
+        MathOperation.multiply_add_complex          : "z",
+      }
+      acc = fp64_acc_map[math_instruction.math_operation]
+    else:
+      tensor_op_acc_map = {
+        "fp32" : "s",
+        "cf32" : "s",
+        "fp16" : "h",
+        "int32": "i",
+        "fp64" : "d",
+      }
+      acc = tensor_op_acc_map[accumulate_precision]
+
+    inst = "{}{}{}".format(acc, inst_shape, inp)
+
+  return inst
+# TODO: Computes FLOps/Bytes for GEMM - revisit for conv
+def _computeFlopsPerByte(operation, m, n, k, batch_count=1, beta=0.0, num_groups=1):
+  assert not (batch_count > 1 and num_groups > 1)
+
+  # TODO: adjust for sparsity
+  gmem_bytes = (
+    (DataTypeSize[operation.A.element] * m // 8) * k +
+    (DataTypeSize[operation.B.element] * n // 8) * k +
+    (DataTypeSize[operation.C.element] * m // 8) * n
+  )
+
+  # TODO: complex-valued support
+  flops = 2 * (m * n * k)
+
+  if bool(beta):
+    gmem_bytes += (DataTypeSize[operation.C.element] * m // 8) * n
+    flops += 2 * m * n
+
+  multiplier = max(batch_count, num_groups)
+  gmem_bytes *= multiplier
+  flops *= multiplier
+
+  return flops / gmem_bytes
+
+def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode
+                              ):
+  # For functional testing, we prefer to run reference computing on device if any
+  reference_device_archs = ["100a", "103a"]
+  run_reference_on_device = True if arch in reference_device_archs and mode in ["functional_L0", "functional_L1"] else False
+  profiler_flags_for_verification = "device" if run_reference_on_device else "host"
+
+  # beta values for L0 and L1
+  # TODO: randomize beta values for wider coverage
+  beta_values = [0.5]
+
+  is_supported_arch = (arch in ["100a", "100f", "101a", "101f", "103a", "110a", "110f", "120a", "120f", "121a", "121f"])
+
+  is_runtime_datatype_enabled = mode == "functional_L0" and is_supported_arch
+
+  if (mode == "functional_L0") and is_supported_arch:
+    problem_waves = [0.5, 1.25, 2.5]
+
+    #
+    # Dense Gemm
+    #
+
+    sm100_mma_data_type_general = [
+      'gemm_f16_f16_f16_f16_f16',
+      'gemm_f16_f16_f16_void_f16',
+      #'gemm_f16_f16_f32_f16_f16',
+      'tf32gemm_f32_f32_f32_f32_f32',
+      'bf16gemm_f32_f32_f32_f32_f32',
+    ]
+
+    exclude_archs = arch not in ("103a")
+    if exclude_archs:
+      sm100_mma_data_type_general.append('gemm_s8_s8_s32_s8_s8')
+
+    sm100_mma_data_type_runtime_dtype = [
+      'gemm.*f4_f4_f32_f32_f32',
+      'gemm.*f6_f6_f32_f32_f32',
+      'gemm.*f8_f8_f32_f32_f32',
+    ]
+
+    sm100_mma_cluster_size = [
+      '8x1x1',
+      '4x4x1', '2x1x1',
+      '0x0x1' # dynamic cluster
+    ]
+
+    # Restrict to two layouts to reduce L0 build and test time.
+    sm100_mma_layouts = [ 
+      'tnt', 
+      'ntn' 
+    ]
+
+    # regex list must be in kernel procedural name order
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+
+    sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+
+    #
+    # Block Scale Gemm
+    #
+
+    block_scaled_data_type = [
+      # runtime datatypes
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue4m3xf4_ue4m3xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2',
+      #'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
+      'gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2',
+    ]
+
+    block_scaled_tile_k = ['x128_', 'x256_']
+
+    sm103_block_scaled_data_type = [
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2',
+      'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1',
+    ]
+
+    sm103_block_scaled_tile_k = ['x768_']
+
+    block_scaled_cluster_size = [
+      '4x4x1', '2x1x1',
+      '0x0x1' # dynamic cluster
+    ]
+
+    block_scaled_layouts = ['tnt']
+    # regex list must be in kernel procedural name order
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+    
+    sm103_block_scaled_prefetch_policy = ['tmapf']
+    sm103_block_scaled_filter_regex_1sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, sm103_block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*(" + "|".join(sm103_block_scaled_prefetch_policy) + ").*"
+    sm103_block_scaled_filter_regex_2sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, sm103_block_scaled_tile_k, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*(" + "|".join(sm103_block_scaled_prefetch_policy) + ").*"
+
+    if arch in ["100a", "100f"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})"
+    elif arch in ["101a", "101f", "110a", "110f"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})"
+    elif arch in ["103a"]:
+      kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \
+                      f"({sm100_mma_filter_regex_2sm})|" \
+                      f"({sm100_mma_filter_regex_1sm_runtime})|" \
+                      f"({sm100_mma_filter_regex_2sm_runtime})|" \
+                      f"({block_scaled_filter_regex_1sm})|" \
+                      f"({block_scaled_filter_regex_2sm})|" \
+                      f"({sm103_block_scaled_filter_regex_1sm})|" \
+                      f"({sm103_block_scaled_filter_regex_2sm})"
+    elif arch in ["120a", "120f", "121a", "121f"]:
+
+      # blockscaled sm120_mma kernels
+      blockscaled_sm120_mma_kernel_cta_tiles = [
+        [ '128x128' ]
+      ]
+
+      # Restrict to two layouts to reduce L0 build and test time.
+      blockscaled_sm120_mma_layouts = [ 'tn' ]
+      filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*"
+      
+      problem_waves = [0.5, 1.25, 2.5]
+
+      kernel_filter = f"({filter_regex_blockscaled_sm120_mma})"
+    else:
+      error_message = "unsupported arch, only support sm100a, sm100f, sm101a, sm101f, sm110a, sm110f, sm103a, sm120a, sm120f, sm121a, sm121f"
+      raise Exception(error_message)
+
+  elif mode == "functional_L1":
+    sm100_mma_cluster_size = [
+                    '0x0x1' # dynamic cluster
+                     ]
+    # Restrict to two layouts to reduce L1 build and test time.
+    sm100_mma_layouts = ['tnt', 'ntn']
+    sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*"
+    sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*"
+    block_scaled_data_type = [
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2',
+      'ue8m0xmx8s26_ue8m0xmx8s26_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
+      'ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2',
+    ]
+
+    sm103_block_scaled_data_type = [
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2',
+      'ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1',
+    ]
+
+    block_scaled_cluster_size = ['0x0x1']
+    block_scaled_layouts = ['tnt']
+
+    # regex list must be in kernel procedural name order
+    block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+
+    sm103_block_scaled_filter_regex_1sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*"
+    sm103_block_scaled_filter_regex_2sm = "cutlass3x_sm103_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [sm103_block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*"
+
+    filter_regex_sm100_mma = f"({sm100_mma_filter_regex_1sm})|" \
+                          f"({sm100_mma_filter_regex_2sm})|" \
+                          f"({block_scaled_filter_regex_1sm})|" \
+                          f"({block_scaled_filter_regex_2sm})" \
+                          f"({sm103_block_scaled_filter_regex_1sm})|" \
+                          f"({sm103_block_scaled_filter_regex_2sm})"
+    # CTA tiles for sm120 MMA - only run one tile size to reduce build/test times
+    sm120_mma_kernel_cta_tiles = [
+      # h1688, s1688, i16832, i8816
+      [ '256x128' ],
+      # d884, c1688,
+      [ '128x128' ],
+      # c1688, z884
+      [ '128x64' ],
+      # gz884
+      [ '64x64' ]
+    ]
+
+    # sm120 MMA instruction shapes, planar complex type excluded as they are not required
+    sm120_mma_instruction_shapes = [
+      [ 'h1688gemm_(?!planar_complex)',
+        's1688gemm_f16',
+        's1688gemm_bf16',
+        's1688gemm_tf32',
+        'i16832gemm',
+        'i8816gemm' ],
+      [ 'd884gemm', 'c1688tf32gemm' ] ,
+      [ 'c1688gemm',
+        'z884gemm'  ],
+      [ 'gz884gemm']
+    ]
+
+    # It's not pretty, but not sure why different instructions support different tile sizes.
+    filter_regex_sm120_mma_0 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[0], sm120_mma_kernel_cta_tiles[0]]]) + ").*"
+    filter_regex_sm120_mma_1 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[1], sm120_mma_kernel_cta_tiles[1]]]) + ").*"
+    filter_regex_sm120_mma_2 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[2], sm120_mma_kernel_cta_tiles[2]]]) + ").*"
+    filter_regex_sm120_mma_3 = "cutlass_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm120_mma_instruction_shapes[3], sm120_mma_kernel_cta_tiles[3]]]) + ").*"
+
+    filter_regex_sm120_mma = f"({filter_regex_sm120_mma_0})|({filter_regex_sm120_mma_1})|({filter_regex_sm120_mma_2})|({filter_regex_sm120_mma_3})"
+
+    problem_waves = [0.5, 1.25, 2.5]
+
+    if arch in ["120a", "120f", "121a", "121f"]:
+      kernel_filter = f"({filter_regex_sm120_mma})"
+    else:
+      kernel_filter = f"({filter_regex_sm100_mma})"
+  else:
+    raise ValueError()
+
+  outfile_name    = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm.csv")
+
+  audit_file_name = os.path.join(curr_build_dir, f"FK_{mode}_audit_SM{arch}_cutlass3x_gemm.csv")
+
+  audit_file_params_name = os.path.join(curr_build_dir, f"FK_{mode}_audit_params_SM{arch}_cutlass3x_gemm.csv")
+
+  kernel_filter_re = re.compile(kernel_filter)
+  testcase_counter = 0
+  kernels_emitted = 0
+  kernels_total = 0
+
+  perf_json_list = []
+  kernel_name_set = set()
+
+  testlist_csv_fields = ["testcase", "metadata"]
+  testlist_csv_rows = []
+  auditlist_csv_map = {}
+  auditlist_csv_params_map = {}
+
+  kernel_features = {}
+
+  for cc in manifest.operations[OperationKind.Gemm].keys():
+    for kernel_name, operation_l in manifest.operations[OperationKind.Gemm][cc].items():
+      assert(len(operation_l) == 1)
+      kernels_total += 1
+      if len(kernel_filter_re.findall(kernel_name)) == 0:
+          continue
+      # Only test f16 I/O void C kernels in void C kernel set
+      # Exception: Use void C kernels for more accurate perf testing
+      if '_void_' in kernel_name and  'perf_' not in mode:
+        if 'f16_f16_f16_void_f16' not in kernel_name :
+          continue
+
+      kernels_emitted += 1
+      kernel_name_set.add(kernel_name)
+      hashed_kernel_name = hash_cutlass_string(kernel_name)
+      operation = operation_l[0]
+
+      dynamic_cluster = (operation.tile_description.cluster_shape[0] == 0
+                          or operation.tile_description.cluster_shape[1] == 0)
+
+      dynamic_datatype = "f8" in kernel_name or "f6" in kernel_name or "f4" in kernel_name
+
+      runtime_input_datatypes = [None]
+
+      if dynamic_datatype:
+        if "f4_f4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "f4_f6" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e3m2']]
+        elif "f4_f8" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e4m3']]
+
+        elif "f6_f4" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e2m1']]
+        elif "f6_f6" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e3m2']]
+        elif "f6_f8" in kernel_name:
+          runtime_input_datatypes = [['e3m2','e4m3']]
+
+        elif "f8_f4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+        elif "f8_f6" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e3m2']]
+        elif "f8_f8" in kernel_name:
+          runtime_input_datatypes = [
+                                    # mask out those not covered in statically encoded test cases
+                                    #  ['e5m2','e4m3'],
+                                    #  ['e4m3','e5m2'],
+                                      ['e4m3','e4m3']
+                                    ]
+
+        # block scaled kernels
+        elif "ue8m0xf4_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "ue4m3xf4_ue4m3xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m1']]
+        elif "ue8m0xf4_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e2m3']]
+        elif "ue8m0xf4_ue8m0xf8" in kernel_name:
+          runtime_input_datatypes = [['e2m1','e4m3']]
+
+        elif "ue8m0xf6_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e2m3','e2m1']]
+        elif "ue8m0xf6_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e2m3','e2m3']]
+        elif "ue8m0xf8_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+
+        elif "ue8m0xf8_ue8m0xf4" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m1']]
+        elif "ue8m0xf8_ue8m0xf6" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e2m3']]
+        elif "ue8m0xf8_ue8m0xf8" in kernel_name:
+          runtime_input_datatypes = [['e4m3','e4m3']]
+
+      if "bstensorop" in kernel_name or is_blockwise(manifest.operations_by_name[kernel_name].gemm_kind):
+        profiler_flags_for_verification = "host"
+
+      # reduce L1 test runtime if reference kernel is not running on device.
+      if mode == "functional_L1" and profiler_flags_for_verification == "host" :
+        problem_waves = [0.5, 2.5]
+      
+
+      if dynamic_cluster:
+        if mode == "functional_L0":
+          runtime_cluster_shapes = [[1,1,1],                   [2,2,1]]
+        else:
+          runtime_cluster_shapes = [[1,1,1], [1,2,1], [2,1,1], [2,2,1], [1,4,1], [4,1,1], [2,4,1], [4,2,1], [4,4,1]]
+          # reduce L1 test runtime if reference kernel is not running on device.
+          if profiler_flags_for_verification == "host":
+            runtime_cluster_shapes = [[1,1,1], [1,2,1], [2,1,1], [2,2,1], [1,4,1], [4,1,1]]
+        cta_tile_shape_m, cta_tile_shape_n, cta_tile_shape_k = operation.tile_description.threadblock_shape
+      else:
+        runtime_cluster_shapes = [operation.tile_description.cluster_shape]
+        cta_tile_shape_m = int(operation.tile_description.threadblock_shape[0] / operation.tile_description.cluster_shape[0])
+        cta_tile_shape_n = int(operation.tile_description.threadblock_shape[1] / operation.tile_description.cluster_shape[1])
+        cta_tile_shape_k = int(operation.tile_description.threadblock_shape[2] / operation.tile_description.cluster_shape[2])
+
+      alignment_a = operation.A.alignment
+      alignment_b = operation.B.alignment
+      alignment_c = operation.C.alignment
+      alignment_ab_max = max(alignment_a, alignment_b)
+
+      layout3x = operation.layout_name_3x()
+      data_types = operation.datatype_name_3x()
+
+      ctas_per_mma_instruction = 1
+      if '_2sm' in kernel_name:
+        ctas_per_mma_instruction = 2
+        valid_cluster_shapes = []
+
+        # Remove any cluster shapes that have cluster_m that is not divisible by 2
+        for cs in runtime_cluster_shapes:
+          if cs[0] % 2 == 0:
+            valid_cluster_shapes.append(cs)
+        runtime_cluster_shapes = valid_cluster_shapes
+
+      kernel_problem_waves = problem_waves
+      if mode == "functional_L0" or mode == "functional_L1":
+        # for functional testing, we want to perturb just a little from even shapes
+        # large K = 8 is chosen such that some kernels will warp around their smem buffers, and some will not
+        # -16 ensures that we are TMA aligned even for FP8/Int8
+        min_k = alignment_ab_max if cta_tile_shape_k == alignment_ab_max else cta_tile_shape_k - alignment_ab_max
+        max_k = (cta_tile_shape_k*8) - alignment_ab_max
+        problem_shapes_k = [min_k, max_k]
+        sm_count = 16
+        swizzle_sizes = [0]
+        # Larger k and less than half wave trigger streamk +separate reduction case to be generated
+        if 'stream_k' in kernel_name:
+          problem_shapes_k = [max_k, cta_tile_shape_k*32]
+          kernel_problem_waves = [0.125, 1.25, 2.5]
+      else:
+        raise ValueError
+
+      if "void" in kernel_name:
+        beta_values = [0]
+
+      alignment_shift_m = max(alignment_c, alignment_a)
+      alignment_shift_n = max(alignment_c, alignment_b)
+
+      is_first_line = True
+      for index_waves, waves in enumerate(kernel_problem_waves):
+        for index_k, k in enumerate(problem_shapes_k):
+          for beta in beta_values:
+            for cluster_shape in runtime_cluster_shapes:
+              for runtime_input_datatype in runtime_input_datatypes:
+                for swizzle_size in swizzle_sizes:
+                  grid_size = waves * sm_count
+                  cluster_shape_m, cluster_shape_n, cluster_shape_k = tuple(cluster_shape)
+                  if cluster_shape_m >= cluster_shape_n:
+                    grid_m = cluster_shape_m
+                    grid_n = grid_size / grid_m
+                    grid_n = max( int((grid_n + cluster_shape_n - 1) / cluster_shape_n) * cluster_shape_n, 1)
+                  else:
+                    grid_n = cluster_shape_n
+                    grid_m = grid_size / grid_n
+                    grid_m = max( int((grid_m + cluster_shape_m - 1) / cluster_shape_m) * cluster_shape_m, 1)
+
+                  verification_required = False
+                  if mode == "functional_L0" or mode == "functional_L1":
+                    if '_void_' not in kernel_name:
+                      verification_required = True
+
+                    m = max(int(grid_m * cta_tile_shape_m), alignment_ab_max)
+                    n = max(int(grid_n * cta_tile_shape_n), alignment_ab_max)
+                    k = int(k)
+
+                    # For functional testing, we want to perturb just a little from even shapes.
+                    # Only do this if the perturbation does not cause one of the dimensions of the
+                    # problem size to go to zero. This can occur for blockscaling kernels for which
+                    # the alignment requirements for A and B can be quite large (e.g., 256).
+                    if m > alignment_shift_m:
+                      m -= alignment_shift_m
+                    if n > alignment_shift_n:
+                      n -= alignment_shift_n
+
+                    if '_n32t32_' in kernel_name:
+                      continue
+                  batch_count = 1
+                  if mode == "functional_L0" or mode == "functional_L1" :
+                    if index_waves == 0 and index_k == 0 :
+                      batch_count = 3 if mode == "functional_L0" else 5
+                  gemm_op = "gemm"
+
+                  grouped = is_grouped(manifest.operations_by_name[kernel_name].gemm_kind)
+                  num_groups = 1
+                  if grouped:
+                    gemm_op = "grouped_gemm"
+                    num_groups = 3 # small to limit test time in host block-scaled reference kernels
+                    batch_count = 1
+                  elif "bstensorop" in kernel_name:
+                    gemm_op = "block_scaled_gemm"
+                  elif is_blockwise(manifest.operations_by_name[kernel_name].gemm_kind):
+                    gemm_op = "blockwise_gemm"
+
+                  problem_size_category = ['smallK','largeK'][index_k] + '_' + ['beta==0','beta!=0'][bool(beta)]
+
+                  assert m > 0 and n > 0 and k > 0
+
+                  # Emit per-testcase metadata for perf testing usage, eventually in perf database
+                  metadata_dict = {
+                    "input_params": {
+                      'problem_size_category' : problem_size_category,
+                      'operation' : _getSubOperationType(operation),
+                      'datatype' : data_types,
+                      'layout' : layout3x,
+                      'm' : m,
+                      'n' : n,
+                      'k' : k,
+                      'beta' : beta,
+                      'flops_per_byte' : _computeFlopsPerByte(operation, m, n, k, batch_count, beta, num_groups)
+                    },
+                    "runtime_params": {
+                      'ctas_per_mma_instruction' : ctas_per_mma_instruction,
+                      'tilesize_m' : cta_tile_shape_m,
+                      'tilesize_n' : cta_tile_shape_n,
+                      'tilesize_k' : cta_tile_shape_k,
+                      'cluster_shape_m' : cluster_shape_m,
+                      'cluster_shape_n' : cluster_shape_n,
+                    }
+                  }
+
+                  cluster_m_fallback = ctas_per_mma_instruction if dynamic_cluster else cluster_shape_m
+                  cluster_n_fallback = 1 if dynamic_cluster else cluster_shape_n
+                  cluster_k_fallback = 1 if dynamic_cluster else cluster_shape_k
+
+
+                  if dynamic_datatype:
+                    runtime_datatype_a, runtime_datatype_b = tuple(runtime_input_datatype)
+                    metadata_dict["runtime_params"]["runtime_datatype_a"] = runtime_datatype_a
+                    metadata_dict["runtime_params"]["runtime_datatype_b"] = runtime_datatype_b
+
+                  testcase_metadata = [
+                    f"cutlass_profiler --operation={gemm_op}" +
+                    (f" --verification-providers=device --providers=cutlass" if profiler_flags_for_verification == "device" else " --mode=trace") +
+                    f" --error-on-no-match --error-if-nothing-is-profiled" +
+                    f" --kernels={kernel_name}" +
+                    f" --m={str(m)}" +
+                    f" --n={str(n)}" +
+                    f" --k={str(k)}" +
+                    (f" --num_groups={str(num_groups)}" if grouped else "") +
+                    f" --cluster_m={str(cluster_shape_m)}" +
+                    f" --cluster_n={str(cluster_shape_n)}" +
+                    f" --cluster_k={str(cluster_shape_k)}" +
+                    f" --cluster_m_fallback={str(cluster_m_fallback)}" +
+                    f" --cluster_n_fallback={str(cluster_n_fallback)}" +
+                    f" --cluster_k_fallback={str(cluster_k_fallback)}" +
+                    f" --beta={str(beta)}" +
+                    ("" if grouped else f" --batch_count={str(batch_count)}") +
+                    f" --swizzle_size={str(swizzle_size)}" +
+                    f" --verification-required={str(verification_required).lower()}"
+                  ] \
+
+                  output_dynamic_datatype = dynamic_datatype
+                  if output_dynamic_datatype:
+                    testcase_metadata[0] += (f" --runtime_input_datatype_a={runtime_datatype_a}" +
+                                              f" --runtime_input_datatype_b={runtime_datatype_b}")
+
+                  testcase_metadata.append(json.dumps(metadata_dict))
+                  testlist_csv_rows.append(testcase_metadata)
+                  testcase_counter += 1
+
+                  alpha = 1.0
+
+                  if dynamic_datatype:
+                    hashed_kernel_name = transform_hashed_string(hashed_kernel_name, runtime_datatype_a, runtime_datatype_b)
+
+                  # If kernel_name is new, initialize its feature set with defaults
+                  if hashed_kernel_name not in kernel_features:
+                    kernel_features[hashed_kernel_name] = {
+                      "is_support_dynamic_cluster": False,
+                      "is_support_dynamic_datatype": False,
+                    }
+
+                  # Update features for the hashed kernel name
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] |= dynamic_cluster
+                  kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] |= dynamic_datatype
+
+                  if hashed_kernel_name not in auditlist_csv_params_map:
+                    auditlist_csv_params_map[hashed_kernel_name] = []
+
+                  audit_row_params = get_kernel_params(
+                    operation,
+                    hashed_kernel_name,
+                    (cluster_shape_m, cluster_shape_n, cluster_shape_k),
+                    (cluster_m_fallback, cluster_n_fallback, cluster_k_fallback),
+                    (m, n, k, batch_count),
+                    alpha, beta,
+                    dynamic_datatype, dynamic_cluster
+                  )
+
+                  auditlist_csv_params_map[hashed_kernel_name].append(audit_row_params)
+
+                  if hashed_kernel_name not in auditlist_csv_map:
+                    audit_row = get_kernel_features(operation, hashed_kernel_name, dynamic_datatype, runtime_input_datatype)
+                    auditlist_csv_map[hashed_kernel_name] = audit_row
+
+  with open(outfile_name, 'w') as testlist_csv:
+    csv_writer = csv.writer(testlist_csv, delimiter=',')
+    csv_writer.writerow(testlist_csv_fields)
+    csv_writer.writerows(testlist_csv_rows)
+
+  with open(audit_file_name, 'w') as auditlist_csv:
+    csv_writer = csv.writer(auditlist_csv, delimiter=',')
+    csv_writer.writerow(audit_csv_fields)
+    for hashed_kernel_name, row in auditlist_csv_map.items():
+      # Append the dynamic features as "Y" or "N"
+      dynamic_cluster_flag = "Y" if kernel_features[hashed_kernel_name]["is_support_dynamic_cluster"] else "N"
+      dynamic_datatype_flag = "Y" if kernel_features[hashed_kernel_name]["is_support_dynamic_datatype"] else "N"
+      test_count = len(auditlist_csv_params_map[hashed_kernel_name])
+      csv_writer.writerow(row + [dynamic_cluster_flag, dynamic_datatype_flag, test_count])
+
+  with open(audit_file_params_name, 'w') as auditlist_csv:
+    csv_writer = csv.writer(auditlist_csv, delimiter=',')
+    csv_writer.writerow(audit_csv_runtime_fields)
+    for kernel_index, (hashed_kernel_name, rows) in enumerate(auditlist_csv_params_map.items(), start=1):
+      for i, row in enumerate(rows):
+        if i == 0:
+          csv_writer.writerow([kernel_index, hashed_kernel_name] + row)
+        else:
+          csv_writer.writerow(["", ""] + row)
+
+  print(f"Generated a total of {testcase_counter} test cases for {kernels_emitted} kernels out of {kernels_total} total.")
+
+  # Generate a newline separated list of kernel filters
+  assert(len(kernel_name_set) == kernels_emitted)
+  output_filter_enabled = True
+  if output_filter_enabled:
+    kernel_filter_outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm_kernel_filter.list")
+  with open(kernel_filter_outfile_name, "w") as file:
+      kernel_name_set = set(map(lambda x: x.replace("_epi_tma", ""), kernel_name_set))
+      for kernel_name in kernel_name_set:
+          file.write(kernel_name + "\n")
+
+  # Sort L0 and L1 kernel list and csv file to avoid mixing cutlass3.x kernels and sm120_mma kernels in cutlass2.x generated together.
+  if mode == "functional_L0" or mode == "functional_L1":
+    # Sort the .csv file
+    outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm.csv")
+    with open(outfile_name) as file:
+      data = file.readlines()
+      data.sort()
+    with open(outfile_name, 'w') as file:
+      for i in range(len(data)):
+        file.write(data[i])
+    # Sort the kernel list
+    kernel_filter_outfile_name = os.path.join(curr_build_dir, f"FK_{mode}_testlist_SM{arch}_cutlass3x_gemm_kernel_filter.list")
+    with open(kernel_filter_outfile_name) as file:
+      data = file.readlines()
+      data.sort()
+    with open(kernel_filter_outfile_name, 'w') as file:
+      for i in range(len(data)):
+        file.write(data[i])
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/gemm_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/gemm_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d2449e769303b738212cdcd896c9f2793ca2632
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/gemm_operation.py
@@ -0,0 +1,1613 @@
+
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting GEMM kernels
+"""
+
+import collections
+import enum
+import functools
+import logging
+import operator
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+_LOGGER = logging.getLogger(__name__)
+
+###################################################################################################
+#
+# Data structure modeling a GEMM operation
+#
+###################################################################################################
+
+#
+class GemmOperation:
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, D = None,
+      kernel_schedule = KernelScheduleType.ScheduleAuto, epilogue_schedule = EpilogueScheduleType.ScheduleAuto,
+      tile_scheduler = TileSchedulerType.Default, mixed_input_mode = None, mixed_input_shuffle = False,
+      ScaleFactorA = None, ScaleFactorB = None, ScaleFactorD = None, 
+      ScaleFactorMVecSize = None, ScaleFactorNVecSize = None, ScaleFactorKVecSize = None):
+
+    kinds_3x = {
+      GemmKind.Universal3x,
+      GemmKind.SparseUniversal3x,
+      GemmKind.BlockScaledUniversal3x, 
+      GemmKind.GroupedUniversal3x,
+      GemmKind.GroupedBlockScaledUniversal3x,
+      GemmKind.BlockwiseUniversal3x,
+      GemmKind.GroupedBlockwiseUniversal3x,
+    }
+    self.is_3x = gemm_kind in kinds_3x
+    self.prefix = "3x" if self.is_3x else ""
+    self.operation_kind = OperationKind.Gemm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.gemm_kind = gemm_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.D = D
+
+    if is_block_scaled(gemm_kind):
+      self.ScaleFactorA = ScaleFactorA
+      self.ScaleFactorB = ScaleFactorB
+      self.ScaleFactorD = ScaleFactorD["tensor"]
+      self.ScaleFactorVectorSize = ScaleFactorD["vector_size"]
+
+    if is_blockwise(gemm_kind):
+      self.ScaleFactorMVecSize = ScaleFactorMVecSize
+      self.ScaleFactorNVecSize = ScaleFactorNVecSize
+      self.ScaleFactorKVecSize = ScaleFactorKVecSize
+
+    if self.D == None:
+      self.D = self.C
+
+    if not self.is_3x:
+      assert(kernel_schedule == KernelScheduleType.ScheduleAuto)
+      assert(epilogue_schedule == EpilogueScheduleType.ScheduleAuto)
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+
+    if self.is_3x and epilogue_functor == EpilogueFunctor.LinearCombination:
+      self.epilogue_functor = EpilogueFunctor3x.LinearCombination
+
+    self.swizzling_functor = swizzling_functor
+    self.tile_scheduler = tile_scheduler
+
+    # Only enable mixed input mode and mixed input shuffle for Hopper
+    self.mixed_input_mode = None
+    if self.is_mixed_input() and self.arch >= 90 and self.arch < 100:
+      self.mixed_input_mode = mixed_input_mode
+    self.mixed_input_shuffle = (self.mixed_input_mode is not None) and mixed_input_shuffle
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def is_planar_complex(self):
+    return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray)
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and',
+      MathOperation.multiply_add_fast_accum: 'fastaccum',
+    }
+
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+      OpcodeClass.BlockScaledTensorOp, 
+    ]
+
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+
+    if is_tensor_op:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) if not self.is_3x else ""
+
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    short_math_name = self.short_math_name() if not self.is_3x else ""
+
+    return "%s%s%s%s" % (short_math_name, inst_shape, intermediate_type, GemmKindNames[self.gemm_kind])
+
+  # Generates a string representing the MMA instruction.
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    element_sfa = ""
+    element_sfb = ""
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.is_mixed_input():
+        extended_name = "${core_name}_${element_a}_${element_b}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
+      elif is_blockwise(self.gemm_kind):
+        extended_name = "${core_name}_${element_sfa}x${element_a}_${element_sfb}x${element_b}"
+        element_sfa = DataTypeNames[self.accumulator_type()]
+        element_sfb = DataTypeNames[self.accumulator_type()]
+      else:
+        extended_name = "${core_name}"
+        if self.C.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name = "${element_c}_" + extended_name
+        if self.A.element != self.tile_description.math_instruction.element_accumulator:
+          extended_name += "_${element_a}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_sfa' : element_sfa,
+      'element_b': DataTypeNames[self.B.element],
+      'element_sfb' : element_sfb,
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def mixed_input_mode_name(self):
+    mode_name_mapping = {
+      MixedInputMode.ConvertOnly: "_cvt",
+      MixedInputMode.ScaleOnly: "_scl",
+      MixedInputMode.ScaleWithZeroPoint: "_sclzr"
+    }
+    mode_name = mode_name_mapping.get(self.mixed_input_mode, "")
+    if self.mixed_input_shuffle:
+      mode_name = mode_name + "_shfl"
+    return mode_name
+
+  def extended_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    extended_name = "{core_name}_{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element],
+      core_name = self.core_name())
+
+    if is_block_scaled(self.gemm_kind):
+      d_type_names = DataTypeNames[self.D.element]
+
+      if self.ScaleFactorD.element != DataType.void:
+        d_type_names = DataTypeNames[self.ScaleFactorD.element] + "x" + d_type_names
+
+      extended_name = "{core_name}_{element_sfa}x{element_a}_{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
+        element_sfa = DataTypeNames[self.ScaleFactorA],
+        element_a = DataTypeNames[self.A.element],
+        element_sfb = DataTypeNames[self.ScaleFactorB],
+        element_b = DataTypeNames[self.B.element],
+        element_acc = DataTypeNames[self.accumulator_type()],
+        element_c = DataTypeNames[self.C.element],
+        element_d = d_type_names,
+        core_name = self.core_name())
+
+    if is_blockwise(self.gemm_kind):
+      d_type_names = DataTypeNames[self.D.element]
+
+      extended_name = "{core_name}_{sfvec_m_size}x{sfvec_k_size}{element_sfa}x{element_a}_{sfvec_n_size}x{sfvec_k_size}{element_sfb}x{element_b}_{element_acc}_{element_c}_{element_d}".format(
+        element_sfa = DataTypeNames[self.accumulator_type()],
+        element_a = DataTypeNames[self.A.element],
+        element_sfb = DataTypeNames[self.accumulator_type()],
+        element_b = DataTypeNames[self.B.element],
+        element_acc = DataTypeNames[self.accumulator_type()],
+        element_c = DataTypeNames[self.C.element],
+        element_d = d_type_names,
+        sfvec_m_size = self.ScaleFactorMVecSize,
+        sfvec_n_size = self.ScaleFactorNVecSize,
+        sfvec_k_size = self.ScaleFactorKVecSize,
+        core_name = self.core_name())
+
+    if self.mixed_input_mode != None:
+      extended_name = extended_name + self.mixed_input_mode_name()
+    return extended_name
+
+  def datatype_name_3x(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    datatype_name = "{element_a}_{element_b}_{element_acc}_{element_c}_{element_d}".format(
+      element_a = DataTypeNames[self.A.element],
+      element_b = DataTypeNames[self.B.element],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      element_d = DataTypeNames[self.D.element])
+    return datatype_name
+
+  # Generates a short string representing the AB layout tags (e.g. nt or tn)
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+      )
+    return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+
+  # Generates a short string representing the ABC layout tags (e.g. ntn or tnn)
+  def layout_name_3x(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "{}{}{}".format(
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)],
+        ShortComplexLayoutNames[(self.C.layout, self.C.complex_transform)])
+    else:
+      return "{}{}{}".format(
+        ShortLayoutTypeNames[self.A.layout],
+        ShortLayoutTypeNames[self.B.layout],
+        ShortLayoutTypeNames[self.C.layout])
+
+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name_3x(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
+
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name_3x(self):
+
+    if is_block_scaled(self.gemm_kind):
+      if self.ScaleFactorD.element != DataType.void:
+        return EpilogueScheduleSuffixes[self.epilogue_schedule] + "_epiVs" + str(self.ScaleFactorVectorSize)+ShortLayoutTypeNames[self.ScaleFactorD.layout]
+    
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
+
+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+  def get_collective_tile_shape(self):
+    """
+    Get the tile shape passed to the collective builder.
+    On Blackwell, this is different than the operation.tile_description.tile_shape.
+    """
+    is_sm100_kernel = (self.arch == 100 or self.arch == 103)
+    if not is_sm100_kernel:
+      return self.tile_description.tile_shape
+
+    opcode_class_main = self.tile_description.math_instruction.opcode_class
+    instruction_shape = self.tile_description.math_instruction.instruction_shape
+    tile_shape_m, tile_shape_n, tile_shape_k = self.tile_description.tile_shape
+    if opcode_class_main in [OpcodeClass.TensorOp, OpcodeClass.BlockScaledTensorOp, OpcodeClass.SparseTensorOp]:
+      tile_shape_m = instruction_shape[0]
+      tile_shape_n = instruction_shape[1]
+    return (tile_shape_m, tile_shape_n, tile_shape_k)
+
+  # Generates the full kernel function name
+  def procedural_name(self):
+    return self._procedural_name
+
+  @functools.cached_property
+  def _procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+    if self.arch >= 90:
+      kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}"
+      tile_shape = self.get_collective_tile_shape()
+      return kernel_name_template.format(
+          p = self.prefix,
+          ar = self.arch,
+          op = opcode_class_name,
+          ex = self.extended_name_3x(),
+          ct = '_' + 'x'.join([str(i) for i in tile_shape]) if tile_shape[0] > 0 else "",
+          cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+          l = self.tile_description.stages,
+          s = self.layout_name_3x(),
+          al = str(max(self.A.alignment, self.B.alignment)),
+          t = TileSchedulerSuffixes[self.tile_scheduler],
+          k = self.kernel_schedule_name_3x(),
+          e = self.epilogue_schedule_name_3x())
+    else:
+      threadblock = self.tile_description.procedural_name()
+      return "cutlass{p}_{op}_{ex}_{tb}_{l}_align{a}".format(
+          p = self.prefix,
+          op = opcode_class_name,
+          ex = self.extended_name(),
+          tb = threadblock,
+          l = self.layout_name(),
+          a = str(max(self.A.alignment, self.B.alignment)))
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+  def __hash__(self):
+    return hash(self.configuration_name())
+
+  def __eq__(self, other):
+    return self.configuration_name() == other.configuration_name()
+
+###################################################################################################
+#
+# Data structure modeling a grouped GEMM operation
+#
+###################################################################################################
+
+#
+class GroupedGemmOperation(GemmOperation):
+  #
+  def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      scheduler_mode = GroupScheduleMode.Device):
+    super().__init__(gemm_kind, arch, tile_description, A, B, C, element_epilogue, \
+                     epilogue_functor, swizzling_functor)
+
+    self.scheduler_mode = scheduler_mode
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    base = super().procedural_name()
+    return SubstituteTemplate(
+      base + "_schedule${schedule}",
+      {
+        'schedule': ShortGroupScheduleModeNames[self.scheduler_mode]
+      })
+
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitGemmInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.gemm_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::Gemm<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    false,
+    ${math_operation}
+    ${residual}
+  >;
+"""
+    self.gemm_complex_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::GemmComplex<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${transform_a},
+    ${transform_b},
+    ${math_operation}
+    ${residual}
+  >;
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    residual = ''
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'residual': residual
+    }
+
+    template = self.gemm_complex_template if operation.is_complex() else self.gemm_template
+
+    return SubstituteTemplate(template, values)
+
+###################################################################################################
+
+class EmitSparseGemmInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.gemm_template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = cutlass::gemm::device::SparseGemm<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    false,
+    ${math_operation}
+    ${residual}
+  >;
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<Operation_${operation_name}>("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    residual = ''
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'residual': residual
+    }
+
+    template = self.gemm_template
+
+    return SubstituteTemplate(template, values)
+
+###################################################################################################
+
+
+#
+class EmitGemmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/device/gemm_universal_adapter.h",
+      "cutlass/gemm/kernel/default_gemm_universal.h",
+    ]
+    self.builtin_epilogue_functor_template = """
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >
+"""
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},    // transposed B operand
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},    // transposed A operand
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+    self.gemm_template_interleaved = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+      cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+    >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    if operation.A.layout in transpose_layouts.keys() and \
+      operation.B.layout in transpose_layouts.keys() and \
+      operation.C.layout in transpose_layouts.keys():
+
+      instance_layout_A = transpose_layouts[operation.A.layout]
+      instance_layout_B = transpose_layouts[operation.B.layout]
+      instance_layout_C = transpose_layouts[operation.C.layout]
+
+      gemm_template = self.gemm_template
+    else:
+      instance_layout_A, instance_layout_B, instance_layout_C = \
+        (operation.A.layout, operation.B.layout, operation.C.layout)
+
+      gemm_template = self.gemm_template_interleaved
+    #
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(gemm_template, values)
+
+
+###################################################################################################
+
+class EmitGemmUniversal3xInstance:
+  ''' Responsible for emitting a CUTLASS 3.x template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/gemm/gemm.h",
+      "cutlass/numeric_types.h",
+      "cutlass/gemm/kernel/gemm_universal.hpp",
+      "cutlass/gemm/collective/collective_builder.hpp",
+      "cutlass/epilogue/collective/collective_builder.hpp",
+      "cutlass/detail/blockwise_scale_layout.hpp",
+    ]
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
+      ${element_d},
+      ${element_epilogue},
+      ${element_c},
+      ${element_epilogue}
+    >"""
+
+    self.gemm_template = """
+
+using ${operation_name}_epilogue =
+  typename cutlass::epilogue::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_epi},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${epi_tile_mn},
+    ${element_accumulator}, ${element_epilogue},
+    ${element_c}, ${layout_c}, ${align_c},
+    ${element_d}, ${layout_d}, ${align_d},
+    ${epilogue_schedule},
+    ${epilogue_functor}
+  >::CollectiveOp;
+
+${mixed_dtype_prepare_code}
+${blockwise_prepare_code}
+
+using ${operation_name}_mainloop =
+  typename cutlass::gemm::collective::CollectiveBuilder<
+    ${arch}, ${opcode_class_main},
+    ${element_a}, ${layout_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${align_b},
+    ${element_accumulator},
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
+    cute::Shape<${cluster_shape_m}, ${cluster_shape_n}, ${cluster_shape_k}>,
+    ${stages},
+    ${kernel_schedule}
+  >::CollectiveOp;
+
+// Gemm operator ${operation_name}
+using ${operation_name}_base = cutlass::gemm::kernel::GemmUniversal<
+    ${problem_shape},
+    ${operation_name}_mainloop,
+    ${operation_name}_epilogue,
+    ${tile_scheduler}>;
+
+// Define named type
+struct ${operation_name} :
+  public ${operation_name}_base { };
+
+"""
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  {
+    using GemmKernel = cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>;
+    manifest.append(
+      new ${gemm_kind}<GemmKernel>("${operation_name}"));
+  }
+${compile_guard_end}
+"""
+
+  
+  def emit_block_scale_epilogue_functor(self, operation):
+    block_scaled_template = """
+      ${epilogue_functor}<
+        ${epi_vs},
+        ${element_d},
+        ${element_accumulator},
+        ${element_sfd},
+        ${layout_sfd},
+        ${element_c},
+        ${element_scalar}
+      >
+    """
+    block_scaled_values = {
+      'epi_vs'  : str(operation.ScaleFactorVectorSize),
+      'element_d': str(DataTypeTag[operation.D.element]),
+      'element_sfd': str(DataTypeTag[operation.ScaleFactorD.element]),
+      'layout_sfd': LayoutTag[operation.ScaleFactorD.layout],
+      'epilogue_functor': EpilogueFunctor3xTag[EpilogueFunctor3x.LinearCombinationBlockScaleFactor],
+      'element_accumulator': str(DataTypeTag[operation.accumulator_type()]),
+      'element_scalar': str(DataTypeTag[operation.accumulator_type()]),
+      'element_c': str(DataTypeTag[operation.C.element]),
+    }
+    return SubstituteTemplate(block_scaled_template, block_scaled_values)
+  
+
+  @staticmethod
+  def pointerize_if_grouped(operation, layout):
+    return layout if not is_grouped(operation.gemm_kind) else layout + "* "
+
+  @staticmethod
+  def transform_layout_A_if_blockwise(operation, layout):
+    layout_sfa = f"{operation.procedural_name()}_LayoutSFA"
+    layout_sfa = layout_sfa if not is_grouped(operation.gemm_kind) else layout_sfa + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfa}>"
+
+  @staticmethod
+  def transform_layout_B_if_blockwise(operation, layout):
+    layout_sfb = f"{operation.procedural_name()}_LayoutSFB"
+    layout_sfb = layout_sfb if not is_grouped(operation.gemm_kind) else layout_sfb + "* "
+    return layout if not is_blockwise(operation.gemm_kind) else f"cute::tuple<{layout}, {layout_sfb}>"
+
+  @staticmethod
+  def problem_shape(operation):
+    gemm_shape_type = "cute::Shape<int,int,int,int>"
+    grouped_gemm_shape_type = "cute::Shape<int,int,int>"
+    grouped_gemm_shape_type = "cutlass::gemm::GroupProblemShape<" + grouped_gemm_shape_type + ">"
+
+    return gemm_shape_type if not is_grouped(operation.gemm_kind) else grouped_gemm_shape_type
+
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.procedural_name(): " + operation.procedural_name())
+    _LOGGER.debug("***   tile_shape: " + str(operation.tile_description.tile_shape))
+    _LOGGER.debug("***   warp_count: " + str(operation.tile_description.warp_count))
+
+    opcode_class_main = operation.tile_description.math_instruction.opcode_class
+    opcode_class_epi = opcode_class_main
+    
+    tile_shape = operation.tile_description.tile_shape
+    instruction_shape = operation.tile_description.math_instruction.instruction_shape
+    cluster_m = operation.tile_description.cluster_shape[0]
+    cluster_n = operation.tile_description.cluster_shape[1]
+    cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1]
+    tile_shape_m, tile_shape_n, tile_shape_k = operation.get_collective_tile_shape()
+ 
+    # stage count set to zero indicates builder automatic stage selection
+    if operation.tile_description.stages > 0:
+      stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
+    elif opcode_class_main == OpcodeClass.SparseTensorOp and operation.arch == 100:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveoutEpi<{str(operation.procedural_name())}_epilogue>"
+    else:
+      stage_count_string = f"cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename {str(operation.procedural_name())}_epilogue::SharedStorage))>"
+
+    epi_tile_mn = "cutlass::epilogue::collective::EpilogueTileAuto"
+
+    instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
+      (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
+
+    # 3.0 profiler integration only supports trivial epilogues for now
+    epilogue_vector_length = 1
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+      values = {
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctor3xTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+      
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
+        epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
+
+
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+
+      if is_block_scaled(operation.gemm_kind) and operation.ScaleFactorD.element != DataType.void:
+        epilogue_functor =  self.emit_block_scale_epilogue_functor(operation)
+
+    #
+    # Cutlass3x complex kernels' ElementA(B) is a tuple in collective mainloop builder, e.g. cute::tuple<Element, Transform>, Transform : cute::identity / cute::conjugate.
+    element_a = DataTypeTag[operation.A.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.A.element])},{str(ComplexTransformTag3x[operation.A.complex_transform])}>"
+    element_b = DataTypeTag[operation.B.element] if not operation.is_complex() else f"cute::tuple<{str(DataTypeTag[operation.B.element])},{str(ComplexTransformTag3x[operation.B.complex_transform])}>"
+    epilogue_schedule_type = EpilogueScheduleTag[operation.epilogue_schedule]
+    
+    if opcode_class_main == OpcodeClass.BlockScaledTensorOp:
+      grouped = is_grouped(operation.gemm_kind)
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped):
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]
+      if cta_n == 256 and operation.kernel_schedule == to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped):
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]
+      # SM103 FP4 Ultra
+      is_sm103_fp4_ultra_1sm_kernel_schedule = operation.kernel_schedule in [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch, grouped)
+                                                                             ]
+      is_sm103_fp4_ultra_2sm_kernel_schedule = operation.kernel_schedule in [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch, grouped),
+                                                                             to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch, grouped)
+                                                                             ]
+      if cta_n == 256 and is_sm103_fp4_ultra_1sm_kernel_schedule:
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]
+      if cta_n == 256 and is_sm103_fp4_ultra_2sm_kernel_schedule:
+        epi_tile_mn = "cute::Shape<cute::_128,cute::_64>"
+        if is_tma_epilogue(operation.epilogue_schedule):
+          epilogue_schedule_type = EpilogueScheduleTag[to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]
+
+      element_a = f'cute::tuple<{str(element_a)},{str(DataTypeTag[operation.ScaleFactorA])}>'
+      element_b = f'cute::tuple<{str(element_b)},{str(DataTypeTag[operation.ScaleFactorB])}>'
+
+    alignment_c = get_tma_alignment(operation.C.element) \
+                  if is_tma_epilogue(operation.epilogue_schedule) and opcode_class_epi != OpcodeClass.Simt \
+                  else operation.C.alignment
+    alignment_d = get_tma_alignment(operation.D.element) \
+                  if is_tma_epilogue(operation.epilogue_schedule) and opcode_class_epi != OpcodeClass.Simt \
+                  else operation.D.alignment
+
+    operation_name_str = operation.procedural_name()
+    layout_a_str = LayoutTag[instance_layout_A]
+    layout_b_str = LayoutTag[instance_layout_B]
+    mixed_dtype_prepare_code = ""
+    if operation.mixed_input_mode != None:
+      A_dtype = operation.A.element
+      B_dtype = operation.B.element
+      A_dtype_bits = DataTypeSize[A_dtype]
+      B_dtype_bits = DataTypeSize[B_dtype]
+      is_A_dtype_narrow = A_dtype_bits < B_dtype_bits
+      if is_A_dtype_narrow:
+        narrow_dtype, wide_dtype = (A_dtype, B_dtype)
+        narrow_dtype_bits, wide_dtype_bits = (A_dtype_bits, B_dtype_bits)
+      else:
+        narrow_dtype, wide_dtype = (B_dtype, A_dtype)
+        narrow_dtype_bits, wide_dtype_bits = (B_dtype_bits, A_dtype_bits)
+
+      narrow_tag = DataTypeTag[narrow_dtype]
+      wide_tag   = DataTypeTag[wide_dtype]
+      scale_tag  = DataTypeTag[wide_dtype]
+      zero_tag   = DataTypeTag[wide_dtype]
+
+      do_shuffle = False
+      value_shuffle_str = ""
+      if narrow_dtype_bits == 4 and wide_dtype_bits == 16:
+        value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_4>, cute::Stride<cute::_4,cute::_1>>"
+        do_shuffle = True
+      if narrow_dtype_bits == 8 and wide_dtype_bits == 16:
+        value_shuffle_str = "cute::Layout<cute::Shape<cute::_2,cute::_2>, cute::Stride<cute::_2,cute::_1>>"
+        do_shuffle = True
+      do_shuffle = operation.mixed_input_shuffle and do_shuffle
+
+      if do_shuffle:
+        if is_A_dtype_narrow:
+          stride_narrow_str = f"cutlass::detail::TagToStrideA_t<{layout_a_str}>"
+          layout_a_str = f"{operation_name_str}_LayoutNarrowReordered"
+        else:
+          stride_narrow_str = f"cutlass::detail::TagToStrideB_t<{layout_b_str}>"
+          layout_b_str = f"{operation_name_str}_LayoutNarrowReordered"
+        # The {operation_name_str}_ prefixs in mixed_dtype_prepare_code and
+        # layout_{a, b}_str are to prevent errors in Windows platform unity build
+        mixed_dtype_prepare_code = f"""
+using {operation_name_str}_StrideNarrow = {stride_narrow_str};
+using {operation_name_str}_ValueShuffle = {value_shuffle_str};
+static constexpr int {operation_name_str}_NumShuffleAtoms = 1;
+using {operation_name_str}_MmaAtomShape = cute::Layout<cute::Shape<cute::_1, cute::Int<{operation_name_str}_NumShuffleAtoms>>>;
+using {operation_name_str}_LayoutAtomQuant = decltype(cutlass::compute_memory_reordering_atom<{wide_tag}, {operation_name_str}_MmaAtomShape, {operation_name_str}_ValueShuffle>());
+using {operation_name_str}_LayoutNarrowReordered = decltype(cute::tile_to_shape({operation_name_str}_LayoutAtomQuant{{}}, cute::Layout<cute::Shape<int,int,int>, {operation_name_str}_StrideNarrow>{{}}));
+        """
+
+      mixed_input_modes_to_element = {
+        MixedInputMode.ConvertOnly: narrow_tag,
+        MixedInputMode.ScaleOnly: f"cute::tuple<{narrow_tag}, {scale_tag}>",
+        MixedInputMode.ScaleWithZeroPoint: f"cute::tuple<{narrow_tag}, {scale_tag}, {zero_tag}>"
+      }
+      narrow_element = mixed_input_modes_to_element.get(operation.mixed_input_mode, narrow_tag)
+
+      if narrow_dtype == DataType.s4 and (wide_dtype == DataType.e4m3 or wide_dtype == DataType.e5m2):
+        narrow_element = f"cute::tuple<{narrow_tag}, cutlass::Array<{scale_tag}, 8>>"
+
+      if is_A_dtype_narrow:
+        element_a = narrow_element
+      else:
+        element_b = narrow_element
+
+    blockwise_prepare_code = ""
+    if is_blockwise(operation.gemm_kind):
+      sfm_vec_size = operation.ScaleFactorMVecSize
+      sfn_vec_size = operation.ScaleFactorNVecSize
+      sfk_vec_size = operation.ScaleFactorKVecSize
+      blockwise_prepare_code = f"""
+using {operation_name_str}_ScaleConfig = cutlass::detail::Sm{operation.arch}BlockwiseScaleConfig<{sfm_vec_size}, {sfn_vec_size}, {sfk_vec_size}>;
+using {operation_name_str}_LayoutSFA = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFA());
+using {operation_name_str}_LayoutSFB = decltype({operation_name_str}_ScaleConfig::deduce_layoutSFB());
+      """
+
+    values = {
+      'operation_name': operation_name_str,
+      'operation_suffix': self.operation_suffix,
+      'problem_shape': self.problem_shape(operation),
+      'element_a': element_a,
+      'layout_a': self.transform_layout_A_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_a_str)),
+      'element_b': element_b,
+      'layout_b': self.transform_layout_B_if_blockwise(operation, self.pointerize_if_grouped(operation, layout_b_str)),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': self.pointerize_if_grouped(operation, LayoutTag[instance_layout_C]),
+      'element_d': DataTypeTag[operation.D.element],
+      'layout_d': self.pointerize_if_grouped(operation, LayoutTag[instance_layout_D]),
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class_main': OpcodeClassTag[opcode_class_main],
+      'opcode_class_epi': OpcodeClassTag[opcode_class_epi],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'tile_shape_m': str(tile_shape_m),
+      'tile_shape_n': str(tile_shape_n),
+      'tile_shape_k': str(tile_shape_k),
+      'cluster_shape_m': 'cute::_' + str(operation.tile_description.cluster_shape[0]) if operation.tile_description.cluster_shape[0] > 0 else "int",
+      'cluster_shape_n': 'cute::_' + str(operation.tile_description.cluster_shape[1]) if operation.tile_description.cluster_shape[1] > 0 else "int",
+      'cluster_shape_k': 'cute::_' + str(operation.tile_description.cluster_shape[2]) if operation.tile_description.cluster_shape[2] > 0 else "int",
+      'instruction_shape_m': str(instruction_shape[0]),
+      'instruction_shape_n': str(instruction_shape[1]),
+      'instruction_shape_k': str(instruction_shape[2]),
+      'kernel_schedule' : str(KernelScheduleTag[operation.kernel_schedule]),
+      'epilogue_schedule' : str(epilogue_schedule_type),
+      'epi_tile_mn' : epi_tile_mn,
+      'epilogue_functor': epilogue_functor,
+      'stages': stage_count_string,
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'align_c': str(alignment_c),
+      'align_d': str(alignment_d),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'tile_scheduler': str(TileSchedulerTag[operation.tile_scheduler]),
+      'mixed_dtype_prepare_code': mixed_dtype_prepare_code,
+      'blockwise_prepare_code' : blockwise_prepare_code
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
+###################################################################################################
+
+#
+class EmitGemmPlanarComplexInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b},
+    ${element_c}, cutlass::layout::RowMajor,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+      ${element_c},
+      ${alignment_c},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator}
+  >::GemmKernel;
+
+  struct ${operation_name} :
+    public Operation_${operation_name} { };
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+
+    # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major
+    transposed_layout_A = TransposedLayout[operation.A.layout]
+    transposed_layout_B = TransposedLayout[operation.B.layout]
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.B.element],
+      'layout_a': LayoutTag[transposed_layout_B],
+      'transform_a': ComplexTransformTag[operation.B.complex_transform],
+      'alignment_a': str(operation.B.alignment),
+      'element_b': DataTypeTag[operation.A.element],
+      'layout_b': LayoutTag[transposed_layout_A],
+      'transform_b': ComplexTransformTag[operation.A.complex_transform],
+      'alignment_b': str(operation.A.alignment),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'alignment_c': str(operation.C.alignment),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'stages': str(operation.tile_description.stages),
+      'math_operator': 'cutlass::arch::OpMultiplyAdd'
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+###################################################################################################
+
+#
+class EmitGemmPlanarComplexArrayInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = []
+    self.template = """
+  // Gemm operator ${operation_name}
+  using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal<
+    ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b},
+    ${element_c}, cutlass::layout::RowMajor,
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    cutlass::epilogue::thread::LinearCombinationPlanarComplex<
+      ${element_c},
+      ${alignment_c},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,
+    ${stages},
+    ${math_operator}
+  >::GemmArrayKernel;
+
+  struct ${operation_name} : public Operation_${operation_name} { };
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmUniversalAdapter<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)]
+
+    # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major
+    transposed_layout_A = TransposedLayout[operation.A.layout]
+    transposed_layout_B = TransposedLayout[operation.B.layout]
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.B.element],
+      'layout_a': LayoutTag[transposed_layout_B],
+      'transform_a': ComplexTransformTag[operation.B.complex_transform],
+      'alignment_a': str(operation.B.alignment),
+      'element_b': DataTypeTag[operation.A.element],
+      'layout_b': LayoutTag[transposed_layout_A],
+      'transform_b': ComplexTransformTag[operation.A.complex_transform],
+      'alignment_b': str(operation.A.alignment),
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'alignment_c': str(operation.C.alignment),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'stages': str(operation.tile_description.stages),
+      'math_operator': 'cutlass::arch::OpMultiplyAdd'
+    }
+
+    return SubstituteTemplate(self.template, values)
+
+###################################################################################################
+
+#
+class EmitGemmGroupedInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self, operation_suffix = ''):
+    self.operation_suffix = operation_suffix
+    self.includes = [
+      "cutlass/cutlass.h",
+      "cutlass/numeric_types.h",
+      "cutlass/arch/arch.h",
+      "cutlass/arch/mma.h",
+      "cutlass/layout/matrix.h",
+      "cutlass/gemm/device/gemm.h",
+      "cutlass/gemm/kernel/gemm_grouped.h",
+      "cutlass/gemm/kernel/default_gemm_grouped.h",
+      "cutlass/gemm/device/gemm_grouped.h"
+    ]
+    self.builtin_epilogue_functor_template = \
+"""${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >"""
+
+    self.gemm_template = """
+// Gemm operator ${operation_name}
+using ${operation_name}_base =
+  typename cutlass::gemm::kernel::DefaultGemmGrouped<
+    ${element_a}, ${layout_a}, ${transform_a}, ${align_a},
+    ${element_b}, ${layout_b}, ${transform_b}, ${align_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor},
+    ${swizzling_functor},
+    ${stages},
+    ${scheduler_mode},
+    ${math_operation}
+>::GemmKernel;
+
+// Define named type
+struct ${operation_name}${operation_suffix} :
+  public ${operation_name}_base { };
+"""
+
+  #
+  def instance_template(self):
+    return """
+${compile_guard_start}
+  manifest.append(new ${gemm_kind}<
+    cutlass::gemm::device::GemmGrouped<${operation_name}>
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+
+  #
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    transpose_layouts = {
+      LayoutType.ColumnMajor: LayoutType.RowMajor,
+      LayoutType.RowMajor: LayoutType.ColumnMajor
+    }
+
+    instance_layout_A, instance_layout_B, instance_layout_C = \
+      (operation.A.layout, operation.B.layout, operation.C.layout)
+    #
+
+    # Support built-in epilogue functors or user-defined functions
+    if isinstance(operation.epilogue_functor, enum.Enum):
+
+      epilogue_vector_length = \
+        min(operation.C.alignment * DataTypeSize[operation.C.element], 128) // DataTypeSize[operation.C.element]
+
+      values = {
+        'epilogue_vector_length': str(epilogue_vector_length),
+        'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+        'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      }
+      epilogue_functor = SubstituteTemplate(self.builtin_epilogue_functor_template, values)
+    else:
+      epilogue_functor = self.epilogue_functor.emit_declaration()
+    #
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'operation_suffix': self.operation_suffix,
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[instance_layout_A],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[instance_layout_B],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[instance_layout_C],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_functor': epilogue_functor,
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'scheduler_mode': GroupScheduleModeTag[operation.scheduler_mode],
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation]
+    }
+
+    return SubstituteTemplate(self.gemm_template, values)
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitGemmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      GemmKind.Gemm: EmitGemmInstance,
+      GemmKind.Sparse: EmitSparseGemmInstance,
+      GemmKind.Universal: EmitGemmUniversalInstance,
+      GemmKind.Universal3x: EmitGemmUniversal3xInstance,
+      GemmKind.SparseUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.BlockScaledUniversal3x: EmitGemmUniversal3xInstance,  
+      GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance,
+      GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance,
+      GemmKind.Grouped: EmitGemmGroupedInstance,
+      GemmKind.GroupedUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockScaledUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.BlockwiseUniversal3x: EmitGemmUniversal3xInstance,
+      GemmKind.GroupedBlockwiseUniversal3x: EmitGemmUniversal3xInstance,
+    }
+
+    self.gemm_kind_wrappers = {
+      GemmKind.Gemm: 'GemmOperation',
+      GemmKind.Sparse: 'GemmSparseOperation',
+      GemmKind.Universal: 'GemmUniversalOperation',
+      GemmKind.Universal3x: 'GemmUniversal3xOperation',
+      GemmKind.SparseUniversal3x: 'SparseGemmUniversal3xOperation',
+      GemmKind.BlockScaledUniversal3x: 'BlockScaledGemmUniversal3xOperation', 
+      GemmKind.PlanarComplex: 'GemmPlanarComplexOperation',
+      GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation',
+      GemmKind.Grouped: 'GemmGroupedOperation',
+      GemmKind.GroupedUniversal3x: 'GroupedGemmUniversal3xOperation',
+      GemmKind.GroupedBlockScaledUniversal3x: 'GroupedBlockScaledGemmUniversal3xOperation',
+      GemmKind.BlockwiseUniversal3x: 'BlockwiseGemmUniversal3xOperation',
+      GemmKind.GroupedBlockwiseUniversal3x: 'GroupedBlockwiseGemmUniversal3xOperation',
+    }
+
+    self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)"
+
+    self.separator = """
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.header_template = """
+/*
+  Generated by gemm_operation.py - Do not edit.
+*/
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::__enter__")
+    _LOGGER.debug("***   configuration_path (file to write): " +
+                  str(self.configuration_path))
+
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+    self.configuration_file.write(self.separator)
+
+    self.includes = collections.OrderedDict([
+      ("cutlass/cutlass.h", None),
+      ("cutlass/library/library.h", None),
+      ("cutlass/library/manifest.h", None),
+      ("library_internal.h", None),
+      ("gemm_operation.h", None),
+      ("gemm_operation_3x.hpp", None),
+      ("grouped_gemm_operation_3x.hpp", None),
+      ("sparse_gemm_operation_3x.hpp", None),
+      ("block_scaled_gemm_operation_3x.hpp", None),   
+      ("blockwise_gemm_operation_3x.hpp", None),   
+      ("cutlass/arch/wmma.h", None),
+      ("cutlass/numeric_types.h", None)
+    ])
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    _LOGGER.debug("*** EmitGemmConfigurationLibrary::emit(operation)")
+    _LOGGER.debug("***   operation.gemm_kind: " + str(operation.gemm_kind))
+
+    emitter = self.instance_emitter[operation.gemm_kind]()
+
+    for incl in emitter.includes:
+      self.includes[incl] = None
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(emitter.instance_template(), {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'gemm_kind': self.gemm_kind_wrappers[operation.gemm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write includes
+    for incl, _ in self.includes.items():
+      include_statement = "#include \"%s\"\n" % incl
+      self.configuration_file.write(include_statement)
+
+    self.configuration_file.write(self.separator)
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/generator.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..063e8fb1caa6626e8ba099133fee4dd3dc115e40
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/generator.py
@@ -0,0 +1,10962 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for enumerating CUTLASS library kernels
+"""
+
+import argparse
+import enum
+from itertools import chain, product
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Dict, Optional, Sequence, Tuple
+
+_LOGGER = logging.getLogger(__name__)
+
+def logging_prefix(indent_level: int = 0) -> str:
+  """String prefix for start of each debug log entry"""
+  prefix = '*** '
+  indent = '  '
+  return f"{prefix}{indent_level * indent}"
+
+def log_debug_line(line: str, indent_level: int = 0) -> None:
+  """Log one line of debug output"""
+  prefix = logging_prefix(indent_level)
+  _LOGGER.debug(prefix + line)
+
+# Certain usecases of cutlass_library nearly always prefer to run as scripts with
+# relative imports, rather than via an installed Python package. An example of this
+# is using CUTLASS's CMake system to generate a library of kernels to be profiled.
+# To make it easy to use these use cases when an existing installation of cutlass_library
+# exists, this global flag can be set to true (via command-line arguments) to ensure
+# that package-based installations are not used.
+
+# Create a temporary argument parser to check only for the availability of the
+# --disable-cutlass-package-imports argument, which controls whether package-based
+# imports are disabled.
+def _add_package_disablement_flag(argparser):
+  argparser.add_argument("--disable-cutlass-package-imports", action='store_true', required=False,
+                     help="Disable use of cutlass_library from Python package")
+
+_parser = argparse.ArgumentParser()
+_add_package_disablement_flag(_parser)
+_args, _ = _parser.parse_known_args()
+
+# Add `CUTLASS_IGNORE_PACKAGE` to `builtins` so that it is visible for gating future
+# imports without requiring importing another module. Ideally, we would just place this
+# as a global variable in a module to that could be imported and checked (e.g.,
+# utils.CUTLASS_IGNORE_PACKAGE). However, this raises the issue of determining
+# where this module should be sourced (from the cutlass_library package or from
+# a relative import), which is the problem this variable is being used to solve in the
+# first place.
+import builtins
+builtins.CUTLASS_IGNORE_PACKAGE = _args.disable_cutlass_package_imports
+
+try:
+  if CUTLASS_IGNORE_PACKAGE:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.manifest import *
+  from cutlass_library.heuristics import *
+  from cutlass_library.emit_kernel_listing import emit_gemm_kernel_testlist 
+except ImportError:
+  from library import *
+  from manifest import *
+  from heuristics import *
+  from emit_kernel_listing import emit_gemm_kernel_testlist 
+###################################################################################################
+
+#
+def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
+
+  # by default, use the latest CUDA Toolkit version
+  cuda_version = [11, 0, 132]
+
+  # Update cuda_version based on parsed string
+  if semantic_ver_string != '':
+    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')[:3]]):
+      if i < len(cuda_version):
+        cuda_version[i] = x
+      else:
+        cuda_version.append(x)
+  return cuda_version >= [major, minor, patch]
+
+# From cuda 13.0, Thor SM is renumbered from 101 to 110
+def ThorSMRenumbering(cuda_version):
+  return 110 if CudaToolkitVersionSatisfies(cuda_version, 13, 0) else 101
+
+###################################################################################################
+###################################################################################################
+
+#
+def EpilogueAlignment(max_alignment, tile, epilogue_steps = 8):
+  ''' Helper to compute the maximum alignment of the epilogue '''
+
+  def product(X, identity = 1):
+    result = identity
+    for item in X:
+      result *= item
+    return result
+
+  elements_per_thread = product(tile.threadblock_shape[:-1]) // product(tile.warp_count) // 32 // epilogue_steps
+  return min(max_alignment, elements_per_thread)
+
+def DefaultSwizzlingFunctor():
+    return SwizzlingFunctor.Identity8
+    # To use StreamK decomposition for basic GEMMs, set `swizzling_functor = SwizzlingFunctor.StreamK`
+
+#
+def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = DefaultSwizzlingFunctor()):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for tile_description in tile_descriptions:
+      for alignment in alignment_constraints:
+        for complex_transform in complex_transforms:
+
+            # If alignment is a tuple or a list, then we have different alignments for A and B
+            alignment_a = alignment if isinstance(alignment, int) else alignment[0]
+            alignment_b = alignment if isinstance(alignment, int) else alignment[1]
+            alignment_c = min(8, alignment_a) if isinstance(alignment, int) else alignment[2]
+
+            A = TensorDescription(element_a, layout[0], alignment_a, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment_b, complex_transform[1])
+            C = TensorDescription(element_c, layout[2], alignment_c)
+
+            new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
+# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
+def CreateGemmUniversal3xOperator(
+    manifest, layouts, tile_descriptions, data_types,
+    schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
+    complex_transforms=None,
+    epilogue_functor=EpilogueFunctor.LinearCombination,
+    swizzling_functor=SwizzlingFunctor.Identity1,
+    tile_schedulers=[TileSchedulerType.Default],
+    gemm_kind=GemmKind.Universal3x):
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    if len(tile_descriptions) == 0:
+      return operations
+    tile_descriptions = [tile_descriptions[0]]
+
+  combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
+  for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
+    kernel_schedule, epilogue_schedule = schedules
+    A = TensorDescription(
+        data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+    B = TensorDescription(
+        data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
+
+    C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+    D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
+
+    gemm_op_extra_args = {}
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+
+    if "sf_type" in data_type:
+      gemm_op_extra_args["ScaleFactorA"] = data_type["sf_type"]
+      gemm_op_extra_args["ScaleFactorB"] = data_type["sf_type"]
+      gemm_op_extra_args["ScaleFactorD"] = { "tensor": TensorDescription(data_type["sfd_type"]["type"], data_type["sfd_type"]["layout"]),
+                                             "vector_size" : data_type["sfd_type"]["vector_size"]}
+      assert is_block_scaled(gemm_kind)
+    
+    if tile_description.explicit_vector_sizes != None:
+      assert len(tile_description.explicit_vector_sizes) == 3
+      gemm_op_extra_args["ScaleFactorMVecSize"] = tile_description.explicit_vector_sizes[0]
+      gemm_op_extra_args["ScaleFactorNVecSize"] = tile_description.explicit_vector_sizes[1]
+      gemm_op_extra_args["ScaleFactorKVecSize"] = tile_description.explicit_vector_sizes[2]
+      assert is_blockwise(gemm_kind)
+    else:
+      assert not is_blockwise(gemm_kind)
+
+    A_dtype = data_type["a_type"]
+    B_dtype = data_type["b_type"]
+    A_dtype_bits = DataTypeSize[A_dtype]
+    B_dtype_bits = DataTypeSize[B_dtype]
+    is_A_dtype_narrow = A_dtype_bits < B_dtype_bits
+    if is_A_dtype_narrow:
+      narrow_dtype, wide_dtype = (A_dtype, B_dtype)
+      narrow_dtype_bits, wide_dtype_bits = (A_dtype_bits, B_dtype_bits)
+    else:
+      narrow_dtype, wide_dtype = (B_dtype, A_dtype)
+      narrow_dtype_bits, wide_dtype_bits = (B_dtype_bits, A_dtype_bits)
+
+    mixed_input_modes = [None]
+    if narrow_dtype_bits != wide_dtype_bits:
+      if narrow_dtype == DataType.s4 and (wide_dtype == DataType.e4m3 or wide_dtype == DataType.e5m2):
+        mixed_input_modes = [MixedInputMode.ScaleOnly]
+      else:
+        mixed_input_modes = [MixedInputMode.ConvertOnly, MixedInputMode.ScaleOnly, MixedInputMode.ScaleWithZeroPoint]
+
+    mixed_input_shuffle_options = [False]
+    if (mixed_input_modes[0] is not None) and (wide_dtype_bits == 16) and (narrow_dtype_bits == 4 or narrow_dtype_bits == 8):
+      mixed_input_shuffle_options = [False, True]
+
+    for mixed_input_mode, mixed_input_shuffle in product(mixed_input_modes, mixed_input_shuffle_options):
+      operation = GemmOperation(
+          gemm_kind, tile_description.minimum_compute_capability,
+          tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+          kernel_schedule, epilogue_schedule, tile_scheduler,
+          mixed_input_mode=mixed_input_mode, mixed_input_shuffle=mixed_input_shuffle, **gemm_op_extra_args)
+      manifest.append(operation)
+      operations.append(operation)
+
+  return operations
+
+# Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
+def CreateSparseGemmUniversal3xOperator(
+    manifest, layouts, tile_descriptions, data_types,
+    schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
+    complex_transforms=None,
+    epilogue_functor=EpilogueFunctor.LinearCombination,
+    swizzling_functor=SwizzlingFunctor.Identity1,
+    tile_schedulers=[TileSchedulerType.Default]):
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0]]
+
+  combinations = product(layouts, tile_descriptions, data_types, complex_transforms, schedules, tile_schedulers)
+  for layout, tile_description, data_type, complex_transform, schedules, tile_scheduler in combinations:
+    kernel_schedule, epilogue_schedule = schedules
+    A = TensorDescription(
+        data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+    B = TensorDescription(
+        data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
+
+    # Currently assume tensor C/D have same layout requirement.
+    C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+    D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
+
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+
+    operation = GemmOperation(
+        GemmKind.SparseUniversal3x, tile_description.minimum_compute_capability,
+        tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+        kernel_schedule, epilogue_schedule, tile_scheduler)
+
+    manifest.append(operation)
+    operations.append(operation)
+
+  return operations
+
+#
+def CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  gemm_kinds = [GemmKind.Sparse]
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for tile_description in tile_descriptions:
+      for alignment in alignment_constraints:
+        for complex_transform in complex_transforms:
+
+            alignment_c = min(8, alignment)
+
+            A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
+            C = TensorDescription(element_c, layout[2], alignment_c)
+
+            new_operation = GemmOperation(GemmKind.Sparse, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
+#
+def CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  gemm_kinds = [GemmKind.PlanarComplex, GemmKind.PlanarComplexArray]
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for gemm_kind in gemm_kinds:
+    for layout in layouts:
+      for tile_description in tile_descriptions:
+        for alignment in alignment_constraints:
+          for complex_transform in complex_transforms:
+
+            alignment_c = min(8, alignment)
+
+            A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
+            C = TensorDescription(element_c, layout[2], alignment_c)
+
+            manifest.append(GemmOperation(gemm_kind, \
+              tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue))
+  return
+
+#
+def CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for tile_description in tile_descriptions:
+      for alignment in alignment_constraints:
+        for complex_transform in complex_transforms:
+
+            alignment_c = min(8, alignment)
+
+            A = TensorDescription(element_a, layout[0], alignment, complex_transform[0])
+            B = TensorDescription(element_b, layout[1], alignment, complex_transform[1])
+            C = TensorDescription(element_c, layout[2], alignment_c)
+
+            new_operation = GroupedGemmOperation(GemmKind.Grouped, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
+#
+def CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, data_type, \
+  alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  element_a, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for fill_mode in fill_modes:
+      for tile_description in tile_descriptions:
+        for alignment in alignment_constraints:
+
+          # SERK supported layouts (RowMajor, ColumnMajor) with no conjugation
+          complex_transform = ComplexTransform.none
+
+          # HERK supported layouts (RowMajor + conj, ColumnMajor)
+          if blas_mode == BlasMode.hermitian and layout[0] == LayoutType.RowMajor:
+            complex_transform = ComplexTransform.conj
+
+          alignment_c = 1 # Alignment only applies to A in SYRK
+
+          A = TensorDescription(element_a, layout[0], alignment, complex_transform)
+          C = SymmetricTensorDescription(element_c, layout[1], fill_mode, alignment_c)
+
+          # Rank-K update
+          new_operation = RankKOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
+            tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+          # Rank-2K update
+          new_operation = Rank2KOperation(RankKKind.Universal, tile_description.minimum_compute_capability, \
+            tile_description, A, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+#
+def CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, data_type, \
+  alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none),]
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for side_mode in side_modes:
+      for fill_mode in fill_modes:
+        for diag_type in diag_types:
+          for tile_description in tile_descriptions:
+            for alignment in alignment_constraints:
+              for complex_transform in complex_transforms:
+
+                  alignment_c = min(8, alignment)
+
+                  A = TriangularTensorDescription(element_a, layout[0], side_mode, fill_mode, diag_type,
+                                                  alignment, complex_transform)
+                  B = TensorDescription(element_b, layout[1], alignment)
+                  C = TensorDescription(element_c, layout[2], alignment_c)
+
+                  new_operation = TrmmOperation(TrmmKind.Universal, tile_description.minimum_compute_capability, \
+                    tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor)
+
+                  manifest.append(new_operation)
+                  operations.append(new_operation)
+
+  return operations
+
+#
+def CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, data_type, \
+  alignment_constraints, blas_mode, epilogue_functor = EpilogueFunctor.LinearCombination, \
+  swizzling_functor = SwizzlingFunctor.Identity8):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  operations = []
+
+  # by default, only generate the largest tile and largest alignment
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  for layout in layouts:
+    for side_mode in side_modes:
+      for fill_mode in fill_modes:
+        for tile_description in tile_descriptions:
+          for alignment in alignment_constraints:
+
+            # SYMM supported layouts (RowMajor, ColumnMajor) with no conjugation
+            complex_transform = ComplexTransform.none
+
+            alignment_a = 1 # No vectorized access for the triangular matrix
+            alignment_c = min(8, alignment)
+
+            A = SymmetricTensorDescription(element_a, layout[0], fill_mode, alignment_a, complex_transform, side_mode)
+            # tensor A and B have same data type and layout
+            B = TensorDescription(element_b, layout[0], alignment)
+            C = TensorDescription(element_c, layout[1], alignment_c)
+
+            # SYMM/HEMM update
+            new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+            # SYMM/HEMM update
+            new_operation = SymmOperation(SymmKind.Universal, tile_description.minimum_compute_capability, \
+              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, blas_mode)
+
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+  return operations
+
+###########################################################################################################
+#   ConvolutionOperator support variations
+#        ____________________________________________________________________
+#         ConvolutionalOperator |      Analytic          |    Optimized
+#        ____________________________________________________________________
+#        |       Fprop          |     (strided)          |    (strided)
+#        |       Dgrad          |     (strided, unity*)  |    (strided, unity)
+#        |       Wgrad          |     (strided)          |    (strided)
+#        ____________________________________________________________________
+#
+# Note :  Operator marked (*) are supported but not generated to keep the instantiated kernel count low
+###########################################################################################################
+# Convolution for 2D operations
+def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+    iterator_algorithms = [IteratorAlgorithm.Optimized]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for alignment in alignment_constraints:
+
+      alignment_c = min(8, alignment)
+
+      A = TensorDescription(element_a, layout[0], alignment)
+      B = TensorDescription(element_b, layout[1], alignment)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      #
+      # Conv2d Fprop
+      #
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Analytic and Optimized Fprop
+        for iterator_algorithm in iterator_algorithms:
+          new_operations = [
+            # None grouped kernel
+            Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+              A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_),
+          ]
+
+          # Instance group conv kernel
+          if tile.math_instruction.opcode_class == OpcodeClass.TensorOp and A.layout == LayoutType.TensorNHWC and \
+            tile.minimum_compute_capability >= 80:
+            # SingleGroup kernel
+            new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+              A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_, group_mode=GroupMode.SingleGroup))
+
+            # Analytic iterator supports MultipleGroup mode
+            if iterator_algorithm == IteratorAlgorithm.Analytic:
+              new_operations.append(Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+                A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_, group_mode=GroupMode.MultipleGroup))
+
+          for new_operation in new_operations:
+            manifest.append(new_operation)
+            operations.append(new_operation)
+
+      #
+      # Conv2d Dgrad
+      #
+      if ConvKind.Dgrad in conv_kinds:
+
+        # Unity stride for Analytic and Optimized Dgrad
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Dgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+        # Strided support for Analytic Dgrad
+        # strided dgrad uses a special threadblock swizzle
+        # note that SwizzlingFunctor.StridedDgradHorizontal might be
+        # better for problem sizes with large activation channel count
+        swizzling_functor_strided_dgrad_ = SwizzlingFunctor.StridedDgradIdentity1
+
+        if IteratorAlgorithm.Analytic in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+        # Strided support for Optimized Dgrad
+        if IteratorAlgorithm.Optimized in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_strided_dgrad_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+      #
+      # Conv2d Wgrad
+      #
+      if ConvKind.Wgrad in conv_kinds:
+
+        # Strided support for Analytic and Optimized Wgrad
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+# Convolution for 2D operations specialized for few channels
+def CreateConv2dFixedChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.FixedChannels,]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    channel_counts = [channel_counts[0],]
+
+  operations = []
+
+
+
+  for tile in tile_descriptions:
+    for channel_count in channel_counts:
+
+      alignment_c = EpilogueAlignment(channel_count, tile)
+
+      A = TensorDescription(element_a, layout[0], channel_count)
+      B = TensorDescription(element_b, layout[1], channel_count)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      #
+      # Conv2d Fprop
+      #
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Analytic and Optimized Fprop
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+# Convolution for 2D operations specialized for few channels
+def CreateConv2dFewChannelsOperator(manifest, layout, tile_descriptions, data_type, channel_counts, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.FewChannels,]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    channel_counts = [channel_counts[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for channel_count in channel_counts:
+
+      alignment_c = EpilogueAlignment(channel_count, tile)
+
+      A = TensorDescription(element_a, layout[0], channel_count)
+      B = TensorDescription(element_b, layout[1], channel_count)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      #
+      # Conv2d Fprop
+      #
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Analytic and Optimized Fprop
+        for iterator_algorithm in iterator_algorithms:
+          new_operation = Conv2dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+            A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor, swizzling_functor_)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+# Convolution for 3D operations
+def CreateConv3dOperator(manifest, layout, tile_descriptions, data_type, alignment, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # one exceptional case
+  alignment_c = min(8, alignment)
+
+  # iterator algorithm (analytic and optimized)
+  iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size and optimized iterators
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    iterator_algorithms = [IteratorAlgorithm.Optimized]
+
+  operations = []
+
+  # All tile sizes for Conv3dFprop and Conv3dWgrad
+  for tile in tile_descriptions:
+    A = TensorDescription(element_a, layout, alignment)
+    B = TensorDescription(element_b, layout, alignment)
+    C = TensorDescription(element_c, layout, alignment_c)
+
+    #
+    # Conv3d Fprop
+    #
+    if ConvKind.Fprop in conv_kinds:
+      # Strided support for Analytic and Optimized Fprop
+      for iterator_algorithm in iterator_algorithms:
+        new_operation = Conv3dOperation(ConvKind.Fprop, iterator_algorithm, tile.minimum_compute_capability, tile,\
+                                        A, B, C, element_epilogue, StrideSupport.Strided)
+        manifest.append(new_operation)
+        operations.append(new_operation)
+    #
+    # Conv3d Wgrad
+    #
+    if ConvKind.Wgrad in conv_kinds:
+
+      # Strided support for Analytic and Optimized Wgrad
+      for iterator_algorithm in iterator_algorithms:
+        new_operation = Conv3dOperation(ConvKind.Wgrad, iterator_algorithm, tile.minimum_compute_capability, tile,\
+          A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
+        manifest.append(new_operation)
+        operations.append(new_operation)
+
+  # All tile sizes for Conv3dDgrad
+  for tile in tile_descriptions:
+
+    A = TensorDescription(element_a, layout, alignment)
+    B = TensorDescription(element_b, layout, alignment)
+    C = TensorDescription(element_c, layout, alignment_c)
+
+    #
+    # Conv3d Dgrad
+    #
+    if ConvKind.Dgrad in conv_kinds:
+      # Unity stride for Optimized Dgrad
+      new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Optimized, tile.minimum_compute_capability, tile,\
+        A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor)
+
+      manifest.append(new_operation)
+      operations.append(new_operation)
+
+      # Strided support for Analytic Dgrad
+      # Conv3dDgrad has a naive strided support which does not cut down redundant MMAs
+      new_operation = Conv3dOperation(ConvKind.Dgrad, IteratorAlgorithm.Analytic, tile.minimum_compute_capability, tile,\
+        A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor)
+
+      manifest.append(new_operation)
+      operations.append(new_operation)
+
+  return operations
+
+# Convolution for Depthwise 2d conv
+def CreateDepthwiseConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment_constraints, \
+  conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], \
+  epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4):
+
+  element_a, element_b, element_c, element_epilogue = data_type
+
+  # iterator algorithm (FixedStrideDilation, Optimized)
+  iterator_algorithms = [IteratorAlgorithm.FixedStrideDilation, IteratorAlgorithm.Optimized]
+
+  # by default, only generate the largest tile size, largest alignment, and optimized iterator
+  if manifest.kernel_filter == '':
+    tile_descriptions = [tile_descriptions[0],]
+    alignment_constraints = [alignment_constraints[0],]
+
+  operations = []
+
+  for tile in tile_descriptions:
+    for alignment in alignment_constraints:
+
+      alignment_c = min(8, alignment)
+
+      A = TensorDescription(element_a, layout[0], alignment)
+      B = TensorDescription(element_b, layout[1], alignment)
+      C = TensorDescription(element_c, layout[2], alignment_c)
+
+      swizzling_functor_ = swizzling_functor
+
+      if ConvKind.Fprop in conv_kinds:
+
+        # Strided support for Optimized and FixedStridedDilation Depthwise Conv
+        for iterator_algorithm in iterator_algorithms:
+          stride_support = StrideSupport.Strided
+          if iterator_algorithm == IteratorAlgorithm.FixedStrideDilation:
+              if tile.stride == [-1, -1] or tile.dilation == [-1,-1]:
+                continue
+              stride_support = StrideSupport.Fixed
+
+          if iterator_algorithm == IteratorAlgorithm.Optimized:
+              if tile.stride != [-1, -1] or tile.dilation != [-1,-1]:
+                continue
+          new_operation = Conv2dOperation(ConvKind.Fprop,
+                                          iterator_algorithm,
+                                          tile.minimum_compute_capability,
+                                          tile,
+                                          A, B, C,
+                                          element_epilogue,
+                                          stride_support,
+                                          epilogue_functor,
+                                          swizzling_functor_,
+                                          group_mode=GroupMode.Depthwise)
+
+          manifest.append(new_operation)
+          operations.append(new_operation)
+
+  return operations
+
+class ConvOperation3x:
+  """All parameters of a CUTLASS 3 convolution operation.
+
+  Unlike CUTLASS 2 convolutions, CUTLASS 3 convolutions do not
+  distinguish between 2-D and 3-D convolutions by kernel class name.
+  Instead, for CUTLASS 3 convolutions, the tensor layouts encode
+  whether the convolution is 2-D or 3-D.  Thus, this class deduces
+  the OperationKind (either Conv2d or Conv3d) from the layouts,
+  rather than taking it as a constructor parameter.
+  """
+  def __init__(self,
+               conv_kind: ConvKind,
+               tile_description: TileDescription,
+               A: TensorDescription,
+               B: TensorDescription,
+               C: TensorDescription,
+               element_compute: Optional[DataType] = None,
+               D: Optional[TensorDescription] = None,
+               kernel_schedule: KernelScheduleType = KernelScheduleType.ScheduleAuto,
+               epilogue_schedule: EpilogueScheduleType = EpilogueScheduleType.ScheduleAuto,
+               tile_scheduler: TileSchedulerType = TileSchedulerType.Default,
+               log_indent_level: int = 1):
+    log_debug_line(f'ConvOperation3x::init: conv_kind: {conv_kind}', log_indent_level)
+    log_indent_level = log_indent_level + 1
+
+    self.conv_kind = conv_kind
+    self.tile_description = tile_description
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_compute = C.element if element_compute is None else element_compute
+    self.kernel_schedule = kernel_schedule
+    self.epilogue_schedule = epilogue_schedule
+
+    self.arch = tile_description.minimum_compute_capability
+    self.tile_scheduler = tile_scheduler
+    if D == None:
+      self.D = C
+    else:
+      self.D = D
+
+    self.is_3x = True
+    self.group_mode = GroupMode.NoneGroup # CUTLASS 3 convolutions currently aren't grouped
+
+    operation_kind = None
+    for layout in (A.layout, B.layout, C.layout):
+      assert(isinstance(layout, LayoutType))
+      new_operation_kind = convolution_tensor_layout_type_to_operation_kind(layout)
+      if operation_kind is None:
+        operation_kind = new_operation_kind
+      else: # CUTLASS 3 convolutions don't permit mixing 2-D and 3-D layouts.
+        assert(operation_kind == new_operation_kind)
+    assert(operation_kind is not None)
+    self.operation_kind = operation_kind
+
+  def __str__(self):
+    return f"ConvOperation3x: operation_kind={self.operation_kind}, conv_kind={self.conv_kind}, tile_description={self.tile_description}"
+
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+    if self.is_complex():
+      return get_complex_from_real(accum)
+    return accum
+
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and',
+    }
+
+    tensor_ops = [
+      OpcodeClass.TensorOp,
+      OpcodeClass.WmmaTensorOp,
+      OpcodeClass.SparseTensorOp,
+      OpcodeClass.BlockScaledTensorOp, 
+    ]
+
+    is_tensor_op = self.tile_description.math_instruction.opcode_class in tensor_ops
+
+    if is_tensor_op:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    return "%s%s%s" % (math_op_string, intermediate_type, ConvKindNames[self.conv_kind])
+
+  def extended_name(self):
+    '''Generates a string representing the MMA atom. Assumes accumulator type is C type.'''
+    extended_name = "{core_name}_{element_a}{layout_a}_{element_b}{layout_b}_{element_acc}_{element_c}_{element_d}{layout_c}".format(
+      element_a = DataTypeNames[self.A.element],
+      layout_a = ShortLayoutTypeNames[self.A.layout],
+      element_b = DataTypeNames[self.B.element],
+      layout_b = ShortLayoutTypeNames[self.B.layout],
+      element_acc = DataTypeNames[self.accumulator_type()],
+      element_c = DataTypeNames[self.C.element],
+      layout_c = ShortLayoutTypeNames[self.C.layout],
+      element_d = DataTypeNames[self.D.element],
+      core_name = self.core_name())
+
+    return extended_name
+
+  # Generates a short string representing underlying kernel schedule type
+  def kernel_schedule_name(self):
+    return KernelScheduleSuffixes[self.kernel_schedule]
+
+  # Generates a short string representing underlying epilogue schedule type
+  def epilogue_schedule_name(self):
+    return EpilogueScheduleSuffixes[self.epilogue_schedule]
+  
+  # Generate a short string representing the operation class
+  def opcode_class_name(self):
+    return OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+  # Generates the full kernel function name
+  def configuration_name(self):
+    ''' The full function name indicates architecture, extended name, tile size, and layout. '''
+    kernel_name_template = "cutlass3x_sm{ar}_{op}_{ex}{ct}{cs}_{l}_align{al}{t}{k}{e}"
+    return kernel_name_template.format(
+        ar = self.arch,
+        op = self.opcode_class_name(),
+        ex = self.extended_name(),
+        ct = '_' + 'x'.join([str(i) for i in self.tile_description.tile_shape]) if self.tile_description.tile_shape[0] > 0 else "",
+        cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]),
+        l = self.tile_description.stages,
+        al = str(max(self.A.alignment, self.B.alignment)),
+        t = TileSchedulerSuffixes[self.tile_scheduler],
+        k = self.kernel_schedule_name(),
+        e = self.epilogue_schedule_name())
+
+  def procedural_name(self):
+    return self.configuration_name()
+
+def convolution_tensor_layout_type_to_operation_kind(layout: LayoutType) -> OperationKind:
+  if layout == LayoutType.TensorNHWC or layout == LayoutType.TensorKCSR:
+    return OperationKind.Conv2d
+  elif layout == LayoutType.TensorNDHWC or layout == LayoutType.TensorKCSRT:
+    return OperationKind.Conv3d
+  else:
+    raise RuntimeError(f'LayoutType {layout} does not have a corresponding OperationKind')
+
+def CreateConvOperator3x(manifest: Manifest,
+                         dims_and_alignments: Sequence[Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]],
+                         tile_descriptions: Sequence[Sequence[TileDescription]],
+                         data_types,
+                         schedule_pairs: Sequence[Tuple[KernelScheduleType, KernelScheduleType]] = \
+                           [(KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto)],
+                         complex_transforms: Optional[Sequence[ComplexTransform]] = None,
+                         tile_schedulers: Sequence[TileSchedulerType] = [TileSchedulerType.Default],
+                         conv_kind: ConvKind = ConvKind.Fprop,
+                         log_indent_level: int = 1):
+  """
+  Create zero or more CUTLASS 3 two-dimensional convolution operators.
+
+  Create a CUTLASS 3 two-dimensional convolution operator
+  for all feasible combinations of the input parameters.
+  Add the operators to the manifest.
+
+  dims_and_alignments: 3-level list.  Each outer list term is a list [A, B, C].
+    Each inner list (A, B, or C) has the form [num_spatial_dimensions, alignment].
+    Both are integers; the first is the number of spatial dimensions
+    (currently, only 2 or 3 are supported), and the second is the byte alignment.
+    We deduce the operation_kind (either OperationKind.Conv2d or OperationKind.Conv3d)
+    from num_spatial_dimensions.
+
+  This function doesn't take layouts, unlike the GEMM functions.
+  CUTLASS 3 convolutions currently support three input layouts:
+
+  * TensorNWC for 1-D convolutions,
+  * TensorNHWC for 2-D convolutions, and
+  * TensorNDHWC for 3-D convolutions.
+
+  Output (C and D) layouts are the same as input layouts,
+  except for Wgrad convolutions, where the layouts are
+
+  * TensorKCS for 1-D convolutions,
+  * TensorKCSR for 2-D convolutions, and
+  * TensorKCSRT for 3-D convolutions.
+
+  The output layouts are completely constrained by the input layouts
+  and the convolution kind.
+
+  tile_descriptions: 2-level list.
+    Outer level has one list per math instruction.
+    Inner level has one TileDescription for each cluster shape.
+
+  data_types: Either a single data_type dictionary, or a list of them.
+    Keys: 'a_type', 'b_type', 'c_type', 'd_type', 'acc_type', 'epi_type'
+
+  complex_transforms: Optional list of pairs.
+    First element of each pair is the complex transform for A, and
+    second element of each pair is the complex transform for B.
+
+  schedule_pairs: [(kernel_schedule, epilogue_schedule), ...]
+
+  conv_kind: Convolution kind (Fprop, Dgrad, or Wgrad).
+  """
+  log_debug_line('CreateConvOperator3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+  log_debug_line(f'conv_kind: {conv_kind}', log_indent_level)
+
+  for triple in dims_and_alignments:
+    assert(isinstance(triple, tuple) or isinstance(triple, list))
+    assert(len(triple) == 3)
+
+    spatial_dimensionality = None # to be determined by loop below
+
+    for entry in triple: # [A, B, C]
+      assert(len(entry) == 2)
+      [dim, alignment] = entry
+      assert(type(dim) is int)
+      assert(dim == 2 or dim == 3)
+      assert(type(alignment) is int)
+      assert(alignment > 0)
+      if spatial_dimensionality is None:
+        spatial_dimensionality = dim
+      else:
+        # A, B, and C need to have the same spatial dimensionality
+        assert(spatial_dimensionality == dim)
+
+  def input_and_output_layouts(spatial_dim: int, kind: ConvKind) -> Tuple[LayoutType, LayoutType]:
+    if spatial_dim == 1:
+      input_layout = LayoutType.TensorNWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCS
+      else:
+        output_layout = input_layout
+    elif spatial_dim == 2:
+      input_layout = LayoutType.TensorNHWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCSR
+      else:
+        output_layout = input_layout
+    elif spatial_dim == 3:
+      input_layout = LayoutType.TensorNDHWC
+      if kind == ConvKind.Wgrad:
+        output_layout = LayoutType.TensorKCSRT
+      else:
+        output_layout = input_layout
+    else:
+      assert(False)
+    return (input_layout, output_layout)
+
+  def dims_to_layouts(A_B_C: Tuple[Tuple[int, int], Tuple[int, int], Tuple[int, int]]) -> \
+      Tuple[Tuple[LayoutType, int], Tuple[LayoutType, int], Tuple[LayoutType, int]]:
+    [A, B, C] = A_B_C
+    [spatial_dim, alignment] = A
+    [input_layout, output_layout] = input_and_output_layouts(spatial_dim, conv_kind)
+    return ((input_layout, A[1]),
+            (input_layout, B[1]),
+            (output_layout, C[1]))
+
+  # layouts: list of triples (A, B, C).
+  # Each of A, B, and C has the form [layout, alignment].
+  layouts = [dims_to_layouts(A_B_C) for A_B_C in dims_and_alignments]
+
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedule_pairs:
+    assert(len(s) == 2)
+
+  if complex_transforms is None:
+    complex_transforms = [(ComplexTransform.none, ComplexTransform.none)]
+
+  # product produces a one-pass generator, so the loop must call it anew each time.
+  def make_combinations():
+    return product(
+      layouts,
+      tile_descriptions,
+      data_types,
+      complex_transforms,
+      schedule_pairs,
+      tile_schedulers
+    )
+
+  operations = []
+  for layout_triple, tile_description, data_type, complex_transform_pair, schedule_pair, tile_scheduler in make_combinations():
+    A_layout, A_alignment = layout_triple[0]
+    A_xform = complex_transform_pair[0]
+    B_layout, B_alignment = layout_triple[1]
+    B_xform = complex_transform_pair[1]
+    C_layout, C_alignment = layout_triple[2]
+    D_layout = C_layout
+    D_alignment = C_alignment
+
+    A = TensorDescription(data_type["a_type"], A_layout, A_alignment, A_xform)
+    B = TensorDescription(data_type["b_type"], B_layout, B_alignment, B_xform)
+    C = TensorDescription(data_type["c_type"], C_layout, C_alignment)
+    D = TensorDescription(data_type["d_type"], D_layout, D_alignment)
+    element_compute = data_type.get("epi_type", data_type["acc_type"])
+    kernel_schedule, epilogue_schedule = schedule_pair
+
+    operation = ConvOperation3x(conv_kind=conv_kind,
+                                tile_description=tile_description,
+                                A=A,
+                                B=B,
+                                C=C,
+                                element_compute=element_compute,
+                                D=D,
+                                kernel_schedule=kernel_schedule,
+                                epilogue_schedule=epilogue_schedule,
+                                tile_scheduler=tile_scheduler,
+                                log_indent_level=log_indent_level)
+    log_debug_line(f'Created ConvOperation3x: {str(operation)}', log_indent_level)
+    manifest.append(operation)
+    operations.append(operation)
+
+  return operations
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM50_Simt(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f64, DataType.f64, DataType.f64,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 50
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    if math_inst.element_a == DataType.f32:
+      conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+#
+def GenerateSM50_Simt_complex(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add_complex),
+  ]
+
+  min_cc = 50
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128,  64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32,
+    ]
+
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+#
+def GenerateSM50(manifest, cuda_version):
+  GenerateSM50_Simt(manifest, cuda_version)
+  GenerateSM50_Simt_complex(manifest, cuda_version)
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM60_Simt(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 60
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+def GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version):
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 60
+  max_cc = 1024
+
+  alignment_constraints = [8,]
+
+  filter_3x3 = [3, 3]
+  filter_5x5 = [5, 5]
+
+  # [stride_h, stride_w]
+  # [-1, -1] means all stride size.
+  strides = [[-1,-1], [1, 1], [2, 2]]
+  # [dilation_h, dilation_w]
+  # [-1, -1] means all dilation size.
+  dilations = [[-1,-1], [1, 1], [2, 2]]
+
+  #groups per thread block
+  g16 = 16
+  g32 = 32
+  g64 = 64
+
+  #output shape per thread block
+  npq_1x4x4 = [1, 4, 4]
+  npq_1x8x8 = [1, 8, 8]
+  npq_1x10x10 = [1, 10, 10]
+
+  tile_descriptions = []
+  for math_inst in math_instructions:
+    for stride, dilation in product(strides, dilations):
+      tile_descriptions.extend([
+        # filter3x3               ThreadBlock_output, filter, stage, warp
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_3x3, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_3x3, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_3x3, 4, stride, dilation, [4, 1, 1],  math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_3x3, 4,  stride, dilation,[4, 1, 1], math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_3x3, 4, stride, dilation, [4, 1, 1],  math_inst, min_cc, max_cc),
+
+        # filter5x5               ThreadBlock_output, filter, stage, warp
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g32], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g64], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x8x8+[g16], filter_5x5, 3, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x10x10+[g64], filter_5x5, 2, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g32], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g64], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc),
+        Direct2dConvFixedStrideDilationTileDescription(npq_1x4x4+[g16], filter_5x5, 4, stride, dilation,[4, 1, 1],math_inst, min_cc, max_cc)
+      ])
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateDepthwiseConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+#
+def GenerateSM60(manifest, cuda_version):
+  GenerateSM60_Simt(manifest, cuda_version)
+  GenerateSM60_Simt_DepthwiseConv2d(manifest, cuda_version)
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM61_Simt(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 4],                                      \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 61
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+#
+
+#
+def GenerateSM61(manifest, cuda_version):
+  GenerateSM61_Simt(manifest, cuda_version)
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM70_TensorOp_884(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 4],                                      \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [8, 8, 4],                                      \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 70
+  max_cc = 75
+
+  alignment_constraints = [8, 4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
+
+#
+def GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 1):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 4],                                      \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [8, 8, 4],                                      \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 70
+  max_cc = 75
+
+  alignment_constraints = [8, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, complex_transforms)
+
+
+#
+def GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version):
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 16, 16],                                   \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.WmmaTensorOp,                       \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 16, 16],                                   \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.WmmaTensorOp,                       \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 70
+  max_cc = 1024
+
+  alignment_constraints = [8,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+#
+##################################################################################################
+#
+
+def GenerateSM70(manifest, cuda_version):
+  GenerateSM70_TensorOp_884(manifest, cuda_version)
+  GenerateSM70_PlanarComplexTensorOp_884(manifest, cuda_version)
+
+  # To limit build size, WMMA GEMMs are disabled for now.
+  #
+  #GenerateSM70_WmmaTensorOp_161616(manifest, cuda_version)
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst):
+
+  min_cc = 75
+  max_cc = 1024
+
+  tile_descriptions = [
+    TileDescription([128,  64, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 64], 2, [2, 2, 2], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [
+    math_inst.element_a,
+    math_inst.element_b,
+    math_inst.element_accumulator,
+    math_inst.element_accumulator,
+  ]
+
+  conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+
+  CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
+  CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [1, 2, 4])
+
+  # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+  if math_inst.element_a != math_inst.element_accumulator:
+
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      math_inst.element_accumulator,
+    ]
+
+    CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
+    CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [1, 2, 4])
+
+#
+def GenerateSM75_TensorOp_1688(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 8],                                     \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 8],                                     \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 75
+  max_cc = 1024
+
+  alignment_constraints = [8, 4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64], 2, [1, 2, 2], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
+
+    # Separate generator for 'few channels' specializations
+    GenerateSM75_TensorOp_1688_FewChannels(manifest, cuda_version, math_inst)
+
+#
+
+#
+def GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 8],                                     \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 8],                                     \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 75
+  max_cc = 1024
+
+  alignment_constraints = [8, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([ 64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM75_TensorOp_8816_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 75
+  max_cc = 90
+
+  alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+
+      TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      DataType.s32,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      operations = []
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      for op in operations:
+        if op.tile_description.threadblock_shape[1] >= 128:
+          op.C.alignment = 16
+        else:
+          op.C.alignment = 8
+
+#
+
+#
+def GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 16],                                     \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 75
+  max_cc = 90
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      DataType.f32,
+    ]
+
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM75_TensorOp_8832_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 75
+  max_cc = 89
+
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      DataType.s32,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      operations = []
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      for op in operations:
+        if op.tile_description.threadblock_shape[1] >= 128:
+          op.C.alignment = 16
+        elif op.tile_description.threadblock_shape[1] == 64:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 8
+
+#
+
+#
+def GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 2):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [8, 8, 32],                                     \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 75
+  max_cc = 89
+
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128], 2, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128], 2, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+      conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+      for op in operations:
+        op.C.alignment = 16
+#
+
+#
+def GenerateSM75_TensorOp_88128(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [8, 8, 128],                                   \
+      DataType.b1, DataType.b1, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.xor_popc),
+  ]
+
+  min_cc = 75
+  max_cc = {
+    MathOperation.xor_popc: 89,
+    MathOperation.and_popc: 90
+  }
+
+  alignment_constraints = [128,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 512], 2, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 256, 512], 2, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 256, 512], 2, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([256,  64, 512], 2, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 128, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64,  64, 512], 2, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+    ]
+
+    data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+#
+
+#
+def GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 10, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 16, 16],                                   \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.WmmaTensorOp,                       \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 75
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      DataType.f32,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+#
+
+#
+def GenerateSM75_Simt_complex(manifest, cuda_version):
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add_complex),
+  ]
+
+  min_cc = 75
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc)
+    ]
+    data_type = [
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32,
+      DataType.cf32
+    ]
+
+    complex_transforms = [
+      (ComplexTransform.none, ComplexTransform.none),
+      (ComplexTransform.conj, ComplexTransform.none),
+      (ComplexTransform.none, ComplexTransform.conj),
+      (ComplexTransform.conj, ComplexTransform.conj)
+    ]
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+def GenerateSM75(manifest, cuda_version):
+  GenerateSM75_TensorOp_1688(manifest, cuda_version)
+  GenerateSM75_PlanarComplexTensorOp_1688(manifest, cuda_version)
+  GenerateSM75_TensorOp_8816_TN(manifest, cuda_version)
+  GenerateSM75_TensorOp_8816_Interleaved(manifest, cuda_version)
+  GenerateSM75_TensorOp_8832_TN(manifest, cuda_version)
+  GenerateSM75_TensorOp_8832_Interleaved(manifest, cuda_version)
+  GenerateSM75_TensorOp_88128(manifest, cuda_version)
+  #GenerateSM75_WmmaTensorOp_161616(manifest, cuda_version)
+  GenerateSM75_Simt_complex(manifest, cuda_version)
+
+
+###################################################################################################
+###################################################################################################
+
+#
+def GenerateSM80_TensorOp_16816(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [8, 4, 2]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    CreateGemmGroupedOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+    CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type, [4, 8])
+    CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type, 8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+      CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
+      CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, [4, 8])
+      CreateConv3dOperator(manifest, LayoutType.TensorNDHWC, tile_descriptions, data_type_mixed, 8)
+#
+
+#
+def GenerateSM80_SparseTensorOp_16832(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [8]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints)
+
+#
+
+#
+def GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.bf16, DataType.f32,     \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.f16, DataType.f16,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [8, ]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand A
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.f16, DataType.f32,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.f16, DataType.f32,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.s8, DataType.f16, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.u8, DataType.f16, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the
+  # inner list contains the alignment constraints for operands/matrices
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[16, 8, 8],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      # 128x128
+      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x64
+      TileDescription([128, 64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x32
+      TileDescription([128, 32, 64],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x16
+      TileDescription([128, 16, 64],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 64],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_b != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_b,
+        math_inst.element_accumulator,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if (DataTypeSize[op.C.element] == 16) and \
+         (op.tile_description.threadblock_shape[1] <= 32):
+        op.C.alignment = 4
+
+#
+def GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.s8, DataType.f32,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.u8, DataType.f32,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.s8, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.bf16, DataType.u8, DataType.f32,       \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.s8, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+    MathInstruction(                                  \
+      [16, 8, 16],                                    \
+      DataType.f16, DataType.u8, DataType.f16,        \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the
+  # inner list contains the alignment constraints for operands/matrices
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[8, 16, 8],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      # 128x128
+      TileDescription([128, 128, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x64
+      TileDescription([128, 64, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x32
+      TileDescription([128, 32, 64],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 32],  9, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 32, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      # 128x16
+      TileDescription([128, 16, 64],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 64],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  9, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 16, 32],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+      # 256x16
+      TileDescription([256, 16, 32],  5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 16, 32],  3, [2, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        math_inst.element_accumulator,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] <= 32:
+        op.C.alignment = 4
+
+#
+def GenerateSM80_TensorOp_16832_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+  smem_usage = 164
+
+  alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128,  64],  6, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    operations = []
+
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    operations += CreateConv2dFewChannelsOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+#
+
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand A
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s4, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[32, 16, 4],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[32, 16, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_b,
+        DataType.f32
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  # Upcast on Operand B
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_mixed_input_upcast),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  # For mixed-input alignment constraints are a list of lists, where the 
+  # inner list contains the alignment constraints for operands/matrices 
+  # [[alignA, alignB, alignC],..]
+  alignment_constraints = [[16, 32, 4],]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    # streamk uses more regs which can cause spill for the biggest warp tile size when the accumulators are 32bit.
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination, SwizzlingFunctor.Identity8)
+
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. S8 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      alignment_constraints = [[16, 32, 16],]
+
+      data_type_mixed = [
+        math_inst.element_a,
+        math_inst.element_b,
+        math_inst.element_a,
+        DataType.f32,
+      ]
+
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+        data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, SwizzlingFunctor.Identity8)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+#
+
+#
+def GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+  ]
+
+  math_inst =                                         \
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  tile_descriptions = [
+    TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.s8, DataType.s8, DataType.s32, DataType.s32]
+  data_type_mixed = [DataType.s8, DataType.s8, DataType.s8, DataType.f32]
+
+  CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+  operations = []
+
+  operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+  for op in operations:
+    if op.tile_description.threadblock_shape[1] >= 128:
+      op.C.alignment = 16
+    else:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.s8, DataType.s8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 32],                                    \
+      DataType.u8, DataType.u8, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32, LayoutType.TensorNC32HW32)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16864_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 256],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [math_inst.element_a, math_inst.element_b, math_inst.element_accumulator, DataType.s32]
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    operations = []
+
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 16
+      elif op.tile_description.threadblock_shape[1] == 64:
+        op.C.alignment = 8
+      else:
+        op.C.alignment = 8
+#
+
+#
+def GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+  ]
+
+  math_inst =                                         \
+    MathInstruction(                                  \
+      [16, 8, 128],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate)
+
+  min_cc = 80
+  max_cc = 1024
+  alignment_constraints = [32,]
+
+  tile_descriptions = [
+    TileDescription([ 64,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256,  64, 256],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 128, 256],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 256, 256],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 256, 256],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 256],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 128, 512],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128,  64, 512],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64, 128, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([ 64,  64, 512],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.s4, DataType.s4, DataType.s32, DataType.s32]
+  data_type_mixed = [DataType.s4, DataType.s4, DataType.s4, DataType.f32]
+
+  CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+  operations = []
+
+  operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+  for op in operations:
+    if op.tile_description.threadblock_shape[1] > 128:
+      op.C.alignment = 16
+    else:
+      op.C.alignment = 8
+#
+
+#
+def GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+      (LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.s4, DataType.s4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+    MathInstruction(                                  \
+      [16, 8, 64],                                    \
+      DataType.u4, DataType.u4, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.multiply_add_saturate),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+  alignment_constraints = [32,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type_mixed = [math_inst.element_a, math_inst.element_b, math_inst.element_a, DataType.f32]
+
+    operations = []
+
+    operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp)
+
+    conv_layout = (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64, LayoutType.TensorNC64HW64)
+
+    operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+      data_type_mixed, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombinationClamp)
+
+    for op in operations:
+      op.C.alignment = 16
+#
+
+#
+def GenerateSM80_TensorOp_168256(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [16, 8, 256],                                   \
+      DataType.b1, DataType.b1, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.xor_popc),
+    MathInstruction(                                  \
+      [16, 8, 256],                                   \
+      DataType.b1, DataType.b1, DataType.s32,         \
+      OpcodeClass.TensorOp,                           \
+      MathOperation.and_popc),
+  ]
+
+  min_cc = 80
+  max_cc = {
+    MathOperation.xor_popc: 89,
+    MathOperation.and_popc: 90
+  }
+
+  alignment_constraints = [128,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  512],  3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 256,  512],  3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([256,  64,  512],  4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 256,  512],  4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 128,  512],  5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128,  64,  512],  6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 128,  512],  6, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64,  64,  512], 10, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([256, 128, 1024],  3, [4, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 256, 1024],  3, [2, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([256,  64, 1024],  4, [4, 1, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 256, 1024],  4, [1, 4, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128, 128, 1024],  4, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([128,  64, 1024],  3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64, 128, 1024],  3, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+      TileDescription([ 64,  64, 1024],  5, [2, 2, 1], math_inst, min_cc, max_cc[math_inst.math_operation]),
+    ]
+
+    data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+#
+
+#
+def GenerateSM80_TensorOp_1688(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,     \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64,  128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    data_type_mixed = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_a,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type_mixed, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type_mixed, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,     \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.f16, DataType.f16, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f16),
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.bf16, DataType.bf16, DataType.f32,       \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_bf16),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4, 2, 1]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 16],  4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+def GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst = MathInstruction(                            \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32)
+
+  min_cc = 80
+  max_cc = 1024
+
+  tile_descriptions = [
+    TileDescription([128, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [
+    DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+  ]
+
+  alignment_constraints = [1,]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+
+
+#
+def GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 1):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                      \
+      [16, 8, 16],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,     \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 32],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 32],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateSparseGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst = MathInstruction(                  \
+    [16, 8, 8],                                 \
+    DataType.tf32, DataType.tf32, DataType.f32,   \
+    OpcodeClass.TensorOp,                       \
+    MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [
+    DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+  ]
+
+  alignment_constraints = [1,]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1, 2, 4]  # Alignment only applies to A in SYRK
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32]
+
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    # SYRK
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+
+    # HERK
+    CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1, 2, 4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    complex_transforms = [
+      ComplexTransform.none, ComplexTransform.conj,
+    ]
+
+    CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+      data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  # A and B have same layouts
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [
+    1, 2, 4
+  ]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 16],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 16],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 16],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 16],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 16],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 32],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 32],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      #TileDescription([256,  64, 32],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 256, 32],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 32],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([128,  64, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64, 128, 32],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([ 64,  64, 32],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
+
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_instructions = [
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.tf32, DataType.tf32, DataType.f32,         \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex),
+      MathInstruction(                                    \
+      [16, 8, 8],                                         \
+      DataType.f32, DataType.f32, DataType.f32,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_fast_f32),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      #TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+      #TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32
+    ]
+
+    alignment_constraints = [1,]
+
+    # SYMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.symmetric)
+
+    # HEMM
+    CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+      data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64,  8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64,  16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+
+#
+def GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64]
+
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+
+#
+
+#
+def GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+
+#
+def GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [8, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80_Simt_f32(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+
+#
+def GenerateSM80_Simt_f64(manifest, cuda_version):
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f64, DataType.f64, DataType.f64,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_type = [
+      math_inst.element_a,
+      math_inst.element_b,
+      math_inst.element_accumulator,
+      math_inst.element_accumulator,
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, \
+      data_type, alignment_constraints)
+#
+
+
+##################################################################################################
+#
+def GenerateSM80_Simt_complex(manifest, cuda_version):
+  math_instructions = [
+    MathInstruction(                                  \
+      [1, 1, 1],                                      \
+      DataType.f32, DataType.f32, DataType.f32,       \
+      OpcodeClass.Simt,                               \
+      MathOperation.multiply_add_complex),
+  ]
+
+  min_cc = 80
+  max_cc = 1024
+
+  alignment_constraints = [1,]
+
+  data_type = [
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32,
+    DataType.cf32
+  ]
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  for math_inst in math_instructions:
+
+    tile_descriptions = [
+      TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 16],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, alignment_constraints, complex_transforms)
+
+    conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+    CreateConv2dOperator(manifest, conv_layout, tile_descriptions, data_type, alignment_constraints)
+#
+
+###################################################################################################
+
+#
+def GenerateSM80(manifest, cuda_version):
+  GenerateSM80_TensorOp_16816(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16832(manifest, cuda_version)
+  GenerateSM80_PlanarComplexTensorOp_16816(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16816_fast_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_complex(manifest, cuda_version)
+  # 3xTF32
+  GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_fast_fp32_math_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_1688_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_rank_k_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_trmm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex(manifest, cuda_version)
+  GenerateSM80_TensorOp_884_symm_complex_gaussian(manifest, cuda_version)
+  GenerateSM80_TensorOp_16816_mixed_input_upcast_a(manifest, cuda_version)
+  GenerateSM80_TensorOp_16816_mixed_input_upcast_b(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_a(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_TN_mixed_input_upcast_b(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16832_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_TN(manifest, cuda_version)
+  GenerateSM80_SparseTensorOp_168128_TN(manifest, cuda_version)
+  GenerateSM80_TensorOp_16864_Interleaved(manifest, cuda_version)
+  GenerateSM80_TensorOp_168256(manifest, cuda_version)
+  GenerateSM80_Simt_f32(manifest, cuda_version)
+  GenerateSM80_Simt_f64(manifest, cuda_version)
+  GenerateSM80_Simt_complex(manifest, cuda_version)
+
+###################################################################################################
+
+def GenerateSM89_TensorOp_16832_fp8(manifest, element_acc):
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor)
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e4m3, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e5m2, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e4m3, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e5m2, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e4m3, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e4m3, DataType.e5m2, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e4m3, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 32],
+      DataType.e5m2, DataType.e5m2, element_acc,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+  ]
+
+  min_cc = 89
+  max_cc = 100
+  alignment_constraints = [16,]
+  alignment_constraints_small_channels = [16, 8, 4]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([256, 128,  64],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128,  64],  6, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256,  64],  6, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  3, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  32,  64],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 256,  64],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32, 128],  4, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  32,  64],  6, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 32, 128,  64],  6, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  5, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64,  64], 10, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_types = [
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.f32,
+        math_inst.element_accumulator
+      ],
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.bf16,
+        math_inst.element_accumulator
+      ],
+    ]
+
+    operations = []
+    for data_type in data_types:
+      operations += CreateGemmOperator(manifest, layouts, tile_descriptions, data_type,
+        alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+      conv_layout = (LayoutType.TensorNHWC, LayoutType.TensorNHWC, LayoutType.TensorNHWC)
+      operations += CreateConv2dOperator(manifest, conv_layout, tile_descriptions,
+        data_type, alignment_constraints, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+      operations += CreateConv2dFixedChannelsOperator(manifest, conv_layout, tile_descriptions,
+        data_type, alignment_constraints_small_channels, [ConvKind.Fprop], EpilogueFunctor.LinearCombination)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        if op.tile_description.threadblock_shape[0] == 32:
+          op.C.alignment = 8
+        else:
+          op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+def GenerateSM89_TensorOp_16832_fp8_fp32acc(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 4):
+    return
+
+  GenerateSM89_TensorOp_16832_fp8(manifest, DataType.f32)
+
+def GenerateSM89_TensorOp_16832_fp8_fp16acc(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  GenerateSM89_TensorOp_16832_fp8(manifest, DataType.f16)
+
+#
+def GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version):
+
+  if (
+    not CudaToolkitVersionSatisfies(cuda_version, 12, 4)
+  ):
+    return
+
+  layouts = [
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor)
+  ]
+
+  math_instructions = [
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e4m3, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e4m3, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+    MathInstruction(
+      [16, 8, 64],
+      DataType.e5m2, DataType.e5m2, DataType.f32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add_fast_accum),
+  ]
+
+  min_cc = 89
+  max_cc = 89
+
+  alignment_constraints = [16,]
+
+  for math_inst in math_instructions:
+    tile_descriptions = [
+      TileDescription([128,  64, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256, 128, 128],  3, [4, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 256, 128],  3, [2, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 128],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([256,  64, 128],  3, [4, 1, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 256, 128],  4, [1, 4, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 128],  6, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 128],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([128,  64, 256],  4, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64, 128, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+      TileDescription([ 64,  64, 256],  3, [2, 2, 1], math_inst, min_cc, max_cc),
+    ]
+
+    data_types = [
+      [
+        math_inst.element_a,
+        math_inst.element_b,
+        DataType.f32,
+        math_inst.element_accumulator
+      ],
+    ]
+
+    operations = []
+    for data_type in data_types:
+      operations += CreateSparseGemmOperator(manifest, layouts, tile_descriptions, data_type,
+        alignment_constraints, None, EpilogueFunctor.LinearCombination)
+
+    for op in operations:
+      if op.tile_description.threadblock_shape[1] >= 128:
+        op.C.alignment = 16
+      else:
+        op.C.alignment = 8
+
+###################################################################################################
+
+#
+def GenerateSM89(manifest, cuda_version):
+  GenerateSM89_TensorOp_16832_fp8_fp32acc(manifest, cuda_version)
+  GenerateSM89_TensorOp_16832_fp8_fp16acc(manifest, cuda_version)
+  GenerateSM89_SparseTensorOp_16864_fp8(manifest, cuda_version)
+
+###################################################################################################
+
+
+try:
+    from .sm90_utils import (
+        generate_fp16_bf16_math_instructions_sm90,
+        generate_tf32_math_instructions_sm90,
+        generate_int8_math_instructions_sm90,
+        generate_fp8_math_instructions_sm90,
+        generate_mixed_dtype_math_instructions_sm90,
+        make_sparse_math_instructions,
+        generate_tile_descriptions_sm90,
+        get_valid_schedules,
+        generate_data_types_from_math_instruction,
+        fix_alignments,
+    )
+except ImportError:
+    from sm90_utils import (
+        generate_fp16_bf16_math_instructions_sm90,
+        generate_tf32_math_instructions_sm90,
+        generate_int8_math_instructions_sm90,
+        generate_fp8_math_instructions_sm90,
+        generate_mixed_dtype_math_instructions_sm90,
+        make_sparse_math_instructions,
+        generate_tile_descriptions_sm90,
+        get_valid_schedules,
+        generate_data_types_from_math_instruction,
+        fix_alignments,
+    )
+
+def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.Universal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 3 if is_grouped(gemm_kind) else 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_fp16_bf16_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_types = [data_type_w_source, data_type_wo_source]
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_type_mixed_wo_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=DataType.void,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
+        data_types.append(data_type_mixed_wo_source)
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+              gemm_kind=gemm_kind,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules, gemm_kind=gemm_kind)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
+  is_aligned = False
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_fp16_bf16_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_types = [data_type_w_source]
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+def GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,   16], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,   16], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_fp16_bf16_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_types = [data_type_w_source, data_type_wo_source]
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+        data_type_mixed_w_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=math_inst.element_a,
+            element_dest=math_inst.element_a
+        )
+        data_type_mixed_wo_source = generate_data_types_from_math_instruction(
+            math_inst,
+            element_source=DataType.void,
+            element_dest=math_inst.element_a
+        )
+        data_types.append(data_type_mixed_w_source)
+        data_types.append(data_type_mixed_wo_source)
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
+  ]
+
+  math_instructions = generate_tf32_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+
+    for layout in layouts:
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_tf32_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32_wo_source = copy.deepcopy(data_type_tf32_wo_source)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        data_type_f32_wo_source["a_type"] = DataType.f32
+        data_type_f32_wo_source["b_type"] = DataType.f32
+        data_type_f32_wo_source["epi_type"] = DataType.f32
+        data_types = [data_type_tf32, data_type_f32, data_type_tf32_wo_source, data_type_f32_wo_source]
+
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
+  is_aligned = False
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 2], [LayoutType.RowMajor,    2], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,    1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.ColumnMajor, 1], [LayoutType.RowMajor,    1], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_tf32_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+
+    for layout in layouts:
+        # Inconsistency: TF32 does not stamp out void-C
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        for data_type in [data_type_tf32, data_type_f32]:
+            # Inconsistency: alignments aren't fixed in TF32 / alignx
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_tf32_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+
+    for layout in layouts:
+        data_type_tf32 = generate_data_types_from_math_instruction(math_inst)
+        data_type_tf32_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+        data_type_f32 = copy.deepcopy(data_type_tf32)
+        data_type_f32_wo_source = copy.deepcopy(data_type_tf32_wo_source)
+        data_type_f32["a_type"] = DataType.f32
+        data_type_f32["b_type"] = DataType.f32
+        data_type_f32["epi_type"] = DataType.f32
+        data_type_f32_wo_source["a_type"] = DataType.f32
+        data_type_f32_wo_source["b_type"] = DataType.f32
+        data_type_f32_wo_source["epi_type"] = DataType.f32
+        data_types = [data_type_tf32, data_type_f32, data_type_tf32_wo_source, data_type_f32_wo_source]
+
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
+  ]
+
+  math_instructions = generate_int8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_wo_source, data_type_int8_output]
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
+  is_aligned = False
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor,  8], [LayoutType.ColumnMajor,  8], [LayoutType.ColumnMajor, 1]],
+    [[LayoutType.RowMajor,  4], [LayoutType.ColumnMajor,  4], [LayoutType.ColumnMajor, 1]],
+  ]
+
+  math_instructions = generate_int8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_int8_output]
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 32], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_int8_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    # s8.u8 and u8.s8 wgmma variants require PTX 8.4
+    if math_inst.element_a != math_inst.element_b and not CudaToolkitVersionSatisfies(cuda_version, 12, 4):
+      continue
+    data_type_w_source = generate_data_types_from_math_instruction(math_inst)
+    data_type_wo_source = generate_data_types_from_math_instruction(math_inst, element_source=DataType.void)
+    data_type_int8_output = generate_data_types_from_math_instruction(
+        math_inst,
+        element_source=DataType.s8,
+        element_dest=math_inst.element_a,
+        element_epilogue=DataType.f32
+    )
+    data_types = [data_type_w_source, data_type_wo_source, data_type_int8_output]
+
+    for layout in layouts:
+        for data_type in data_types:
+            layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.Universal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 3 if is_grouped(gemm_kind) else 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+              gemm_kind=gemm_kind,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules, gemm_kind=gemm_kind)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+def GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.BlockwiseUniversal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 3 if is_grouped(gemm_kind) else 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions_ = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  tile_descriptions = list()
+
+  for desc in tile_descriptions_:
+    desc.explicit_vector_sizes = [1, desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [desc.tile_shape[0], desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [desc.tile_shape[0], desc.tile_shape[1], desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+    desc.explicit_vector_sizes = [1, 1, desc.tile_shape[2]]
+    tile_descriptions.append(copy.deepcopy(desc))
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+              gemm_kind=gemm_kind,
+              enable_fp8_fast_acc=False,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules, gemm_kind=gemm_kind)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK],
+                                              gemm_kind=gemm_kind)
+
+
+
+def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9992)
+  is_aligned = False
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 1]],  # TN Layout
+    [[LayoutType.RowMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = generate_fp8_math_instructions_sm90(instantiation_level)
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = [generate_data_types_from_math_instruction(math_inst)]
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                              stream_k_schedules,
+                                              tile_schedulers=[TileSchedulerType.StreamK])
+
+def GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  is_aligned = True
+
+  # layouts for ABC, their alignments will be fixed later based on the data type
+  layouts = [
+    [[LayoutType.RowMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16]],
+  ]
+
+  valid_types_for_a_b_acc = [
+    (DataType.e4m3, DataType.f16, DataType.f32),
+    (DataType.e4m3, DataType.bf16, DataType.f32),
+    (DataType.e5m2, DataType.f16, DataType.f32),
+    (DataType.e5m2, DataType.bf16, DataType.f32),
+    (DataType.s8, DataType.f16, DataType.f32),
+    (DataType.s8, DataType.bf16, DataType.f32),
+    (DataType.u8, DataType.f16, DataType.f32),
+    (DataType.u8, DataType.bf16, DataType.f32),
+    (DataType.s4, DataType.f16, DataType.f32),
+    (DataType.s4, DataType.bf16, DataType.f32),
+    (DataType.s4, DataType.e4m3, DataType.f32),
+    (DataType.s4, DataType.e5m2, DataType.f32),
+    (DataType.u4, DataType.f16, DataType.f32),
+    (DataType.u4, DataType.bf16, DataType.f32),
+    (DataType.u2, DataType.f16, DataType.f32),
+    (DataType.u2, DataType.bf16, DataType.f32),
+    (DataType.s2, DataType.f16, DataType.f32),
+    (DataType.s2, DataType.bf16, DataType.f32),
+  ]
+  # Note: For sizeof(a_type) > sizeof(b_type), some generated kernels might crash due to a compiler bug. Disable it for now.
+  #swapped_valid_types_for_a_b_acc = [(b_type, a_type, acc_type) for a_type, b_type, acc_type in valid_types_for_a_b_acc]
+  #valid_types_for_a_b_acc = valid_types_for_a_b_acc + swapped_valid_types_for_a_b_acc
+
+  math_instructions = generate_mixed_dtype_math_instructions_sm90(instantiation_level, valid_types_for_a_b_acc)
+
+  valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+  valid_types_for_c = copy.deepcopy(valid_types_for_d)
+
+  tile_descriptions = generate_tile_descriptions_sm90(
+    math_instructions=math_instructions,
+    is_aligned=is_aligned,
+    level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+
+    # Limit C/D types to avoid a giant number of instantiations.
+    # A typical use case for mixed dtype in DL is weight quantization (tensor A),
+    # therefore we can limit the output type to that of activation (tensor B).
+    valid_types_for_c = [math_inst.element_b]
+    valid_types_for_d = [math_inst.element_b]
+
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+      data_types.append(
+        generate_data_types_from_math_instruction(
+          math_inst,
+          element_source=c_type,
+          element_dest=d_type,
+        )
+      )
+
+    for layout in layouts:
+      for data_type in data_types:
+        # Fix alignments, DataTypeSize are in the unit of bits
+        alignment_bits = 128
+        layout[0][1] = alignment_bits // DataTypeSize[data_type['a_type']]
+        layout[1][1] = alignment_bits // DataTypeSize[data_type['b_type']]
+        layout[2][1] = alignment_bits // DataTypeSize[data_type['c_type']]
+
+        schedules, stream_k_schedules = get_valid_schedules(
+          tile_description=tile_desc,
+          cuda_version=cuda_version,
+          is_aligned=is_aligned,
+          data_types=data_type,
+          instantiation_level=instantiation_level,
+          layout=layout,
+        )
+
+        if len(schedules):
+          CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+          if len(stream_k_schedules):
+            assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+            CreateGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                          stream_k_schedules,
+                                          tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
+  is_aligned = True
+
+  # layouts for ABC and their alignments
+  layouts = [
+    [[LayoutType.RowMajor, 32], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 1]],  # TN Layout
+  ]
+
+  math_instructions = make_sparse_math_instructions(generate_fp8_math_instructions_sm90(instantiation_level))
+  tile_descriptions = generate_tile_descriptions_sm90(
+      math_instructions=math_instructions,
+      is_aligned=is_aligned,
+      level=instantiation_level)
+
+  for tile_desc in tile_descriptions:
+    math_inst = tile_desc.math_instruction
+    data_types = []
+    fp8_types = [DataType.e4m3, DataType.e5m2]
+    valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2]
+    valid_types_for_c = copy.deepcopy(valid_types_for_d)
+    valid_types_for_c.append(DataType.void)
+    for c_type, d_type in product(valid_types_for_c, valid_types_for_d):
+        data_types.append(
+            generate_data_types_from_math_instruction(
+                math_inst,
+                element_source=c_type,
+                element_dest=d_type,
+            )
+        )
+    else:
+        for d_type in valid_types_for_d:
+            data_types.append(
+                generate_data_types_from_math_instruction(
+                    math_inst,
+                    element_source=DataType.void,
+                    element_dest=d_type,
+                )
+            )
+
+    for layout in layouts:
+        for data_type in data_types:
+            # Inconsistency: alignments aren't fixed in FP8
+            # layout = fix_alignments(data_type, layout, alignment_bits=128)
+
+            schedules, stream_k_schedules = get_valid_schedules(
+              tile_description=tile_desc,
+              cuda_version=cuda_version,
+              is_aligned=is_aligned,
+              data_types=data_type,
+              instantiation_level=instantiation_level,
+              layout=layout,
+            )
+
+            if len(schedules):
+              CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type, schedules)
+              if len(stream_k_schedules):
+                assert CudaToolkitVersionSatisfies(cuda_version, 12, 1)
+                CreateSparseGemmUniversal3xOperator(manifest, [layout], [tile_desc], data_type,
+                                                    stream_k_schedules,
+                                                    tile_schedulers=[TileSchedulerType.StreamK])
+
+
+def GenerateSM90_TensorOp_1684(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst = MathInstruction(
+      [16, 8, 4],
+      DataType.f64, DataType.f64, DataType.f64,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 64, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([256, 32, 16], 3, [4, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 256, 16], 3, [1, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions,
+    data_type, alignment_constraints)
+
+#
+
+#
+def GenerateSM90_TensorOp_1684_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64,  8 ], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 8 ], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  8 ], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  8 ], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  8 ], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  8 ], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64,  16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  128, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64,  32,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  64,  16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  32,  16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16,  32,  16], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32,  16,  16], 3, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    (ComplexTransform.none, ComplexTransform.none),
+    (ComplexTransform.conj, ComplexTransform.none),
+    (ComplexTransform.none, ComplexTransform.conj),
+    (ComplexTransform.conj, ComplexTransform.conj)
+  ]
+
+  CreateGemmOperator(manifest, layouts, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64]
+
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+
+#
+
+#
+def GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor),
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYRK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HERK computation
+  CreateRankKOperator(manifest, layouts, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints)
+#
+
+#
+def GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+
+#
+def GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+    (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  diag_types = [
+    DiagType.NonUnit, DiagType.Unit,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [
+    ComplexTransform.none, ComplexTransform.conj,
+  ]
+
+  CreateTrmmOperator(manifest, layouts, side_modes, fill_modes, diag_types, tile_descriptions, \
+    data_type, alignment_constraints, complex_transforms)
+#
+
+#
+def GenerateSM90_TensorOp_1684_symm(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64]
+
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+#
+
+#
+def GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+#
+def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 11, 8):
+    return
+
+  layouts = [
+    (LayoutType.ColumnMajor, LayoutType.ColumnMajor),
+  ]
+
+  side_modes = [
+    SideMode.Left, SideMode.Right,
+  ]
+
+  fill_modes = [
+    FillMode.Lower, FillMode.Upper,
+  ]
+
+  math_inst =                                             \
+    MathInstruction(                                      \
+      [16, 8, 4],                                          \
+      DataType.f64, DataType.f64, DataType.f64,           \
+      OpcodeClass.TensorOp,                               \
+      MathOperation.multiply_add_complex_gaussian)
+
+  min_cc = 90
+  max_cc = 90
+
+  alignment_constraints = [1,]
+
+  tile_descriptions = [
+    TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc),
+    #TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc),
+  ]
+
+  data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64]
+
+  complex_transforms = [ComplexTransform.none,]
+
+  # SYMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.symmetric)
+
+  # HEMM computation
+  CreateSymmOperator(manifest, layouts, side_modes, fill_modes, tile_descriptions, \
+    data_type, alignment_constraints, BlasMode.hermitian)
+#
+
+
+
+# Blackwell SM 100 generators
+
+try:
+    import cutlass_library.sm100_utils
+    from cutlass_library.sm100_utils import (
+      generate_tf32_math_instructions_sm100,
+      generate_16b_math_instructions_sm100,
+      generate_f8f6f4_math_instructions_sm100,
+      generate_mxf8f6f4_math_instructions_sm100,
+      generate_mxf4nvf4_math_instructions_sm100,
+      generate_fp8_math_instructions_sm100,
+      generate_cluster_shapes_sm100,
+      get_pruning_level_from_global_level
+    )
+except ImportError:
+    import sm100_utils
+    from sm100_utils import (
+      generate_tf32_math_instructions_sm100,
+      generate_16b_math_instructions_sm100,
+      generate_f8f6f4_math_instructions_sm100,
+      generate_mxf8f6f4_math_instructions_sm100,
+      generate_mxf4nvf4_math_instructions_sm100,
+      generate_fp8_math_instructions_sm100,
+      generate_cluster_shapes_sm100,
+      get_pruning_level_from_global_level
+    )
+
+###################################################################################################
+
+def get_tma_alignment_elt(data_type : DataType, is_f8f6f4 : bool = True ) -> int:
+  if DataTypeSize[data_type] < 8 and is_f8f6f4:
+    return int(128)
+  return int(16 * 8 / DataTypeSize[data_type])
+
+sm100_cluster_shape_1sm = [
+  [4,4,1]
+  , DynamicClusterShape
+]
+
+sm100_cluster_shape_2sm = [
+  # cluster_m % 2 == 0 for 2sm
+  [4,4,1]
+  , DynamicClusterShape
+]
+
+def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=490, default_level=490, exhaustive_level=9999)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4]],
+    [[LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.ColumnMajor, 4], [LayoutType.RowMajor,    4]],
+    [[LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4], [LayoutType.RowMajor,    4]],
+  ]
+
+  data_types = [
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.f32,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  math_instructions_1sm, math_instructions_2sm = generate_tf32_math_instructions_sm100(instantiation_level)
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+      tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    if math_inst.instruction_shape[0] == 128:
+      epi_schedule = EpilogueScheduleType.TmaWarpSpecialized2Sm
+    else:
+      epi_schedule = EpilogueScheduleType.ScheduleAuto
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
+
+def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.Universal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=490, default_level=490, exhaustive_level=9999)
+
+  # layouts for ABC and their alignments. C alignment will be set later based on output type
+  layouts = [
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    8], [LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    0]],
+    [[LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    8], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    8], [LayoutType.ColumnMajor, 8], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    8], [LayoutType.RowMajor,    8], [LayoutType.RowMajor,    0]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  math_instructions_1sm, math_instructions_2sm = generate_16b_math_instructions_sm100(instantiation_level)
+  
+  min_cc = 100
+  max_cc = thor_sm
+  grouped = is_grouped(gemm_kind)
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_accumulator,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+    ]
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    kernel_schedule = KernelScheduleType.TmaWarpSpecialized1SmSm100 if not grouped else KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100
+    epi_schedule = EpilogueScheduleType.TmaWarpSpecialized1Sm if not grouped else EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[kernel_schedule, epi_schedule]],
+      tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      data_types_mixed = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : math_inst.element_a,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : math_inst.element_accumulator,
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : math_inst.element_accumulator,
+        },
+      ]
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types_mixed[0]["d_type"]]
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types_mixed,
+        [[kernel_schedule, epi_schedule]],
+        tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_accumulator,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+    ]
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    if grouped:
+      epi_schedule = EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm
+    elif math_inst.instruction_shape[0] == 128:
+      epi_schedule = EpilogueScheduleType.TmaWarpSpecialized2Sm
+    else:
+      epi_schedule = EpilogueScheduleType.ScheduleAuto
+    kernel_schedule = to_grouped_schedule(KernelScheduleType.TmaWarpSpecialized2SmSm100, grouped)
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[kernel_schedule, epi_schedule]], tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      data_types_mixed = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : math_inst.element_a,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : math_inst.element_accumulator,
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : math_inst.element_accumulator,
+        },
+      ]
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types_mixed[0]["d_type"]]
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types_mixed,
+        [[kernel_schedule, epi_schedule]], tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.Universal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=591 , default_level=591 , exhaustive_level=9999)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]], 
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  epi_type = DataType.f32
+  grouped = is_grouped(gemm_kind)
+
+  math_instructions_1sm, math_instructions_2sm = generate_fp8_math_instructions_sm100(instantiation_level, enable_runtime_dtype=not grouped)
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+      kernel_schedule = to_grouped_schedule(KernelScheduleType.TmaWarpSpecialized1SmSm100, grouped)
+      epi_schedule = to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[kernel_schedule, epi_schedule]],
+        tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+  # 2xSM MMA kernels
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      if grouped:
+        epi_schedule = EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm
+      elif math_inst.instruction_shape[0] == 128:
+        epi_schedule = EpilogueScheduleType.TmaWarpSpecialized2Sm
+      else:
+        epi_schedule = EpilogueScheduleType.ScheduleAuto
+      kernel_schedule = to_grouped_schedule(KernelScheduleType.TmaWarpSpecialized2SmSm100, grouped)
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+      [[kernel_schedule, epi_schedule]], tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+def GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.BlockwiseUniversal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=593, default_level=593, exhaustive_level=9999)
+
+  grouped = is_grouped(gemm_kind)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]], 
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+  ]
+
+  min_cc = 100
+  max_cc = 100
+  epi_type = DataType.f32
+
+  pruning_level = get_pruning_level_from_global_level(instantiation_level)
+
+  math_instructions_1sm, math_instructions_2sm = generate_fp8_math_instructions_sm100(instantiation_level, enable_compile_time_dtype=grouped or pruning_level >= 1, enable_runtime_dtype=not grouped)
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level)
+
+  tile_schedulers = [
+    TileSchedulerType.Default,
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [math_inst.instruction_shape[0], math_inst.instruction_shape[1], 
+           math_inst.instruction_shape[2] * 4]))
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [1, math_inst.instruction_shape[1], 
+           math_inst.instruction_shape[2] * 4]))
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape,
+          [math_inst.instruction_shape[0], 1, 
+           math_inst.instruction_shape[2] * 4]))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+    for data_type in data_types:
+      if ( data_type["a_type"] == DataType.e4m3 ) and ( data_type["b_type"] == DataType.e4m3 ) and\
+         ( data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      is_runtime_datatype_a = is_runtime_datatype(data_type["a_type"])
+      is_runtime_datatype_b = is_runtime_datatype(data_type["d_type"])
+
+      # A/B datatypes should be both static or dynamic
+      if (is_runtime_datatype_a != is_runtime_datatype_b):
+        continue
+
+      kernel_schedule = to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100, grouped)
+      epi_schedule = to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)
+      epi_schedule_nosmem = to_grouped_schedule(EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm, grouped)
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[kernel_schedule, epi_schedule], [kernel_schedule, epi_schedule_nosmem]],
+        tile_schedulers=tile_schedulers, gemm_kind=gemm_kind)
+
+def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.Universal3x):
+
+  # SM100 MMA with mixed F4/F6/F8 inputs + without block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=590, default_level=590, exhaustive_level=9999)
+
+  grouped = is_grouped(gemm_kind)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  math_instructions_1sm, math_instructions_2sm = generate_f8f6f4_math_instructions_sm100(instantiation_level, enable_runtime_dtype=not grouped)
+
+  def change_priority_func(shapes_1sm, shapes_2sm):
+    shapes_1sm[(1,2,1)] = 6
+    shapes_1sm[(1,4,1)] = 6
+    shapes_2sm[(2,2,1)] = 6
+    shapes_2sm[(2,4,1)] = 6
+    shapes_2sm[(4,2,1)] = 6
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level, change_priority_func)
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  epi_type = DataType.f32
+
+  is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+      ]
+
+    for kernel_data_type in kernel_data_types:
+      # Filter out some kernel
+      if ( kernel_data_type["a_type"] == DataType.e4m3 ) and ( kernel_data_type["b_type"] == DataType.e4m3 ) and\
+         ( kernel_data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"])
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]], tile_schedulers=tile_schedulers)
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f32,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+      }
+      ]
+
+    for kernel_data_type in kernel_data_types:
+      # Filter some kernel
+      if ( kernel_data_type["a_type"] == DataType.e4m3 ) and ( kernel_data_type["b_type"] == DataType.e4m3 ) and\
+         ( kernel_data_type["d_type"] == DataType.e5m2 ):
+        continue
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"])
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      if math_inst.instruction_shape[0] == 128:
+        CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+          [[KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]], tile_schedulers=tile_schedulers)
+      else:
+        CreateGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+          [[KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]], tile_schedulers=tile_schedulers)
+
+def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
+
+  # SM100 MMA with mixed F4/F6/F8 inputs + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=590, default_level=590, exhaustive_level=9999)
+
+  grouped = is_grouped(gemm_kind)
+
+  layouts = [
+    [[LayoutType.RowMajor,    128], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    128], [LayoutType.ColumnMajor, 128], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 128], [LayoutType.RowMajor,    128], [LayoutType.RowMajor,    0]],
+  ]
+
+  math_instructions_1sm, math_instructions_2sm = generate_mxf8f6f4_math_instructions_sm100(instantiation_level, enable_runtime_dtype=not grouped)
+
+  def change_priority_func(shapes_1sm, shapes_2sm):
+    shapes_1sm[(1,2,1)] = 6
+    shapes_1sm[(1,4,1)] = 6
+    shapes_2sm[(2,2,1)] = 6
+    shapes_2sm[(2,4,1)] = 6
+    shapes_2sm[(4,2,1)] = 6
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level, change_priority_func)
+
+  ab_types  = [
+    DataType.f4, DataType.f6,
+    DataType.e2m1, 
+    DataType.e2m3, 
+    DataType.e3m2,
+    DataType.e5m2,
+    DataType.e4m3,
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  def tile_schedulers(sfdtype):
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K.
+    if sfdtype["type"] == DataType.void or grouped:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  epi_type = DataType.f32
+
+  is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    assert math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e3m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      }]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type in data_types:
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[to_grouped_schedule(KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100, grouped), to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)]]
+        , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+
+  for math_inst in math_instructions_2sm:
+    assert math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e3m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+    ]
+
+    # Set alignment d based on Destination format.
+    for data_type in data_types:
+      for layout in layouts:
+        # alignment for a
+        layout[0][1] = get_tma_alignment_elt(data_type["a_type"])
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(data_type["d_type"])
+        for tile in tile_descriptions:
+          math_inst = tile.math_instruction
+          # Filter some kernels that does not meet the alignment requirements.
+          if layout[0][0] == LayoutType.ColumnMajor:
+            if math_inst.instruction_shape[0] // 2 % layout[0][1] != 0:
+              continue
+          else:
+            if tile.threadblock_shape[2] // tile.cluster_shape[2] % layout[0][1] != 0:
+              continue
+  
+          if layout[1][0] == LayoutType.RowMajor:
+            if math_inst.instruction_shape[1] // 2 % layout[1][1] != 0:
+              continue
+          else:
+            if tile.threadblock_shape[2] // tile.cluster_shape[2] % layout[1][1] != 0:
+              continue
+          
+          if grouped:
+            CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
+              [[to_grouped_schedule(KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, grouped), to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized2Sm, grouped)]]
+              , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+          elif math_inst.instruction_shape[0] == 128:
+            CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
+              [[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]]
+              , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+          else:
+            CreateGemmUniversal3xOperator(manifest, [layout], [tile], [data_type],
+              [[KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100, EpilogueScheduleType.ScheduleAuto]]
+              , tile_schedulers = tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+
+
+
+def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
+  # SM100 MMA with F4 + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  instantiation_level = manifest.get_instantiation_level(pruned_level=591, default_level=591, exhaustive_level=9999)
+
+  grouped = is_grouped(gemm_kind)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.ColumnMajor, 0]],
+  ]
+
+  math_instructions_1sm, math_instructions_2sm = generate_mxf4nvf4_math_instructions_sm100(instantiation_level, enable_runtime_dtype=not grouped)
+
+  def change_priority_func(shapes_1sm, shapes_2sm):
+    shapes_1sm[(1,2,1)] = 6
+    shapes_1sm[(1,4,1)] = 6
+    shapes_2sm[(2,2,1)] = 6
+    shapes_2sm[(2,4,1)] = 6
+    shapes_2sm[(4,2,1)] = 6
+
+  cluster_shapes_1sm, cluster_shapes_2sm = generate_cluster_shapes_sm100(instantiation_level, change_priority_func=change_priority_func)
+
+  acc_types = [ DataType.f32 ] # Accumulator is always 32 bits for block scaled MMA instructions
+
+  def tile_schedulers(sfdtype):
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K.
+    if sfdtype["type"] == DataType.void or grouped:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  epi_type = DataType.f32
+
+  is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    if [4,4,1] in cluster_shapes_1sm :
+      cluster_shapes_1sm.remove([4,4,1])
+    if [4,4,1] in cluster_shapes_2sm :
+      cluster_shapes_2sm.remove([4,4,1])
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    assert math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+      assert math_inst.instruction_shape[2] * 4 == 256
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for layout in layouts:
+      for data_type in data_types:
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.RowMajor):
+          data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.ColumnMajor):
+            continue
+
+        # E2M1 x E2M1, vector size 32, E8
+        # E2M1 x E2M1, vector size 16, UE4M3
+        isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
+        epi_schedule = to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized1Sm, grouped)
+        epi_nosmem_schedule = to_grouped_schedule(EpilogueScheduleType.NoSmemWarpSpecialized1Sm, grouped)
+        nvfp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100, grouped)
+        fp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100, grouped)
+
+        nvfp4_schedules = [[nvfp4_kernel_schedule, epi_schedule], [nvfp4_kernel_schedule, epi_nosmem_schedule]]
+        fp4_schedules   = [[fp4_kernel_schedule, epi_schedule], [fp4_kernel_schedule, epi_nosmem_schedule]]
+        CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, nvfp4_schedules
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind
+          )
+        if isFp4:
+          CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, fp4_schedules
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind
+          )
+
+  for math_inst in math_instructions_2sm:
+    assert math_inst.opcode_class == OpcodeClass.BlockScaledTensorOp
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for layout in layouts:
+      for data_type in data_types:
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.RowMajor):
+          data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.ColumnMajor):
+            continue
+
+        # E2M1 x E2M1, vector size 32, E8
+        isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
+
+        epi_schedule = EpilogueScheduleType.ScheduleAuto if not grouped else EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm
+        epi_nosmem_schedule = to_grouped_schedule(EpilogueScheduleType.NoSmemWarpSpecialized2Sm, grouped)
+        nvfp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100, grouped)
+        fp4_kernel_schedule = to_grouped_schedule(KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100, grouped)
+
+        nvfp4_schedules = [[nvfp4_kernel_schedule, epi_schedule], [nvfp4_kernel_schedule, epi_nosmem_schedule]]
+        fp4_schedules   = [[fp4_kernel_schedule, epi_schedule], [fp4_kernel_schedule, epi_nosmem_schedule]]
+        CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, nvfp4_schedules
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+        if isFp4:
+          CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, fp4_schedules
+          , tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+
+def GenerateSM103_TensorOp_fp4_ultra_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.BlockScaledUniversal3x):
+  # SM100 MMA with F4 + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 13, 0):
+    return
+
+  grouped = is_grouped(gemm_kind)
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.ColumnMajor, 0]],
+  ]
+
+  instruction_sizes_1sm = [
+    [128, 128, 96], 
+  ]
+
+  instruction_sizes_2sm = [
+    [256, 128, 96], 
+    [256, 192, 96],
+    [256, 256, 96]
+  ]
+
+  ab_types  = [
+    DataType.f4,
+    DataType.e2m1, 
+  ]
+
+  sf_types  = [
+    DataType.ue4m3,
+    DataType.ue8m0
+  ]
+
+  acc_types = [ DataType.f32 ] # Accumulator is always 32 bits for block scaled MMA instructions
+
+  def tile_schedulers(sfdtype):
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K.
+    if grouped:
+      return [TileSchedulerType.Default]
+    if sfdtype["type"] == DataType.void:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  min_cc = 103
+  max_cc = 103
+  epi_type = DataType.f32
+
+  math_instructions_1sm = []
+
+  is_runtime_datatype = lambda runtime_datatype: runtime_datatype in (DataType.f4, DataType.f6, DataType.f8)
+
+  for instr_size, a_type, b_type, sf_type, acc_type in product(instruction_sizes_1sm, ab_types, ab_types, sf_types, acc_types):
+    is_runtime_datatype_a = is_runtime_datatype(a_type)
+    is_runtime_datatype_b = is_runtime_datatype(b_type)
+
+    # A/B datatypes should be both static or dynamic
+    if (is_runtime_datatype_a != is_runtime_datatype_b):
+      continue
+
+    math_instructions_1sm.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        sf_type)
+    )
+
+  math_instructions_2sm = []
+
+  for instr_size, a_type, b_type, sf_type, acc_type in product(instruction_sizes_2sm, ab_types, ab_types, sf_types, acc_types):
+    is_runtime_datatype_a = is_runtime_datatype(a_type)
+    is_runtime_datatype_b = is_runtime_datatype(b_type)
+
+    # A/B datatypes should be both static or dynamic
+    if (is_runtime_datatype_a != is_runtime_datatype_b):
+      continue
+
+    math_instructions_2sm.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        sf_type)
+    )
+
+  cluster_shapes_1sm = [
+    [1,1,1],
+    # [1,2,1],
+    [2,1,1],
+    # [1,4,1],
+    [4,4,1],
+    DynamicClusterShape
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          768],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      for data_type in data_types:
+        # Set alignment d based on Destination format.
+        if DataTypeSize[data_type["c_type"]] == 0 :
+          layout[2][1] = 256 // DataTypeSize[data_type["d_type"]]
+        else:
+          layout[2][1] = min(256 // DataTypeSize[data_type["d_type"]], 256 // DataTypeSize[data_type["c_type"]])
+        
+        if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.RowMajor):
+          data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.ColumnMajor):
+            continue
+        #   E2M1 x E2M1, vector size 32, E8
+        isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
+
+        epilogue_1sm_schedule = to_grouped_schedule(EpilogueScheduleType.NoSmemWarpSpecialized1Sm, grouped)
+
+        nvfp4_schedule                  = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103, grouped), epilogue_1sm_schedule]              
+        nvfp4_schedule_disable_prefetch = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch, grouped), epilogue_1sm_schedule]                
+        nvfp4_schedule_tma_prefetch     = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch, grouped), epilogue_1sm_schedule]
+        fp4_schedule                    = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103, grouped), epilogue_1sm_schedule]
+        fp4_schedule_disable_prefetch   = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch, grouped), epilogue_1sm_schedule]
+        fp4_schedule_tma_prefetch       = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch, grouped), epilogue_1sm_schedule]
+        nvfp4_schedules = [nvfp4_schedule, nvfp4_schedule_disable_prefetch, nvfp4_schedule_tma_prefetch]
+        fp4_schedules   = [fp4_schedule, fp4_schedule_disable_prefetch, fp4_schedule_tma_prefetch]
+
+        CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, 
+                                      nvfp4_schedules, tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+        if isFp4:
+          CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type,
+                                        fp4_schedules, tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+
+  cluster_shapes_2sm = [
+    [2,1,1],
+    # [2,2,1],
+    # [2,4,1],
+    [4,1,1],
+    # [4,2,1],
+    [4,4,1],
+    DynamicClusterShape
+  ]
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 8 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.bf16,
+        "d_type"   : DataType.bf16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e2m1,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      for data_type in data_types:
+        # Set alignment d based on Destination format.
+        if DataTypeSize[data_type["c_type"]] == 0 :
+          layout[2][1] = 256 // DataTypeSize[data_type["d_type"]]
+        else:
+          layout[2][1] = min(256 // DataTypeSize[data_type["d_type"]], 256 // DataTypeSize[data_type["c_type"]])
+        
+        if data_type["sfd_type"]["type"] != DataType.void and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.RowMajor):
+          data_type["sfd_type"]["layout"] = layout[2][0] # For FP4 output , the scalefactor layout is same layout as D layout.
+        if (data_type["sfd_type"]["type"] != DataType.void) and (data_type["d_type"] == DataType.e2m1) and (layout[2][0] == LayoutType.ColumnMajor):
+            continue
+        #   E2M1 x E2M1, vector size 32, E8
+        isFp4 = math_inst.element_scale_factor == DataType.ue8m0 and  math_inst.element_a == DataType.e2m1 and math_inst.element_b == DataType.e2m1
+
+        epilogue_2sm_schedule = to_grouped_schedule(EpilogueScheduleType.NoSmemWarpSpecialized2Sm, grouped)
+
+        nvfp4_schedule                  = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103, grouped), epilogue_2sm_schedule]              
+        nvfp4_schedule_disable_prefetch = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch, grouped), epilogue_2sm_schedule]                
+        nvfp4_schedule_tma_prefetch     = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch, grouped), epilogue_2sm_schedule]
+        fp4_schedule                    = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103, grouped), epilogue_2sm_schedule]
+        fp4_schedule_disable_prefetch   = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch, grouped), epilogue_2sm_schedule]
+        fp4_schedule_tma_prefetch       = [to_grouped_schedule(KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch, grouped), epilogue_2sm_schedule]
+        nvfp4_schedules = [nvfp4_schedule, nvfp4_schedule_disable_prefetch, nvfp4_schedule_tma_prefetch]
+        fp4_schedules   = [fp4_schedule, fp4_schedule_disable_prefetch, fp4_schedule_tma_prefetch]
+
+        CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type, 
+                                      nvfp4_schedules, tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+        if isFp4:
+          CreateGemmUniversal3xOperator(manifest, [layout], tile_descriptions, data_type,
+                                        fp4_schedules, tile_schedulers=tile_schedulers(data_type["sfd_type"]), gemm_kind=gemm_kind)
+
+
+def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.ColumnMajor, 16], [LayoutType.RowMajor,    0]],
+    [[LayoutType.RowMajor,    16], [LayoutType.RowMajor,    16], [LayoutType.RowMajor,    0]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  epi_type = DataType.f32
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [64, 128, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 128, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add)]
+
+  cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1], [4,4,1]
+                        , DynamicClusterShape
+                       ]
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1]
+                          , DynamicClusterShape
+                         ]                    
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_accumulator,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+    ]
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+      tile_schedulers=tile_schedulers)
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      data_types_mixed = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : math_inst.element_a,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+        },
+      ]
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types_mixed[0]["d_type"]]
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types_mixed,
+        [[KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  math_instructions_2sm = [
+    MathInstruction(
+      [128, 128, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 128, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 32],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]
+                        , DynamicClusterShape
+                       ]
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]
+                          , DynamicClusterShape
+                         ]
+
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 4 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : math_inst.element_accumulator,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : math_inst.element_accumulator,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator,
+      },
+    ]
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    if math_inst.instruction_shape[0] == 128:
+      epi_schedule = EpilogueScheduleType.TmaWarpSpecialized2Sm
+    else:
+      epi_schedule = EpilogueScheduleType.ScheduleAuto
+
+    CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types,
+      [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
+
+    # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
+    # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
+    if math_inst.element_a != math_inst.element_accumulator:
+      data_types_mixed = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : math_inst.element_a,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : math_inst.element_a,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+        },
+      ]
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_types_mixed,
+        [[KernelScheduleType.TmaWarpSpecialized2SmSm100, epi_schedule]], tile_schedulers=tile_schedulers)
+
+
+def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.f32,
+      "b_type"   : DataType.f32,
+      "c_type"   : DataType.f32,
+      "d_type"   : DataType.f32,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 16],
+      DataType.tf32, DataType.tf32, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.f16,
+      "b_type"   : DataType.f16,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.f16,
+      "b_type"   : DataType.f16,
+      "c_type"   : DataType.f16,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 32],
+      DataType.f16, DataType.f16, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  kernel_data_types = [
+    # void_c
+    {
+      "a_type"   : DataType.s8,
+      "b_type"   : DataType.s8,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.s8,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    # none void_c
+    {
+      "a_type"   : DataType.s8,
+      "b_type"   : DataType.s8,
+      "c_type"   : DataType.s8,
+      "d_type"   : DataType.s8,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    MathInstruction(
+      [128, 128, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add)]
+
+  math_instructions_2sm = [
+    MathInstruction(
+      [256, 128, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.s8, DataType.s8, DataType.s32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  kernel_data_types = [
+    # NOTE: a/b type in kernel will be overwrite below.
+    #* void_c
+    # f8_f8_f32_void_f16
+    {
+      "a_type"   : DataType.e4m3,
+      "b_type"   : DataType.e4m3,
+      "c_type"   : DataType.void,
+      "d_type"   : DataType.f16,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+    #* non-void_c
+    # f8_f8_f32_f16_f8
+    {
+      "a_type"   : DataType.e4m3,
+      "b_type"   : DataType.e4m3,
+      "c_type"   : DataType.f16,
+      "d_type"   : DataType.e4m3,
+      "acc_type" : DataType.f32,
+      "epi_type" : DataType.f32,
+    },
+  ]
+
+  math_instructions_1sm = [
+    # Runtime DType
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    # Runtime DType
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f8, DataType.f8, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update input AB type
+      kernel_data_type["a_type"] = math_inst.element_a
+      kernel_data_type["b_type"] = math_inst.element_b
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    for kernel_data_type in kernel_data_types:
+      # Update input AB type
+      kernel_data_type["a_type"] = math_inst.element_a
+      kernel_data_type["b_type"] = math_inst.element_b
+
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_copy = copy.deepcopy(layouts)
+      for layout in layouts_copy:
+        # alignment for a, 2 for sparsity
+        layout[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+        # alignment for b
+        layout[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+        # alignment for d
+        layout[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_copy, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    # Alignment requirement will be over-write below
+    [[LayoutType.RowMajor, -1], [LayoutType.ColumnMajor, -1], [LayoutType.RowMajor, -1]],
+  ]
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  min_cc = 100
+  max_cc = thor_sm
+
+  tile_schedulers = [
+    TileSchedulerType.Default, TileSchedulerType.StreamK
+  ]
+
+  math_instructions_1sm = [
+    # Runtime Dtype
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  
+    MathInstruction(
+      [128, 128, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [128, 256, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  math_instructions_2sm = [
+    # Runtime DType
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f4, DataType.f4, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  
+    MathInstruction(
+      [256, 128, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+    MathInstruction(
+      [256, 256, 64],
+      DataType.f6, DataType.f6, DataType.f32,
+      OpcodeClass.SparseTensorOp,
+      MathOperation.multiply_add),
+  ]
+
+  # 1xSM MMA kernels
+  for math_inst in math_instructions_1sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_1sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_1sm[0],
+          math_inst.instruction_shape[1]     * multiplier_1sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_1sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      # void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+      # none void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+    ]
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_filtered = []
+      for layout in layouts:
+        layout_filter = copy.deepcopy(layout)
+        # * A_K : Logical TileShape_K % 256 == 0
+        # * A_M : TileShape_M % 128 == 0
+        # * B_N : TileSize_N % 128 == 0
+        # * B_K : TileSize_K % 128 == 0
+        if ((layout_filter[0][0] == LayoutType.RowMajor and (math_inst.instruction_shape[2] * 2) % 256 == 0) or \
+            (layout_filter[0][0] == LayoutType.ColumnMajor and math_inst.instruction_shape[0] % 128 == 0)) and \
+           ((layout_filter[1][0] == LayoutType.RowMajor and math_inst.instruction_shape[1] % 128 == 0) or \
+            (layout_filter[1][0] == LayoutType.ColumnMajor and (math_inst.instruction_shape[0] * 2) % 128 == 0)):
+          # alignment for a, 2 for sparsity
+          layout_filter[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+          # alignment for b
+          layout_filter[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+          # alignment for d
+          layout_filter[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+          layouts_filtered.append(layout_filter)
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_filtered, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm]],
+        tile_schedulers=tile_schedulers)
+
+  # 2xSM MMA kernels
+  for math_inst in math_instructions_2sm:
+    tile_descriptions = []
+    for cluster_shape in sm100_cluster_shape_2sm:
+      if thor_sm in manifest.compute_capabilities_baseline :
+        if cluster_shape == [4,4,1] :
+          continue
+      multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      tile_descriptions.append(
+        TileDescription([
+          math_inst.instruction_shape[0]     * multiplier_2sm[0],
+          math_inst.instruction_shape[1]     * multiplier_2sm[1],
+          math_inst.instruction_shape[2] * 2 * multiplier_2sm[2]],
+          0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    kernel_data_types = [
+      # void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+      # none void_c
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32,
+      },
+    ]
+
+    for kernel_data_type in kernel_data_types:
+      # Update layout alignment
+      # alignment for d might be different for each kernel_data_type
+      layouts_filtered = []
+      for layout in layouts:
+        layout_filter = copy.deepcopy(layout)
+        # * A_K : Logical TileShape_K % 256 == 0
+        # * A_M : TileShape_M % 128 == 0
+        # * B_N : TileSize_N % 256 == 0
+        # * B_K : TileSize_K % 128 == 0
+        if ((layout_filter[0][0] == LayoutType.RowMajor and (math_inst.instruction_shape[2] * 2) % 256 == 0) or \
+            (layout_filter[0][0] == LayoutType.ColumnMajor and math_inst.instruction_shape[0] % 128 == 0)) and \
+           ((layout_filter[1][0] == LayoutType.RowMajor and math_inst.instruction_shape[1] % 256 == 0) or \
+            (layout_filter[1][0] == LayoutType.ColumnMajor and (math_inst.instruction_shape[0] * 2) % 128 == 0)):
+          # alignment for a, 2 for sparsity
+          layout_filter[0][1] = get_tma_alignment_elt(kernel_data_type["a_type"]) * ( 2 if layout[0][0] == LayoutType.RowMajor else 1)
+          # alignment for b
+          layout_filter[1][1] = get_tma_alignment_elt(kernel_data_type["b_type"])
+          # alignment for d
+          layout_filter[2][1] = get_tma_alignment_elt(kernel_data_type["d_type"])
+          layouts_filtered.append(layout_filter)
+
+      CreateSparseGemmUniversal3xOperator(manifest, layouts_filtered, tile_descriptions, [kernel_data_type],
+        [[KernelScheduleType.SparseTmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm]],
+        tile_schedulers=tile_schedulers)
+
+# Conv Utility functions
+def make_dims_and_alignments_triple(dim: int, bit_per_element_A: int, bit_per_element_B: int, bit_per_element_C: int):
+  bit_alignment_required_by_tma = 128
+  return ((dim, bit_alignment_required_by_tma // bit_per_element_A), # A
+          (dim, bit_alignment_required_by_tma // bit_per_element_B), # B
+          (dim, bit_alignment_required_by_tma // bit_per_element_C)) # C
+
+def make_math_instruction_w_output(data_types: Tuple[DataType, DataType, DataType, DataType],
+                          instruction_shape: Tuple[int, int, int]) -> (MathInstruction, DataType):
+  default_opcode = OpcodeClass.TensorOp
+  default_math_op = MathOperation.multiply_add
+  [A_data_type, B_data_type, Acc_data_type, Out_data_type] = data_types
+  return (MathInstruction(
+    instruction_shape,
+    A_data_type, B_data_type, Acc_data_type,
+    default_opcode,
+    default_math_op
+  ), Out_data_type)
+
+"""
+Generate CUTLASS 3 convolution kernel(s) for SM100.
+
+This is meant to be called from GenerateSM100.
+"""
+def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version,
+                                           log_indent_level: int = 0):
+  log_debug_line('GenerateSM100_TensorOp_16b_UMMA_conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  minimum_compute_capability = 100
+  maximum_compute_capability = thor_sm
+
+  spatial_dims = [2, 3]
+
+  conv_kinds = [
+    ConvKind.Fprop,
+    ConvKind.Dgrad,
+    ConvKind.Wgrad
+  ]
+
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  data_types_and_instruction_shapes_1sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (64, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (64, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (64, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 256, 16)),
+  ]
+  math_instructions_w_output_1sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_1sm)
+
+  cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
+
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]]
+
+  # tile_descriptions is a 2-level list.
+  # Each inner list is for each cluster shape.
+  for math_inst, output_type in math_instructions_w_output_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      cluster_multiplier = cluster_shape
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      # It's typical to get the data types from the math instruction.
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      for conv_kind in conv_kinds:
+        CreateConvOperator3x(manifest,
+                            dims_and_alignments = dims_and_alignments,
+                            tile_descriptions = tile_descriptions,
+                            data_types = data_type,
+                            schedule_pairs = schedule_pairs,
+                            conv_kind = conv_kind,
+                            log_indent_level = log_indent_level)
+
+  data_types_and_instruction_shapes_2sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f16, DataType.f16),    (256, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 128, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (128, 256, 16)),
+    ((DataType.f16, DataType.f16, DataType.f32, DataType.f16),    (256, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 128, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (128, 256, 16)),
+    ((DataType.bf16, DataType.bf16, DataType.f32, DataType.bf16), (256, 256, 16)),
+  ]
+  math_instructions_w_output_2sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_2sm)
+
+  cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]]
+
+  for math_inst, output_type in math_instructions_w_output_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      cluster_multiplier = (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      # It's typical to get the data types from the math instruction.
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      for conv_kind in conv_kinds:
+        CreateConvOperator3x(manifest,
+                            dims_and_alignments = dims_and_alignments,
+                            tile_descriptions = tile_descriptions,
+                            data_types = data_type,
+                            schedule_pairs = schedule_pairs,
+                            conv_kind = conv_kind,
+                            log_indent_level = log_indent_level)
+
+def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version,
+                                           log_indent_level: int = 0):
+  # Instantiate Fp8 Fprop kernels with e4m3 A/B, f32 Acc, e4m3/bf16/f16/f32 C/D
+  log_debug_line('GenerateSM100_TensorOp_fp8_UMMA_conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  thor_sm = ThorSMRenumbering(cuda_version)
+
+  minimum_compute_capability = 100
+  maximum_compute_capability = thor_sm
+
+  spatial_dims = [2, 3]
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  data_types_and_instruction_shapes_1sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (64, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 256, 32)),
+  ]
+  math_instructions_w_output_1sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_1sm)
+
+  cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]]
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]]
+
+  for math_inst, output_type in math_instructions_w_output_1sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_1sm:
+      cluster_multiplier = cluster_shape
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      CreateConvOperator3x(manifest,
+                          dims_and_alignments = dims_and_alignments,
+                          tile_descriptions = tile_descriptions,
+                          data_types = data_type,
+                          schedule_pairs = schedule_pairs,
+                          conv_kind = ConvKind.Fprop,
+                          log_indent_level = log_indent_level)
+
+  data_types_and_instruction_shapes_2sm = [
+    # ((A,B,Acc,C/D), (InstM,InstN,InstK))
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.e4m3),   (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f16),    (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.bf16),   (256, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 128, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (128, 256, 32)),
+    ((DataType.e4m3, DataType.e4m3, DataType.f32, DataType.f32),    (256, 256, 32)),
+  ]
+  math_instructions_w_output_2sm = map(lambda x: make_math_instruction_w_output(*x),
+                          data_types_and_instruction_shapes_2sm)
+
+  cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]]
+  if thor_sm in manifest.compute_capabilities_baseline :
+    cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]]
+
+  for math_inst, output_type in math_instructions_w_output_2sm:
+    tile_descriptions = []
+    for cluster_shape in cluster_shapes_2sm:
+      cluster_multiplier = (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2])
+      # Unlike SM90, SM100 tile shape calculation includes cluster shape.
+      tile_shape = [
+        math_inst.instruction_shape[0]     * cluster_multiplier[0],
+        math_inst.instruction_shape[1]     * cluster_multiplier[1],
+        math_inst.instruction_shape[2] * 4 * cluster_multiplier[2]
+      ]
+      warp_count = [4, 1, 1]
+      tile_description = TileDescription(
+        tile_shape, stages, warp_count, math_inst,
+        minimum_compute_capability, maximum_compute_capability,
+        cluster_shape)
+      tile_descriptions.append(tile_description)
+
+      data_type = {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : output_type,
+        "d_type"   : output_type,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : math_inst.element_accumulator
+      }
+
+      dims_and_alignments = [make_dims_and_alignments_triple(dim, DataTypeSize[data_type["a_type"]], DataTypeSize[data_type["b_type"]], DataTypeSize[data_type["d_type"]]) for dim in spatial_dims]
+
+      # Schedules
+      mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100
+      epilogue_schedule = EpilogueScheduleType.ScheduleAuto
+      schedule_pairs = [
+        (mainloop_schedule, epilogue_schedule)
+      ]
+
+      CreateConvOperator3x(manifest,
+                          dims_and_alignments = dims_and_alignments,
+                          tile_descriptions = tile_descriptions,
+                          data_types = data_type,
+                          schedule_pairs = schedule_pairs,
+                          conv_kind = ConvKind.Fprop,
+                          log_indent_level = log_indent_level)
+
+def GenerateSM120_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+  # SM120 MMA with mixed F4/F6/F8 inputs + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  layouts = [
+    [[LayoutType.RowMajor,    128], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor,    0]]
+  ]
+
+  instruction_sizes = [
+    [16, 8, 32]
+  ]
+
+  tile_sizes = [
+    [128, 128, 128]
+  ]
+
+  cluster_shape = [1,1,1]
+
+  ab_types  = [
+    DataType.e2m1, 
+    DataType.e2m3, 
+    DataType.e3m2,
+    DataType.e5m2,
+    DataType.e4m3,
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  def is_pingpong(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+    
+  def tile_schedulers(sfdtype, kernel_schedule):
+    # Pingpong kernel schedule doesn't support stream-K.
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K
+    if is_pingpong(kernel_schedule):
+      return [TileSchedulerType.Default]
+    elif sfdtype["type"] == DataType.void:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  min_cc = 120
+  max_cc = 121
+
+  epi_type = DataType.f32
+  
+  math_instructions = []
+
+  kernel_schedules = [
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120
+  ]
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes, ab_types, ab_types, acc_types):
+    math_instructions.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        DataType.ue8m0)
+    )
+
+  for math_inst in math_instructions:
+    tile_descriptions = []
+    for tile_size in tile_sizes:
+      tile_descriptions.append(
+        TileDescription(tile_size, 0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e3m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : epi_type,
+        "sf_type"  : math_inst.element_scale_factor,
+        "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+      }
+    ]
+
+    # Set alignment d based on Destination format.
+    for layout in layouts:
+      layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+    for data_type, kernel_schedule in product(data_types, kernel_schedules):
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+        tile_schedulers = tile_schedulers(data_type["sfd_type"], kernel_schedule),
+        gemm_kind = GemmKind.BlockScaledUniversal3x
+        )
+
+def GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version):
+  # SM120 MMA with with F4 + block scale
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  # layouts for ABC and their alignments.
+  layouts = [
+    [[LayoutType.RowMajor,    32], [LayoutType.ColumnMajor, 32], [LayoutType.RowMajor,    0]]
+  ]
+
+  instruction_sizes = [
+    [16, 8, 64]
+  ]
+
+  tile_sizes_cooperative = [
+    [128, 128, 128],
+    [128, 128, 256],
+    [256, 128, 128]
+  ]
+
+  tile_sizes_pingpong = [
+    [128, 128, 128],
+    [128, 128, 256]
+  ]
+
+  cluster_shape = [1,1,1]
+
+  ab_types  = [
+    DataType.e2m1
+  ]
+
+  sf_types  = [
+    DataType.ue4m3,
+    DataType.ue8m0
+  ]
+
+  acc_types = [ DataType.f32 ]
+
+  def is_pingpong(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120 or \
+       kernel_schedule == KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+  
+  def is_nvf4(kernel_schedule):
+    if kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120 or \
+       kernel_schedule == KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120:
+      return True
+    else:
+      return False
+    
+  def tile_schedulers(sfdtype, kernel_schedule):
+    # Pingpong kernel schedule doesn't support stream-K.
+    # Only use the stream-K scheduler for non-void SFD to limit kernel count. When SFD is void,
+    # the epilogue is the traditional linear combination, for which we already have tests with stream-K
+    if is_pingpong(kernel_schedule):
+      return [TileSchedulerType.Default]
+    elif sfdtype["type"] == DataType.void:
+      return [TileSchedulerType.Default]
+    else:
+      return [TileSchedulerType.Default, TileSchedulerType.StreamK]
+
+  min_cc = 120
+  max_cc = 121
+
+  epi_type = DataType.f32
+  
+  math_instructions = []
+
+  kernel_schedules = [
+    KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120,
+    KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120
+  ]
+
+  for instr_size, a_type, b_type, acc_type, sf_type in product(instruction_sizes, ab_types, ab_types, acc_types, sf_types):
+    math_instructions.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.BlockScaledTensorOp,
+        MathOperation.multiply_add,
+        sf_type)
+    )
+
+  for math_inst in math_instructions:
+    for kernel_schedule in kernel_schedules:
+      tile_descriptions = []
+      tile_sizes = tile_sizes_pingpong if is_pingpong(kernel_schedule) else tile_sizes_cooperative
+      for tile_size in tile_sizes:
+        # nvf4 kernel only supports ue4m3 SF
+        # mxf4 kernel only supports ue8m0 SF
+        if (math_inst.element_scale_factor == DataType.ue4m3 and is_nvf4(kernel_schedule)) or \
+           (math_inst.element_scale_factor == DataType.ue8m0 and not is_nvf4(kernel_schedule)):
+          tile_descriptions.append(
+            TileDescription(tile_size, 0, [4, 1, 1], math_inst, min_cc, max_cc, cluster_shape))
+
+      data_types = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.f32,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e5m2,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e5m2,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.void, "vector_size": None, "layout" : None}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 16, "layout" : LayoutType.RowMajor}
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.e2m1,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : epi_type,
+          "sf_type"  : math_inst.element_scale_factor,
+          "sfd_type" : {"type": DataType.ue8m0, "vector_size": 32, "layout" : LayoutType.RowMajor}
+        }
+      ]
+
+      # Set alignment d based on Destination format.
+      for layout in layouts:
+        layout[2][1] = 128 // DataTypeSize[data_types[0]["d_type"]]
+
+      for data_type in data_types:
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+          [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+          tile_schedulers = tile_schedulers(data_type["sfd_type"], kernel_schedule),
+          gemm_kind = GemmKind.BlockScaledUniversal3x
+          ) 
+
+def GenerateSM120_Sparse_TensorOp_gemm(manifest, cuda_version):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  layouts = [
+    [[LayoutType.RowMajor, 256], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor, 0]]
+  ]
+
+  tile_sizes = [
+    [128, 128, 256]
+  ]
+
+  cluster_shape = [1,1,1]
+  
+  warp_count = [4, 2, 1]
+
+  acc_types = [ DataType.f32 ]
+
+  instruction_sizes_mxf8f6f4 = [
+    [16, 8, 64]
+  ]
+
+  ab_types_mxf8f6f4  = [
+    DataType.e2m1, 
+    #DataType.e2m3, 
+    DataType.e3m2,
+    #DataType.e5m2,
+    DataType.e4m3,
+  ]
+
+  def tile_schedulers(kernel_schedule):
+      return [TileSchedulerType.Default]
+
+  min_cc = 120
+  max_cc = 121
+
+  kernel_schedules = [
+    KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120,
+  ]
+
+  math_instructions_mxf8f6f4 = []
+
+  for instr_size, a_type, b_type, acc_type in product(instruction_sizes_mxf8f6f4, ab_types_mxf8f6f4, ab_types_mxf8f6f4, acc_types):
+    math_instructions_mxf8f6f4.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.SparseTensorOp,
+        MathOperation.multiply_add)
+    )
+
+  # Create gemm operator for mxf8f6f4
+  for math_inst in math_instructions_mxf8f6f4:
+    tile_descriptions_mxf8f6f4 = []
+    for tile_size in tile_sizes:
+      tile_descriptions_mxf8f6f4.append(
+        TileDescription(tile_size, 0, warp_count, math_inst, min_cc, max_cc, cluster_shape))
+
+    data_types = [
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f32,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.e5m2,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.f16,
+        "d_type"   : DataType.e4m3,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      },
+      {
+        "a_type"   : math_inst.element_a,
+        "b_type"   : math_inst.element_b,
+        "c_type"   : DataType.void,
+        "d_type"   : DataType.f16,
+        "acc_type" : math_inst.element_accumulator,
+        "epi_type" : DataType.f32
+      }
+    ]
+
+    for data_type, kernel_schedule in product(data_types, kernel_schedules):
+      # Set alignment d based on Destination format
+      for layout in layouts:
+        layout[2][1] = int(128 // DataTypeSize[data_type["d_type"]])
+      # Create gemm operator
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions_mxf8f6f4, data_type,
+        [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+        tile_schedulers = tile_schedulers(kernel_schedule),
+        gemm_kind = GemmKind.SparseUniversal3x)
+
+def GenerateSM120_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.BlockwiseUniversal3x):
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 8):
+    return
+
+  layouts = [
+    [[LayoutType.RowMajor, 128], [LayoutType.ColumnMajor, 128], [LayoutType.RowMajor, 16]],
+    [[LayoutType.RowMajor, 128], [LayoutType.ColumnMajor, 128], [LayoutType.ColumnMajor, 16]]
+  ]
+
+  cooperative_tile_sizes = [
+    [128, 128, 128]
+  ]
+  pingpong_tile_sizes = [
+    [64, 128, 128]
+  ]
+
+  def get_tile_sizes(kernel_scheduler):
+    if kernel_scheduler == KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120:
+      return pingpong_tile_sizes
+    return cooperative_tile_sizes
+
+  def get_warp_count(kernel_scheduler):
+    if kernel_scheduler == KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120:
+      return [2, 2, 1]
+    return [4, 2, 1]
+
+  def get_sf_sizes(tile_size):
+    sf_sizes = []
+    for vec_m in [1, 128]:
+      if tile_size[0] % vec_m > 0:
+        continue
+      for vec_n in [1, 128]:
+        if tile_size[1] % vec_m > 0:
+          continue
+        sf_sizes.append(
+          [vec_m, vec_n, 128]
+        )
+    return sf_sizes
+
+  cluster_shape = [1,1,1]
+
+  acc_types = [ DataType.f32 ]
+
+  instruction_sizes = [
+    [16, 8, 32]
+  ]
+
+  def tile_schedulers(kernel_schedule):
+      return [TileSchedulerType.Default]
+
+  min_cc = 120
+  max_cc = 121
+
+  kernel_schedulers = [
+    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120
+  ]
+
+  ab_types = [
+    [DataType.e4m3, DataType.e4m3],
+    [DataType.e4m3, DataType.e5m2]
+  ]
+
+  math_instructions = []
+
+  for instr_size, ab_type, acc_type in product(instruction_sizes, ab_types, acc_types):
+    a_type, b_type = ab_type
+    math_instructions.append(
+      MathInstruction(
+        instr_size,
+        a_type, b_type, acc_type,
+        OpcodeClass.TensorOp,
+        MathOperation.multiply_add)
+    )
+
+  # Create gemm operator for mxf8f6f4
+  for kernel_schedule in kernel_schedulers:
+    tile_sizes = get_tile_sizes(kernel_schedule)
+    warp_count = get_warp_count(kernel_schedule)
+    for math_inst in math_instructions:
+      tile_descriptions = []
+      for tile_size in tile_sizes:
+        sf_sizes = get_sf_sizes(tile_size)
+        for sf_size in sf_sizes:
+          tile_descriptions.append(
+            TileDescription(tile_size, 0, warp_count, math_inst, min_cc, max_cc, cluster_shape,
+                            explicit_vector_sizes=sf_size)
+          )
+
+      data_types = [
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.f16,
+          "d_type"   : DataType.f16,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : DataType.f32
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.bf16,
+          "d_type"   : DataType.bf16,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : DataType.f32
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.f16,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : DataType.f32
+        },
+        {
+          "a_type"   : math_inst.element_a,
+          "b_type"   : math_inst.element_b,
+          "c_type"   : DataType.void,
+          "d_type"   : DataType.bf16,
+          "acc_type" : math_inst.element_accumulator,
+          "epi_type" : DataType.f32
+        }
+      ]
+
+      for data_type in data_types:
+        # Set alignment d based on Destination format
+        for layout in layouts:
+          layout[2][1] = int(128 // DataTypeSize[data_type["d_type"]])
+        # Create gemm operator
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+          [[kernel_schedule, EpilogueScheduleType.ScheduleAuto]], 
+          tile_schedulers = tile_schedulers(kernel_schedule),
+          gemm_kind = gemm_kind)
+
+def GenerateSM100(manifest, cuda_version):
+  arch_family_cc = ['100f', '101f', '103a']
+  if CudaToolkitVersionSatisfies(cuda_version, 13, 0):
+    for old_cc, new_cc in [('101f', '110f')]:
+      arch_family_cc = [cc.replace(old_cc, new_cc) for cc in arch_family_cc]
+
+  #
+  # Dense Gemm
+  #
+  GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version)
+
+  GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version)
+
+  if not bool(set(manifest.compute_capabilities_feature_set).intersection(arch_family_cc)):
+    GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version)
+
+  GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version)
+  # grouped GEMM
+  GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+  GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+
+  # StreamK is included in regular generation
+  GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
+
+  # Blockwise kernels
+  GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockwiseUniversal3x)
+
+  #
+  # Sparse Gemm
+  #
+  GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version)
+  if not bool(set(manifest.compute_capabilities_feature_set).intersection(arch_family_cc)):
+    GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version)
+  GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version)
+
+  #
+  # Block Scaled Gemm
+  #
+  GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
+  GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version,  gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
+  
+  GenerateSM103_TensorOp_fp4_ultra_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM103_TensorOp_fp4_ultra_UMMA_gemm_with_block_scaled(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockScaledUniversal3x)
+  #
+  # Conv
+  #
+  GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version)
+  GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version)
+
+
+def GenerateSM120(manifest, cuda_version):
+  # StreamK is included in regular generation #
+  #
+  # Dense Block Scaled Gemm
+  #
+  GenerateSM120_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+  GenerateSM120_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_version)
+
+  #
+  # Sparse Gemm
+  #
+  GenerateSM120_Sparse_TensorOp_gemm(manifest, cuda_version)
+  GenerateSM120_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version)
+  GenerateSM120_TensorOp_fp8_UMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockwiseUniversal3x)
+
+###################################################################################################
+
+def GenerateSM90_Conv3x(manifest, cuda_version,
+                        log_indent_level: int = 0):
+  """
+  Generate CUTLASS 3 convolution kernel(s) for SM90.
+
+  This is meant to be called from GenerateSM90.
+  """
+  log_debug_line('GenerateSM90_Conv3x', log_indent_level)
+  log_indent_level = log_indent_level + 1
+
+  if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+    return
+
+  minimum_compute_capability = 90
+  maximum_compute_capability = 90
+
+  spatial_dims = (2, 3)
+
+  # MMA shapes (MMA_M, MMA_N, MMA_K):
+  #
+  # Different hardware MMA instructions may have different MMA shapes.
+  # This function may generate kernels with different MMA shapes for
+  # different data types, either because the hardware only supports
+  # certain shapes for certain types, or for performance reasons
+  # (CUTLASS doesn't need to generate all valid kernels for the
+  # profiler library, just the best-performing ones).
+  #
+  # The kernel names refer to tile shapes (TILE_M, TILE_N, TILE_K)
+  # instead of MMA shapes.  For SM >= 90 kernels, TILE_K = 4 * MMA_K,
+  # where 4, the "number of MMA instructions per tile," is determined
+  # through some combination of modeling and experiment.
+  #
+  # For performance on sm90, generally CUTLASS generates 64x128
+  # instead of 128x64.
+  mma_64x64x16  = ( 64,  64,  16)
+  mma_64x64x8   = ( 64,  64,   8)
+
+  num_mma_per_tile = 4
+
+  # Cluster shapes (1, 1, 1) and (2, 2, 1) are valid,
+  # but not included, because they tend not to perform as well.
+  cluster_shapes = (
+    (2, 1, 1),
+    (1, 2, 1),
+   )
+
+  fp16 = DataType.f16
+  bf16 = DataType.bf16
+  fp32 = DataType.f32
+  s8   = DataType.s8
+  s32  = DataType.s32
+
+  # When generating kernels, the usual way is to specify 4 types,
+  # (A, B, Acc, C/D).  Tests instead have 5 types,
+  # (ElementAct, ElementFlt, ElementOut, ElementAcc, ElementCompute),
+  # where ElementCompute is also called 'epi_type',
+  # and corresponds to the type of epilogue activations.
+  # This script maps tests' 5 types to 4 types
+  # by making ElementCompute the same as ElementOut.
+
+  fp16_fp32_fp16_fp32 = {
+    'a_type':   fp16, # ElementAct(ivation)
+    'b_type':   fp16, # ElementF(i)lt(er)
+    'c_type':   fp32, # ElementAcc
+    'd_type':   fp32, # ElementOut (used only by CollectiveEpilogue)
+    'acc_type': fp16, # ElementAcc
+    'epi_type': fp32, # ElementCompute (used only by CollectiveEpilogue)
+    'alignment_A': 8, # tma alignment elements of A
+    'alignment_B': 8, # tma alignment elements of B
+    'alignment_C': 4, # tma alignment elements of C
+  }
+  fp16_fp32_fp32_fp32 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 4,
+  }
+  fp32_fp32_fp32_fp32 = {
+    'a_type':   fp32,
+    'b_type':   fp32,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+    'alignment_A': 4,
+    'alignment_B': 4,
+    'alignment_C': 4,
+  }
+  s8_s32_s32_s32 = {
+    'a_type':     s8,
+    'b_type':     s8,
+    'c_type':    s32,
+    'd_type':    s32,
+    'acc_type':  s32,
+    'epi_type':  s32,
+    'alignment_A': 16,
+    'alignment_B': 16,
+    'alignment_C': 4,
+  }
+
+  # Other NVIDIA libraries may have the habit of specifying data types like this.
+  bf16bf16_bf16f32_f32 = {
+    'a_type':   bf16,
+    'b_type':   bf16,
+    'c_type':   fp32,
+    'd_type':   fp32,
+    'acc_type': fp32,
+    'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 4,
+  }
+  f16f16_f16f16_f16 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp16,
+    'd_type':   fp16,
+    'acc_type': fp16,
+    'epi_type': fp16,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 8,
+  }
+  f16f16_f16f32_f32 = {
+    'a_type':   fp16,
+    'b_type':   fp16,
+    'c_type':   fp16,
+    'd_type':   fp16,
+    'acc_type': fp32,
+    'epi_type': fp32,
+    'alignment_A': 8,
+    'alignment_B': 8,
+    'alignment_C': 8,
+  }
+  f32f32_tf32f32_f32 = fp32_fp32_fp32_fp32
+
+  i8i8_i8i32_f32 = {
+    'a_type':     s8,
+    'b_type':     s8,
+    'c_type':    s32,
+    'd_type':    s32,
+    'acc_type':  s32,
+    'epi_type':  s32,
+    'alignment_A': 16,
+    'alignment_B': 16,
+    'alignment_C': 4,
+  }
+
+  # Each element in the outermost iterable is one combination of
+  #
+  # (ConvKind, spatial_dimension, data_types, byte_alignments, mma_sizes, cluster_sizes)
+  #
+  # for which to generate a kernel.  spatial_dimension is the spatial
+  # dimension of the convolution: either 1, 2, or 3.  byte_alignments
+  # is a triple of required minimum byte alignments for A, B, and C.
+  #
+  # Note that itertools functions produce a single-pass generator.
+  # The code doesn't need a multipass iterable, but if one did, one
+  # could call `tuple` or `list` on the generator.
+  #
+  # While this happens to use the same cluster sizes for each element,
+  # the code doesn't require that.  Different convolution kinds, data
+  # types, or mma sizes might have different optimal cluster sizes.
+  combinations_of_parameters = chain(
+    # The following are all the kernels exercised in the unit tests.
+    # Please try to keep in sync with the unit tests.
+    product(
+      (
+        ConvKind.Fprop,
+      ),
+      spatial_dims,
+      (
+        fp16_fp32_fp16_fp32,
+        fp16_fp32_fp32_fp32,
+        s8_s32_s32_s32,
+      ),
+      (
+        mma_64x64x16,
+      ),
+      cluster_shapes
+    ),
+    product(
+      (
+        ConvKind.Fprop,
+      ),
+      spatial_dims,
+      (
+        fp32_fp32_fp32_fp32,
+      ),
+      (
+        mma_64x64x8,
+      ),
+      cluster_shapes
+    ),
+    product(
+      (
+        ConvKind.Dgrad,
+        ConvKind.Wgrad
+      ),
+      spatial_dims,
+      (
+        fp16_fp32_fp16_fp32,
+        fp16_fp32_fp32_fp32,
+      ),
+      (
+        mma_64x64x16,
+      ),
+      cluster_shapes
+    ),
+    # Kernels not necessarily in the unit tests, but used elsewhere
+    # and thus useful to have generated for profiling.  They may
+    # duplicate kernels above.  All of them are 2-D.  In general,
+    # CUTLASS prefers 64 x 128 to 128 x 64 on sm90, even if the
+    # hardware permits 128 x 64.
+    (
+      # Fprop
+      #
+      # bf16bf16_bf16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2, bf16bf16_bf16f32_f32, (256, 128, 16), (2, 1, 1)),
+      #
+      # f16f16_f16f16_f16
+      #
+      # cluster shape (1, 1, 1)
+      #
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f16_f16, (256, 128, 16), (1, 1, 1)),
+      #
+      # f16f16_f16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 192, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256,  96,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256,  96, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,    f16f16_f16f32_f32, (256, 128, 16), (2, 1, 1)),
+      #
+      # f32f32_tf32f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (128, 192,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Fprop, 2,   f32f32_tf32f32_f32, (256,  96,  8), (2, 1, 1)),
+      #
+      # i8i8_i8i32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (128, 256, 32), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (256, 128, 16), (2, 1, 1)),
+      (ConvKind.Fprop, 2,       i8i8_i8i32_f32, (256, 128, 32), (2, 1, 1)),
+      #
+      # Dgrad
+      #
+      # bf16bf16_bf16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2, bf16bf16_bf16f32_f32, (256, 128, 16), (2, 1, 1)),
+      #
+      # f16f16_f16f16_f16
+      #
+      # cluster shape (1, 1, 1)
+      #
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, ( 64, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 128, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 256,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (128, 256, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256,  64,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256,  64, 16), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256, 128,  8), (1, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f16_f16, (256, 128, 16), (1, 1, 1)),
+      #
+      # f16f16_f16f32_f32
+      #
+      # cluster shape (2, 1, 1)
+      #
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (128, 256,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (128, 256, 16), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (256, 128,  8), (2, 1, 1)),
+      (ConvKind.Dgrad, 2,    f16f16_f16f32_f32, (256, 128, 16), (2, 1, 1)),
+    ),
+  )
+
+  # SM >= 90 kernels don't actually use warp_count, but the
+  # TileDescription class needs it.  The 4 in the default
+  # warp_count has nothing to do with num_mma_per_tile.
+  warp_count = [4, 1, 1]
+
+  stages = 0 # zero means "deduce the number of stages automatically"
+
+  mainloop_schedule = KernelScheduleType.ImplicitTmaWarpSpecializedSm90
+  epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
+  schedule_pairs = (
+    (mainloop_schedule, epilogue_schedule),
+  )
+  tile_schedulers = (
+    TileSchedulerType.Default, # -> void
+  )
+
+  def make_math_instruction(data_types: Dict[str, DataType],
+                            mma_shape: Tuple[int, int, int]) -> MathInstruction:
+    default_opcode = OpcodeClass.TensorOp
+    default_math_op = MathOperation.multiply_add
+    return MathInstruction(
+      mma_shape,
+      data_types['a_type'], data_types['b_type'], data_types['c_type'],
+      default_opcode,
+      default_math_op
+    )
+
+  for (conv_kind, spatial_dim, data_types, mma_shape, cluster_shape) in combinations_of_parameters:
+    math_inst = make_math_instruction(data_types, mma_shape)
+    tile_shape = (mma_shape[0], mma_shape[1], num_mma_per_tile * mma_shape[2])
+    tile_description = TileDescription(tile_shape, stages, warp_count, math_inst,
+      minimum_compute_capability, maximum_compute_capability, cluster_shape)
+    assert(isinstance(spatial_dim, int))
+    dims_and_alignments = (
+      (
+        (spatial_dim, data_types['alignment_A']),
+        (spatial_dim, data_types['alignment_B']),
+        (spatial_dim, data_types['alignment_C']),
+      ),
+    )
+    CreateConvOperator3x(manifest,
+                         dims_and_alignments = dims_and_alignments,
+                         tile_descriptions = [tile_description],
+                         data_types = data_types,
+                         schedule_pairs = schedule_pairs,
+                         tile_schedulers = tile_schedulers,
+                         conv_kind = conv_kind,
+                         log_indent_level = log_indent_level)
+
+def GenerateSM90(manifest, cuda_version):
+  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684(manifest, cuda_version)
+  GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version, gemm_kind=GemmKind.GroupedUniversal3x)
+  GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_rank_k_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_trmm_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm_complex(manifest, cuda_version)
+  GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version)
+  GenerateSM90_Conv3x(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version)
+  GenerateSM90_TensorOp_fp8_WGMMA_gemm_with_blockwise(manifest, cuda_version, gemm_kind=GemmKind.GroupedBlockwiseUniversal3x)
+
+###################################################################################################
+
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level
+  into the numeric identifier used in setting the log level.
+
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f'Invalid log level: {log_level}')
+  return numeric_level
+
+# This function for defining the ArgumentParser is used to make it easy for the CUTLASS Python interface
+# to leverage the functionality in this file without running this script via a shell prompt.
+def define_parser():
+  parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels")
+  parser.add_argument("--operations", default="all", help="Specifies the operation to generate (gemm, all)")
+  parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory")
+  parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory")
+  parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.")
+  parser.add_argument("--architectures", default='53;60;61;70;75;80;90;100', help="Target compute architectures")
+  parser.add_argument("--kernels", default='', help='Comma-delimited list to filter kernels by name.  ' +
+                      'Specifying this as \"all\" includes ALL the kernels, ' +
+                      'while not specifying this includes only the default set of kernels.')
+  parser.add_argument("--ignore-kernels", default='', help='Comma-delimited list of kernels ' +
+                      'to exclude from build.  For backwards compatibility reasons, ' +
+                      'this option only takes effect if --kernels is set to a nonempty value.')
+  parser.add_argument("--exclude-kernels", default='', help='Comma-delimited list of kernels ' +
+                      'to exclude from build.  In contrast to --ignore-kernels, ' +
+                      'this option always takes effect, ' +
+                      'whether or not --kernels is set to a nonempty value.  ' +
+                      'It also can exclude kernels from the filter file ' +
+                      '(see --kernel-filter-file option below).')
+  parser.add_argument("--filter-by-cc", default='True', type=str, help='If enabled, kernels whose compute capability range is not satisfied by the build target are excluded.')
+  parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit")
+  parser.add_argument('--kernel-filter-file',   type=str, default=None, required=False, help='Full path of filter file')
+  parser.add_argument('--heuristics-problems-file',   type=str, default=None, required=False, help='Full path of heuristics problem size description file, as a json list')
+  parser.add_argument('--heuristics-testlist-file',   type=str, default=None, required=False, help='Full path of heuristics testlist CSV file, to be passed to cutlass_profiler')
+  parser.add_argument('--heuristics-gpu',   type=str, default=None, required=False, help='GPU to use for evaluating heuristics offline. None or `auto` to autodetect using cuda', choices=['', 'auto', 'H100_SXM', 'H100_PCIE', 'H100_NVL', 'H200_SXM', 'H20_SXM', 'B200', 'GB200_NVL', 'RTX_5080', 'RTX_5090', 'RTX_PRO_6000'])
+  parser.add_argument('--heuristics-configs-per-problem',   type=int, default=10, required=False, help='Number of kernel configs to generate for each problem in the problem list')
+  parser.add_argument('--heuristics-restrict-kernels', action='store_true', help='Restrict heuristics mode to use only the default set of kernels emitted by generator.py')
+  parser.add_argument('--selected-kernel-list',   type=str, default=None, required=False,
+                        help='Specify the output log file containing all enabled kernels in this build')
+  parser.add_argument("--interface-dir", default=None, required=False, help="Interface header to kernels")
+  parser.add_argument("--disable-full-archs-compilation", action="store_true", required=False, help="Disable compilation for every archs in --architectures")
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  parser.add_argument('--instantiation-level', type=str, default="", required=False, help="Instantiation level for SM90 kernels. Set to `max` and make sure `--kernels` is not empty to generate all possible configurations.")
+  _add_package_disablement_flag(parser)
+  return parser
+
+
+if __name__ == "__main__":
+  parser = define_parser()
+  args = parser.parse_args()
+
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
+  manifest = Manifest(args)
+
+  archs = args.architectures.split(';')
+
+  if args.heuristics_problems_file:
+    filter_manifest_and_write_heuristics_file(manifest, args)
+
+  GenerateSM50(manifest, args.cuda_version)
+  GenerateSM60(manifest, args.cuda_version)
+  GenerateSM61(manifest, args.cuda_version)
+  GenerateSM70(manifest, args.cuda_version)
+  GenerateSM75(manifest, args.cuda_version)
+  GenerateSM80(manifest, args.cuda_version)
+  GenerateSM89(manifest, args.cuda_version)
+  GenerateSM90(manifest, args.cuda_version)
+
+  blackwell_arch_list = [
+    "100a", "100f",
+    "101a", "101f",
+    "103a", "103f",
+    "110a", "110f",
+    "120a", "120f",
+    "121a", "121f",
+  ]
+  blackwell_enabled_arch = any(arch in blackwell_arch_list for arch in archs)
+  if blackwell_enabled_arch:
+    GenerateSM100(manifest, args.cuda_version)
+    GenerateSM120(manifest, args.cuda_version)
+
+  if 'library' in args.generator_target.split(','):
+    manifest.emit(GeneratorTarget.Library)
+
+  if 'kernel_testlist_l0' in args.generator_target.split(','):
+    emit_gemm_kernel_testlist(manifest, args.curr_build_dir, args.architectures, "functional_L0")
+
+  if 'kernel_testlist_l1' in args.generator_target.split(','):
+    emit_gemm_kernel_testlist(manifest, args.curr_build_dir, args.architectures, "functional_L1")
+  
+  if args.selected_kernel_list is not None:
+    if len(manifest.selected_kernels) > 0:
+      with open(args.selected_kernel_list, 'w') as file_writer:
+        for line in manifest.selected_kernels:
+          file_writer.write("%s\n" % line)
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics.py
new file mode 100644
index 0000000000000000000000000000000000000000..83421a06427acdc3b059855991cf95a1d2f118b3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics.py
@@ -0,0 +1,415 @@
+#################################################################################################
+#
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for selecting CUTLASS library kernels based on problem description
+"""
+import json
+import csv
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.generator import *
+  from cutlass_library.heuristics_provider import *
+except ImportError:
+  from library import *
+  from generator import *
+  from heuristics_provider import *
+
+try:
+  from .sm90_utils import (
+    get_valid_schedules,
+    generate_data_types_from_math_instruction,
+    fix_alignments,
+  )
+except ImportError:
+  from sm90_utils import (
+    get_valid_schedules,
+    generate_data_types_from_math_instruction,
+    fix_alignments,
+  )
+
+_LOGGER = logging.getLogger(__name__)
+
+dtype_map = {v: k for k, v in DataTypeNames.items()}
+
+def serialize_heuristics_results_to_json(problems_with_configs, outfile_path):
+  """
+  Utilitiy function to write heuristics results to a json file for debug
+
+  args:
+    problems_with_configs: List of problems provided to the heuristic, with a list of operations added to each problem dict
+    outfile_path: Outfile path
+      
+  returns:
+    None
+  """
+  pc_copy = problems_with_configs.copy()
+  for p in pc_copy:
+    for k, v in p.items():
+      if isinstance(v, DataType):
+        p[k] = DataTypeNames[v]
+      elif isinstance(v, LayoutType):
+        p[k] = ShortLayoutTypeNames[v]
+    configs = p['configs']
+    for c in configs:
+      for k, v in c.items():
+        if isinstance(v, DataType):
+          c[k] = DataTypeNames[v]
+        elif isinstance(v, LayoutType):
+          c[k] = ShortLayoutTypeNames[v]
+  with open(outfile_path, 'w') as f:
+    json.dump(pc_copy, f, indent=2)
+
+def get_single_gemm_config(m, n, k, batch_count, layouts, dtypes, alignment_a, alignment_b, voidC=False, use_fast_acc=True, count=1, provider=None):
+  """
+  Get heuristic-suggested GEMM kernel configurations for a single GEMM problem.
+
+  args:
+    m, n, k: GEMM dimensions
+    batch_count: batch count
+    layouts: tuple of layouts of type LayoutType
+    use_fast_acc: Use fast accumulation for FP8. Ignored for other precisions
+    count: Number of configs to return
+    provider: Heuristics provider to use
+
+  returns:
+    A list of dictionaries containing the suggested kernel configurations and additional info from the input required to define a Cutlass GemmOperation, with the following keys:
+      - 'cta_tile_m', 'cta_tile_m', 'cta_tile_k': CTA tile size
+      - 'instr_tile_m', 'instr_tile_n', 'instr_tile_k': Instruction tile size
+      - 'stages': kernel pipeline stage count
+      - 'cluster_m', 'cluster_n', 'cluster_k': cluster size
+      - 'layout_a', 'layout_b': input tensor layouts of type LayoutType
+      - 'alignment_a', 'alignment_b': input tensor alignments, in count of elements
+      - 'dtype_a', 'dtype_b', 'dtype_acc': dtypes of a, b, and accumulator, of type DataType
+      - 'swizzle_size' : suggested threadblock swizzle 
+      - 'split_k_slices': number of partitions of the k dimension for splitK
+      - 'raster_order': raster order for CTAs over output tiles ('along_m' or 'along_n')
+  """
+  if provider is None:
+    provider = MatmulHeuristics()
+  return provider.get_configs(m, n, k, batch_count, dtypes, layouts, alignment_a, alignment_b, voidC=voidC, use_fast_acc=use_fast_acc, count=count)
+
+def get_gemm_configs(problems, provider=None, count=1):
+  """
+  Get heuristic-suggested GEMM kernel configurations for a set of GEMM problems.
+
+  args:
+    problems: List of dictionaries describing GEMM problems with the following keys:
+      - 'm', 'n', 'k': Matrix dimensions (required)
+      - 'dtype_a': Data type of matrix A (required)
+      - 'dtype_b': Data type of matrix B (required)
+      - 'dtype_c': Data type of matrix C (default: None)
+      - 'dtype_d': Data type of matrix D (required)
+      - 'dtype_acc': Compute data type (default 'f32')
+      - 'layout': Operation layout (e.g. 'tnt')
+      - 'alignment_a': Memory access granularity of A, in units of elements (default: 16 bytes equivalent elements)
+      - 'alignment_b': Memory access granularity of B, in units of elements (default: 16 bytes equivalent elements)
+      - 'alpha': Scalar multiplier for A*B (default: 1.0)
+      - 'beta': Scalar multiplier for C (default: 0.0)
+      - 'batch_count': Number of GEMM operations in batch (default: 1)
+      - 'use_fast_acc': Enable fast accumulation for FP8 on Hopper (default: True)
+    provider: Heuristics provider to use
+    count: Number of configurations to return per problem (defualt: 1)
+      
+  returns:
+    A copy of the input dictionary, with key `configs` added containing the selected gemm configs
+  """
+  ret = []
+
+  for problem in problems:
+    problem = problem.copy()
+
+    try:
+      m = problem['m']
+      n = problem['n']
+      k = problem['k']
+      dtype_a = problem['dtype_a']
+      dtype_b = problem['dtype_b']
+      dtype_d = problem['dtype_d']
+      layout = problem['layout']
+    except KeyError as e:
+      _LOGGER.error(f"Missing required parameter {e} for problem {problem}")
+      raise
+
+    operation = problem.get('operation', 'gemm')
+    batch_count = problem.get('batch_count', 1)
+    dtype_acc = problem.get('dtype_acc', 'f32')
+    dtype_c = problem.get('dtype_c', None)
+    alpha = problem.get('alpha', 1.0)
+    beta = problem.get('beta', 0.0)
+    use_fast_acc = problem.get('use_fast_acc', True)
+
+    if operation != OperationKindNames[OperationKind.Gemm]:
+      raise ValueError(f"Unsupported operation {operation}")
+    if not (len(layout) == 3 and all(c in "nt" for c in layout)):
+      raise ValueError(f"layout must be a 3-character string containing only 'n' or 't', got {layout}")
+    layouts = tuple(LayoutType.RowMajor if l == 't' else LayoutType.ColumnMajor for l in layout)
+
+    try:
+      dtype_list = [dtype_a.lower(), dtype_b.lower(), dtype_acc.lower(), dtype_c.lower() if dtype_c is not None else dtype_d.lower(), dtype_d.lower()]
+      dtypes = tuple(dtype_map[dt] for dt in dtype_list)
+    except KeyError as dt:
+      _LOGGER.error(f"Unsupported data type: {dt}")
+      raise
+
+    alignment_a = problem.get('alignment_a', 128 // DataTypeSize[dtypes[0]])
+    alignment_b = problem.get('alignment_b', 128 // DataTypeSize[dtypes[1]])
+
+    configs = get_single_gemm_config(m, n, k, batch_count, layouts, dtypes, alignment_a, alignment_b, beta==0.0, use_fast_acc, count, provider)
+    problem['configs'] = configs
+
+    ret.append(problem)
+
+  return ret
+
+
+def generate_sm100_from_heuristics_configs(manifest, cuda_version, kernel_configs):
+  """
+  Generate CUTLASS operations based on the list of configs provided by the heuristic provider
+
+  args:
+    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
+    cuda_version: Cuda compiler version for generating cutlass operations
+    kernel_configs: list of configs generated by the heuristic
+      
+  returns:
+    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
+  """
+  min_cc = 100
+  max_cc = 101
+  if manifest is None:
+    # Use a dummy manifest so we can use existing CreateGemmOperator functions
+    manifest = Manifest()
+
+  configs = []
+  operations = []
+  for config in kernel_configs:
+    layout = ([config['layout_a'], config['alignment_a']], [config['layout_b'], config['alignment_b']], [config['layout_d'], 128 // DataTypeSize[config['dtype_d']]])
+    element_a, element_b, element_accumulator, element_c, element_d = config['dtype_a'], config['dtype_b'], config['dtype_acc'], config['dtype_c'], config['dtype_d']
+
+    # nvMMH assumes 2sm instruction for !(cluster_m % 2)
+    is_2sm = config['cluster_m'] % 2 == 0
+    instruction_shape = [(2 * config['cta_tile_m']) if is_2sm else config['cta_tile_m'], config['cta_tile_n'], config['cta_tile_k'] // 4]
+    math_instruction = MathInstruction(
+      instruction_shape,
+      element_a, element_b, element_accumulator,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add
+    )
+
+    data_types = [
+      {
+        "a_type"   : math_instruction.element_a,
+        "b_type"   : math_instruction.element_b,
+        "c_type"   : DataType.void if config['voidC'] else math_instruction.element_accumulator,
+        "d_type"   : element_d,
+        "acc_type" : math_instruction.element_accumulator,
+        "epi_type" : math_instruction.element_accumulator,
+      }
+    ]
+
+    tile_multiplier = (config['cluster_m'] // (2 if is_2sm else 1), config['cluster_n'], config['cluster_k'])
+    tile_description = TileDescription(
+      [instruction_shape[0] * tile_multiplier[0],
+       instruction_shape[1] * tile_multiplier[1],
+       instruction_shape[2] * 4 * tile_multiplier[2]],
+      0,
+      [4,1,1],
+      math_instruction,
+      min_cc,
+      max_cc,
+      cluster_shape=(config['cluster_m'], config['cluster_n'], config['cluster_k'])
+    )
+
+    schedules = []
+    if is_2sm:
+      schedules.append([KernelScheduleType.TmaWarpSpecialized2SmSm100, EpilogueScheduleType.TmaWarpSpecialized2Sm])
+    else:
+      schedules.append([KernelScheduleType.TmaWarpSpecialized1SmSm100, EpilogueScheduleType.TmaWarpSpecialized1Sm])
+
+    for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types, schedules, tile_schedulers=[TileSchedulerType.Default, TileSchedulerType.StreamK], gemm_kind=GemmKind.Universal3x):
+      configs.append(config)
+      operations.append(o)
+
+ 
+  return configs, operations
+
+
+def generate_sm90_from_heuristics_configs(manifest, cuda_version, kernel_configs):
+  """
+  Generate CUTLASS operations based on the list of configs provided by the heuristic provider
+
+  args:
+    manifest: manifest argument to which to add operations, or None to just return the operations without a manifest (for pruning an existing manifest)
+    cuda_version: Cuda compiler version for generating cutlass operations
+    kernel_configs: list of configs generated by the heuristic
+      
+  returns:
+    (configs, operations): a list of heuristic-provided kernel configs along with a one-to-one corresponding list of the generated operations
+  """
+  min_cc, max_cc = 90, 90
+
+  if manifest is None:
+    # Use a dummy manifest so we can use existing CreateGemmOperator functions
+    manifest = Manifest()
+
+  configs = []
+  operations = []
+  for config in kernel_configs:
+
+    is_aligned = (config['alignment_a'] * DataTypeSize[config['dtype_a']] >= 128) and (config['alignment_b'] * DataTypeSize[config['dtype_b']] >= 128)
+    layout = ([config['layout_a'], config['alignment_a']], [config['layout_b'], config['alignment_b']], [LayoutType.ColumnMajor, 1])
+    element_a, element_b, element_accumulator, element_c, element_d = config['dtype_a'], config['dtype_b'], config['dtype_acc'], config['dtype_c'], config['dtype_d']
+
+    # instr shape and warp config are unused for emitting 3x collective builder code
+    dummy_instr_shape = [0, 0, 0]
+    math_instruction = MathInstruction(
+      dummy_instr_shape,
+      element_a, element_b, element_accumulator,
+      OpcodeClass.TensorOp,
+      MathOperation.multiply_add
+    )
+
+    data_types = generate_data_types_from_math_instruction(math_instruction, element_source=element_c, element_dest=element_d)
+    if is_aligned:
+      layout = fix_alignments(data_types, layout, alignment_bits=128)
+
+    # instr shape and warp config are unused for emitting 3x collective builder code
+    dummy_warp_count = [0, 0, 0]
+    tile_description = TileDescription(
+      [config['cta_tile_m'], config['cta_tile_n'], config['cta_tile_k']],
+      0,
+      dummy_warp_count,
+      math_instruction,
+      min_cc,
+      max_cc,
+      cluster_shape=(config['cluster_m'], config['cluster_n'], config['cluster_k'])
+    )
+
+    schedules, stream_k_schedules = get_valid_schedules(
+      tile_description=tile_description,
+      cuda_version=cuda_version,
+      is_aligned=is_aligned,
+      data_types=data_types,
+      instantiation_level=9000, # don't prune schedules: we didn't get any schedule suggestion from the heuristic
+      layout=layout,
+      gemm_kind=GemmKind.Universal3x,
+      enable_fp8_fast_acc=config['use_fast_acc']
+    )
+
+    if len(schedules):
+      for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types, schedules, gemm_kind=GemmKind.Universal3x):
+        configs.append(config)
+        operations.append(o)
+
+    if len(stream_k_schedules):
+      for o in CreateGemmUniversal3xOperator(manifest, [layout], [tile_description], data_types,
+                                    stream_k_schedules,
+                                    tile_schedulers=[TileSchedulerType.StreamK]):
+        configs.append(config)
+        operations.append(o)
+
+
+  return configs, operations
+
+def filter_manifest_and_write_heuristics_file(manifest, args):
+  """
+  Prune a manifest according to heuristics suggestions from the problems file
+
+  args:
+    manifest: Cutlass manifest to prune
+    args: generator.py args, requires:
+      - args.heuristics_problems_file
+      - args.heuristics_gpu
+      - args.heuristics_testlist_file
+      
+  returns:
+    A list of dictionaries, each of which has information about an operation and a problem from the input problems
+  """
+  heuristics_problems = []
+  with open(args.heuristics_problems_file, 'r') as f:
+    heuristics_problems = json.load(f)
+  gpu = None if (args.heuristics_gpu == "auto" or args.heuristics_gpu == "") else args.heuristics_gpu
+  mmh = MatmulHeuristics(gpu=gpu)
+  if any(('100' in arch) for arch in args.architectures.split(';')):
+    mmh.set_cta_div_n(64)
+  problems_with_configs = get_gemm_configs(heuristics_problems, provider=mmh, count=args.heuristics_configs_per_problem)
+
+  all_configs_and_operations = []
+  operations = []
+  for problem in problems_with_configs:
+    if any('90' in arch for arch in args.architectures.split(';')):
+        problem_configs, problem_operations = generate_sm90_from_heuristics_configs(None if args.heuristics_restrict_kernels else manifest, args.cuda_version, problem['configs'])
+    if any(('100' in arch) or ('101' in arch) for arch in args.architectures.split(';')):
+        problem_configs, problem_operations = generate_sm100_from_heuristics_configs(None if args.heuristics_restrict_kernels else manifest, args.cuda_version, problem['configs'])
+        
+    operations += problem_operations
+    problem_without_configs = {k: v for k, v in problem.items() if k != 'configs'}
+    with_problem_size = [{'operation_name': o.procedural_name(), **problem_without_configs, **c} for c, o in zip(problem_configs, problem_operations)]
+    all_configs_and_operations += with_problem_size
+
+  for operation in operations:
+    manifest.add_kernel_filter(f"^{operation.procedural_name()}$")
+  if not all_configs_and_operations:
+    raise Exception("No valid configurations generated")
+  write_profiler_testlist_to_csv(all_configs_and_operations, args.heuristics_testlist_file)
+  return all_configs_and_operations
+
+def write_profiler_testlist_to_csv(configs_list, outfile_path):
+  """
+  Write a list of configs to a testlist to be consumed by cutlass_profiler
+
+  args:
+    configs_list: List of kernel configs along with runtime arguments and any other columns to include in the CSV, expressed as a list of dictionaries
+    outfile_path: Outfile path
+      
+  returns:
+    None
+  """
+  profiler_testlist = configs_list.copy()
+  for c in profiler_testlist:
+    for k, v in c.items():
+      if isinstance(v, DataType):
+        c[k] = DataTypeNames[v]
+      elif isinstance(v, LayoutType):
+        c[k] = ShortLayoutTypeNames[v]
+
+  with open(outfile_path, mode='w', newline='') as ofile:
+    k_names = profiler_testlist[0].keys()
+
+    writer = csv.DictWriter(ofile, fieldnames=k_names)
+    writer.writeheader()
+    writer.writerows(profiler_testlist)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics_provider.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a4112a34c87d73a792cce368fede96a9315ac1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/heuristics_provider.py
@@ -0,0 +1,175 @@
+#################################################################################################
+#
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Providers for kernel selection heuristics
+"""
+
+import sys
+import os
+import glob
+import logging
+import ctypes
+import functools
+
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import DataType, LayoutType
+except ImportError:
+  from library import DataType, LayoutType
+
+class MatmulHeuristics:
+
+  def __init__(self, gpu = None):
+    import nvMatmulHeuristics
+    self.mmh_lib = nvMatmulHeuristics
+    self.gpu = gpu
+
+    if 'CUTLASS_NVMMH_SO_PATH' in os.environ:
+      nvmmhInterfaceEx = functools.partial(self.mmh_lib.NvMatmulHeuristicsInterfaceEx, path=os.environ['CUTLASS_NVMMH_SO_PATH'])
+    else:
+      nvmmhInterfaceEx = self.mmh_lib.NvMatmulHeuristicsInterfaceEx
+
+    self.lh = nvmmhInterfaceEx(
+      backend=self.mmh_lib.NvMatmulHeuristicsTarget["CUTLASS3"],
+      flags=self.mmh_lib.NvMatmulHeuristicsFlags.PERF_MODEL_BASED_AUTO_TUNING,
+      load_discovery_implicitly=True,
+      gpu=self.mmh_lib.NvMatmulHeuristicsNvidiaGpu[self.gpu] if self.gpu else None
+    )
+    self.backend = self.lh.createBackend(self.mmh_lib.NvMatmulHeuristicsTarget["CUTLASS3"])
+
+  def _layout_from_cutlass(self, layouts):
+    assert(len(layouts)==3)
+    full_layout_str = ''.join('t' if l == LayoutType.RowMajor else 'n' for l in layouts)
+    input_layouts = full_layout_str[:2].upper() 
+    lh_layout = input_layouts + '_' + str("ROW_MAJOR" if full_layout_str[-1]=='t' else "COL_MAJOR")
+    return self.mmh_lib.NvMatmulHeuristicsMatmulLayout[lh_layout]
+
+  def _precision_from_cutlass_dtypes(self, dtypes):
+    dtype_to_cublas = {
+      DataType.f64: 'D',
+      DataType.f32: 'S',
+      DataType.f16: 'H',
+      DataType.bf16: 'T',
+      DataType.e4m3: 'Q',
+      DataType.e5m2: 'R',
+      DataType.s32: 'I',
+      DataType.s8: 'B',
+    }
+
+    dtype_a, dtype_b, dtype_compute, dtype_c, dtype_d = dtypes
+
+    a_c = dtype_to_cublas[dtype_a]
+
+    if a_c.lower() != 'q':
+      return a_c + dtype_to_cublas[dtype_compute] + dtype_to_cublas[dtype_d]
+    else:
+      return a_c + dtype_to_cublas[dtype_b] + dtype_to_cublas[dtype_c] + dtype_to_cublas[dtype_compute] + dtype_to_cublas[dtype_d]
+
+  def set_cta_div_n(self, div_n):
+    cta_n_div_requirement = ctypes.c_int(div_n) 
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.CTA_TILE_N_DIV_REQUIREMENT,
+      ctypes.byref(cta_n_div_requirement),
+      ctypes.sizeof(cta_n_div_requirement)
+    )
+
+  def set_cta_div_m(self, div_m):
+    cta_m_div_requirement = ctypes.c_int(div_m) 
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.CTA_TILE_M_DIV_REQUIREMENT,
+      ctypes.byref(cta_m_div_requirement),
+      ctypes.sizeof(cta_m_div_requirement)
+    )
+
+  def get_configs(self, m, n, k, batch_count, dtypes, layouts, align_a, align_b, voidC=False, use_fast_acc=True, count=1):
+    if use_fast_acc:
+      disable_fast_acc_for_fp8 = ctypes.c_int(0)
+    else:   
+      disable_fast_acc_for_fp8 = ctypes.c_int(1)
+    self.lh.setBackendValueProperty(
+      self.backend,
+      self.mmh_lib.NvMatmulHeuristicsBackendProperty.DISABLE_FAST_ACC_FOR_FP8,
+      ctypes.byref(disable_fast_acc_for_fp8),
+      ctypes.sizeof(disable_fast_acc_for_fp8)
+    )
+
+    precision = self._precision_from_cutlass_dtypes(dtypes)
+    layout = self._layout_from_cutlass(layouts)
+
+    matmul_problem = self.lh.makeNvMatmulHeuristicsProblem(m, n, k, layout, batch_count)
+    configs = self.lh.getEx(matmul_problem, count, self.backend, precision=precision)
+
+    ret = []
+    for c in configs:
+      kernel = c['kernel']
+      problem = c['problem']
+
+      r = {}
+      r['estimated_runtime'] = c['runtime']
+      r['cta_tile_m'] = kernel.cta_tile_m
+      r['cta_tile_n'] = kernel.cta_tile_n
+      r['cta_tile_k'] = kernel.cta_tile_k
+      r['instr_tile_m'] = kernel.instr_tile_m
+      r['instr_tile_n'] = kernel.instr_tile_n
+      r['instr_tile_k'] = kernel.instr_tile_k
+      r['warp_tile_m'] = kernel.warp_tile_m
+      r['warp_tile_n'] = kernel.warp_tile_n
+      r['warp_tile_k'] = kernel.warp_tile_k
+      r['cluster_m'] = kernel.cluster_m
+      r['cluster_n'] = kernel.cluster_n
+      r['cluster_k'] = 1
+      r['layout_a'] = layouts[0]
+      r['layout_b'] = layouts[1]
+      r['layout_d'] = layouts[2]
+      r['dtype_a'] = dtypes[0]
+      r['dtype_b'] = dtypes[1]
+      r['dtype_acc'] = dtypes[2]
+      r['dtype_c'] = dtypes[3]
+      r['dtype_d'] = dtypes[4]
+      r['alignment_a'] = align_a
+      r['alignment_b'] = align_b
+      r['swizzle_size'] = kernel.swizzle_factor
+      r['raster_order'] = 'along_m' if kernel.cta_order==0 else 'along_n'
+      r['split_k_slices'] = kernel.split_k
+      r['use_fast_acc'] = use_fast_acc
+      r['voidC'] = voidC
+
+      ret.append(r)
+
+    return ret
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/library.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/library.py
new file mode 100644
index 0000000000000000000000000000000000000000..56d22dc4b0705b4813b15b1b09decf53b38f7f37
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/library.py
@@ -0,0 +1,1531 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Data types and tags used for emitting CUTLASS C++ kernels
+"""
+
+import enum
+import re
+
+# The following block implements enum.auto() for Python 3.5 variants that don't include it such
+# as the default 3.5.2 on Ubuntu 16.04.
+#
+# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility
+
+try:
+  from enum import auto as enum_auto
+except ImportError:
+  __cutlass_library_auto_enum = 0
+  def enum_auto() -> int:
+    global __cutlass_library_auto_enum
+    i = __cutlass_library_auto_enum
+    __cutlass_library_auto_enum += 1
+    return i
+
+###################################################################################################
+
+#
+class GeneratorTarget(enum.Enum):
+  Library = enum_auto()
+#
+GeneratorTargetNames = {
+  GeneratorTarget.Library: 'library'
+}
+#
+
+###################################################################################################
+
+#
+class DataType(enum.Enum):
+  void = enum_auto()  # primarily used to disable C tensor for epilogues
+  b1 = enum_auto()
+  u2 = enum_auto()
+  u4 = enum_auto()
+  u8 = enum_auto()
+  u16 = enum_auto()
+  u32 = enum_auto()
+  u64 = enum_auto()
+  s2 = enum_auto()
+  s4 = enum_auto()
+  s8 = enum_auto()
+  s16 = enum_auto()
+  s32 = enum_auto()
+  s64 = enum_auto()
+  e4m3 = enum_auto()
+  e5m2 = enum_auto()
+  f8 = enum_auto()    
+  f6 = enum_auto()    
+  f4 = enum_auto()    
+  e3m2 = enum_auto()     
+  e2m3 = enum_auto()     
+  e2m1 = enum_auto()     
+  ue8m0 = enum_auto()    
+  ue4m3 = enum_auto()    
+  f16 = enum_auto()
+  bf16 = enum_auto()
+  f32 = enum_auto()
+  tf32 = enum_auto()
+  f64 = enum_auto()
+  cf16 = enum_auto()
+  cbf16 = enum_auto()
+  cf32 = enum_auto()
+  ctf32 = enum_auto()
+  cf64 = enum_auto()
+  cs2 = enum_auto()
+  cs4 = enum_auto()
+  cs8 = enum_auto()
+  cs16 = enum_auto()
+  cs32 = enum_auto()
+  cs64 = enum_auto()
+  cu2 = enum_auto()
+  cu4 = enum_auto()
+  cu8 = enum_auto()
+  cu16 = enum_auto()
+  cu32 = enum_auto()
+  cu64 = enum_auto()
+  invalid = enum_auto()
+
+#
+ShortDataTypeNames = {
+  DataType.s32: 'i',
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
+  DataType.f16: 'h',
+  DataType.f32: 's',
+  DataType.f64: 'd',
+  DataType.cf32: 'c',
+  DataType.cf64: 'z',
+  DataType.f8: 'f8',      
+  DataType.f6: 'f6',      
+  DataType.f4: 'f4',      
+}
+
+#
+DataTypeNames = {
+  DataType.void: "void",
+  DataType.b1: "b1",
+  DataType.u2: "u2",
+  DataType.u4: "u4",
+  DataType.u8: "u8",
+  DataType.u16: "u16",
+  DataType.u32: "u32",
+  DataType.u64: "u64",
+  DataType.s2: "s2",
+  DataType.s4: "s4",
+  DataType.s8: "s8",
+  DataType.s16: "s16",
+  DataType.s32: "s32",
+  DataType.s64: "s64",
+  DataType.e4m3: 'e4m3',
+  DataType.e5m2: 'e5m2',
+  DataType.f8: 'f8',     
+  DataType.f6: 'f6',     
+  DataType.f4: 'f4',     
+  DataType.e2m3: 'e2m3',       
+  DataType.e3m2: 'e3m2',       
+  DataType.e2m1: 'e2m1',       
+  DataType.ue8m0: 'ue8m0',     
+  DataType.ue4m3: 'ue4m3',     
+  DataType.f16: "f16",
+  DataType.bf16: "bf16",
+  DataType.f32: "f32",
+  DataType.tf32: "tf32",
+  DataType.f64: "f64",
+  DataType.cf16: "cf16",
+  DataType.cbf16: "cbf16",
+  DataType.cf32: "cf32",
+  DataType.ctf32: "ctf32",
+  DataType.cf64: "cf64",
+  DataType.cu2: "cu2",
+  DataType.cu4: "cu4",
+  DataType.cu8: "cu8",
+  DataType.cu16: "cu16",
+  DataType.cu32: "cu32",
+  DataType.cu64: "cu64",
+  DataType.cs2: "cs2",
+  DataType.cs4: "cs4",
+  DataType.cs8: "cs8",
+  DataType.cs16: "cs16",
+  DataType.cs32: "cs32",
+  DataType.cs64: "cs64",
+}
+
+DataTypeTag = {
+  DataType.void: "void",
+  DataType.b1: "cutlass::uint1b_t",
+  DataType.u2: "cutlass::uint2b_t",
+  DataType.u4: "cutlass::uint4b_t",
+  DataType.u8: "uint8_t",
+  DataType.u16: "uint16_t",
+  DataType.u32: "uint32_t",
+  DataType.u64: "uint64_t",
+  DataType.s2: "cutlass::int2b_t",
+  DataType.s4: "cutlass::int4b_t",
+  DataType.s8: "int8_t",
+  DataType.s16: "int16_t",
+  DataType.s32: "int32_t",
+  DataType.s64: "int64_t",
+  DataType.e4m3: 'cutlass::float_e4m3_t',
+  DataType.e5m2: 'cutlass::float_e5m2_t',
+  DataType.f8: 'cutlass::type_erased_dynamic_float8_t',      
+  DataType.f6: 'cutlass::type_erased_dynamic_float6_t',      
+  DataType.f4: 'cutlass::type_erased_dynamic_float4_t',      
+  DataType.e2m3: 'cutlass::float_e2m3_t',                       
+  DataType.e3m2: 'cutlass::float_e3m2_t',                       
+  DataType.e2m1: 'cutlass::float_e2m1_t',                       
+  DataType.ue8m0: 'cutlass::float_ue8m0_t',                     
+  DataType.ue4m3: 'cutlass::float_ue4m3_t',                     
+  DataType.f16: "cutlass::half_t",
+  DataType.bf16: "cutlass::bfloat16_t",
+  DataType.f32: "float",
+  DataType.tf32: "cutlass::tfloat32_t",
+  DataType.f64: "double",
+  DataType.cf16: "cutlass::complex<cutlass::half_t>",
+  DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>",
+  DataType.cf32: "cutlass::complex<float>",
+  DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>",
+  DataType.cf64: "cutlass::complex<double>",
+  DataType.cu2: "cutlass::complex<cutlass::uint2b_t>",
+  DataType.cu4: "cutlass::complex<cutlass::uint4b_t>",
+  DataType.cu8: "cutlass::complex<cutlass::uint8_t>",
+  DataType.cu16: "cutlass::complex<cutlass::uint16_t>",
+  DataType.cu32: "cutlass::complex<cutlass::uint32_t>",
+  DataType.cu64: "cutlass::complex<cutlass::uint64_t>",
+  DataType.cs2: "cutlass::complex<cutlass::int2b_t>",
+  DataType.cs4: "cutlass::complex<cutlass::int4b_t>",
+  DataType.cs8: "cutlass::complex<cutlass::int8_t>",
+  DataType.cs16: "cutlass::complex<cutlass::int16_t>",
+  DataType.cs32: "cutlass::complex<cutlass::int32_t>",
+  DataType.cs64: "cutlass::complex<cutlass::int64_t>",
+}
+
+DataTypeSize = {
+  DataType.void: 0,
+  DataType.b1: 1,
+  DataType.u2: 2,
+  DataType.u4: 4,
+  DataType.u8: 8,
+  DataType.u16: 16,
+  DataType.u32: 32,
+  DataType.u64: 64,
+  DataType.s2: 2,
+  DataType.s4: 4,
+  DataType.s8: 8,
+  DataType.s16: 16,
+  DataType.s32: 32,
+  DataType.s64: 64,
+  DataType.e4m3: 8,
+  DataType.e5m2: 8,
+  DataType.f8: 8,
+  DataType.f6: 6,
+  DataType.f4: 4,
+  DataType.e2m3: 6,
+  DataType.e3m2: 6,
+  DataType.e2m1: 4,
+  DataType.ue8m0: 8,
+  DataType.ue4m3: 8,
+  DataType.f16: 16,
+  DataType.bf16: 16,
+  DataType.f32: 32,
+  DataType.tf32: 32,
+  DataType.f64: 64,
+  DataType.cf16: 32,
+  DataType.cbf16: 32,
+  DataType.cf32: 64,
+  DataType.ctf32: 32,
+  DataType.cf64: 128,
+  DataType.cu2: 4,
+  DataType.cu4: 8,
+  DataType.cu8: 16,
+  DataType.cu16: 32,
+  DataType.cu32: 64,
+  DataType.cu64: 128,
+  DataType.cs2: 4,
+  DataType.cs4: 8,
+  DataType.cs8: 16,
+  DataType.cs16: 32,
+  DataType.cs32: 64,
+  DataType.cs64: 128,
+}
+
+###################################################################################################
+#
+class BlasMode(enum.Enum):
+  symmetric = enum_auto()
+  hermitian = enum_auto()
+
+#
+BlasModeTag = {
+  BlasMode.symmetric: 'cutlass::BlasMode::kSymmetric',
+  BlasMode.hermitian: 'cutlass::BlasMode::kHermitian',
+}
+
+#
+class ComplexTransform(enum.Enum):
+  none = enum_auto()
+  conj = enum_auto()
+
+#
+ComplexTransformTag = {
+  ComplexTransform.none: 'cutlass::ComplexTransform::kNone',
+  ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate',
+}
+
+# Used for cutlass3x complex kernel collective mainloop builder instantiation
+ComplexTransformTag3x = {
+  ComplexTransform.none: 'cute::identity',
+  ComplexTransform.conj: 'cute::conjugate',
+}
+
+#
+RealComplexBijection = [
+  (DataType.f16, DataType.cf16),
+  (DataType.f32, DataType.cf32),
+  (DataType.f64, DataType.cf64),
+]
+
+#
+def is_complex(data_type):
+  for r, c in RealComplexBijection:
+    if data_type == c:
+      return True
+  return False
+
+def is_block_scaled(gemm_kind):
+  return gemm_kind in (GemmKind.BlockScaledUniversal3x, GemmKind.GroupedBlockScaledUniversal3x)
+
+def is_blockwise(gemm_kind):
+  return gemm_kind in (GemmKind.BlockwiseUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
+
+def is_grouped(gemm_kind):
+  return gemm_kind in (GemmKind.GroupedUniversal3x, 
+    GemmKind.GroupedBlockScaledUniversal3x, GemmKind.GroupedBlockwiseUniversal3x)
+
+#
+def get_complex_from_real(real_type):
+  for r, c in RealComplexBijection:
+    if real_type == r:
+      return c
+  return DataType.invalid
+
+#
+def get_real_from_complex(complex_type):
+  for r, c in RealComplexBijection:
+    if complex_type == c:
+      return r
+  return DataType.invalid
+
+# TMA requires an alignment of 128 bits for all data types
+def get_tma_alignment(data_type):
+  if data_type == DataType.void:
+    return 0
+  elif DataTypeSize[data_type] == 6:
+    return 128 # 96B alignment for 16U6 format 
+  else:
+    return 128 // DataTypeSize[data_type]
+
+#
+class ComplexMultiplyOp(enum.Enum):
+  multiply_add = enum_auto()
+  gaussian = enum_auto()
+
+###################################################################################################
+
+#
+class MathOperation(enum.Enum):
+  multiply_add = enum_auto()
+  multiply_add_saturate = enum_auto()
+  multiply_add_mixed_input_upcast = enum_auto()
+  xor_popc = enum_auto()
+  and_popc = enum_auto()
+  multiply_add_fast_bf16 = enum_auto()
+  multiply_add_fast_f16 = enum_auto()
+  multiply_add_fast_f32 = enum_auto()
+  multiply_add_complex_fast_f32 = enum_auto()
+  multiply_add_complex = enum_auto()
+  multiply_add_complex_gaussian = enum_auto()
+  multiply_add_fast_accum = enum_auto()
+
+#
+MathOperationTag = {
+  MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd',
+  MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate',
+  MathOperation.multiply_add_mixed_input_upcast: 'cutlass::arch::OpMultiplyAddMixedInputUpcast',
+  MathOperation.xor_popc: 'cutlass::arch::OpXorPopc',
+  MathOperation.and_popc: 'cutlass::arch::OpAndPopc',
+  MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16',
+  MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16',
+  MathOperation.multiply_add_fast_f32: 'cutlass::arch::OpMultiplyAddFastF32',
+  MathOperation.multiply_add_complex_fast_f32: 'cutlass::arch::OpMultiplyAddComplexFastF32',
+  MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex',
+  MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex',
+  MathOperation.multiply_add_fast_accum: 'cutlass::arch::OpMultiplyAddFastAccum',
+}
+
+###################################################################################################
+
+#
+class LayoutType(enum.Enum):
+  ColumnMajor = enum_auto()
+  RowMajor = enum_auto()
+  ColumnMajorInterleaved2 = enum_auto()
+  RowMajorInterleaved2 = enum_auto()
+  ColumnMajorInterleaved32 = enum_auto()
+  RowMajorInterleaved32 = enum_auto()
+  ColumnMajorInterleaved64 = enum_auto()
+  RowMajorInterleaved64 = enum_auto()
+  TensorNWC = enum_auto()
+  TensorNHWC = enum_auto()
+  TensorNDHWC = enum_auto()
+  TensorNCHW = enum_auto()
+  TensorNGHWC = enum_auto()
+  TensorNC32HW32 = enum_auto()
+  TensorNC64HW64 = enum_auto()
+  TensorC32RSK32 = enum_auto()
+  TensorC64RSK64 = enum_auto()
+  TensorKCS = enum_auto()
+  TensorKCSR = enum_auto()
+  TensorKCSRT = enum_auto()
+
+#
+LayoutTag = {
+  LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor',
+  LayoutType.RowMajor: 'cutlass::layout::RowMajor',
+  LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>',
+  LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>',
+  LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>',
+  LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>',
+  LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>',
+  LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>',
+  LayoutType.TensorNWC: 'cutlass::layout::TensorNWC',
+  LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC',
+  LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC',
+  LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW',
+  LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC',
+  LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>',
+  LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>',
+  LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>',
+  LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>',
+  LayoutType.TensorKCS: 'cutlass::layout::TensorKCS',
+  LayoutType.TensorKCSR: 'cutlass::layout::TensorKCSR',
+  LayoutType.TensorKCSRT: 'cutlass::layout::TensorKCSRT'
+}
+
+#
+TransposedLayout = {
+  LayoutType.ColumnMajor: LayoutType.RowMajor,
+  LayoutType.RowMajor: LayoutType.ColumnMajor,
+  LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2,
+  LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2,
+  LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32,
+  LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32,
+  LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64,
+  LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64,
+  LayoutType.TensorNHWC: LayoutType.TensorNHWC
+}
+
+#
+ShortLayoutTypeNames = {
+  LayoutType.ColumnMajor: 'n',
+  LayoutType.ColumnMajorInterleaved2: 'n2',
+  LayoutType.ColumnMajorInterleaved32: 'n32',
+  LayoutType.ColumnMajorInterleaved64: 'n64',
+  LayoutType.RowMajor: 't',
+  LayoutType.RowMajorInterleaved2: 't2',
+  LayoutType.RowMajorInterleaved32: 't32',
+  LayoutType.RowMajorInterleaved64: 't64',
+  LayoutType.TensorNWC: 'nwc',
+  LayoutType.TensorNHWC: 'nhwc',
+  LayoutType.TensorNDHWC: 'ndhwc',
+  LayoutType.TensorNCHW: 'nchw',
+  LayoutType.TensorNGHWC: 'nghwc',
+  LayoutType.TensorNC32HW32: 'nc32hw32',
+  LayoutType.TensorNC64HW64: 'nc64hw64',
+  LayoutType.TensorC32RSK32: 'c32rsk32',
+  LayoutType.TensorC64RSK64: 'c64rsk64',
+  LayoutType.TensorKCS: 'kcs',
+  LayoutType.TensorKCSR: 'kcsr',
+  LayoutType.TensorKCSRT: 'kcsrt'
+}
+
+#
+ShortComplexLayoutNames = {
+  (LayoutType.ColumnMajor, ComplexTransform.none): 'n',
+  (LayoutType.ColumnMajor, ComplexTransform.conj): 'c',
+  (LayoutType.RowMajor, ComplexTransform.none): 't',
+  (LayoutType.RowMajor, ComplexTransform.conj): 'h'
+}
+
+###################################################################################################
+class KernelScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  Multistage = enum_auto()
+  CpAsyncWarpSpecialized = enum_auto()
+  CpAsyncWarpSpecializedPingpong = enum_auto()
+  CpAsyncWarpSpecializedCooperative = enum_auto()
+  Tma = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedPingpong = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecializedFP8FastAccum = enum_auto()
+  TmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
+  TmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
+  ImplicitTmaWarpSpecializedSm90 = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperative = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpongFP8FastAccum = enum_auto()
+
+  BlockwiseTmaWarpSpecializedCooperative = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecializedCooperative = enum_auto()
+  BlockwiseTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecializedPingpong = enum_auto()
+
+  TmaWarpSpecialized1SmSm100 = enum_auto()
+  TmaWarpSpecialized2SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized1SmSm100 = enum_auto()
+  ImplicitTmaWarpSpecialized2SmSm100 = enum_auto()
+
+  PtrArrayTmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized2SmSm100 = enum_auto()
+
+  PtrArrayTmaWarpSpecialized1SmBlockScaledSm100 = enum_auto()
+  PtrArrayTmaWarpSpecialized2SmBlockScaledSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayNvf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
+
+  SparseTmaWarpSpecialized1SmSm100 = enum_auto()
+  SparseTmaWarpSpecialized2SmSm100 = enum_auto()
+
+  BlockScaledTmaWarpSpecialized1SmSm100 = enum_auto()
+  BlockScaledTmaWarpSpecialized2SmSm100 = enum_auto()
+  Mxf8f6f4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Mxf8f6f4TmaWarpSpecialized2SmSm100 = enum_auto()
+
+  BlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  BlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+
+  PtrArrayBlockwiseTmaWarpSpecialized1SmSm100 = enum_auto()
+  PtrArrayBlockwiseTmaWarpSpecialized2SmSm100 = enum_auto()
+
+
+  Mxf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Mxf4TmaWarpSpecialized2SmSm100 = enum_auto()
+  Nvf4TmaWarpSpecialized1SmSm100 = enum_auto()
+  Nvf4TmaWarpSpecialized2SmSm100 = enum_auto()
+
+  # FP4 Ultra
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103 = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103 = enum_auto()
+
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch = enum_auto()
+
+  MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch = enum_auto()
+  MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch = enum_auto()
+
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103 = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103 = enum_auto()
+
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch = enum_auto()
+
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch = enum_auto()
+  PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch = enum_auto()
+
+  Mxf8f6f4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf8f6f4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Nvf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedCooperativeSm120 = enum_auto()
+  Mxf4TmaWarpSpecializedPingpongSm120 = enum_auto()
+
+  F8f6f4SparseTmaWarpSpecializedCooperativeSm120 = enum_auto()
+
+  BlockwiseTmaWarpSpecializedCooperativeSm120 = enum_auto()
+  BlockwiseTmaWarpSpecializedPingpongSm120 = enum_auto()
+
+KernelScheduleTag = {
+  KernelScheduleType.ScheduleAuto: 'cutlass::gemm::collective::KernelScheduleAuto',
+  KernelScheduleType.Multistage: 'cutlass::gemm::KernelMultistage',
+  KernelScheduleType.CpAsyncWarpSpecialized: 'cutlass::gemm::KernelCpAsyncWarpSpecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: 'cutlass::gemm::KernelCpAsyncWarpSpecializedPingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: 'cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative',
+  KernelScheduleType.Tma: 'cutlass::gemm::KernelTma',
+  KernelScheduleType.TmaWarpSpecialized: 'cutlass::gemm::KernelTmaWarpSpecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: 'cutlass::conv::KernelImplicitTmaWarpSpecializedSm90',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8Blockwise',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8Blockwise',
+
+  KernelScheduleType.TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmSm100',
+
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: 'cutlass::conv::KernelImplicitTmaWarpSpecialized2SmSm100',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmSm100',
+
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized1SmSm100',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelSparseTmaWarpSpecialized2SmSm100',
+
+  KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledSm100',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledSm100',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf8f6f4Sm100',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf8f6f4Sm100',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100',
+
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise1SmSm100',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockwise2SmSm100',
+
+  KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmMxf4Sm100',
+  KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmMxf4Sm100',
+  KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized1SmNvf4Sm100',
+  KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: 'cutlass::gemm::KernelTmaWarpSpecialized2SmNvf4Sm100',
+
+  # FP4 Ultra
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103',
+  
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum',
+
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8Blockwise',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8Blockwise',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledSm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmNvf4Sm100",
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmNvf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmMxf8f6f4Sm100",
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: "cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmMxf8f6f4Sm100",
+
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103TmaPrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs16Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized1SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: 'cutlass::gemm::KernelPtrArrayTmaWarpSpecialized2SmBlockScaledMxNvf4UltraVs32Sm103DisablePrefetch',
+
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf8f6f4Sm120',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf8f6f4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedNvf4Sm120',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongNvf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedMxf4Sm120',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedPingpongMxf4Sm120',
+
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelScheduleSparseF8f6f4Sm120',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwiseCooperativeSm120',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120: 'cutlass::gemm::KernelTmaWarpSpecializedBlockwisePingpongSm120',
+}
+
+#
+KernelScheduleSuffixes = {
+  KernelScheduleType.ScheduleAuto: '',
+  KernelScheduleType.Multistage: '_cpasync',
+  KernelScheduleType.CpAsyncWarpSpecialized: '_cpasync_warpspecialized',
+  KernelScheduleType.CpAsyncWarpSpecializedPingpong: '_cpasync_warpspecialized_pingpong',
+  KernelScheduleType.CpAsyncWarpSpecializedCooperative: '_cpasync_warpspecialized_cooperative',
+  KernelScheduleType.Tma: '_unspecialized',
+  KernelScheduleType.TmaWarpSpecialized: '_warpspecialized',
+  KernelScheduleType.TmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.TmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.TmaWarpSpecializedFP8FastAccum: '_warpspecialized_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
+  KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+  KernelScheduleType.ImplicitTmaWarpSpecializedSm90: '_warpspecialized',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+
+  KernelScheduleType.TmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.TmaWarpSpecialized2SmSm100: '_2sm',
+
+  KernelScheduleType.ImplicitTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.ImplicitTmaWarpSpecialized2SmSm100: '_2sm',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100: '_2sm',
+
+  KernelScheduleType.SparseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.SparseTmaWarpSpecialized2SmSm100: '_2sm',
+
+  KernelScheduleType.BlockScaledTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.BlockScaledTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100: '_q_1sm',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100: '_q_2sm',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100: '_1sm',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100: '_2sm',
+
+  KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
+  KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
+
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: '_o_vs16_ultra_1sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: '_o_vs16_ultra_2sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: '_o_vs32_ultra_1sm',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: '_o_vs32_ultra_2sm',
+
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_1sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_2sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_1sm_nopf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_2sm_nopf',
+
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_1sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_2sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_1sm_tmapf',
+  KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_2sm_tmapf',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum: '_warpspecialized_cooperative_fp8_fastaccum',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+  KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum: '_warpspecialized_pingpong_fp8_fastaccum',
+
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative: '_warpspecialized_cooperative',
+  KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong: '_warpspecialized_pingpong',
+
+  KernelScheduleType.PtrArrayTmaWarpSpecialized1SmBlockScaledSm100: '_1sm',
+  KernelScheduleType.PtrArrayTmaWarpSpecialized2SmBlockScaledSm100: '_2sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100: '_o_vs16_1sm',
+  KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100: '_o_vs16_2sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100: '_o_vs32_1sm',
+  KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100: '_o_vs32_2sm',
+
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: '_o_vs16_ultra_1sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: '_o_vs16_ultra_2sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: '_o_vs32_ultra_1sm',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: '_o_vs32_ultra_2sm',
+
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_1sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: '_o_vs16_ultra_2sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_1sm_nopf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: '_o_vs32_ultra_2sm_nopf',
+
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_1sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: '_o_vs16_ultra_2sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_1sm_tmapf',
+  KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: '_o_vs32_ultra_2sm_tmapf',
+
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedCooperativeSm120: '_cooperative_q',
+  KernelScheduleType.Mxf8f6f4TmaWarpSpecializedPingpongSm120: '_pingpong_q',
+  KernelScheduleType.Nvf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs16',
+  KernelScheduleType.Nvf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs16',
+  KernelScheduleType.Mxf4TmaWarpSpecializedCooperativeSm120: '_cooperative_o_vs32',
+  KernelScheduleType.Mxf4TmaWarpSpecializedPingpongSm120: '_pingpong_o_vs32',
+
+  KernelScheduleType.F8f6f4SparseTmaWarpSpecializedCooperativeSm120: '_q',
+
+  KernelScheduleType.BlockwiseTmaWarpSpecializedCooperativeSm120: '_cooperative_q',
+  KernelScheduleType.BlockwiseTmaWarpSpecializedPingpongSm120: '_pingpong_q'
+}
+
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  PtrArrayNoSmemWarpSpecialized = enum_auto()
+  NoSmemWarpSpecialized1Sm = enum_auto()
+  NoSmemWarpSpecialized2Sm = enum_auto()
+  FastF32NoSmemWarpSpecialized1Sm = enum_auto()
+  FastF32NoSmemWarpSpecialized2Sm = enum_auto()
+  BlockwiseNoSmemWarpSpecialized1Sm = enum_auto()
+  BlockwiseNoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayNoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayFastF32NoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayFastF32NoSmemWarpSpecialized2Sm = enum_auto()
+  PtrArrayBlockwiseNoSmemWarpSpecialized1Sm = enum_auto()
+  PtrArrayBlockwiseNoSmemWarpSpecialized2Sm = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecialized1Sm = enum_auto() 
+  TmaWarpSpecialized2Sm = enum_auto() 
+  PtrArrayTmaWarpSpecialized1Sm = enum_auto()
+  PtrArrayTmaWarpSpecialized2Sm = enum_auto()
+  PtrArrayTmaWarpSpecializedPingpong = enum_auto()
+  PtrArrayTmaWarpSpecializedCooperative = enum_auto()
+
+#
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::FastF32NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::FastF32NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::BlockwiseNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::BlockwiseNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayFastF32NoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayBlockwiseNoSmemWarpSpecialized2Sm',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+  EpilogueScheduleType.TmaWarpSpecialized1Sm: 'cutlass::epilogue::TmaWarpSpecialized1Sm', 
+  EpilogueScheduleType.TmaWarpSpecialized2Sm: 'cutlass::epilogue::TmaWarpSpecialized2Sm', 
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm: 'cutlass::epilogue::PtrArrayTmaWarpSpecialized1Sm',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm: 'cutlass::epilogue::PtrArrayTmaWarpSpecialized2Sm',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative: 'cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong: 'cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong',
+}
+
+#
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.NoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized1Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.FastF32NoSmemWarpSpecialized2Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized1Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.PtrArrayFastF32NoSmemWarpSpecialized2Sm: '_epi_nosmem_fastf32',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm: '_epi_nosmem',
+  EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecialized1Sm: '', 
+  EpilogueScheduleType.TmaWarpSpecialized2Sm: '_epi_tma', 
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm: '',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm: '_epi_tma',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong: '_epi_tma',
+}
+
+class EpilogueFunctor3x(enum.Enum):
+  LinearCombination = enum_auto()
+  LinearCombinationBlockScaleFactor = enum_auto() 
+
+#
+EpilogueFunctor3xTag = {
+  EpilogueFunctor3x.LinearCombination: 'cutlass::epilogue::fusion::LinearCombination',
+  EpilogueFunctor3x.LinearCombinationBlockScaleFactor: 'cutlass::epilogue::fusion::LinCombBlockScaleFactor',  
+}
+
+# TMA epilogues have certain alignment requirements as calculated in get_tma_alignment(data_type)
+def is_tma_epilogue(epilogue_schedule_type):
+  return epilogue_schedule_type in [
+    EpilogueScheduleType.ScheduleAuto,
+    EpilogueScheduleType.TmaWarpSpecialized,
+    EpilogueScheduleType.TmaWarpSpecializedCooperative,
+    EpilogueScheduleType.TmaWarpSpecialized1Sm,
+    EpilogueScheduleType.TmaWarpSpecialized2Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+  ]
+
+def to_grouped_schedule(schedule, grouped):
+  if not grouped:
+    return schedule
+
+  group_schedule_map = {
+    # SM90
+    KernelScheduleType.TmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedCooperative,
+    KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecializedPingpong,
+    KernelScheduleType.TmaWarpSpecializedPingpong    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum : KernelScheduleType.PtrArrayTmaWarpSpecializedCooperativeFP8FastAccum,
+    KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum    : KernelScheduleType.PtrArrayTmaWarpSpecializedPingpongFP8FastAccum,
+    EpilogueScheduleType.TmaWarpSpecialized            : EpilogueScheduleType.PtrArrayTmaWarpSpecializedPingpong,
+    EpilogueScheduleType.TmaWarpSpecializedCooperative : EpilogueScheduleType.PtrArrayTmaWarpSpecializedCooperative,
+    EpilogueScheduleType.NoSmemWarpSpecialized         : EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized,
+    # SM100
+    KernelScheduleType.TmaWarpSpecialized1SmSm100: KernelScheduleType.PtrArrayTmaWarpSpecialized1SmSm100,
+    KernelScheduleType.TmaWarpSpecialized2SmSm100: KernelScheduleType.PtrArrayTmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Nvf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Nvf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayNvf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized1SmSm100,
+    KernelScheduleType.Mxf8f6f4TmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayMxf8f6f4TmaWarpSpecialized2SmSm100,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized1SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized1SmSm100,
+    KernelScheduleType.BlockwiseTmaWarpSpecialized2SmSm100 : KernelScheduleType.PtrArrayBlockwiseTmaWarpSpecialized2SmSm100,
+    EpilogueScheduleType.TmaWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized1Sm,
+    EpilogueScheduleType.TmaWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayTmaWarpSpecialized2Sm,
+    EpilogueScheduleType.NoSmemWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized1Sm,
+    EpilogueScheduleType.NoSmemWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayNoSmemWarpSpecialized2Sm,
+    EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized1Sm: EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized1Sm,
+    EpilogueScheduleType.BlockwiseNoSmemWarpSpecialized2Sm: EpilogueScheduleType.PtrArrayBlockwiseNoSmemWarpSpecialized2Sm,
+    # SM103
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103DisablePrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs16Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs16Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized1SmVs32Sm103TmaPrefetch,
+    KernelScheduleType.MxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch: KernelScheduleType.PtrArrayMxNvf4UltraTmaWarpSpecialized2SmVs32Sm103TmaPrefetch,
+  }
+
+  return group_schedule_map[schedule]
+
+class TileSchedulerType(enum.Enum):
+  Default = enum_auto()
+  Persistent = enum_auto()
+  StreamK = enum_auto()
+#
+TileSchedulerTag = {
+  TileSchedulerType.Default: 'void',
+  TileSchedulerType.Persistent: 'cutlass::gemm::PersistentScheduler',
+  TileSchedulerType.StreamK: 'cutlass::gemm::StreamKScheduler',
+}
+
+#
+TileSchedulerSuffixes = {
+  TileSchedulerType.Default: '',
+  TileSchedulerType.Persistent: '',
+  TileSchedulerType.StreamK: '_stream_k',
+}
+
+###################################################################################################
+
+#
+class SideMode(enum.Enum):
+  Left = enum_auto()
+  Right = enum_auto()
+
+#
+SideModeTag = {
+  SideMode.Left: 'cutlass::SideMode::kLeft',
+  SideMode.Right: 'cutlass::SideMode::kRight'
+}
+
+#
+ShortSideModeNames = {
+  SideMode.Left: 'ls',
+  SideMode.Right: 'rs'
+}
+
+###################################################################################################
+
+#
+class FillMode(enum.Enum):
+  Lower = enum_auto()
+  Upper = enum_auto()
+
+#
+FillModeTag = {
+  FillMode.Lower: 'cutlass::FillMode::kLower',
+  FillMode.Upper: 'cutlass::FillMode::kUpper'
+}
+
+#
+ShortFillModeNames = {
+  FillMode.Lower: 'l',
+  FillMode.Upper: 'u'
+}
+
+###################################################################################################
+
+#
+class DiagType(enum.Enum):
+  NonUnit = enum_auto()
+  Unit = enum_auto()
+
+#
+DiagTypeTag = {
+  DiagType.NonUnit: 'cutlass::DiagType::kNonUnit',
+  DiagType.Unit: 'cutlass::DiagType::kUnit'
+}
+
+#
+ShortDiagTypeNames = {
+  DiagType.NonUnit: 'nu',
+  DiagType.Unit: 'un'
+}
+
+###################################################################################################
+
+#
+class OpcodeClass(enum.Enum):
+  Simt = enum_auto()
+  TensorOp = enum_auto()
+  WmmaTensorOp = enum_auto()
+  SparseTensorOp = enum_auto()
+  BlockScaledTensorOp = enum_auto()                                     
+
+
+OpcodeClassNames = {
+  OpcodeClass.Simt: 'simt',
+  OpcodeClass.TensorOp: 'tensorop',
+  OpcodeClass.WmmaTensorOp: 'wmma_tensorop',
+  OpcodeClass.SparseTensorOp: 'sptensorop',
+  OpcodeClass.BlockScaledTensorOp: 'bstensorop'                         
+}
+
+OpcodeClassTag = {
+  OpcodeClass.Simt: 'cutlass::arch::OpClassSimt',
+  OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp',
+  OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp',
+  OpcodeClass.SparseTensorOp: 'cutlass::arch::OpClassSparseTensorOp',
+  OpcodeClass.BlockScaledTensorOp: 'cutlass::arch::OpClassBlockScaledTensorOp'    
+}
+
+###################################################################################################
+
+#
+class OperationKind(enum.Enum):
+  Gemm = enum_auto()
+  RankK = enum_auto()
+  Rank2K = enum_auto()
+  Trmm = enum_auto()
+  Symm = enum_auto()
+  Conv2d = enum_auto()
+  Conv3d = enum_auto()
+
+#
+OperationKindNames = {
+  OperationKind.Gemm: 'gemm'
+  , OperationKind.RankK: 'rank_k'
+  , OperationKind.Rank2K: 'rank_2k'
+  , OperationKind.Trmm: 'trmm'
+  , OperationKind.Symm: 'symm'
+  , OperationKind.Conv2d: 'conv2d'
+  , OperationKind.Conv3d: 'conv3d'
+}
+
+#
+class Target(enum.Enum):
+  library = enum_auto()
+#
+ArchitectureNames = {
+  50: 'maxwell',
+  60: 'pascal',
+  61: 'pascal',
+  70: 'volta',
+  75: 'turing',
+  80: 'ampere',
+  89: 'ada',
+  90: 'hopper'
+}
+
+#
+SharedMemPerCC = {
+  70:   96, #  96KB of SMEM
+  72:   96, #  96KB of SMEM
+  75:   64, #  64KB of SMEM
+  80:  163, # 163KB of SMEM - 1KB reserved for the driver
+  86:   99, #  99KB of SMEM - 1KB reserved for the driver
+  87:  163, # 163KB of SMEM - 1KB reserved for the driver
+  89:   99, #  99KB of SMEM - 1KB reserved for the driver
+  90:  227, # 227KB of SMEM - 1KB reserved for the driver
+  100: 227, # 227KB of SMEM - 1KB reserved for the driver
+}
+
+###################################################################################################
+
+#
+def SubstituteTemplate(template, values):
+  text = template
+  changed = True
+  while changed:
+    changed = False
+    for key, value in values.items():
+      regex = "\\$\\{%s\\}" % key
+      newtext = re.sub(regex, value, text)
+      if newtext != text:
+        changed = True
+      text = newtext
+  return text
+
+###################################################################################################
+
+#
+class GemmKind(enum.Enum):
+  Gemm = enum_auto()
+  Sparse = enum_auto()
+  Universal = enum_auto()
+  Universal3x = enum_auto()
+  SparseUniversal3x = enum_auto()
+  PlanarComplex = enum_auto()
+  PlanarComplexArray = enum_auto()
+  Grouped = enum_auto()
+  BlockScaledUniversal3x = enum_auto()                                   
+  GroupedUniversal3x = enum_auto()
+  GroupedBlockScaledUniversal3x = enum_auto()
+  BlockwiseUniversal3x = enum_auto()
+  GroupedBlockwiseUniversal3x = enum_auto()
+
+#
+GemmKindNames = {
+  GemmKind.Gemm: "gemm",
+  GemmKind.Sparse: "spgemm",
+  GemmKind.Universal: "gemm",
+  GemmKind.Universal3x: "gemm",
+  GemmKind.SparseUniversal3x: "spgemm",
+  GemmKind.PlanarComplex: "gemm_planar_complex",
+  GemmKind.PlanarComplexArray: "gemm_planar_complex_array",
+  GemmKind.Grouped: "gemm_grouped",
+  GemmKind.BlockScaledUniversal3x: "gemm",
+  GemmKind.GroupedUniversal3x: "gemm_grouped",
+  GemmKind.GroupedBlockScaledUniversal3x: "gemm_grouped",
+  GemmKind.BlockwiseUniversal3x: "gemm",
+  GemmKind.GroupedBlockwiseUniversal3x: "gemm_grouped"
+}
+
+#
+class RankKKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+RankKKindNames = {
+  RankKKind.Universal: "rank_k"
+}
+
+#
+class TrmmKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+TrmmKindNames = {
+  TrmmKind.Universal: "trmm"
+}
+
+#
+class SymmKind(enum.Enum):
+  Universal = enum_auto()
+
+#
+SymmKindNames = {
+  SymmKind.Universal: "symm"
+}
+
+#
+class EpilogueFunctor(enum.Enum):
+  LinearCombination = enum_auto()
+  LinearCombinationClamp = enum_auto()
+
+#
+EpilogueFunctorTag = {
+  EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination',
+  EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp',
+}
+
+#
+class MixedInputMode(enum.Enum):
+  ConvertOnly = enum_auto()
+  ScaleOnly = enum_auto()
+  ScaleWithZeroPoint = enum_auto()
+
+#
+class SwizzlingFunctor(enum.Enum):
+  Identity1 = enum_auto()
+  Identity2 = enum_auto()
+  Identity4 = enum_auto()
+  Identity8 = enum_auto()
+  Horizontal = enum_auto()
+  StridedDgradIdentity1 = enum_auto()
+  StridedDgradIdentity4 = enum_auto()
+  StridedDgradHorizontal = enum_auto()
+  StreamK = enum_auto()
+
+#
+SwizzlingFunctorTag = {
+  SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>',
+  SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>',
+  SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>',
+  SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>',
+  SwizzlingFunctor.Horizontal: 'cutlass::gemm::threadblock::GemmHorizontalThreadblockSwizzle',
+  SwizzlingFunctor.StridedDgradIdentity1: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>',
+  SwizzlingFunctor.StridedDgradIdentity4: 'cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<4>',
+  SwizzlingFunctor.StridedDgradHorizontal: 'cutlass::conv::threadblock::StridedDgradHorizontalThreadblockSwizzle',
+  SwizzlingFunctor.StreamK: 'cutlass::gemm::threadblock::ThreadblockSwizzleStreamK',
+}
+
+#
+class GroupScheduleMode(enum.Enum):
+  Device = enum_auto(),
+  Host = enum_auto()
+
+#
+GroupScheduleModeTag = {
+  GroupScheduleMode.Device: 'cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly',
+  GroupScheduleMode.Host: 'cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute'
+}
+
+#
+ShortGroupScheduleModeNames = {
+  GroupScheduleMode.Device: 'Device',
+  GroupScheduleMode.Host: 'Host'
+}
+
+###################################################################################################
+
+#
+class ConvKind(enum.IntEnum):
+  Fprop = 0
+  Dgrad = 1
+  Wgrad = 2
+
+#
+ConvKindTag = {
+  ConvKind.Fprop: 'cutlass::conv::Operator::kFprop',
+  ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad',
+  ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad'
+}
+
+ConvKindNames = {
+  ConvKind.Fprop: 'fprop',
+  ConvKind.Dgrad: 'dgrad',
+  ConvKind.Wgrad: 'wgrad',
+}
+
+class ConvMode(enum.IntEnum):
+  CrossCorrelation = 0
+  Convolution = 1
+
+#
+class IteratorAlgorithm(enum.Enum):
+  Analytic = 0
+  Optimized = 1
+  FixedChannels = 2
+  FewChannels = 3
+  FixedStrideDilation = 4
+
+#
+IteratorAlgorithmTag = {
+  IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic',
+  IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized',
+  IteratorAlgorithm.FixedChannels: 'cutlass::conv::IteratorAlgorithm::kFixedChannels',
+  IteratorAlgorithm.FewChannels: 'cutlass::conv::IteratorAlgorithm::kFewChannels',
+  IteratorAlgorithm.FixedStrideDilation: 'cutlass::conv::IteratorAlgorithm::kFixedStrideDilation'
+}
+
+IteratorAlgorithmNames = {
+  IteratorAlgorithm.Analytic: 'analytic',
+  IteratorAlgorithm.Optimized: 'optimized',
+  IteratorAlgorithm.FixedChannels: 'fixed_channels',
+  IteratorAlgorithm.FewChannels: 'few_channels',
+  IteratorAlgorithm.FixedStrideDilation: 'fixed_stride_dilation'
+}
+
+#
+class StrideSupport(enum.Enum):
+  Strided = 0
+  Unity = 1
+  Fixed = 2
+
+#
+StrideSupportTag = {
+  StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided',
+  StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity',
+  StrideSupport.Fixed: 'cutlass::conv::StrideSupport::kFixed'
+}
+
+StrideSupportNames = {
+  StrideSupport.Strided: '',
+  StrideSupport.Unity: 'unity_stride',
+  StrideSupport.Fixed: 'fixed_stride'
+}
+
+#
+class GroupMode(enum.Enum):
+  NoneGroup = enum_auto()         # dense conv (G=1)
+  SingleGroup = enum_auto()       # grouped convolution (single group per CTA)
+  MultipleGroup = enum_auto()     # grouped convolution ( multiple groups per CTA)
+  Depthwise = enum_auto()         # Depthwise convolution ( C=K=G )
+
+#
+GroupModeTag = {
+  GroupMode.NoneGroup: 'cutlass::conv::GroupMode::kNone',
+  GroupMode.SingleGroup: 'cutlass::conv::GroupMode::kSingleGroup',
+  GroupMode.MultipleGroup: 'cutlass::conv::GroupMode::kMultipleGroup',
+  GroupMode.Depthwise: 'cutlass::conv::GroupMode::kDepthwise',
+}
+
+GroupModeNames = {
+  GroupMode.NoneGroup: '',
+  GroupMode.SingleGroup: 'single_group',
+  GroupMode.MultipleGroup: 'multiple_group',
+  GroupMode.Depthwise: 'depthwise',
+}
+
+DynamicClusterShape = [0, 0, 1] 
+
+###################################################################################################
+
+#
+class MathInstruction:
+  def __init__(self,
+      instruction_shape,                                            \
+      element_a, element_b, element_accumulator,                    \
+      opcode_class, math_operation = MathOperation.multiply_add     \
+      , element_scale_factor = None 
+    ):
+
+    self.instruction_shape = instruction_shape
+    self.element_a = element_a
+    self.element_b = element_b
+    self.element_accumulator = element_accumulator
+    self.opcode_class = opcode_class
+    self.math_operation = math_operation
+    self.element_scale_factor = element_scale_factor 
+
+#
+class TileDescription:
+
+  def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1], explicit_vector_sizes = None):
+    self.threadblock_shape = threadblock_shape
+    self.tile_shape = threadblock_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+    self.cluster_shape = cluster_shape
+    self.explicit_vector_sizes = explicit_vector_sizes
+
+  def procedural_name(self):
+    if self.minimum_compute_capability >= 90:
+      return "{tbm}x{tbn}x{tbk}_{cm}x{cn}x{ck}_{s}".format(
+        tbm = self.threadblock_shape[0],
+        tbn = self.threadblock_shape[1],
+        tbk = self.threadblock_shape[2],
+        cm = self.cluster_shape[0],
+        cn = self.cluster_shape[1],
+        ck = self.cluster_shape[2],
+        s = self.stages)
+    else:
+      return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages)
+
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
+
+#
+class Direct2dConvFixedStrideDilationTileDescription:
+  def __init__(self, threadblock_output_shape, filter_shape, stages, stride, dilation, warp_count, math_instruction, min_compute, max_compute):
+    self.threadblock_shape = [threadblock_output_shape[0]*threadblock_output_shape[1]*threadblock_output_shape[2], threadblock_output_shape[3], filter_shape[0]*filter_shape[1]]
+    self.threadblock_output_shape = threadblock_output_shape
+    self.filter_shape = filter_shape
+    self.stages = stages
+    self.warp_count = warp_count
+    self.stride = stride
+    self.dilation =  dilation
+    self.math_instruction = math_instruction
+    self.minimum_compute_capability = min_compute
+    self.maximum_compute_capability = max_compute
+
+  def procedural_name(self):
+    str_name = "%dx%dx%d_%dx%dx%dx%d_%d_filter%dx%d" % (self.threadblock_shape[0],
+                                      self.threadblock_shape[1],
+                                      self.threadblock_shape[2],
+                                      self.threadblock_output_shape[0],
+                                      self.threadblock_output_shape[1],
+                                      self.threadblock_output_shape[2],
+                                      self.threadblock_output_shape[3],
+                                      self.stages,
+                                      self.filter_shape[0],
+                                      self.filter_shape[1])
+    # Fixed Strided and dilation
+    if self.stride != [-1, -1] and self.dilation != [-1, -1]:
+      str_name += "_stride%dx%d_dilation%dx%d" % (self.stride[0],
+                                                  self.stride[1],
+                                                  self.dilation[0],
+                                                  self.dilation[1])
+    return str_name
+
+#
+class TensorDescription:
+  def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none):
+    self.element = element
+    self.layout = layout
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+
+#
+class SymmetricTensorDescription:
+  def __init__(self, element, layout, fill_mode, alignment = 1, complex_transform = ComplexTransform.none, side_mode = SideMode.Left):
+    self.element = element
+    self.layout = layout
+    self.fill_mode = fill_mode
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+    self.side_mode = side_mode
+
+#
+class TriangularTensorDescription:
+  def __init__(self, element, layout, side_mode, fill_mode, diag_type, alignment = 1, complex_transform = ComplexTransform.none):
+    self.element = element
+    self.layout = layout
+    self.side_mode = side_mode
+    self.fill_mode = fill_mode
+    self.diag_type = diag_type
+    self.alignment = alignment
+    self.complex_transform = complex_transform
+
+#
+def CalculateSmemUsage(operation):
+  cta_shape = operation.tile_description.threadblock_shape
+  stages = operation.tile_description.stages
+
+  if operation.operation_kind == OperationKind.Gemm and operation.gemm_kind == GemmKind.Sparse:
+    # Elements represented by 8 bits of metadata (based on 4:8, 2:4 or 1:2 sparsity)
+    if DataTypeSize[operation.A.element] == 32:
+      elements_per_8b_md = 2
+    elif DataTypeSize[operation.A.element] == 4:
+      elements_per_8b_md = 8
+    else:
+      elements_per_8b_md = 4
+
+    smem_per_stage = DataTypeSize[operation.A.element] * cta_shape[0] * (cta_shape[2] // 2) // 8 + \
+                     DataTypeSize[operation.B.element] * cta_shape[1] * cta_shape[2] // 8 + \
+                     cta_shape[0] * (cta_shape[2] // 2) // elements_per_8b_md
+  else:
+    # Few BLAS3 operations only have A tensor
+    data_type_size_a = DataTypeSize[operation.A.element]
+    data_type_size_b = DataTypeSize[operation.A.element]
+    if operation.is_mixed_input():
+      data_type_size_b = DataTypeSize[operation.B.element]
+
+    smem_per_stage = data_type_size_a * cta_shape[0] * cta_shape[2] // 8 + \
+                     data_type_size_b * cta_shape[1] * cta_shape[2] // 8
+
+  smem_usage = smem_per_stage * stages
+  return (smem_usage >> 10)
+
+
+class GemmUniversalMode(enum.IntEnum):
+  """
+  Types corresponding to GemmUniversalMode
+  """
+  Gemm = 0
+  GemmSplitKParallel = 1
+  Batched = 2
+  Array = 3
+
+
+class SplitKMode(enum.IntEnum):
+  """
+  Types corresponding to SplitKMode
+  """
+  NoneSplitK = 0
+  Serial = 1
+  Parallel = 2
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/manifest.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/manifest.py
new file mode 100644
index 0000000000000000000000000000000000000000..5733ef26322794ee650dfa0c8c2b170bd8c6f3e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/manifest.py
@@ -0,0 +1,868 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for filtering CUTLASS library kernels and emitting library intitialization
+and building code
+"""
+
+import enum
+import logging
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+  from cutlass_library.gemm_operation import *
+  from cutlass_library.rank_k_operation import *
+  from cutlass_library.rank_2k_operation import *
+  from cutlass_library.trmm_operation import *
+  from cutlass_library.symm_operation import *
+  from cutlass_library.conv2d_operation import *
+  from cutlass_library.conv3d_operation import *
+except ImportError:
+  from library import *
+  from gemm_operation import *
+  from rank_k_operation import *
+  from rank_2k_operation import *
+  from trmm_operation import *
+  from symm_operation import *
+  from conv2d_operation import *
+  from conv3d_operation import *
+
+###################################################################################################
+_LOGGER = logging.getLogger(__name__)
+
+
+class EmitOperationKindAll:
+  """
+  Emit the OperationKind-level CUTLASS library initialization code.
+  The code is generated in the {generated_path}/{operation_kind} directory
+  (e.g., tools/library/generated/gemm in the build directory,
+  for OperationKind=Gemm), in the all_{operation_kind}_operations.cu file
+  (e.g., all_gemm_operations.cu for OperationKind=Gemm).
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_{configuration_name}(Manifest& manifest);
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories
+  (which this class does not create).
+  """
+
+  def __init__(self, generated_path, kind, args):
+    self.generated_path = generated_path
+    self.kind = kind
+    self.args = args
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template ="  initialize_${configuration_name}(manifest);\n"
+
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindAll::__enter__")
+
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind])
+    _LOGGER.debug('***   operation_path (directory to create): ' +
+                  str(self.operation_path));
+    os.makedirs(self.operation_path, exist_ok=True)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = [self.top_level_path,]
+
+    self.configurations = []
+
+    return self
+
+  #
+  def emit(self, operations):
+    _LOGGER.debug('*** EmitOperationKindAll::emit')
+    _LOGGER.debug(f"***   len(operations): {len(operations)}")
+    _LOGGER.debug(f"***   min_cc list: {sorted(min_cc for min_cc, _ in operations.items())}")
+
+    for min_cc, configurations in sorted(operations.items()):
+      _LOGGER.debug(f"***   min_cc={min_cc}")
+
+      for configuration_name, _ in configurations.items():
+        _LOGGER.debug(f"***     configuration_name={configuration_name}")
+        self.configurations.append(configuration_name)
+        self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindAll::__exit__")
+
+    self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]}))
+
+    for configuration_name in self.configurations:
+      self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name}))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+
+class EmitOperationKindLibrary:
+  """
+  Emit the CUTLASS library initialization code for each OperationKind.
+  The code is generated in the directory
+  {generated_path}/{operation_kind}/{min_cc}
+  (e.g., tools/library/generated/gemm/90 in the build directory,
+  for min_cc=90 and OperationKind=Gemm), in the file
+  all_sm{min_cc}_{operation_kind}_operations.cu
+  (e.g., all_sm90_gemm_operations.cu for min_cc=90 and OperationKind=Gemm).
+  The min_cc variable here indicates the minimum GPU architecture version
+  that the things to be initialized require.
+  For example, min_cc=90 indicates sm90.
+
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_all_sm{min_cc}_{subclass_name}_{extended_name}_operations(Manifest& manifest);
+
+  where extended_name is operation.extended_name() for all the operations
+  given to the emit method (which see below).  (All operations for a given
+  configuration_name are guaranteed to have the same extended_name().)
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all_sm{min_cc}__{operation_kind}_operations(Manifest& manifest);
+
+  That function calls all of the functions declared in this file.
+  Those functions are defined in subdirectories.
+  The mapping from OperationKind to emitter handles the details
+  of what happens in each of those subdirectories.
+  """
+
+  def __init__(self, generated_path, min_cc, kind, args):
+    self.generated_path = generated_path
+    self.min_cc = min_cc
+    self.kind = kind
+    self.args = args
+    self.emitters = {
+      OperationKind.Gemm: EmitGemmConfigurationLibrary,
+      OperationKind.Conv2d: EmitConv2dConfigurationLibrary,
+      OperationKind.Conv3d: EmitConv3dConfigurationLibrary,
+      OperationKind.RankK: EmitRankKConfigurationLibrary,
+      OperationKind.Rank2K: EmitRank2KConfigurationLibrary,
+      OperationKind.Trmm: EmitTrmmConfigurationLibrary,
+      OperationKind.Symm: EmitSymmConfigurationLibrary
+    }
+
+    self.header_template ="""
+/*
+ Generated by manifest.py - Do not edit.
+*/
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+    self.entry_template = """
+
+//
+// Entry point to construct operations
+//
+void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest) {
+"""
+    self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n"
+    self.configuration_template = "  initialize_${configuration_name}(manifest);\n"
+    self.subclass_call_template = "  initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(manifest);\n"
+    self.subclass_prototype_template = "void initialize_all_sm${min_cc}_${subclass_name}_${operation_name}_operations(Manifest &manifest);\n"
+    self.epilogue_template ="""}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+"""
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__enter__")
+    _LOGGER.debug(f"***   generated_path: {str(self.generated_path)}")
+    _LOGGER.debug(f"***   OperationKindNames[kind]: {OperationKindNames[self.kind]}")
+    _LOGGER.debug(f"***   min_cc: {self.min_cc}")
+
+    self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind], str(self.min_cc))
+    _LOGGER.debug(f"***   operation_path (directory to make): {str(self.operation_path)}")
+    os.makedirs(self.operation_path)
+
+    self.top_level_path = os.path.join(self.operation_path, f"all_sm{self.min_cc}_{OperationKindNames[self.kind]}_operations.cu")
+    _LOGGER.debug(f"***   top_level_path (file to write): {str(self.top_level_path)}")
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.header_template)
+
+    self.source_files = {}
+
+    # Each {operation_kind x cc} combination is further decomposed by the instruction
+    # types used. This dictionary used to track the file handles for the top-level
+    # files of each subclass
+    self.subclass_files = {}
+
+    # Configurations in each sub class
+    self.subclass_configurations = {}
+
+    return self
+
+  #
+  def emit(self, configuration_name, operations):
+    _LOGGER.debug("*** EmitOperationKindLibrary::emit")
+    _LOGGER.debug(f"***   configuration_name: {configuration_name}")
+
+    assert len(operations) > 0
+
+    # The extended name for all operations of a given configuration_name is guaranteed
+    # to be the same because extended_name() is used in defining configuration_name. Thus,
+    # we can safely use the extended_name() of the first operation.
+    extended_name = operations[0].extended_name()
+    _LOGGER.debug('***   extended_name (for all ops): ' + extended_name)
+
+    # Create a directory for operations with this subclass if it does not exist
+    if extended_name not in self.subclass_files:
+      subclass_path = os.path.join(self.operation_path, extended_name)
+      _LOGGER.debug(f"***     subclass_path: {str(subclass_path)}")
+      os.mkdir(subclass_path)
+
+      self.subclass_configurations[extended_name] = []
+
+      # Open a new top-level file for this sub class
+      subclass_top_level_path = os.path.join(
+        subclass_path, f"all_sm{self.min_cc}_{extended_name}_{OperationKindNames[self.kind]}_operations.cu")
+      _LOGGER.debug('***     subclass_top_level_path (min_cc, extended_name, ' +
+                    'OperationKind): ' + str(subclass_top_level_path))
+
+      self.subclass_files[extended_name] = open(subclass_top_level_path, "w")
+      self.subclass_files[extended_name].write(self.header_template)
+
+      self.source_files[extended_name] = [subclass_top_level_path]
+
+    subclass_dir = os.path.dirname(self.subclass_files[extended_name].name)
+    _LOGGER.debug('***   subclass_dir: ' + str(subclass_dir))
+
+    with self.emitters[self.kind](subclass_dir, configuration_name) as configuration_emitter:
+      for operation in operations:
+        configuration_emitter.emit(operation)
+
+      _LOGGER.debug('***   configuration_emitter.configuration_path: ' +
+                    str(configuration_emitter.configuration_path))
+      self.source_files[extended_name].append(configuration_emitter.configuration_path)
+
+    self.subclass_configurations[extended_name].append(configuration_name)
+    self.subclass_files[extended_name].write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} ))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitOperationKindLibrary::__exit__")    
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      self.top_level_file.write(SubstituteTemplate(self.subclass_prototype_template, subclass_cfg))
+
+    self.top_level_file.write(
+      SubstituteTemplate(self.entry_template, {
+        'min_cc': str(self.min_cc),
+        'subclass_name': '',
+        'operation_name': OperationKindNames[self.kind]
+      }))
+
+    # Finish and close all subclass files
+    for subclass_name, subclass_file in sorted(self.subclass_files.items()):
+      subclass_cfg = {
+        'min_cc': str(self.min_cc),
+        'subclass_name': subclass_name,
+        'operation_name': OperationKindNames[self.kind]
+      }
+      subclass_file.write(SubstituteTemplate(self.entry_template, subclass_cfg))
+
+      for configuration in self.subclass_configurations[subclass_name]:
+        subclass_file.write(
+          SubstituteTemplate(self.configuration_template, {
+            'configuration_name': configuration
+          }))
+
+      subclass_file.write(self.epilogue_template)
+      subclass_file.close()
+
+      # Write the call to initialize_all for this subclass to the top-level file
+      self.top_level_file.write(SubstituteTemplate(self.subclass_call_template, subclass_cfg))
+
+    self.top_level_file.write(self.epilogue_template)
+    self.top_level_file.close()
+
+class EmitInterfaceLibrary:
+  """
+  Emit the topmost-level CUTLASS library initialization code.
+  The code is generated in the generated_path directory
+  (e.g., tools/library/generated in the build directory),
+  in the initialize_all.cpp file.
+  That file declares several functions in namespace cutlass::library.
+  The functions all have this form,
+
+  void initialize_all_{operation_kind}_operations(Manifest& manifest);
+
+  where {operation_kind} abbreviates the "kind" of operation
+  (e.g., gemm for matrix-matrix multiply, conv2d for 2-d convolution,
+  or trmm for triangular solve with multiple right-hand sides).
+  The definitions of these functions live in subdirectories.
+
+  The file also _defines_ the following function in that namespace.
+
+  void initialize_all(Manifest& manifest);
+
+  That function first prepares the manifest, and then
+  calls all of the functions declared in this file.
+  """
+
+  def __init__(self, generated_path, operation_count, args):
+    self.generated_path = generated_path
+    self.args = args
+
+    self.prototypes = []
+    self.fn_calls = []
+    self.operation_count = str(operation_count)
+
+    self.top_level_hdr_template = '''
+/*
+ Generated by manifest.py - Do not edit.
+*/
+'''
+    self.top_level_prologue = '''
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+namespace cutlass {
+\tnamespace library {
+
+${prototypes}
+'''
+
+    self.top_level_initialize_kind = '''
+\t\tvoid initialize_all_${kind}_operations(Manifest &manifest) {
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_initialize = '''
+\t\tvoid initialize_all(Manifest &manifest) {
+\t\t\tmanifest.reserve(${operation_count});\n
+${fn_calls}
+\t\t}
+'''
+
+    self.top_level_suffix = '''
+\t} // namespace library
+} // namespace cutlass
+
+'''
+
+  #
+  def __enter__(self):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__enter__")
+
+    self.top_level_path = os.path.join(self.generated_path, 'initialize_all.cpp')
+    _LOGGER.debug("***   top_level_path: " + str(self.top_level_path))
+
+    self.top_level_file = open(self.top_level_path, "w")
+    self.top_level_file.write(self.top_level_hdr_template)
+
+    self.source_files = [self.top_level_path,]
+
+    return self
+
+  #
+  def emit(self, operation_name):
+    _LOGGER.debug("*** EmitInterfaceLibrary::emit")
+    _LOGGER.debug("***   operation_name: " + operation_name)
+
+    self.prototypes.append(SubstituteTemplate(
+       "\t\tvoid initialize_all_${operation_kind}_operations(Manifest &manifest);",
+       {'operation_kind': operation_name}))
+
+    self.fn_calls.append(SubstituteTemplate(
+      "\t\t\tinitialize_all_${operation_kind}_operations(manifest);",
+      {'operation_kind': operation_name}))
+
+  #
+  def __exit__(self, exception_type, exception_value, traceback):
+    _LOGGER.debug("*** EmitInterfaceLibrary::__exit__")
+
+    self.top_level_file.write(SubstituteTemplate(self.top_level_prologue, {'prototypes':"\n".join(self.prototypes)}))
+
+    # Write out initialize_all method
+    self.top_level_file.write(SubstituteTemplate(self.top_level_initialize,
+                              {'operation_count': self.operation_count, 'fn_calls':"\n".join(self.fn_calls)}))
+
+    self.top_level_file.write(self.top_level_suffix)
+    self.top_level_file.close()
+
+###################################################################################################
+###################################################################################################
+
+class Options:
+  def __init__(self):
+    pass
+
+###################################################################################################
+
+#
+class Manifest:
+
+  #
+  def __init__(self, args = None):
+    self.operations = {}
+    self.args = args
+    self.operation_count = 0
+    self.operations_by_name = {}
+
+    self.kernel_filter = ''
+    self.kernel_filter_list = []
+    self.kernel_names = []
+    self.operations_enabled = []
+    self.selected_kernels = []
+    self.ignore_kernel_names = []
+    self.exclude_kernel_names = []
+    self.compute_capabilities_baseline = [50,]
+    self.compute_capabilities_feature_set = ['50',]
+    self.curr_build_dir = '.'
+    self.filter_by_cc = True
+
+    if self.args:
+      self.kernel_filter = self.args.kernels
+      self.curr_build_dir = args.curr_build_dir
+
+      # A common user error is to use commas instead of semicolons.
+      if ',' in args.architectures:
+        raise RuntimeError("The list of architectures (CMake option CUTLASS_NVCC_ARCHS) must be semicolon-delimited.\nDon't use commas to separate the architectures; use semicolons.\nYou specified the list as: " + args.architectures)
+      
+      self.compute_capabilities_feature_set = args.architectures.split(';') if len(args.architectures) else ['50',]
+      self.compute_capabilities_baseline = sorted(set(int(arch.split('a')[0].split('f')[0]) for arch in self.compute_capabilities_feature_set))
+
+      if args.filter_by_cc in ['false', 'False', '0']:
+        self.filter_by_cc = False
+
+      if args.operations == 'all':
+        self.operations_enabled = []
+      else:
+        operations_list = [
+          OperationKind.Gemm
+          , OperationKind.Conv2d
+          , OperationKind.Conv3d
+            , OperationKind.RankK
+            , OperationKind.Trmm
+            , OperationKind.Symm
+        ]
+        self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')]
+
+      if args.kernels == 'all':
+        self.kernel_names = []
+      else:
+        self.kernel_names = [x for x in args.kernels.split(',') if x != '']
+
+      self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != '']
+      self.exclude_kernel_names = [x for x in args.exclude_kernels.split(',') if x != '']
+
+      if args.kernel_filter_file is None:
+          self.kernel_filter_list = []
+      else:
+          self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
+          _LOGGER.debug("Using {filter_count} kernel filters from {filter_file}".format(
+              filter_count = len(self.kernel_filter_list),
+              filter_file = args.kernel_filter_file))
+
+      self.operation_count = 0
+      self.operations_by_name = {}
+      self.disable_full_archs_compilation = args.disable_full_archs_compilation
+      self.is_kernel_filter_set_to_all = args.instantiation_level == "max" and args.kernels != ''
+      self.instantiation_level = 0
+      try:
+          self.instantiation_level = int(args.instantiation_level)
+      except ValueError:
+          self.instantiation_level = 0
+
+  def add_kernel_filter(self, filter_str):
+    filter_re = re.compile(filter_str)
+
+    self.kernel_filter_list.append(filter_re)
+
+  def get_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9992):
+    # Non-negative integer which determines how many kernels are instantiated.
+    # 0 = 0000 generates the fewest kernels, 9999 generates all possible combinations.
+    # increasing first digit reduces schedule / mixed type pruning,
+    # increasing second digit generates more cluster sizes,
+    # increasing third digit generates more MMA multipliers,
+    # increasing fourth digit generates more instruction shapes.
+
+    if self.instantiation_level > 0:
+        return self.instantiation_level
+
+    elif self.is_kernel_filter_set_to_all:
+        return exhaustive_level
+
+    elif self.kernel_filter == '':
+        return pruned_level
+
+    else:
+        return default_level
+
+
+  def get_kernel_filters(self, kernelListFile):
+    if os.path.isfile(kernelListFile):
+        with open(kernelListFile, 'r') as fileReader:
+            lines = [line.rstrip() for line in fileReader if not line.startswith("#")]
+
+        lines = [re.compile(line) for line in lines if line]
+        return lines
+    else:
+        return []
+
+  #
+  def filter_out_kernels(self, kernel_name, kernel_filter_list):
+
+    for kernel_filter_re in kernel_filter_list:
+        if kernel_filter_re.search(kernel_name) is not None:
+            return True
+
+    return False
+
+
+  #
+  def _filter_string_matches(self, filter_string, haystack):
+    ''' Returns true if all substrings appear in the haystack in order'''
+    substrings = filter_string.split('*')
+    for sub in substrings:
+      idx = haystack.find(sub)
+      if idx < 0:
+        return False
+      haystack = haystack[idx + len(sub):]
+    return True
+
+  #
+  def filter(self, operation):
+    ''' Filtering operations based on various criteria'''
+
+    # filter based on compute capability
+    enabled = not (self.filter_by_cc)
+
+    for cc in self.compute_capabilities_baseline:
+
+      if cc >= operation.tile_description.minimum_compute_capability and \
+         cc <= operation.tile_description.maximum_compute_capability and \
+         (cc not in SharedMemPerCC or SharedMemPerCC[cc] >= CalculateSmemUsage(operation)):
+
+        enabled = True
+        break
+
+    if not enabled:
+      return False
+
+    if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled:
+      return False
+
+    name = operation.procedural_name()
+
+    # eliminate duplicates
+    if name in self.operations_by_name.keys():
+      return False
+
+    # Filter based on list of valid substrings
+    if len(self.kernel_names):
+      enabled = False
+
+      # compare against the include list
+      for name_substr in self.kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} included due to filter string '{name_substr}'.")
+          enabled = True
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT included due to not matching '{name_substr}'.")
+
+      # compare against the exclude list
+      for name_substr in self.ignore_kernel_names:
+        if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug(f"Kernel {name} ignored due to filter string '{name_substr}'.")
+          enabled = False
+          break
+        else:
+          _LOGGER.debug(f"Kernel {name} NOT ignored due to not matching '{name_substr}'.")
+
+    if len(self.kernel_filter_list) > 0:
+      if self.filter_out_kernels(name, self.kernel_filter_list):
+        _LOGGER.debug(f"Kernel {name} matched via kernel filter file.")
+        enabled = True
+      else:
+        _LOGGER.debug(f"Kernel {name} culled due to no match in kernel filter file.")
+        enabled = False
+
+    # CUTLASS_LIBRARY_IGNORE_KERNELS ("ignore" list) only takes effect
+    # if CUTLASS_LIBRARY_KERNELS was specified.
+    # Changing that would break backwards compatibility.
+    # Thus, CUTLASS has introduced the new CMake option CUTLASS_LIBRARY_EXCLUDE_KERNELS,
+    # that always takes effect, whether or not CUTLASS_LIBRARY_KERNELS was specified.
+    for name_substr in self.exclude_kernel_names:
+      if self._filter_string_matches(name_substr, name):
+        _LOGGER.debug(f"Kernel {name} excluded due to filter string '{name_substr}'.")
+        enabled = False
+        break
+      else:
+        _LOGGER.debug(f"Kernel {name} NOT excluded due to not matching '{name_substr}'.")
+
+    # TODO: filter based on compute data type
+    return enabled
+  #
+
+  #
+  def append(self, operation):
+    '''
+      Inserts the operation.
+
+      operation_kind -> configuration_name -> []
+    '''
+
+    if self.filter(operation):
+
+      self.selected_kernels.append(operation.procedural_name())
+
+      self.operations_by_name[operation.procedural_name()] = operation
+
+      # add the configuration
+      configuration_name = operation.configuration_name()
+
+      # Split operations by minimum CC
+      min_cc = operation.arch
+
+      if operation.operation_kind not in self.operations.keys():
+        self.operations[operation.operation_kind] = {}
+
+      if min_cc not in self.operations[operation.operation_kind]:
+        self.operations[operation.operation_kind][min_cc] = {}
+
+      if configuration_name not in self.operations[operation.operation_kind][min_cc].keys():
+        self.operations[operation.operation_kind][min_cc][configuration_name] = []
+
+      self.operations[operation.operation_kind][min_cc][configuration_name].append(operation)
+      self.operation_count += 1
+    else:
+      _LOGGER.debug("Culled {} from manifest".format(operation.procedural_name()))
+  #
+
+  def emit_manifest_cmake(self, manifest_path, top_level_path, source_files):
+    with open(manifest_path, "w") as manifest_file:
+
+      target_text = SubstituteTemplate("""cutlass_target_sources(cutlass_library_objs PRIVATE
+      """, { })
+      manifest_file.write(target_text + '\n\n')
+      manifest_file.write("    %s\n" % str(top_level_path.replace('\\', '/')))
+      generated_path = os.path.join(self.curr_build_dir, 'generated')
+      for kind in self.operations.keys():
+        kind_str = OperationKindNames[kind]
+        all_kind_file = os.path.join(generated_path, kind_str, f"all_{kind_str}_operations.cu").replace('\\', '/')
+        manifest_file.write(f"    {all_kind_file}\n")
+      manifest_file.write(')\n\n')
+
+      for kind in self.operations.keys():
+        for min_cc in sorted(self.operations[kind].keys()):
+          for subclass in sorted(source_files[kind][min_cc].keys()):
+            target_text = SubstituteTemplate("""cutlass_add_cutlass_library(
+      SUFFIX ${kind}_sm${min_cc}_${subclass}
+""", { 'min_cc': str(min_cc), 'kind': OperationKindNames[kind], 'subclass': subclass })
+            manifest_file.write(target_text + '\n\n')
+
+            for source_file in source_files[kind][min_cc][subclass]:
+              manifest_file.write("    %s\n" % str(source_file.replace('\\', '/')))
+
+            manifest_file.write(")\n")
+
+          if self.disable_full_archs_compilation:
+            self.emit_disable_full_archs_compilation(manifest_file, source_files)
+
+  def emit_disable_full_archs_compilation(manifest_file, source_files):
+      def for_hopper(name):
+          pass
+
+      def for_ampere(name):
+          return "16816" in name or \
+                  "16832" in name or \
+                  "16864" in name or \
+                  ("1688" in name and "tf32" in name)
+
+      def for_turing(name):
+          return ("1688" in name and "tf32" not in name) or \
+                  "8816" in name
+
+      def for_volta(name):
+          return "884" in name
+
+      def is_cpp(name):
+          return name.endswith(".cpp")
+
+      def get_src_archs_str_given_requested_cuda_archs(archs, source_file):
+          intersected_archs = archs & set(self.compute_capabilities_baseline)
+          if intersected_archs == set():
+              raise RuntimeError(
+                    """
+                    Empty archs set for file {} after taking
+                    the intersection of {} (global requested archs) and
+                    {} (per file requested archs)
+                    """.format(source_file, set(self.compute_capabilities_baseline), archs))
+          else:
+              return " ".join(map(str, intersected_archs))
+
+      for min_cc in sorted(source_files.keys()):
+        for source_file in source_files[min_cc]:
+            if is_cpp(source_file):
+                continue # skip because source is cpp
+            elif for_ampere(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({80, 87, 90}, source_file)
+            elif for_turing(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({75}, source_file)
+            elif for_volta(source_file):
+                archs_str = get_src_archs_str_given_requested_cuda_archs({70, 72}, source_file)
+            else:
+                raise RuntimeError("Per file archs are not set {}, as there is no rule specified for this file pattern".format(source_file))
+
+            manifest_file.write("cutlass_apply_cuda_gencode_flags({} SM_ARCHS {})\n".format(str(source_file.replace('\\', '/')), archs_str))
+
+  #
+  def emit(self, target = GeneratorTarget.Library):
+
+    operation_emitters = {
+      GeneratorTarget.Library: EmitOperationKindLibrary
+    }
+
+    # Emitters for all operations that fall under a particular kind (e.g., GEMM, Conv2d)
+    kind_emitters = {
+      GeneratorTarget.Library: EmitOperationKindAll
+    }
+
+    interface_emitters = {
+      GeneratorTarget.Library: EmitInterfaceLibrary
+    }
+
+    generated_path = os.path.join(self.curr_build_dir, 'generated')
+
+    # create generated/
+    if os.path.exists(generated_path):
+      shutil.rmtree(generated_path)
+
+    os.mkdir(generated_path)
+
+    with interface_emitters[target](generated_path, self.operation_count, self.args) as iface_emitter:
+      top_level_path = iface_emitter.top_level_path
+      for operation_kind in self.operations.keys():
+        iface_emitter.emit(OperationKindNames[operation_kind])
+
+    source_files = {}
+    for kind in self.operations.keys():
+      source_files[kind] = {}
+      for min_cc in self.operations[kind].keys():
+        source_files[kind][min_cc] = {}
+
+    for operation_kind, ops in self.operations.items():
+      for min_cc, configurations in sorted(ops.items()):
+        with operation_emitters[target](generated_path, min_cc, operation_kind, self.args) as operation_kind_emitter:
+          for configuration_name, operations in configurations.items():
+            _LOGGER.info(f"Emitting {configuration_name} with {len(operations)} operation{'' if len(operations) == 1 else 's'}.")
+            operation_kind_emitter.emit(configuration_name, operations)
+
+          for subclass, files in operation_kind_emitter.source_files.items():
+            if subclass not in source_files[operation_kind][min_cc]:
+              source_files[operation_kind][min_cc][subclass] = []
+            source_files[operation_kind][min_cc][subclass].extend(operation_kind_emitter.source_files[subclass])
+
+      # Emit top level all_{gemm, conv2d, ...}_operations.cu files
+      with kind_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
+        operation_kind_emitter.emit(ops)
+
+    # write the manifest.cmake file containing paths from all targets
+    manifest_path = os.path.join(generated_path, "manifest.cmake")
+
+    self.emit_manifest_cmake(manifest_path, top_level_path, source_files)
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_2k_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_2k_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..29ef056f26f914a9c3c33e13900c33642ad2f1b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_2k_operation.py
@@ -0,0 +1,438 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Rank2K kernels
+"""
+
+import enum
+import functools
+import operator
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+
+#
+class Rank2KOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Rank2K
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = A
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'syr2k' if self.blas_mode == BlasMode.symmetric else 'her2k'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.A.alignment, self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitRank2KUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Rank2K<
+    ${element_a}, ${layout_a},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${transform_b},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'transform_b': ComplexTransformTag[operation.B.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+
+    return SubstituteTemplate(rank_k_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitRank2KConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRank2KUniversalInstance,
+    }
+
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'Rank2KOperation',
+    }
+
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by rank_2k_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "rank_2k_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_k_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_k_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9841952332a170d6f401dbe34a0093540c166fb8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/rank_k_operation.py
@@ -0,0 +1,427 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting RankK kernels
+"""
+
+import enum
+import functools
+import operator
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Rank K update operation
+#
+###################################################################################################
+
+#
+class RankKOperation:
+  #
+  def __init__(self, rank_k_kind, arch, tile_description, A, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.RankK
+    self.arch = arch
+    self.tile_description = tile_description
+    self.rank_k_kind = rank_k_kind
+    self.A = A
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_mixed_input(self):
+    return False
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'syrk' if self.blas_mode == BlasMode.symmetric else 'herk'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.C.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.A.alignment, self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % self.A.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitRankKUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.rank_k_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.rank_k_complex_template = """
+// Rank K operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::RankK<
+    ${element_a}, ${layout_a},
+    ${element_c}, ${layout_c}, ${fill_mode},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'fill_mode': FillModeTag[operation.C.fill_mode],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    rank_k_template = self.rank_k_complex_template if operation.is_complex() else self.rank_k_template
+
+    return SubstituteTemplate(rank_k_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitRankKConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      RankKKind.Universal: EmitRankKUniversalInstance,
+    }
+
+    self.rank_k_kind_wrappers = {
+      RankKKind.Universal: 'RankKOperation',
+    }
+
+    self.instance_template = {
+      RankKKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${rank_k_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by rank_k_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "rank_k_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.rank_k_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.rank_k_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'rank_k_kind': self.rank_k_kind_wrappers[operation.rank_k_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_shapes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..32e4376513679f06dc085ead068e258b3d8b5e72
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_shapes.py
@@ -0,0 +1,342 @@
+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Valid tcgen05 shapes and cluster sizes for SM100, associated with levels.
+These shape and level pairs are defined as dicts, where keys are shapes and values are their
+associated levels. If the user input level for that category (tcgen05 shape, cluster
+size) is smaller than a shape's associated level, it will be excluded, and otherwise, included.
+Higher levels are therefore less likely emitted, but lower levels are more emitted more frequently.
+Level 0 is always emitted. 
+"""
+
+try:
+    from .library import DynamicClusterShape
+except:
+    from library import DynamicClusterShape
+
+SM100_CLUSTER_SHAPES_1SM = {
+    tuple(DynamicClusterShape) : 0,
+    # size 1 cluster
+    (1, 1, 1): 1,
+    # size 2 cluster
+    (1, 2, 1): 2,
+    (2, 1, 1): 5,
+    # size 4 clusters
+    (2, 2, 1): 6,
+    (1, 4, 1): 3,
+    (4, 1, 1): 6,
+    # size 8 clusters
+    (2, 4, 1): 7,
+    (4, 2, 1): 7,
+    (1, 8, 1): 8,
+    (8, 1, 1): 8,
+    # size 16 cluster
+    (4, 4, 1): 4,
+}
+
+SM100_CLUSTER_SHAPES_2SM = {
+    tuple(DynamicClusterShape) : 0,
+    # size 2 cluster
+    (2, 1, 1): 1,
+    # size 4 clusters
+    (2, 2, 1): 2,
+    (4, 1, 1): 2,
+    # size 8 clusters
+    (2, 4, 1): 3,
+    (4, 2, 1): 3,
+    (8, 1, 1): 6,
+    # size 16 cluster
+    (4, 4, 1): 4,
+}
+
+# MMA shapes
+
+# 16b Dense
+
+SM100_MMA_SHAPES_16b_DENSE_1SM = {
+    (64,   8, 16): 5,
+    (64,  16, 16): 2,
+    (64,  24, 16): 5,
+    (64,  32, 16): 2,
+    (64,  40, 16): 5,
+    (64,  48, 16): 5,
+    (64,  56, 16): 5,
+    (64,  64, 16): 2,
+    (64,  72, 16): 5,
+    (64,  80, 16): 5,
+    (64,  88, 16): 5,
+    (64,  96, 16): 5,
+    (64, 104, 16): 5,
+    (64, 112, 16): 5,
+    (64, 120, 16): 5,
+    (64, 128, 16): 0,
+    (64, 136, 16): 5,
+    (64, 144, 16): 5,
+    (64, 152, 16): 5,
+    (64, 160, 16): 5,
+    (64, 168, 16): 5,
+    (64, 176, 16): 5,
+    (64, 184, 16): 5,
+    (64, 192, 16): 3,
+    (64, 200, 16): 5,
+    (64, 208, 16): 5,
+    (64, 216, 16): 5,
+    (64, 224, 16): 5,
+    (64, 232, 16): 5,
+    (64, 240, 16): 5,
+    (64, 248, 16): 5,
+    (64, 256, 16): 3,
+
+    (128,  16, 16): 2,
+    (128,  32, 16): 2,
+    (128,  48, 16): 5,
+    (128,  64, 16): 2,
+    (128,  80, 16): 5,
+    (128,  96, 16): 5,
+    (128, 112, 16): 5,
+    (128, 128, 16): 0,
+    (128, 144, 16): 5,
+    (128, 160, 16): 5,
+    (128, 176, 16): 5,
+    (128, 192, 16): 3,
+    (128, 208, 16): 5,
+    (128, 224, 16): 5,
+    (128, 240, 16): 5,
+    (128, 256, 16): 0,
+
+}
+
+
+SM100_MMA_SHAPES_16b_DENSE_2SM = {
+    (128,  32, 16): 2,
+    (128,  64, 16): 2,
+    (128,  96, 16): 5,
+    (128, 128, 16): 0,
+    (128, 160, 16): 5,
+    (128, 192, 16): 5,
+    (128, 224, 16): 5,
+    (128, 256, 16): 0,
+
+    (256,  32, 16): 2,
+    (256,  64, 16): 2,
+    (256,  96, 16): 5,
+    (256, 128, 16): 0,
+    (256, 160, 16): 5,
+    (256, 192, 16): 3,
+    (256, 224, 16): 5,
+    (256, 256, 16): 0,
+}
+
+# TF32 Dense
+
+SM100_MMA_SHAPES_TF32_DENSE_1SM = {
+    (64,   8, 8): 5,
+    (64,  16, 8): 2,
+    (64,  24, 8): 5,
+    (64,  32, 8): 2,
+    (64,  40, 8): 5,
+    (64,  48, 8): 5,
+    (64,  56, 8): 5,
+    (64,  64, 8): 1,
+    (64,  72, 8): 5,
+    (64,  80, 8): 5,
+    (64,  88, 8): 5,
+    (64,  96, 8): 5,
+    (64, 104, 8): 5,
+    (64, 112, 8): 5,
+    (64, 120, 8): 5,
+    (64, 128, 8): 0,
+    (64, 136, 8): 5,
+    (64, 144, 8): 5,
+    (64, 152, 8): 5,
+    (64, 160, 8): 5,
+    (64, 168, 8): 5,
+    (64, 176, 8): 5,
+    (64, 184, 8): 5,
+    (64, 192, 8): 3,
+    (64, 200, 8): 5,
+    (64, 208, 8): 5,
+    (64, 216, 8): 5,
+    (64, 224, 8): 5,
+    (64, 232, 8): 5,
+    (64, 240, 8): 5,
+    (64, 248, 8): 5,
+    (64, 256, 8): 3,
+
+    (128,  16, 8): 2,
+    (128,  32, 8): 2,
+    (128,  48, 8): 5,
+    (128,  64, 8): 2,
+    (128,  80, 8): 5,
+    (128,  96, 8): 5,
+    (128, 112, 8): 5,
+    (128, 128, 8): 0,
+    (128, 144, 8): 5,
+    (128, 160, 8): 5,
+    (128, 176, 8): 5,
+    (128, 192, 8): 3,
+    (128, 208, 8): 5,
+    (128, 224, 8): 5,
+    (128, 240, 8): 5,
+    (128, 256, 8): 0,
+
+}
+
+SM100_MMA_SHAPES_TF32_DENSE_2SM = {
+    (128,  32, 8): 2,
+    (128,  64, 8): 1,
+    (128,  96, 8): 5,
+    (128, 128, 8): 0,
+    (128, 160, 8): 5,
+    (128, 192, 8): 5,
+    (128, 224, 8): 5,
+    (128, 256, 8): 0,
+
+    (256,  32, 8): 2,
+    (256,  64, 8): 1,
+    (256,  96, 8): 5,
+    (256, 128, 8): 0,
+    (256, 160, 8): 5,
+    (256, 192, 8): 5,
+    (256, 224, 8): 5,
+    (256, 256, 8): 0,
+}
+
+# F8F6F4
+SM100_MMA_SHAPES_F8F6F4_DENSE_1SM = {
+    (64,   8, 32): 4,
+    (64,  16, 32): 4,
+    (64,  24, 32): 5,
+    (64,  32, 32): 3,
+    (64,  40, 32): 5,
+    (64,  48, 32): 5,
+    (64,  56, 32): 5,
+    (64,  64, 32): 2,
+    (64,  72, 32): 5,
+    (64,  80, 32): 5,
+    (64,  88, 32): 5,
+    (64,  96, 32): 5,
+    (64, 104, 32): 5,
+    (64, 112, 32): 5,
+    (64, 120, 32): 5,
+    (64, 128, 32): 0,
+    (64, 136, 32): 5,
+    (64, 144, 32): 5,
+    (64, 152, 32): 5,
+    (64, 160, 32): 5,
+    (64, 168, 32): 5,
+    (64, 176, 32): 5,
+    (64, 184, 32): 5,
+    (64, 192, 32): 5,
+    (64, 200, 32): 5,
+    (64, 208, 32): 5,
+    (64, 216, 32): 5,
+    (64, 224, 32): 5,
+    (64, 232, 32): 5,
+    (64, 240, 32): 5,
+    (64, 248, 32): 5,
+    (64, 256, 32): 0,
+
+    (128,  16, 32): 4,
+    (128,  32, 32): 3,
+    (128,  48, 32): 5,
+    (128,  64, 32): 2,
+    (128,  80, 32): 5,
+    (128,  96, 32): 5,
+    (128, 112, 32): 5,
+    (128, 128, 32): 0,
+    (128, 144, 32): 5,
+    (128, 160, 32): 5,
+    (128, 176, 32): 5,
+    (128, 192, 32): 5,
+    (128, 208, 32): 5,
+    (128, 224, 32): 5,
+    (128, 240, 32): 5,
+    (128, 256, 32): 0,
+
+}
+
+SM100_MMA_SHAPES_F8F6F4_DENSE_2SM = {
+    (128,  32, 32): 3,
+    (128,  64, 32): 2,
+    (128,  96, 32): 5,
+    (128, 128, 32): 1,
+    (128, 160, 32): 5,
+    (128, 192, 32): 5,
+    (128, 224, 32): 5,
+    (128, 256, 32): 1,
+
+    (256,  32, 32): 2,
+    (256,  64, 32): 2,
+    (256,  96, 32): 5,
+    (256, 128, 32): 0,
+    (256, 160, 32): 5,
+    (256, 192, 32): 5,
+    (256, 224, 32): 5,
+    (256, 256, 32): 0,
+}
+
+# MXF8F6F4
+SM100_MMA_SHAPES_MXF8F6F4_DENSE_1SM = {
+    (128,  64, 32): 1,
+    (128, 128, 32): 0,
+    (128, 192, 32): 1,
+    (128, 256, 32): 0,
+}
+
+
+SM100_MMA_SHAPES_MXF8F6F4_DENSE_2SM = {
+    (256,  64, 32): 1,
+    (256, 128, 32): 0,
+    (256, 192, 32): 1,
+    (256, 256, 32): 0,
+
+
+}
+
+# MXF4NVF4
+SM100_MMA_SHAPES_MXF4NVF4_DENSE_1SM = {
+    (128,  64, 64): 1,
+    (128, 128, 64): 0,
+    (128, 192, 64): 1,
+    (128, 256, 64): 0,
+}
+
+SM100_MMA_SHAPES_MXF4NVF4_DENSE_2SM = {
+    # Multiples of 16 for N
+    (256,  64, 64): 1,
+    (256, 128, 64): 0,
+    (256, 192, 64): 1,
+    (256, 256, 64): 0,
+
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf24fe7f528020be4dcfc6ac41cfe949dd63be5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm100_utils.py
@@ -0,0 +1,661 @@
+#################################################################################################
+#
+# Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for enumerating CUTLASS library SM100 kernels
+"""
+
+import argparse
+import enum
+from itertools import product
+import math
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Optional, Sequence, Tuple, List, Union, Callable
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+#### Step 0: define levels
+
+# One integer level controls multiple "generators" and how many
+# combinations they generate. That is the "global" level.
+# "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and
+# anything that is eventually involved in the Cartesian product
+# which yields our kernel configurations.
+# For simplicity, each generator defines their own levels, 
+# starting from 0. As a rule we assume 10 or fewer levels, making
+# their level a digit.
+# The "global" level simply stacks these digits and represents them
+# as a single integer.
+# 
+# For example, level 500 indicates cluster sizes are at level 5, MMA
+# multipliers are at level 0, and WGMMA shapes are at level 0 as well.
+#
+# Here we define the global level to generator level mappings.
+
+
+def get_tcgen05_level_from_global_level(global_level: int):
+    return global_level % 10
+
+def get_mma_level_from_global_level(global_level: int):
+    return (global_level // 10) % 10
+
+
+def get_cluster_level_from_global_level(global_level: int):
+    return (global_level // 100) % 10
+
+
+def get_pruning_level_from_global_level(global_level: int):
+    return (global_level // 1000) % 10
+
+
+#### Step 1: generate MMA instruction shapes based on levels
+
+try:
+    from .sm100_shapes import *
+except:
+    from sm100_shapes import *
+
+###########
+
+def generate_tf32_math_instructions_sm100(level: int):
+    """
+    Generate all TensorOp math instructions for TF32 MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_TF32_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_TF32_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+
+    for shape in shapes_2sm:
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+ 
+    return math_instructions_1sm, math_instructions_2sm
+
+def generate_16b_math_instructions_sm100(level: int):
+    """
+    Generate all TensorOp math instructions for 16b MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_16b_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_16b_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_1sm.append(
+          MathInstruction(
+              shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+
+
+    for shape in shapes_2sm:
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+        math_instructions_2sm.append(
+          MathInstruction(
+              shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+ 
+    return math_instructions_1sm, math_instructions_2sm
+
+
+def generate_fp8_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all TensorOp math instructions for FP8 MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    pruning_level = get_pruning_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.f8, DataType.f8, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+        if enable_compile_time_dtype:    
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e5m2, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_1sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e5m2, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            if pruning_level >= 2:
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      DataType.e5m2, DataType.e5m2, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.f8, DataType.f8, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+        if enable_compile_time_dtype:    
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e5m2, DataType.e4m3, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            math_instructions_2sm.append(
+              MathInstruction(
+                  shape,
+                  DataType.e4m3, DataType.e5m2, DataType.f32,
+                  OpcodeClass.TensorOp,
+                  MathOperation.multiply_add)
+            )
+            if pruning_level >= 2:
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      DataType.e5m2, DataType.e5m2, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+    return math_instructions_1sm, math_instructions_2sm
+
+def generate_f8f6f4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all TensorOp math instructions for FP8 FP6 and FP4 MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_F8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, DataType.e5m2, DataType.e3m2, DataType.e2m1 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, DataType.e5m2, DataType.e3m2, DataType.e2m1 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.TensorOp,
+                      MathOperation.multiply_add)
+                )
+
+    return math_instructions_1sm, math_instructions_2sm
+
+def generate_mxf8f6f4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all BlockScaledTensorOp math instructions for MXFP8, MXFP6, and MXFP4 MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    pruning_level = get_pruning_level_from_global_level(level)
+
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF8F6F4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF8F6F4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+
+                if pruning_level < 2 and ((a_type == DataType.f8 or b_type == DataType.f8)):
+                    continue
+
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, 
+                                   DataType.e5m2, 
+                                   DataType.e3m2, 
+                                   DataType.e2m3,
+                                   DataType.e2m1 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+
+
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f8, DataType.f6, DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+
+                if pruning_level < 2 and ((a_type == DataType.f8 or b_type == DataType.f8)):
+                    continue
+
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e4m3, 
+                                   DataType.e5m2, 
+                                   DataType.e3m2, 
+                                   DataType.e2m3,
+                                   DataType.e2m1 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+
+    return math_instructions_1sm, math_instructions_2sm
+
+def generate_mxf4nvf4_math_instructions_sm100(level: int, enable_runtime_dtype = True, enable_compile_time_dtype = True):
+    """
+    Generate all BlockScaledTensorOp math instructions for MXFP4 and MXFP4 MMA that are supported by SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate math instructions for.
+        enable_runtime_dtype: Whether to generate runtime dtype math instructions.
+        enable_compile_time_dtype: Whether to generate compile time dtype math instructions.
+
+    Returns:
+        A tuple of two lists of MathInstruction objects. 
+        The first list contains the math instructions for 1SM, and the second list contains the math instructions for 2SM.
+    """
+    tcgen05_level = get_tcgen05_level_from_global_level(level)
+    math_instructions_1sm = []
+    math_instructions_2sm = []
+
+    shapes_1sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF4NVF4_DENSE_1SM.items() if tcgen05_level >= min_level
+    ]
+    shapes_2sm = [
+        shape for shape, min_level in SM100_MMA_SHAPES_MXF4NVF4_DENSE_2SM.items() if tcgen05_level >= min_level
+    ]
+
+    for shape in shapes_1sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e2m1, 
+                                 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_1sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+
+
+    for shape in shapes_2sm:
+        if enable_runtime_dtype:
+
+            runtime_types = [ DataType.f4 ]
+
+            for a_type, b_type in product(runtime_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+
+
+        if enable_compile_time_dtype:
+            compile_time_types = [ DataType.e2m1, 
+                                 ]
+
+            for a_type, b_type in product(compile_time_types, repeat=2):
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue8m0)
+                )
+                math_instructions_2sm.append(
+                  MathInstruction(
+                      shape,
+                      a_type, b_type, DataType.f32,
+                      OpcodeClass.BlockScaledTensorOp,
+                      MathOperation.multiply_add,
+                      DataType.ue4m3)
+                )
+
+
+    return math_instructions_1sm, math_instructions_2sm
+
+
+def generate_cluster_shapes_sm100(level: int, change_priority_func : Union[Callable, None] = None):
+    """
+    Generate all cluster shapes for SM100 at or above the given level.
+
+    Args:
+        level: The global level to generate cluster shapes for.
+
+    Returns:
+        A tuple of two lists of cluster shapes. 
+        The first list contains the cluster shapes for 1SM, and the second list contains the cluster shapes for 2SM.
+    """
+    cluster_level = get_cluster_level_from_global_level(level)
+
+    assert cluster_level >= 4
+
+    if change_priority_func is not None:
+        SM100_CLUSTER_SHAPES_1SM_CPY = copy.deepcopy(SM100_CLUSTER_SHAPES_1SM)
+        SM100_CLUSTER_SHAPES_2SM_CPY = copy.deepcopy(SM100_CLUSTER_SHAPES_2SM)
+        change_priority_func(SM100_CLUSTER_SHAPES_1SM_CPY, SM100_CLUSTER_SHAPES_2SM_CPY)
+        shapes_1sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_1SM_CPY.items() if cluster_level >= min_level
+        ]
+        shapes_2sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_2SM_CPY.items() if cluster_level >= min_level
+        ]
+
+        return shapes_1sm, shapes_2sm
+   
+    else:
+
+        shapes_1sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_1SM.items() if cluster_level >= min_level
+        ]
+        shapes_2sm = [
+            list(shape) for shape, min_level in SM100_CLUSTER_SHAPES_2SM.items() if cluster_level >= min_level
+        ]
+
+        return shapes_1sm, shapes_2sm
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_shapes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14761aae6494f877e6dc6521b30baea0db7509c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_shapes.py
@@ -0,0 +1,212 @@
+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Valid WGMMA shapes, MMA multipliers, and cluster sizes for SM90, associated with levels.
+These shape and level pairs are defined as dicts, where keys are shapes and values are their
+associated levels. If the user input level for that category (MMA multiplier, WGMMA shape, cluster
+size) is smaller than a shape's associated level, it will be excluded, and otherwise, included.
+Higher levels are therefore less likely emitted, but lower levels are more emitted more frequently.
+Level 0 is always emitted. The default behavior in `generator.py` is that level 1 is only emitted
+when the `--kernel` argument is non-empty.
+"""
+
+# NOTE: more combinations are possible here.
+# Levels [0, 3] exist in order to control exactly what configs are generated in different dtypes.
+# The rest are only used in the exhaustive mode (when the corresponding level digit is 9).
+# MMA multipliers are multiplied by MMA instruction shapes (WGMMA shapes) to get CTA shapes.
+SM90_MMA_MULTIPLIERS = {
+    (2, 1, 4): 0,
+    (1, 1, 4): 1,
+    (4, 1, 4): 2,
+    (2, 2, 4): 3,
+    (2, 1, 8): 4,
+    (4, 1, 8): 4,
+    (1, 1, 8): 4,
+    (2, 2, 8): 4,
+    (2, 1, 16): 5,
+    (4, 1, 16): 5,
+    (1, 1, 16): 5,
+    (2, 2, 16): 5,
+}
+
+# Level 0: only (1, 2, 1) -- fp8 dense gemms in pruned case
+# Level 1: clusters with 2 CTAs -- all but fp8 (s8, u8, f16, b16, f32, tf32) dense gemms in pruned case
+# Level 2: clusters with 1 or 2 CTAs
+# Level 3: clusters with 1, 2, or 4 CTAs
+# Level 4: clusters with 1, 2, 4, or 8 CTAs
+# Level 5: clusters with 1, 2, 4, 8, or 16 CTAs
+SM90_CLUSTER_SIZES = {
+    (1, 2, 1): 0,
+    (2, 1, 1): 1,
+    (1, 1, 1): 2,
+    (2, 2, 1): 3,
+    (1, 4, 1): 3,
+    (4, 1, 1): 3,
+    (2, 4, 1): 4,
+    (4, 2, 1): 4,
+    (1, 8, 1): 4,
+    (8, 1, 1): 4,
+    (4, 4, 1): 5,
+}
+
+
+# WGMMA shapes
+# Level 0: "default" shape only,
+# Level 1: additional shapes for the unpruned case (tf32 only)
+# Level 2: shapes that are all powers of 2
+# Level 3: all other shapes
+SM90_WGMMA_SHAPES_FP16_BF16_DENSE = {
+    (64, 8, 16): 2,
+    (64, 16, 16): 2,
+    (64, 24, 16): 3,
+    (64, 32, 16): 2,
+    (64, 40, 16): 3,
+    (64, 48, 16): 3,
+    (64, 56, 16): 3,
+    (64, 64, 16): 2,
+    (64, 72, 16): 3,
+    (64, 80, 16): 3,
+    (64, 88, 16): 3,
+    (64, 96, 16): 3,
+    (64, 104, 16): 3,
+    (64, 112, 16): 3,
+    (64, 120, 16): 3,
+    (64, 128, 16): 0,
+    (64, 136, 16): 3,
+    (64, 144, 16): 3,
+    (64, 152, 16): 3,
+    (64, 160, 16): 3,
+    (64, 168, 16): 3,
+    (64, 176, 16): 3,
+    (64, 184, 16): 3,
+    (64, 192, 16): 3,
+    (64, 200, 16): 3,
+    (64, 208, 16): 3,
+    (64, 216, 16): 3,
+    (64, 224, 16): 3,
+    (64, 232, 16): 3,
+    (64, 240, 16): 3,
+    (64, 248, 16): 3,
+    (64, 256, 16): 1,
+}
+
+SM90_WGMMA_SHAPES_TF32_DENSE = {
+    (64, 8, 8): 2,
+    (64, 16, 8): 2,
+    (64, 24, 8): 3,
+    (64, 32, 8): 2,
+    (64, 40, 8): 3,
+    (64, 48, 8): 3,
+    (64, 56, 8): 3,
+    (64, 64, 8): 2,
+    (64, 72, 8): 3,
+    (64, 80, 8): 3,
+    (64, 88, 8): 3,
+    (64, 96, 8): 3,
+    (64, 104, 8): 3,
+    (64, 112, 8): 3,
+    (64, 120, 8): 3,
+    (64, 128, 8): 0,
+    (64, 136, 8): 3,
+    (64, 144, 8): 3,
+    (64, 152, 8): 3,
+    (64, 160, 8): 3,
+    (64, 168, 8): 3,
+    (64, 176, 8): 3,
+    (64, 184, 8): 3,
+    (64, 192, 8): 3,
+    (64, 200, 8): 3,
+    (64, 208, 8): 3,
+    (64, 216, 8): 3,
+    (64, 224, 8): 3,
+    (64, 232, 8): 3,
+    (64, 240, 8): 3,
+    (64, 248, 8): 3,
+    (64, 256, 8): 1,
+}
+
+SM90_WGMMA_SHAPES_FP8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 40, 32): 3,
+    (64, 48, 32): 3,
+    (64, 56, 32): 3,
+    (64, 64, 32): 2,
+    (64, 72, 32): 3,
+    (64, 80, 32): 3,
+    (64, 88, 32): 3,
+    (64, 96, 32): 3,
+    (64, 104, 32): 3,
+    (64, 112, 32): 3,
+    (64, 120, 32): 3,
+    (64, 128, 32): 0,
+    (64, 136, 32): 3,
+    (64, 144, 32): 3,
+    (64, 152, 32): 3,
+    (64, 160, 32): 3,
+    (64, 168, 32): 3,
+    (64, 176, 32): 3,
+    (64, 184, 32): 3,
+    (64, 192, 32): 3,
+    (64, 200, 32): 3,
+    (64, 208, 32): 3,
+    (64, 216, 32): 3,
+    (64, 224, 32): 3,
+    (64, 232, 32): 3,
+    (64, 240, 32): 3,
+    (64, 248, 32): 3,
+    (64, 256, 32): 1,
+}
+
+SM90_WGMMA_SHAPES_INT8_DENSE = {
+    (64, 8, 32): 2,
+    (64, 16, 32): 2,
+    (64, 24, 32): 3,
+    (64, 32, 32): 2,
+    (64, 48, 32): 3,
+    (64, 64, 32): 2,
+    (64, 80, 32): 3,
+    (64, 96, 32): 3,
+    (64, 112, 32): 3,
+    (64, 128, 32): 0,
+    (64, 144, 32): 3,
+    (64, 160, 32): 3,
+    (64, 176, 32): 3,
+    (64, 192, 32): 3,
+    (64, 208, 32): 3,
+    (64, 224, 32): 3,
+    (64, 240, 32): 3,
+    (64, 256, 32): 1,
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5fdf14abb85835f71ecfd704a2738f5792af50
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/sm90_utils.py
@@ -0,0 +1,753 @@
+#################################################################################################
+#
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for enumerating CUTLASS library SM90 kernels
+"""
+
+import argparse
+import enum
+from itertools import product
+import math
+import logging
+import os.path
+import shutil
+import sys
+import copy
+from typing import Any, Optional, Sequence, Tuple, List
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+# NOTE: this is a duplicate of CudaToolkitVersionSatisfies in generator.py
+def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0):
+
+  # by default, use the latest CUDA Toolkit version
+  cuda_version = [11, 0, 132]
+
+  # Update cuda_version based on parsed string
+  if semantic_ver_string != '':
+    for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')[:3]]):
+      if i < len(cuda_version):
+        cuda_version[i] = x
+      else:
+        cuda_version.append(x)
+  return cuda_version >= [major, minor, patch]
+
+#### Step 0: define levels
+
+# One integer level controls multiple "generators" and how many
+# combinations they generate. That is the "global" level.
+# "Generators" are WGMMA shapes, MMA multipliers, cluster sizes, and
+# anything that is eventually involved in the Cartesian product
+# which yields our kernel configurations.
+# For simplicity, each generator defines their own levels, 
+# starting from 0. As a rule we assume 10 or fewer levels, making
+# their level a digit.
+# The "global" level simply stacks these digits and represents them
+# as a single integer.
+# 
+# For example, level 500 indicates cluster sizes are at level 5, MMA
+# multipliers are at level 0, and WGMMA shapes are at level 0 as well.
+#
+# Here we define the global level to generator level mappings.
+
+
+def get_wgmma_level_from_global_level(global_level: int):
+    return global_level % 10
+
+
+def get_mma_level_from_global_level(global_level: int):
+    return (global_level // 10) % 10
+
+
+def get_cluster_level_from_global_level(global_level: int):
+    return (global_level // 100) % 10
+
+
+def get_pruning_level_from_global_level(global_level: int):
+    return (global_level // 1000) % 10
+
+
+#### Step 1: generate MMA instruction shapes based on levels
+
+try:
+    from .sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+except:
+    from sm90_shapes import (
+        SM90_MMA_MULTIPLIERS,
+        SM90_CLUSTER_SIZES,
+        SM90_WGMMA_SHAPES_TF32_DENSE,
+        SM90_WGMMA_SHAPES_FP16_BF16_DENSE,
+        SM90_WGMMA_SHAPES_FP8_DENSE,
+        SM90_WGMMA_SHAPES_INT8_DENSE,
+    )
+
+
+def generate_tf32_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_TF32_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_fp16_bf16_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP16_BF16_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_fp8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_FP8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_int8_math_instruction_shapes_sm90(level: int):
+    assert isinstance(level, int) and level >= 0
+    filtered_list_of_wgmma_shapes = [
+        wgmma_shape for wgmma_shape, min_level in SM90_WGMMA_SHAPES_INT8_DENSE.items() if level >= min_level
+    ]
+    return filtered_list_of_wgmma_shapes
+
+def generate_mixed_dtype_math_instructions_shapes_sm90(wgmma_level: int, a_type: DataType, b_type: DataType):
+    # DataTypeSize are in the unit of bits
+    a_bytes = DataTypeSize[a_type] // 8
+    b_bytes = DataTypeSize[b_type] // 8
+    if a_bytes == 4 or b_bytes == 4:
+        return generate_tf32_math_instruction_shapes_sm90(wgmma_level)
+    elif a_bytes == 2 or b_bytes == 2:
+        return generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level)
+    else:
+        return generate_fp8_math_instruction_shapes_sm90(wgmma_level)
+
+###########
+
+def generate_tf32_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_tf32_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions.append(
+          MathInstruction(
+              math_instruction_shape,
+              DataType.tf32, DataType.tf32, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add)
+        )
+    return math_instructions
+
+def generate_fp16_bf16_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp16_bf16_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f16,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.f16, DataType.f16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.bf16, DataType.bf16, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def generate_fp8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_fp8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e4m3, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e4m3, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.e5m2, DataType.e5m2, DataType.f32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def generate_mixed_dtype_math_instructions_sm90(level: int, types_of_a_b_acc: List[Tuple[DataType, DataType, DataType]]):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for a_type, b_type, acc_type in types_of_a_b_acc:
+        math_instruction_shapes = generate_mixed_dtype_math_instructions_shapes_sm90(wgmma_level, a_type, b_type)
+        for math_instruction_shape in math_instruction_shapes:
+            math_instructions += [
+                MathInstruction(
+                    math_instruction_shape,
+                    a_type, b_type, acc_type,
+                    OpcodeClass.TensorOp,
+                    MathOperation.multiply_add
+                ),
+            ]
+    return math_instructions
+
+def generate_int8_math_instructions_sm90(level: int):
+    wgmma_level = get_wgmma_level_from_global_level(level)
+    math_instructions = []
+    for math_instruction_shape in generate_int8_math_instruction_shapes_sm90(wgmma_level):
+        math_instructions += [
+          MathInstruction(
+              math_instruction_shape,
+              DataType.s8, DataType.s8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+          MathInstruction(
+              math_instruction_shape,
+              DataType.u8, DataType.u8, DataType.s32,
+              OpcodeClass.TensorOp,
+              MathOperation.multiply_add),
+        ]
+    return math_instructions
+
+def make_sparse_math_instructions(math_instructions):
+    sparse_instructions = []
+    for inst in math_instructions:
+        if inst.opcode_class == OpcodeClass.TensorOp:
+            sparse_instructions.append(MathInstruction(
+                (inst.instruction_shape[0], inst.instruction_shape[1], inst.instruction_shape[2] * 2),
+                inst.element_a, inst.element_b, inst.element_accumulator,
+                OpcodeClass.SparseTensorOp,
+                inst.math_operation),)
+    return sparse_instructions
+
+
+#### Step 2: generate tile descriptions from math instruction shapes
+
+def is_tile_desc_valid(tile_description):
+    if tile_description.minimum_compute_capability != 90 or tile_description.maximum_compute_capability != 90:
+        return False
+
+    element_a, element_b, element_accum = (
+        tile_description.math_instruction.element_a,
+        tile_description.math_instruction.element_b,
+        tile_description.math_instruction.element_accumulator
+    )
+
+    cluster_size, cta_shape = (
+        tile_description.cluster_shape,
+        tile_description.threadblock_shape,
+    )
+    grid_size = (
+        cta_shape[0] * cluster_size[0] +
+        cta_shape[1] * cluster_size[1] +
+        cta_shape[2] * cluster_size[2]
+    )
+    num_ctas_in_cluster = cluster_size[0] * cluster_size[1] * cluster_size[2]
+    cluster_shape = (
+        cluster_size[0] * cta_shape[0],
+        cluster_size[1] * cta_shape[1],
+        cluster_size[2] * cta_shape[2]
+    )
+
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp32 = element_a in FP32_TYPES and element_b in FP32_TYPES
+    is_fp16 = element_a in FP16_TYPES and element_b in FP16_TYPES
+
+    # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is
+    # allowed for non portable clusters.
+    if num_ctas_in_cluster > 16 or num_ctas_in_cluster < 1:
+        return False
+
+    if grid_size < 1:
+        return False
+
+    # SM90 WGMMA shapes are always 64 across M, therefore
+    # CTA shape across M must always be a multiple of 64.
+    if cta_shape[0] < 64 or cta_shape[0] % 64 != 0:
+        return False
+
+    # The minimum WGMMA shape across N is 8, and increments
+    # vary across different dtypes, but they're never smaller
+    # than 8. The minimum CTA shape allowed across N though is 16.
+    if cta_shape[1] < 16 or cta_shape[1] % 8 != 0:
+        return False
+
+    # SM90 WGMMA shapes across K are always 8 for 32 bit dense
+    # operations, 16 for 16 bit, and 32 for 8 bit. In any case,
+    # the CTA shape across K should be a multiple of 8 and at least
+    # twice the WGMMA shape across K.
+    if cta_shape[2] < 16 or cta_shape[2] % 8 != 0:
+        return False
+
+    # Minimum of 2 stages (very rough heuristic that may filter out valid kernel configs)
+    if (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 256:
+        return False
+
+    if is_fp32 and (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 128:
+        return False
+
+    if is_fp32 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 64:
+        return False
+
+    if is_fp16 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 128:
+        return False
+
+    # CTA shape upper bound: <256, 256, 256>
+    if cta_shape[0] > 256 or cta_shape[1] > 256 or cta_shape[2] > 256:
+        return False
+
+    return True
+
+def get_mma_multipliers(level: int):
+    assert isinstance(level, int) and level >= 0
+    mma_level = get_mma_level_from_global_level(level)
+    return [
+        mma_mul for mma_mul, mma_min_level in SM90_MMA_MULTIPLIERS.items() if mma_level >= mma_min_level
+    ]
+
+def get_cluster_sizes(level: int, is_aligned: bool):
+    if not is_aligned:
+        return [(1, 1, 1)]
+    assert isinstance(level, int) and level >= 0
+    cluster_level = get_cluster_level_from_global_level(level)
+    return [
+        cluster_size for cluster_size, cluster_min_level in SM90_CLUSTER_SIZES.items() if cluster_level >= cluster_min_level
+    ]
+
+def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: int):
+    tile_descriptions = set()
+    mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
+    for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
+
+        # generator can stamp out duplicate kernels, because it doesn't explicitly set instruction
+        # shape for SM90 kernels, and the 3.X collective API doesn't directly expose them when using
+        # the auto kernel schedule.
+
+        math_inst_stub = copy.deepcopy(math_inst)
+        math_inst_stub.instruction_shape = [0, 0, 0]
+
+        tile_desc = TileDescription(
+            threadblock_shape=[
+                math_inst.instruction_shape[0] * mma_mul[0],
+                math_inst.instruction_shape[1] * mma_mul[1],
+                math_inst.instruction_shape[2] * mma_mul[2]
+            ],
+            stages=0,
+            warp_count=[4, 1, 1],
+            math_instruction=math_inst_stub,
+            min_compute=90,
+            max_compute=90,
+            cluster_shape=cluster_size)
+        # For sparse kernels K-tile is twice as large (due to 2x MMA-K size)
+        # Reduce it to same size as dense to afford more smem stages
+        if math_inst.opcode_class == OpcodeClass.SparseTensorOp:
+            tile_desc.threadblock_shape[2] = tile_desc.threadblock_shape[2] // 2
+        if is_tile_desc_valid(tile_desc):
+            tile_descriptions.add(tile_desc)
+
+    return tile_descriptions
+
+#### Step 3: map tile description to valid schedules
+
+def is_tile_desc_compatible_with_cooperative(tile_description):
+    # Cooperative kernels require a minimum CTA-M of 128
+    return tile_description.threadblock_shape[0] % 128 == 0
+
+
+def can_tile_desc_use_shmem_in_epilogue(tile_description, data_types):
+    dtype_a, dtype_b, dtype_c, dtype_d, dtype_acc, dtype_epi = (
+        data_types["a_type"],
+        data_types["b_type"],
+        data_types["c_type"],
+        data_types["d_type"],
+        data_types["acc_type"],
+        data_types["epi_type"]
+    )
+    mn = tile_description.threadblock_shape[0] * tile_description.threadblock_shape[1]
+    bitsize_c, bitsize_d = DataTypeSize[dtype_c], DataTypeSize[dtype_d]
+
+    shmem_bits_c, shmem_bits_d = bitsize_c * mn, bitsize_d * mn
+    shmem_bits_total = shmem_bits_c + shmem_bits_d
+    # Magic number: 2^20
+    # Existing logic suggested that tile shape 256x128 (or 128x256)
+    # would run out of shmem if D is FP32, and source is needed.
+    # That would be 256 * 128 * 32 == 2^21 (~262 KB), which is over the limit.
+    # Hopper's max shmem size is 228 KB, and 2^20 ~= 131 KB.
+    # Since epilogue can't possibly use ALL of the shmem available
+    # we can just settle on 2^20 bits (~ 131 KB) being the upper bound
+    # we would allow for epilogue.
+    # This can be different for non-persistent kernels where epilogue and
+    # mainloop shmem is shared.
+    if shmem_bits_total > 2 ** 20:
+        return False
+
+    return True
+
+
+def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, layout,
+                        instantiation_level, enable_fp8_fast_acc=True, gemm_kind=GemmKind.Universal3x):
+    # Level 0: prune according to existing generator.py behavior
+    # Level >= 1: no pruning
+    level = get_pruning_level_from_global_level(instantiation_level)
+    schedules = []
+    stream_k_schedules = []
+
+    if not is_tile_desc_valid(tile_description):
+        return schedules, stream_k_schedules
+
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp16 = data_types["a_type"] in FP16_TYPES and data_types["b_type"] in FP16_TYPES
+
+    FP8_TYPES = [DataType.e4m3, DataType.e5m2]
+    is_fp8 = data_types["a_type"] in FP8_TYPES and data_types["b_type"] in FP8_TYPES
+    can_do_fp8_fast_accum = is_fp8 and enable_fp8_fast_acc
+
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    is_fp32 = data_types["a_type"] in FP32_TYPES and data_types["b_type"] in FP32_TYPES
+    requires_transposed_epilogue = is_fp32 and layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.RowMajor
+
+    can_do_cooperative = is_tile_desc_compatible_with_cooperative(tile_description)
+    can_do_tma_epilogue = is_aligned and not requires_transposed_epilogue and can_tile_desc_use_shmem_in_epilogue(tile_description, data_types)
+
+    default_epilogue = EpilogueScheduleType.NoSmemWarpSpecialized if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+    auto_epilogue = EpilogueScheduleType.ScheduleAuto if not requires_transposed_epilogue else EpilogueScheduleType.EpilogueTransposed
+
+    cta_m, cta_n, cta_k = (
+        tile_description.threadblock_shape[0],
+        tile_description.threadblock_shape[1],
+        tile_description.threadblock_shape[2]
+    )
+    c_type = data_types["c_type"]
+    d_type = data_types["d_type"]
+    is_void_c = c_type == DataType.void
+
+    # Filter out invalid kernels
+    is_nt = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.RowMajor
+    is_tn = layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.ColumnMajor
+    is_nn = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.ColumnMajor
+
+    # static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    #   "Copy size must evenly divide SMEM tile.");
+    if is_fp32 and is_nt and (cta_n % cta_k != 0):
+        return [], []
+
+    # static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    # "SmemLayoutB K must be 128bytes to be transposed.")
+    if is_fp32 and is_nt and cta_k != 32:
+        return [], []
+
+    # Static assert failure when instantiating SmemLayoutB
+    if is_fp32 and (is_tn or is_nn) and (cta_n % cta_k != 0):
+        return [], []
+
+    grouped = is_grouped(gemm_kind)
+    if grouped:
+        # the following cases are unsupported by grouped GEMM
+        if not is_aligned:
+            return [], []
+        if requires_transposed_epilogue:
+            return [], []
+
+    # Early pruning
+    if level < 1:
+        # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64
+        if is_fp16 and cta_m <= 64 and cta_n <= 128 and cta_k <= 64:
+            return [], []
+
+        # FP8 configs with CTA tile larger than or equal to 256x128x128 limit data types and schedules
+        is_large_fp8_tile = is_fp8 and cta_m >= 256 and cta_n >= 128 and cta_k >= 128
+        if is_large_fp8_tile:
+            # Only void-C, and only FP8 outputs allowed
+            if not is_void_c or d_type not in FP8_TYPES:
+                return [], []
+            if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
+                schedules = []
+                if is_blockwise(gemm_kind):
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                else:
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                    schedules.append(
+                        [
+                            to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                            to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                        ])
+                return schedules, []
+            return [], []
+
+        if is_fp8 and not is_large_fp8_tile:
+            valid_dtypes_for_c = [DataType.f32, DataType.bf16, DataType.f16, DataType.void]
+            # Prune all configs with fp8 source, and all configs with non-fp8 output
+            # that have different dtypes for source and output.
+            if c_type not in valid_dtypes_for_c or (d_type not in FP8_TYPES and c_type != d_type):
+                return [], []
+
+        # FP32/TF32 kernels don't stamp out void-C
+        if is_fp32 and is_void_c:
+            return [], []
+
+    # Void-c only makes a difference for TMA epilogues
+    if is_void_c and not can_do_tma_epilogue:
+        return [], []
+
+    # For mixed input data types
+    a_type_size = DataTypeSize[data_types["a_type"]]
+    b_type_size = DataTypeSize[data_types["b_type"]]
+    if a_type_size != b_type_size and CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        schedules = []
+        stream_k_schedules = []
+        epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized
+        if a_type_size > b_type_size:
+            epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
+        
+        if not is_blockwise(gemm_kind):
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecialized,
+                epilogue_schedule
+            ])
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecializedPingpong,
+                epilogue_schedule
+            ])
+        if cta_m >= 128:
+            if a_type_size > b_type_size:
+                epilogue_schedule = EpilogueScheduleType.EpilogueTransposed
+            else:
+                epilogue_schedule = EpilogueScheduleType.TmaWarpSpecializedCooperative
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+            else:
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    epilogue_schedule
+                ])
+        return schedules, stream_k_schedules
+
+    if not is_aligned and not is_blockwise(gemm_kind):
+        schedules = [[KernelScheduleType.CpAsyncWarpSpecialized,
+                    default_epilogue]]
+        stream_k_schedules = []
+
+        if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative:
+            schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+            stream_k_schedules.append([
+                KernelScheduleType.CpAsyncWarpSpecializedCooperative,
+                default_epilogue
+            ])
+
+        return schedules, stream_k_schedules
+
+    schedules = []
+    # Pruning: emit Void-C and Grouped kernels with persistent kernels only
+    if (level >= 1 or not is_void_c) and not grouped and not is_blockwise(gemm_kind):
+        # Pruning: don't stamp out fp8 kernels with auto schedule
+        if not is_fp8:
+            schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
+        schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
+    stream_k_schedules = []
+    
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            # Inconsistency: fp8 pingpong only gets stamped out with fast accum
+            if (not is_fp8 or level >= 1) and not is_blockwise(gemm_kind):
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpong, grouped),
+                    to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized, grouped)
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, grouped),
+                    to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecialized, grouped)
+                ])
+
+    if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
+        # Pruning: don't stamp out fp8 ping-pong kernel with non-tma epilogue
+        if not is_fp8 or level >= 1:
+            if not is_blockwise(gemm_kind):
+                schedules.append([to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpong, grouped), to_grouped_schedule(default_epilogue, grouped)])
+            else:
+                schedules.append([to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedPingpong, grouped), to_grouped_schedule(default_epilogue, grouped)])
+
+        if can_do_fp8_fast_accum:
+            if not grouped:
+                schedules.append([KernelScheduleType.TmaWarpSpecializedFP8FastAccum, default_epilogue])
+            schedules.append([to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, grouped), to_grouped_schedule(default_epilogue, grouped)])
+
+        if can_do_cooperative:
+            if is_blockwise(gemm_kind):
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            else:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    default_epilogue
+                ])
+            if can_do_fp8_fast_accum:
+                schedules.append([
+                    to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                    to_grouped_schedule(default_epilogue, grouped)
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                    default_epilogue
+                ])
+
+        # persistent kernels with TMA epilogues
+        if can_do_tma_epilogue:
+            assert not requires_transposed_epilogue
+            if can_do_cooperative:
+                if is_blockwise(gemm_kind):
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.BlockwiseTmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                else:
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperative, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+                if can_do_fp8_fast_accum:
+                    schedules.append([
+                        to_grouped_schedule(KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum, grouped),
+                        to_grouped_schedule(EpilogueScheduleType.TmaWarpSpecializedCooperative, grouped)
+                    ])
+                    stream_k_schedules.append([
+                        KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        EpilogueScheduleType.TmaWarpSpecializedCooperative
+                    ])
+    # Grouped GEMM do not support Stream-K scheduler
+    if grouped:
+        return schedules, []
+    return schedules, stream_k_schedules
+
+
+#### Misc: helpers
+
+def generate_data_types_from_math_instruction(math_instruction, element_source = None, element_dest = None, element_epilogue = None):
+    element_a, element_b = math_instruction.element_a, math_instruction.element_b
+    element_accumulator = math_instruction.element_accumulator
+    element_c = element_source or element_accumulator
+    element_d = element_dest or element_accumulator
+    element_epilogue = element_epilogue or element_accumulator
+    data_types = {
+        "a_type"   : element_a,
+        "b_type"   : element_b,
+        "c_type"   : element_c,
+        "d_type"   : element_d,
+        "acc_type" : element_accumulator,
+        "epi_type" : element_epilogue
+    }
+    return data_types
+
+def fix_alignments(data_types, layout, alignment_bits = 128):
+    operand_keys = ["a_type", "b_type", "c_type"]
+    operands_to_fix = ["c_type"]
+    new_layout = []
+    assert len(layout) == len(operand_keys)
+    for i, k in enumerate(operand_keys):
+        assert k in data_types and data_types[k] in DataTypeSize
+        dtype = data_types[k]
+        dtype_size_bits = DataTypeSize[dtype]
+
+        layout_type = layout[i][0]
+        layout_alignment = layout[i][1]
+
+        # Don't modify alignment if dtype's been changed to void
+        if k in operands_to_fix and dtype_size_bits >= 1:
+            layout_alignment = alignment_bits // dtype_size_bits
+
+        new_layout.append([layout_type, layout_alignment])
+
+    return new_layout
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/symm_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/symm_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8661ff798b2e3e0987fdf7e050b6ad2e0f8f3678
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/symm_operation.py
@@ -0,0 +1,440 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Symm kernels
+"""
+
+import enum
+import functools
+import operator
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a Symm update operation
+#
+###################################################################################################
+
+#
+class SymmOperation:
+  #
+  def __init__(self, symm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8, \
+      blas_mode = BlasMode.symmetric):
+
+    self.blas_mode = blas_mode
+    self.operation_kind = OperationKind.Symm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.symm_kind = symm_kind
+    # tensor A and B have same data type and layout
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def is_planar_complex(self):
+    return False
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    operation_name = 'symm' if self.blas_mode == BlasMode.symmetric else 'hemm'
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, operation_name)
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)]
+      )
+    return "%s" % (ShortLayoutTypeNames[self.A.layout])
+
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = self.C.alignment
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'alignment': "%d" % alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitSymmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.symm_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.symm_complex_template = """
+// Symm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Symm<
+    ${element_a}, ${layout_a}, ${side_mode}, ${fill_mode},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue}
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${blas_mode}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+
+    warp_count = operation.tile_description.warp_count
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode': SideModeTag[operation.A.side_mode],
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(operation.A.alignment),
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'blas_mode': BlasModeTag[operation.blas_mode]
+    }
+
+    symm_template = self.symm_complex_template if operation.is_complex() else self.symm_template
+
+    return SubstituteTemplate(symm_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitSymmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      SymmKind.Universal: EmitSymmUniversalInstance,
+    }
+
+    self.symm_kind_wrappers = {
+      SymmKind.Universal: 'SymmOperation',
+    }
+
+    self.instance_template = {
+      SymmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${symm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by symm_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "symm_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.symm_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.symm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'symm_kind': self.symm_kind_wrappers[operation.symm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/trmm_operation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/trmm_operation.py
new file mode 100644
index 0000000000000000000000000000000000000000..46ba360cb615c955d329b390c0ab93d13ed88c7c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/cutlass_library/trmm_operation.py
@@ -0,0 +1,447 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for emitting Trmm kernels
+"""
+
+import enum
+import functools
+import operator
+import os.path
+import shutil
+
+try:
+  import builtins
+  if hasattr(builtins, "CUTLASS_IGNORE_PACKAGE") and CUTLASS_IGNORE_PACKAGE == True:
+    raise ImportError("Disabling attempt to import cutlass_library")
+  from cutlass_library.library import *
+except ImportError:
+  from library import *
+
+
+###################################################################################################
+#
+# Data structure modeling a TRMM operation
+#
+###################################################################################################
+
+#
+class TrmmOperation:
+  #
+  def __init__(self, trmm_kind, arch, tile_description, A, B, C, element_epilogue, \
+      epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8):
+
+    self.operation_kind = OperationKind.Trmm
+    self.arch = arch
+    self.tile_description = tile_description
+    self.trmm_kind = trmm_kind
+    self.A = A
+    self.B = B
+    self.C = C
+    self.element_epilogue = element_epilogue
+    self.epilogue_functor = epilogue_functor
+    self.swizzling_functor = swizzling_functor
+
+  #
+  def is_complex(self):
+    complex_operators = [
+      MathOperation.multiply_add_complex,
+      MathOperation.multiply_add_complex_gaussian,
+      MathOperation.multiply_add_complex_fast_f32
+    ]
+    return self.tile_description.math_instruction.math_operation in complex_operators
+    return False
+
+  #
+  def is_planar_complex(self):
+#   return self.trmm_kind in (TrmmKind.PlanarComplex, TrmmKind.PlanarComplexArray)
+    return False
+
+  #
+  def is_mixed_input(self):
+    return self.A.element != self.B.element
+
+  #
+  def accumulator_type(self):
+    accum = self.tile_description.math_instruction.element_accumulator
+
+    if self.is_complex():
+      return get_complex_from_real(accum)
+
+    return accum
+
+  #
+  def short_math_name(self):
+    if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian:
+      return "g%s" % ShortDataTypeNames[self.accumulator_type()]
+    return ShortDataTypeNames[self.accumulator_type()]
+
+
+  #
+  def core_name(self):
+    ''' The basic operation kind is prefixed with a letter indicating the accumulation type. '''
+
+    inst_shape = ''
+    inst_operation = ''
+    intermediate_type = ''
+
+    math_operations_map = {
+      MathOperation.xor_popc: 'xor',
+      MathOperation.and_popc: 'and'
+    }
+
+    if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \
+      self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp:
+
+      math_op = self.tile_description.math_instruction.math_operation
+      math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else ''
+
+      inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
+      inst_shape += math_op_string
+
+      if self.tile_description.math_instruction.element_a != self.A.element and \
+        self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator:
+        intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
+
+    return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, TrmmKindNames[self.trmm_kind])
+
+  #
+  def extended_name(self):
+    ''' Append data types if they differ from compute type. '''
+    if self.is_complex():
+      extended_name = "${core_name}"
+    else:
+      if self.C.element != self.tile_description.math_instruction.element_accumulator and \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${element_c}_${core_name}_${element_a}"
+      elif self.C.element == self.tile_description.math_instruction.element_accumulator and  \
+        self.A.element != self.tile_description.math_instruction.element_accumulator:
+        extended_name = "${core_name}_${element_a}"
+      else:
+        extended_name = "${core_name}"
+
+    extended_name = SubstituteTemplate(extended_name, {
+      'element_a': DataTypeNames[self.A.element],
+      'element_c': DataTypeNames[self.C.element],
+      'core_name': self.core_name()
+      })
+
+    return extended_name
+
+  #
+  def layout_name(self):
+    if self.is_complex() or self.is_planar_complex():
+      return "%s%s" % (
+        ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)],
+        ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)]
+      )
+    return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
+
+  #
+  def side_mode_name(self):
+    return "%s" % (ShortSideModeNames[self.A.side_mode])
+
+  #
+  def fill_mode_name(self):
+    return "%s" % (ShortFillModeNames[self.A.fill_mode])
+
+  #
+  def diag_type_name(self):
+    return "%s" % (ShortDiagTypeNames[self.A.diag_type])
+
+  #
+  def procedural_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    threadblock = self.tile_description.procedural_name()
+
+    opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
+
+    alignment = max([self.C.alignment])
+
+    return SubstituteTemplate(
+      "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_${side_mode}_${fill_mode}_${diag_type}_align${alignment}",
+      {
+        'opcode_class': opcode_class_name,
+        'extended_name': self.extended_name(),
+        'threadblock': threadblock,
+        'layout': self.layout_name(),
+        'side_mode': self.side_mode_name(),
+        'fill_mode': self.fill_mode_name(),
+        'diag_type': self.diag_type_name(),
+        'alignment': "%d" % self.C.alignment,
+      }
+    )
+
+  #
+  def configuration_name(self):
+    ''' The full procedural name indicates architecture, extended name, tile size, and layout. '''
+    return self.procedural_name()
+
+###################################################################################################
+#
+# Emits single instances of a CUTLASS device-wide operator
+#
+###################################################################################################
+
+#
+class EmitTrmmUniversalInstance:
+  ''' Responsible for emitting a CUTLASS template definition'''
+
+  def __init__(self):
+    self.trmm_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation}
+>;
+"""
+    self.trmm_complex_template = """
+// Trmm operator ${operation_name}
+using Operation_${operation_name} =
+  typename cutlass::gemm::device::Trmm<
+    ${element_a}, ${layout_a},
+    ${side_mode}, ${fill_mode}, ${diag_type},
+    ${element_b}, ${layout_b},
+    ${element_c}, ${layout_c},
+    ${element_accumulator},
+    ${opcode_class},
+    ${arch},
+    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
+    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
+    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
+    ${epilogue_functor}<
+      ${element_c},
+      ${epilogue_vector_length},
+      ${element_accumulator},
+      ${element_epilogue},
+      cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling
+    >,
+    ${swizzling_functor},
+    ${stages},
+    ${align_a},
+    ${align_b},
+    ${split_k_serial},
+    ${math_operation},
+    ${transform_a}
+>;
+"""
+
+  def emit(self, operation):
+
+    threadblock_shape = operation.tile_description.threadblock_shape
+    warp_count = operation.tile_description.warp_count
+
+    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+
+    epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element])
+
+    values = {
+      'operation_name': operation.procedural_name(),
+      'element_a': DataTypeTag[operation.A.element],
+      'layout_a': LayoutTag[operation.A.layout],
+      'side_mode' : SideModeTag[operation.A.side_mode],
+      'fill_mode': FillModeTag[operation.A.fill_mode],
+      'diag_type' : DiagTypeTag[operation.A.diag_type],
+      'element_b': DataTypeTag[operation.B.element],
+      'layout_b': LayoutTag[operation.B.layout],
+      'element_c': DataTypeTag[operation.C.element],
+      'layout_c': LayoutTag[operation.C.layout],
+      'element_accumulator': DataTypeTag[operation.accumulator_type()],
+      'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
+      'arch': "cutlass::arch::Sm%d" % operation.arch,
+      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
+      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
+      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'warp_shape_m': str(warp_shape[0]),
+      'warp_shape_n': str(warp_shape[1]),
+      'warp_shape_k': str(warp_shape[2]),
+      'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]),
+      'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]),
+      'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]),
+      'epilogue_vector_length': str(epilogue_vector_length),
+      'element_epilogue': str(DataTypeTag[operation.element_epilogue]),
+      'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor],
+      'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor],
+      'stages': str(operation.tile_description.stages),
+      'align_a': str(1),  # TRMM A's alignment is always 1 for no padding to work until we make zfill work with variable bytes
+      'align_b': str(operation.B.alignment),
+      'split_k_serial': 'false',
+      'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation],
+      'transform_a': ComplexTransformTag[operation.A.complex_transform]
+    }
+
+    trmm_template = self.trmm_complex_template if operation.is_complex() else self.trmm_template
+
+    return SubstituteTemplate(trmm_template, values)
+
+###################################################################################################
+
+
+###################################################################################################
+#
+# Emitters functions for all targets
+#
+###################################################################################################
+
+class EmitTrmmConfigurationLibrary:
+  def __init__(self, operation_path, configuration_name):
+    self.configuration_name = configuration_name
+    self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/')
+
+    self.instance_emitter = {
+      TrmmKind.Universal: EmitTrmmUniversalInstance,
+    }
+
+    self.trmm_kind_wrappers = {
+      TrmmKind.Universal: 'TrmmOperation',
+    }
+
+    self.instance_template = {
+      TrmmKind.Universal: """
+${compile_guard_start}
+  manifest.append(new ${trmm_kind}<
+    Operation_${operation_name}
+  >("${operation_name}"));
+${compile_guard_end}
+"""
+    }
+
+    self.header_template = """
+/*
+  Generated by trmm_operation.py - Do not edit.
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+
+#include "library_internal.h"
+#include "trmm_operation.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+    self.initialize_function_template = """
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void initialize_${configuration_name}(Manifest &manifest) {
+
+"""
+    self.epilogue_template = """
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+"""
+
+  def __enter__(self):
+    self.configuration_file = open(self.configuration_path, "w")
+    self.configuration_file.write(self.header_template)
+
+    self.instance_definitions = []
+    self.instance_wrappers = []
+
+    self.operations = []
+    return self
+
+  def emit(self, operation):
+    emitter = self.instance_emitter[operation.trmm_kind]()
+
+    self.operations.append(operation)
+
+    self.instance_definitions.append(emitter.emit(operation))
+
+    self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.trmm_kind], {
+      'configuration_name': self.configuration_name,
+      'operation_name': operation.procedural_name(),
+      'trmm_kind': self.trmm_kind_wrappers[operation.trmm_kind],
+      'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "",
+      'compile_guard_end': "#endif" \
+        if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else ""
+      }))
+
+  def __exit__(self, exception_type, exception_value, traceback):
+
+    # Write instance definitions in top-level namespace
+    for instance_definition in self.instance_definitions:
+      self.configuration_file.write(instance_definition)
+
+    # Add wrapper objects within initialize() function
+    self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, {
+      'configuration_name': self.configuration_name
+      }))
+
+    for instance_wrapper in self.instance_wrappers:
+      self.configuration_file.write(instance_wrapper)
+
+    self.configuration_file.write(self.epilogue_template)
+    self.configuration_file.close()
+
+###################################################################################################
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/docs_src/source/conf.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/docs_src/source/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..c396d75a5534493f1ebf90043f2a182eb46abb7f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/docs_src/source/conf.py
@@ -0,0 +1,132 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('../../media/docs'))
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'CUTLASS Python interface'
+copyright = '2023, NVIDIA'
+author = 'NVIDIA'
+release = '3.1.0'
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+        'myst_parser',
+        'nbsphinx',
+        'nbsphinx_link', 
+        'sphinx_copybutton',
+        'sphinx.ext.autodoc',
+        'sphinx.ext.autosectionlabel',
+        'sphinx.ext.autosummary',
+        'sphinx.ext.coverage',
+        'sphinx.ext.extlinks',
+        'sphinx.ext.ifconfig',
+        'sphinx.ext.intersphinx',
+        'sphinx.ext.mathjax',
+        'sphinx.ext.napoleon',
+        'sphinx.ext.viewcode',
+        'sphinx_inline_tabs',
+        ]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+autodoc_typehints = 'description'
+
+pygments_style = "sphinx"
+pygments_dark_style = "monokai"
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# Ignore errors when converting notebooks
+nbsphinx_allow_errors = True
+
+language = 'en'
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_static_path = ['_static']
+
+html_title = "CUTLASS Python"
+html_baseurl = 'docs'
+html_theme = 'furo'
+html_theme_options = {
+	"light_logo": "cutlass-logo-small.png",
+	"dark_logo": "cutlass-logo-small.png",
+    "light_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "dark_css_variables": {
+        "color-brand-primary": "#76B900",
+        "color-brand-content": "#76B900",
+    },
+    "footer_icons": [
+        {
+            "name": "GitHub",
+            "url": "https://github.com/NVIDIA/cutlass",
+            "html": """
+                <svg stroke="currentColor" fill="currentColor" stroke-width="0" viewBox="0 0 16 16">
+                    <path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"></path>
+                </svg>
+            """,
+            "class": "",
+        },
+    ],
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/__init__.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..308a5676b06f00089d1cdfe0fb83b442ca2df36e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from .int_tuple import *
+from .layout import *
+from .swizzle import *
+from .typing import *
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/int_tuple.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/int_tuple.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d722130c52142e68a3bcd54ac708012aeeeaad3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/int_tuple.py
@@ -0,0 +1,225 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Functions for manipulating IntTuples
+"""
+
+from functools import reduce
+from itertools import chain
+from typing import Union
+from .typing import Integer
+
+
+def is_int(x):
+  return isinstance(x, Integer)
+
+
+def is_tuple(x):
+  return isinstance(x, tuple)
+
+
+def flatten(t):
+  if is_tuple(t):
+    if len(t) == 0:
+      return ()
+    else:
+      return tuple(i for a in t for i in flatten(a))
+  else:
+    return (t,)
+
+
+def signum(a):
+  return bool(a > 0) - bool(a < 0)
+
+
+def product(a):
+  if is_tuple(a):
+    return reduce(lambda val,elem : val*product(elem), a, 1)
+  else:
+    return a
+
+
+def inner_product(a, b):
+  if is_tuple(a):                      # tuple tuple
+    assert len(a) == len(b)
+    return sum(inner_product(x,y) for x,y in zip(a,b))
+  else:                                # "int" "int"
+    assert not is_tuple(b)
+    return a * b
+
+
+def tuple_max(a):
+  if is_tuple(a):
+    return max(tuple_max(x) for x in a)
+  else:
+    return a
+
+
+def elem_scale(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                     # tuple tuple
+      assert len(a) == len(b)
+      return tuple(elem_scale(x,y) for x,y in zip(a,b))
+    else:                               # tuple "int"
+      assert False           # Error
+  else:
+    if is_tuple(b):                     # "int" tuple
+      return elem_scale(a, product(b))
+    else:                               # "int" "int"
+      return a * b
+
+
+# Inclusive prefix ceil div with output congruent to input a
+def shape_div(a, b):
+  if is_tuple(a):
+    if is_tuple(b):                    # tuple tuple
+      assert len(a) == len(b)
+      return tuple(shape_div(x,y) for x,y in zip(a,b))
+    else:                              # tuple "int"
+      #r = [shape_div(a[0],b)] + [shape_div(a[i],b := shape_div(b, product(a[i-1]))) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(shape_div(v,b))
+        b = shape_div(b,product(v))
+      return tuple(r)
+  else:
+    if is_tuple(b):                    # "int" tuple
+      return shape_div(a, product(b))
+    else:                              # "int" "int"
+      assert a % b == 0 or b % a == 0
+      return (a + b - 1) // b
+
+# Exclusive prefix product with output congruent to input a
+def prefix_product(a, init=1):
+  if is_tuple(a):
+    if is_tuple(init):                 # tuple tuple
+      assert len(a) == len(init)
+      return tuple(prefix_product(x,i) for x,i in zip(a,init))
+    else:                              # tuple "int"
+      #r = [prefix_product(a[0],init)] + [prefix_product(a[i],init := init * product(a[i-1])) for i in range(1,len(a))]
+      r = []
+      for v in a:
+        r.append(prefix_product(v,init))
+        init = init * product(v)
+      return tuple(r)
+  else:
+    if is_tuple(init):                 # "int" tuple
+      assert False           # Error
+    else:                              # "int" "int"
+      return init
+
+
+def idx2crd(idx, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(idx):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(idx) == len(shape) and len(idx) == len(stride)
+      return tuple(idx2crd(i, s, d) for i, s, d in zip(idx,shape,stride))
+    else:                              # tuple "int" "int"
+      assert False           # Error
+  else:
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      return tuple(idx2crd(idx, s, d) for s,d in zip(shape,stride))
+    else:                              # "int" "int" "int"
+      return (idx // stride) % shape
+
+
+def crd2idx(crd, shape, stride=None):
+  if stride is None:
+    stride = prefix_product(shape)
+
+  if is_tuple(crd):
+    if is_tuple(shape):                # tuple tuple tuple
+      assert len(crd) == len(shape) and len(crd) == len(stride)
+      return sum(crd2idx(c, s, d) for c, s, d in zip(crd, shape, stride))
+    else:                              # tuple "int" "int"
+      assert False, f"crd={crd}, shape={shape}"           # Error
+  else:
+    if crd is None:
+      crd = 0
+
+    if is_tuple(shape):                # "int" tuple tuple
+      assert len(shape) == len(stride)
+      result = 0
+      for i in range(len(shape)-1):
+        result += crd2idx(crd % product(shape[i]), shape[i], stride[i])
+        crd = crd // product(shape[i])
+      return result + crd2idx(crd, shape[-1], stride[-1])
+    else:                              # "int" "int" "int"
+      return crd * stride
+
+
+# Transform crd into the dst_shape's iteration space
+def crd2crd(crd, dst_shape, src_shape=None):
+  if is_tuple(crd):
+    if is_tuple(dst_shape):            # tuple tuple
+      assert len(crd) == len(dst_shape)
+      return tuple(crd2crd(x, y) for x, y in zip(crd,dst_shape))
+    else:                              # tuple "int"
+      # Ambiguous unless we have src_shape
+      assert src_shape is not None
+      return crd2idx(crd, src_shape)
+  else:
+    if is_tuple(dst_shape):            # "int" tuple
+      return idx2crd(crd, dst_shape)
+    else:                              # "int" "int"
+      assert crd < dst_shape
+      return crd
+
+
+# Filter trg according to crd: keep only elements of trg that are paired with None
+def slice_(crd: Union[None, tuple, int],
+           trg: Union[tuple, int]):
+  if is_tuple(crd):
+    if is_tuple(trg):                  # tuple tuple
+      assert len(crd) == len(trg)
+      # match C++ behavior of `filter_tuple` using `tuple_cat(...)`
+      return tuple(chain(*filter(lambda x: x != (), [slice_(c, s) for c, s in zip(crd, trg)])))
+    else:
+      assert False                     # tuple "int" : Error
+  elif crd is None:
+    # match C++ behavior `return cute::tuple<B>{b};`
+    return (trg,)
+  else:
+    return ()
+
+
+# Determine if None appears at any of an int_tuples' terminals
+def has_none(a: Union[None, tuple, int]):
+  if is_tuple(a):
+    return any(has_none(v) for v in a)
+  else:
+    return a is None
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/layout.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c220eb16dd089c65fdbe6d6929b357ace0a77c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/layout.py
@@ -0,0 +1,367 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Definition of CuTe Layouts and functions to manipulate them
+"""
+
+from itertools import chain
+from typing import Union
+
+from .int_tuple import *
+
+
+class LayoutBase:
+  pass
+
+
+def is_layout(x):
+  return isinstance(x, LayoutBase)
+
+
+class Layout(LayoutBase):
+  def __init__(self, _shape, _stride=None):
+    self.shape  = _shape
+    if _stride is None:
+      self.stride = prefix_product(self.shape)
+    else:
+      self.stride = _stride
+
+  # operator ==
+  def __eq__(self, other):
+    return self.shape == other.shape and self.stride == other.stride
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    if is_tuple(self.shape):
+      return len(self.shape)
+    else:
+      return 1
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    """
+    Map a logical coordinate to a linear index (Coord has no Underscore slice operators)
+    OR
+    Slice the layout and return the sublayout (Coord has an Underscore slice op)
+
+    Follow the same behavior of `Layout::operator(Coord const&)` in cute C++
+    """
+    if has_none(args):
+      if len(args) == 1:
+        return Layout(slice_(args[0], self.shape), slice_(args[0], self.stride))
+      else:
+        return Layout(slice_(args, self.shape), slice_(args, self.stride))
+    else:
+      if len(args) == 1:
+        return crd2idx(args[0], self.shape, self.stride)
+      else:
+        return crd2idx(args, self.shape, self.stride)
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    if is_tuple(self.shape):
+      return Layout(self.shape[i], self.stride[i])
+    else:
+      assert i == 0
+      return Layout(self.shape, self.stride)
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return product(self.shape)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return self(self.size() - 1) + 1
+
+  # print and str
+  def __str__(self):
+    return f"{self.shape}:{self.stride}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Layout({self.shape},{self.stride})"
+
+
+# Make Layout from a list of layouts (each layout it's own mode in the result)
+def make_layout(*layouts):
+  if len(layouts) == 1 and not is_layout(layouts[0]):
+    layouts = layouts[0]
+
+  shape, stride = zip(*((a.shape,a.stride) for a in layouts))
+  return Layout(shape, stride)
+
+
+# Size of the domain
+def size(layout):
+  if is_layout(layout):
+    return layout.size()
+  return product(layout)
+
+
+# Size of the codomain
+def cosize(layout):
+  return layout.cosize()
+
+
+# Layout coalesce -- flatten and combine as many modes as possible while preserving the int-to-int function
+def coalesce(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((coalesce(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                       for i in range(len(profile),len(layout)))))
+
+  result_shape  = [1]
+  result_stride = [0]
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s
+    if shape == 1:
+      continue
+    # replace our shape-1 with anything
+    elif result_shape[-1] == 1:
+      result_shape[-1]  = shape
+      result_stride[-1] = stride
+    # merge modes if the shape*stride match
+    elif result_shape[-1] * result_stride[-1] == stride:
+      result_shape[-1] = result_shape[-1] * shape
+    # append a new mode
+    else:
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 1:
+    return Layout(result_shape[0], result_stride[0])
+  else:
+    return Layout(tuple(result_shape), tuple(result_stride))
+
+
+# Layout filter -- replace all stride-0 modes with size-1 and then coalesce to remove them
+def filter(layout, profile=None):
+  if is_tuple(profile):
+    assert len(layout) >= len(profile)
+    return make_layout(chain((filter(layout[i], profile[i]) for i in range(           0,len(profile))),
+                             (layout[i]                     for i in range(len(profile),len(layout)))))
+
+  result_shape  = []
+  result_stride = []
+  for (shape,stride) in zip(flatten(layout.shape),flatten(layout.stride)):
+    # skip their shape-1s and stride-0s
+    if not (shape == 1 or stride == 0):
+      result_shape.append(shape)
+      result_stride.append(stride)
+
+  if len(result_shape) == 0:
+    return Layout(1,0)
+  else:
+    return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout composition
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def composition(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return composition(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((composition(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                          for i in range(len(layoutB),len(layoutA)))))
+  elif is_tuple(layoutB.shape):
+    return make_layout(composition(layoutA, layoutB_i) for layoutB_i in layoutB)
+
+  if layoutB.stride == 0:
+    return Layout(layoutB.shape, 0)
+  else:
+    result_shape  = []
+    result_stride = []
+    rest_shape    = layoutB.shape
+    rest_stride   = layoutB.stride
+    flat_A = coalesce(layoutA)
+    for (curr_shape, curr_stride) in zip(flatten(flat_A.shape)[:-1], flatten(flat_A.stride)[:-1]):
+      assert curr_shape % rest_stride == 0 or rest_stride % curr_shape == 0
+      new_shape = min(max(1, curr_shape // rest_stride), rest_shape)
+
+      if new_shape != 1:
+        result_shape.append(new_shape)
+        result_stride.append(rest_stride * curr_stride)
+
+      rest_shape  = rest_shape // new_shape
+      rest_stride = -(-rest_stride // curr_shape)  # Python exclusive impl: "//" is always floor div so == ceil_div(abs(rest_stride), curr_shape) * signum(rest_stride)
+
+    if rest_shape != 1 or len(result_shape) == 0:
+      result_shape.append(rest_shape)
+      result_stride.append(rest_stride * flatten(flat_A.stride)[-1])
+
+    if len(result_shape) == 1:
+      return Layout(result_shape[0], result_stride[0])
+    else:
+      return Layout(tuple(result_shape), tuple(result_stride))
+
+
+# Layout complement
+def complement(layout, max_idx=1):
+  if is_int(layout):
+    return complement(Layout(layout))
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  sorted_DS = sorted(zip(flatten(layout.stride), flatten(layout.shape)))
+  for (stride, shape) in sorted_DS:
+    if stride == 0 or shape == 1:
+      continue
+
+    in_bound = current_idx <= shape * stride
+    # To support symbolic value which can't be evaluated now
+    assert (type(in_bound) is not bool) or in_bound
+
+    result_shape.append(stride // current_idx)
+    result_stride.append(current_idx)
+    current_idx = shape * stride
+
+  result_shape.append((max_idx + current_idx - 1) // current_idx)  # ceil_div
+  result_stride.append(current_idx)
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout right inverse
+def right_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+
+  result_shape  = []
+  result_stride = []
+  current_idx = 1
+
+  flat_shape  = flatten(layout.shape)
+  flat_stride = flatten(layout.stride)
+  sorted_DSA = sorted(zip(flat_stride, flat_shape, prefix_product(flat_shape)))
+  for (stride,shape,rstride) in sorted_DSA:
+    if shape == 1:
+      continue
+    if current_idx != stride:
+      break
+
+    result_shape.append(shape)
+    result_stride.append(rstride)
+    current_idx = shape * stride
+
+  return coalesce(Layout(tuple(result_shape), tuple(result_stride)))
+
+
+# Layout left inverse
+def left_inverse(layout):
+  if layout is None:
+    return None
+  elif is_int(layout):
+    return Layout(layout)
+  return right_inverse(make_layout(layout, complement(layout)))
+
+
+# Split a layout by the composition of B and the "rest"
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_divide(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_divide(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                             for i in range(len(layoutB),len(layoutA)))))
+
+  return composition(layoutA, make_layout(layoutB, complement(layoutB, size(layoutA))))
+
+
+# Reproduce a layoutA over a layoutB
+# Use tuples-of-layouts to perform this operation by-mode and None as no-op
+def logical_product(layoutA, layoutB):
+  if layoutB is None:
+    return layoutA
+  elif is_int(layoutB):
+    return logical_divide(layoutA, Layout(layoutB))
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    return make_layout(chain((logical_product(layoutA[i], layoutB[i]) for i in range(           0,len(layoutB))),
+                             (layoutA[i]                              for i in range(len(layoutB),len(layoutA)))))
+
+  return make_layout(layoutA, composition(complement(layoutA, size(layoutA)*cosize(layoutB)), layoutB));
+
+
+# Gather the modes from a hierarchical logical_divide or logical_product
+def hier_unzip(splitter, layoutA, layoutB):
+  if layoutB is None:
+    return make_layout(Layout(1,0), layoutA)
+  elif is_tuple(layoutB):
+    assert len(layoutA) >= len(layoutB)
+    # A layout with shape ((A,a),(B,b),(C,c))
+    split = make_layout(hier_unzip(splitter, layoutA[i], layoutB[i]) for i in range(0,len(layoutB)))
+    # Gather to shape ((A,B,C,...),(a,b,c,...,y,z))
+    return make_layout(make_layout(       split[i][0] for i in range(           0,len(layoutB))),
+                       make_layout(chain((split[i][1] for i in range(           0,len(layoutB))),
+                                         (layoutA[i]  for i in range(len(layoutB),len(layoutA))))))
+
+  # splitter must return a rank-2 layout
+  return splitter(layoutA, layoutB)
+
+
+# Apply logical divide hierarchically and gather the split modes into two modes
+def zipped_divide(layoutA, layoutB):
+  return hier_unzip(logical_divide, layoutA, layoutB)
+
+
+# Perform logical divide hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_divide(layoutA, layoutB):
+  result = zipped_divide(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+# Apply logical product hierarchically and gather the split modes into two modes
+def zipped_product(layoutA, layoutB):
+  return hier_unzip(logical_product, layoutA, layoutB)
+
+
+# Perform logical product hierarchically and gather tiles (B-layouts) into a new mode
+def tiled_product(layoutA, layoutB):
+  result = zipped_product(layoutA, layoutB)
+  return make_layout([result[0]] + [result[1][i] for i in range(len(result[1]))])
+
+
+def slice_and_offset(crd: tuple,
+                     layout: Layout):
+  return (Layout(slice_(crd, layout.shape), slice_(crd, layout.stride)),
+          crd2idx(crd, layout.shape, layout.stride))
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/swizzle.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/swizzle.py
new file mode 100644
index 0000000000000000000000000000000000000000..308aee0c3838a82c4de53833fb8a36950b30f62d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/swizzle.py
@@ -0,0 +1,129 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Methods for layout swizzling
+"""
+
+from .layout import *
+
+
+def shiftr(a, s):
+  return a >> s if s > 0 else shiftl(a, -s)
+
+
+def shiftl(a, s):
+  return a << s if s > 0 else shiftr(a, -s)
+
+
+## A generic Swizzle functor
+ # 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx
+ #                               ^--^  Base is the number of least-sig bits to keep constant
+ #                  ^-^       ^-^      Bits is the number of bits in the mask
+ #                    ^---------^      Shift is the distance to shift the YYY mask
+ #                                       (pos shifts YYY to the right, neg shifts YYY to the left)
+ #
+ # e.g. Given
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx
+ # the result is
+ # 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY
+ #
+class Swizzle:
+  def __init__(self, bits, base, shift):
+    assert bits >= 0
+    assert base >= 0
+    assert abs(shift) >= bits
+    self.bits = bits
+    self.base = base
+    self.shift = shift
+    bit_msk = (1 << bits) - 1
+    self.yyy_msk = bit_msk << (base + max(0,shift))
+    self.zzz_msk = bit_msk << (base - min(0,shift))
+
+  # operator ()    (transform integer)
+  def __call__(self, offset):
+    return offset ^ shiftr(offset & self.yyy_msk, self.shift)
+
+  # Size of the domain
+  def size(self):
+    return 1 << (self.bits + self.base + abs(self.shift))
+
+  # Size of the codomain
+  def cosize(self):
+    return self.size()
+
+  # print and str
+  def __str__(self):
+    return f"SW_{self.bits}_{self.base}_{self.shift}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"Swizzle({self.bits},{self.base},{self.shift})"
+
+
+class ComposedLayout(LayoutBase):
+  def __init__(self, layoutB, offset, layoutA):
+    self.layoutB = layoutB
+    self.offset  = offset
+    self.layoutA = layoutA
+
+  # operator ==
+  def __eq__(self, other):
+    return self.layoutB == other.layoutB and self.offset == other.offset and self.layoutA == other.layoutA
+
+  # operator len(L)  (len [rank] like tuples)
+  def __len__(self):
+    return len(self.layoutA)
+
+  # operator ()    (map coord to idx)
+  def __call__(self, *args):
+    return self.layoutB(self.offset + self.layoutA(*args))
+
+  # operator []    (get-i like tuples)
+  def __getitem__(self, i):
+    return ComposedLayout(self.layoutB, self.offset, self.layoutA[i])
+
+  # size(layout)   Size of the domain
+  def size(self):
+    return size(self.layoutA)
+
+  # cosize(layout)   Size of the codomain
+  def cosize(self):
+    return cosize(self.layoutB)
+
+  # print and str
+  def __str__(self):
+    return f"{self.layoutB} o {self.offset} o {self.layoutA}"
+
+  # error msgs and representation
+  def __repr__(self):
+    return f"ComposedLayout({repr(self.layoutB)},{repr(self.offset)},{repr(self.layoutA)})"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/typing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..834f7e5411f5c2a4e218f9ce8a4f0a229d039710
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/pycute/typing.py
@@ -0,0 +1,42 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from abc import ABC
+
+
+class Integer(ABC):
+    @classmethod
+    def __subclasshook__(cls, c):
+        if c in [bool, float]:
+            return False
+
+        return issubclass(c, int)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_cutlass.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_cutlass.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc0c46e540735443a4943908852010a80d02187
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_cutlass.py
@@ -0,0 +1,74 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+
+import copy
+import os
+import setuptools
+from setuptools import setup
+from setuptools.command.build_ext import build_ext
+
+import setup_pycute
+import setup_library
+
+
+# Install cutlass_library package
+setup_library.perform_setup()
+
+
+# Install the PyCuTe package
+setup_pycute.perform_setup()
+
+
+setup(
+    name='cutlass_cppgen',
+    version='4.2.0',
+    description='CUTLASS Pythonic Interface',
+    package_dir={'': '.'},
+    packages=[
+        'cutlass_cppgen',
+        'cutlass_cppgen.emit',
+        'cutlass_cppgen.op',
+        'cutlass_cppgen.utils',
+        'cutlass_cppgen.backend',
+        'cutlass_cppgen.backend.utils'
+        ],
+    setup_requires=['pybind11'],
+    install_requires=[
+        'bfloat16',
+        'cuda-python>=11.8.0',
+        'pybind11',
+        'scikit-build',
+        'treelib',
+        'pydot'
+        ]
+)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_library.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_library.py
new file mode 100644
index 0000000000000000000000000000000000000000..c56d6b5556fea2d5e56209b13f5b95e487ca22fb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_library.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='cutlass_library',
+        version='4.2.1',
+        description='CUTLASS library generation scripts',
+        packages=['cutlass_library']
+    )
+
+
+if __name__ == '__main__':
+    perform_setup()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_pycute.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_pycute.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bad050fcade8b26d33043abbb0f8226be7d816c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/python/setup_pycute.py
@@ -0,0 +1,46 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from setuptools import setup
+
+
+def perform_setup():
+    setup(
+        name='pycute',
+        version='4.2.1',
+        description='Python implementation of CuTe',
+        packages=['pycute'],
+    )
+
+
+if __name__ == '__main__':
+    perform_setup()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_problem_sizes.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_problem_sizes.py
new file mode 100644
index 0000000000000000000000000000000000000000..852c0277ebae2fce7e0b083ce2f497a2c828256f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@@ -0,0 +1,661 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utilities for defining Conv2D problem sizes for testing.
+
+This file was ported from the C++ version in test/unit/conv/device/conv2d_problems.h
+"""
+
+from cutlass_library import ConvMode
+
+import cutlass_cppgen
+from cutlass_cppgen.shape import Conv2DProblemSize
+
+
+class TestbedConv2dProblemSizes:
+    def __init__(self, minimum_channel_size: int):
+        conv2d_default_sizes = self.initialize_conv2d_default_sizes(minimum_channel_size)
+        conv2d_rigorous_sizes = self.initialize_conv2d_rigorous_sizes(minimum_channel_size)
+        conv2d_resnet50_sizes = self.initialize_conv2d_resnet50_sizes(1)
+        conv2d_resnet50_sizes_perf = self.initialize_conv2d_resnet50_sizes(34)
+        grouped_sizes = self.initialize_conv2d_grouped_sizes()
+
+        # Filter all problems
+        self.all = []
+        for size_list in [conv2d_default_sizes, conv2d_rigorous_sizes, conv2d_resnet50_sizes, conv2d_resnet50_sizes_perf, grouped_sizes]:
+            for size in size_list:
+                if (size.C // size.groups) % minimum_channel_size == 0:
+                    self.all.append(size)
+
+
+    def initialize_conv2d_default_sizes(self, minimum_channel_size):
+        # Small input size x stride (1,1)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+
+        conv2d_default_sizes = []
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 1, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 1, 8, minimum_channel_size,
+          8, 1, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 8, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 7, 9, minimum_channel_size,
+          8, 4, 4, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          2, 7, 9, minimum_channel_size,
+          8, 5, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 5, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 6, 6, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 7, 9, minimum_channel_size,
+          8, 7, 7, minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##############################################
+        # Small input size x stride (2,2)
+        # C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##############################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 11, 7, minimum_channel_size,
+          8, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 11, minimum_channel_size,
+          8, 1, 1, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 17, 19, minimum_channel_size,
+          16, 2, 2, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 5, minimum_channel_size,
+          16, 3, 3, minimum_channel_size,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 17, 8,
+          24, 3, 3, 8,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 8,
+          24, 3, 3, 8,
+          1, 1,
+          3, 3,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 20, 24, 8,
+          40, 3, 3, 8,
+          3, 3,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 160,
+          224, 1, 1, 160,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 19, 37, 160,
+          224, 3, 3, 160,
+          1, 1,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 16, 160,
+          224, 2, 3, 160,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 23, 21, 128,
+          224, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 29, 37, 160,
+          224, 5, 5, 160,
+          2, 2,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 15, 19, 32 + minimum_channel_size,
+          96, 3, 3, 32 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 16, 24, 64 + minimum_channel_size,
+          96, 3, 3, 64 + minimum_channel_size,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 13, 16, 288,
+          160, 5, 5, 288,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 55, 51, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 71, 80, 32,
+          64, 5, 5, 32,
+          2, 2,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 224, 224, 8,
+          64, 7, 7, 8,
+          3, 3,
+          2, 2,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size stride (3, 3), filter (3, 3), non-default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 23, 256,
+          512, 3, 3, 256,
+          0, 0,
+          3, 3,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size padding > stride, asymmetric filter, padding and striding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 31, 256,
+          512, 3, 3, 256,
+          5, 7,
+          3, 4,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 35, 256,
+          512, 7, 5, 256,
+          11, 7,
+          3, 5,
+          1, 1,
+        ))
+
+        ##########################################
+        # Medium input size *mixed* stride (1, 2) and (2, 1),
+        # filter (3, 3), default padding
+        ##########################################
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          1, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          1, 27, 27, 256,
+          512, 3, 3, 256,
+          1, 1,
+          2, 1,
+          1, 1,
+        ))
+
+        ######################################/
+        # Additional input size
+        ######################################/
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          3, 28, 28, 256,
+          256, 2, 2, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+           1, 32, 32, 16,
+           32, 3, 3, 16,
+           1, 1,
+           6, 2,
+           1, 1,
+         ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          32, 24, 32, 32,
+          32, 1, 2, 32,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_default_sizes.append(Conv2DProblemSize(
+          4, 2, 3, 256,
+          328, 3, 5, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+        return conv2d_default_sizes
+
+    # Add a few large and rigorous convolution problem sizes
+    def initialize_conv2d_rigorous_sizes(self, minimum_channel_size):
+        sizes = []
+        if False:
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 124, 224, 2 * minimum_channel_size),
+              (24, 7, 7, 2 * minimum_channel_size),
+            ))
+
+            sizes.append(Conv2DProblemSize.from_sizes(
+              (1, 233, 35, minimum_channel_size),
+              (24, 7, 5, minimum_channel_size),
+            ))
+        return sizes
+
+    # Add resent50 layers to unit testing sizes
+    def initialize_conv2d_resnet50_sizes(self, batch_size):
+        conv2d_problem_vector = []
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          256, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 1, 1, 64,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 64,
+          64, 3, 3, 64,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          64, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          512, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 56, 56, 256,
+          128, 1, 1, 256,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          128, 3, 3, 128,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 128,
+          512, 1, 1, 128,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          128, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          1024, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 28, 28, 512,
+          256, 1, 1, 512,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          256, 3, 3, 256,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 256,
+          1024, 1, 1, 256,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          256, 1, 1, 1024,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          2048, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 14, 14, 1024,
+          512, 1, 1, 1024,
+          0, 0,
+          2, 2,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          512, 3, 3, 512,
+          1, 1,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 512,
+          2048, 1, 1, 512,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        conv2d_problem_vector.append(Conv2DProblemSize(
+          batch_size, 7, 7, 2048,
+          512, 1, 1, 2048,
+          0, 0,
+          1, 1,
+          1, 1,
+        ))
+
+        return conv2d_problem_vector
+
+    def initialize_conv2d_grouped_sizes(self):
+        threadblock_n = 128
+        threadblock_k = 32
+
+        sizes = []
+        ##########################################
+        # One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+        # One CTA calculates a single group
+        ##########################################
+        for cta_per_group_k in range(1, 4):
+            for groups in range(2, 5):
+                conv_k = cta_per_group_k * threadblock_n * groups
+                sizes.append(Conv2DProblemSize(
+                  1, 8, 8, threadblock_k * 2 * groups,
+                  conv_k, 3, 3, threadblock_k * 2,
+                  1, 1,
+                  1, 1,
+                  1, 1,
+                  ConvMode.CrossCorrelation,
+                  1,
+                  groups
+                ))
+
+        # Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n * 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        sizes.append(Conv2DProblemSize(
+          1, 56, 56, 696,
+          768, 3, 3, 232,
+          1, 1,
+          2, 2,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+        sizes.append(Conv2DProblemSize(
+          1, 14, 14, 1392,
+          1536, 3, 3, 232,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          3
+        ))
+
+        ##########################################
+        # One CTA calculate multiple groups: CTA::N % k_per_group = 0
+        ##########################################
+
+        # 2 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 4,
+          threadblock_n, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 2 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k,
+          threadblock_n, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          2
+        ))
+
+        # 4 groups per CTA
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 8,
+          threadblock_n // 2, 3, 3, threadblock_k * 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        # 4 groups per CTA and partial gemm_k
+        sizes.append(Conv2DProblemSize(
+          1, 8, 8, threadblock_k * 2,
+          threadblock_n // 2, 3, 3, threadblock_k // 2,
+          1, 1,
+          1, 1,
+          1, 1,
+          ConvMode.CrossCorrelation,
+          1,
+          4
+        ))
+
+        return sizes
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..f77a0ec831be087bd3badc929eee955f0b37c489
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_sm80.py
@@ -0,0 +1,146 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for Conv2d opreations on SM80
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from conv2d_test_utils import *
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is invalid for SM80 tests.')
+class Conv2dSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+conv_problems = get_conv_problems()
+
+
+# Tests for optimized & analytic
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    # F16, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=2, instruction_shape=[1, 1, 1])
+    # F16, tensor op
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, analytic iterator
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="analytic")
+    # F16, tensor op, f32 output
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16])
+    # F16, tensor op, different tile description
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8])
+    # F32, simt
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="simt", threadblock_shape=[128, 128, 8],
+        warp_count=[4, 2, 1], stages=4, instruction_shape=[1, 1, 1])
+    # Tf32, tensorop
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32,
+        opclass="tensor_op", threadblock_shape=[128, 128, 16],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8]
+    )
+    # Split-K
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="serial",
+        split_k_slices=2)
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode="parallel",
+        split_k_slices=5)
+    # Swizzling functor
+    add_test(
+        Conv2dSm80, cc, conv_kind, conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 64, 32],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 8], swizzle=4)
+
+# Tests for few channels and fixed channels
+# F16, tensor op, few channels
+for c, tb, stage, inst in zip([2, 1],
+                                [[128, 128, 64], [128, 128, 32]],
+                                [3, 2],
+                                [[16, 8, 16], [16, 8, 8]]):
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=tb,
+        warp_count=[2, 2, 1], stages=stage, instruction_shape=inst, iterator_algorithm="few_channels"
+    )
+# F16, tensor op, fixed channels
+for c in [8, 4, 2]:
+    add_test(
+        Conv2dSm80, cc, "fprop", conv2d_few_channel_problemsizes(c), cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+        opclass="tensor_op", threadblock_shape=[128, 128, 64],
+        warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], iterator_algorithm="fixed_channels"
+    )
+
+# Test activations
+for activation in ["relu", "leaky_relu"]:
+    for split_k_mode, split_k_slices in zip(["parallel", "serial", "parallel"], [1, 7, 5]):
+        add_test(
+            Conv2dSm80, cc, "fprop", conv_problems, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16,
+            opclass="tensor_op", threadblock_shape=[128, 128, 64],
+            warp_count=[2, 2, 1], stages=3, instruction_shape=[16, 8, 16], split_k_mode=split_k_mode,
+            split_k_slices=split_k_slices, activation=activation)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_test_utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc4542cd5ccf72341f7db3c7947d481b032926d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/conv2d_test_utils.py
@@ -0,0 +1,428 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility functions for Conv2d tests.
+"""
+
+from cutlass_library import SubstituteTemplate
+import torch
+
+import cutlass_cppgen
+from cutlass_library import (
+    ConvKind,
+    ConvMode,
+    DataType,
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames,
+    SplitKMode,
+)
+from cutlass_cppgen.shape import Conv2DProblemSize
+from cutlass_cppgen.utils.datatypes import numpy_type, torch_type
+
+from conv2d_problem_sizes import TestbedConv2dProblemSizes
+
+
+def get_name_conv2d(
+    arch,
+    conv_kind,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm,
+    swizzle,
+    split_k_mode,
+    split_k_slices,
+    activation
+):
+    """
+    Generates a procedural name for a test case for conv2d
+
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param conv_kind: the convolution type (i.e. fprop, dgrad, wgrad)
+    :type conv_kind: str
+    :param iterator_algorithm: the iterator algorithm applied
+    :type iterator_algorithm: cutlass_library.library.IteratorAlgorithm
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param element_accumulator: data type used in accumulation
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param stride_support: stride support of dgrad
+    :param alignment: int
+    :type alignment: int
+
+    :return: str
+    """
+    if iterator_algorithm is None:
+        iterator_algorithm = "AUTO"
+    if swizzle is None:
+        swizzle = 1
+    name_format = "test_SM${arch}_Device_Conv2d_${conv_kind}_${iter_alg}_ImplicitGemm_${eA}nhwc_${eB}nhwc_${eC}nhwc_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${wM}x${wN}x${wK}_${IM}${IN}${IK}_stage${stages}_swizzle${swizzle}_${split_k_mode}${split_k_slices}_${activation}"
+
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "conv_kind": conv_kind,
+            "iter_alg": iterator_algorithm,
+            "eA": DataTypeNames[element],
+            "eB": DataTypeNames[element],
+            "eC": DataTypeNames[element_output],
+            "opclass": opclass,
+            "acc": DataTypeNames[element_accumulator],
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "wM": str(threadblock_shape[0] // warp_count[0]),
+            "wN": str(threadblock_shape[1] // warp_count[1]),
+            "wK": str(threadblock_shape[2] // warp_count[2]),
+            "IM": str(instruction_shape[0]),
+            "IN": str(instruction_shape[1]),
+            "IK": str(instruction_shape[2]),
+            "stages": str(stages),
+            "swizzle": str(swizzle),
+            "split_k_mode": split_k_mode,
+            "split_k_slices": str(split_k_slices),
+            "activation": activation
+        }
+    )
+
+
+def conv2d_few_channel_problemsizes(channels):
+    problem_sizes = [
+        Conv2DProblemSize(
+            1, 8, 8, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 3, 3, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 16, 16, channels,
+            16, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            32, 7, 7, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 7, 7, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            1, 1,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 224, 224, channels,
+            64, 5, 5, channels,
+            1, 1,
+            2, 2,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return problem_sizes
+
+
+def validate_problem_size(ps, conv_kind, split_k_slices):
+    P = (ps.H + 2 * ps.pad_h - ps.dilation_h * (ps.R - 1) - 1) // ps.stride_h + 1
+    Q = (ps.W + 2 * ps.pad_w - ps.dilation_w * (ps.S - 1) - 1) // ps.stride_w + 1
+    if P != ps.P or Q != ps.Q:
+        return False
+
+    # Split-K (serial or parallel) is not supported for strided dgrad
+    if conv_kind == "dgrad" and split_k_slices > 1 and (ps.stride_h > 1 or ps.stride_w > 1):
+        return False
+    return True
+
+
+class Conv2dLauncherFrontend:
+    def __init__(self, plan: cutlass_cppgen.Conv2d, seed: int = 80, backend="numpy"):
+        self.operation = plan
+        self.conv_kind = plan.conv_kind
+        self.seed = seed
+        self.backend = backend
+
+        self.dtype_A = plan._element_a
+        self.dtype_B = plan._element_b
+        self.dtype_C = plan._element_c
+        self.dtype_acc = plan._element_accumulator
+        self.layout_A = LayoutType.TensorNHWC
+        self.layout_B = LayoutType.TensorNHWC
+        self.layout_C = LayoutType.TensorNHWC
+        self.layout_D = LayoutType.TensorNHWC
+
+        self.element_compute = DataType.f32
+
+        if self.dtype_A in [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.bf16]:
+            self.rand_max = 1
+        else:
+            self.rand_max = 4
+        self.activation = plan.activation
+
+    def uniform_init(self, size, dtype):
+        tensor = torch.ceil(
+            torch.empty(size=size, dtype=torch_type(dtype), device="cuda").uniform_(-self.rand_max - 0.5, self.rand_max - 0.5)
+        ).to(memory_format=torch.channels_last)
+        return tensor
+
+    def reference(self, ps, A, B, C, alpha, beta, activation):
+        if self.conv_kind == ConvKind.Fprop:
+            torch_result = alpha * torch.ops.aten.conv2d(
+                A,
+                B,
+                stride=(ps.stride_h, ps.stride_w),
+                padding=(ps.pad_h, ps.pad_w),
+                dilation=(ps.dilation_h, ps.dilation_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Dgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_input(
+                (ps.N, ps.C, ps.H, ps.W),
+                B,
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        elif self.conv_kind == ConvKind.Wgrad:
+            torch_result = alpha * torch.nn.grad.conv2d_weight(
+                B,
+                (ps.K, ps.C, ps.R, ps.S),
+                A,
+                padding=(ps.pad_h, ps.pad_w),
+                stride=(ps.stride_h, ps.stride_w)
+            ) + beta * C
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is currently unsupported.")
+
+        if activation == cutlass_cppgen.backend.epilogue.relu:
+            torch_result = torch.nn.functional.relu(torch_result)
+        elif activation == cutlass_cppgen.backend.epilogue.leaky_relu:
+            torch_result = torch.nn.functional.leaky_relu(torch_result, 0.5)
+        return torch_result
+
+    def run(self, ps, split_k_mode=SplitKMode.Serial, split_k_slices=1, alpha=1.0, beta=0.0):
+        if self.conv_kind == ConvKind.Fprop:
+            tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+        elif self.conv_kind == ConvKind.Dgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+            tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+        elif self.conv_kind == ConvKind.Wgrad:
+            tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+            tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+            tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+        else:
+            raise Exception(f"Conv kind {self.conv_kind} is not supported")
+
+        torch.manual_seed(self.seed)
+
+        tensor_A = self.uniform_init(size=tensor_A_size, dtype=self.dtype_A)
+        tensor_B = self.uniform_init(size=tensor_B_size, dtype=self.dtype_B)
+        tensor_C = self.uniform_init(size=tensor_C_size, dtype=self.dtype_C)
+        tensor_D = torch.zeros_like(tensor_C).to(memory_format=torch.channels_last)
+        args = self.operation.run(tensor_A, tensor_B, tensor_C, tensor_D,
+            stride=(ps.stride_h, ps.stride_w),
+            padding=(ps.pad_h, ps.pad_w),
+            dilation=(ps.dilation_h, ps.dilation_w),
+            alpha=alpha, beta=beta,
+            split_k=(split_k_mode, split_k_slices))
+
+        args.sync()
+
+        tensor_D_ref = self.reference(ps, tensor_A, tensor_B, tensor_C, alpha, beta, self.activation)
+
+        torch.cuda.synchronize()
+        passed = torch.allclose(tensor_D, tensor_D_ref, atol=2e-06)
+
+        return passed
+
+
+def add_test(
+    cls,
+    cc,
+    conv_kind,
+    problem_sizes,
+    element,
+    element_accumulator,
+    element_output,
+    opclass,
+    threadblock_shape,
+    warp_count,
+    instruction_shape,
+    stages,
+    iterator_algorithm=None,
+    swizzle=None,
+    split_k_mode="serial",
+    split_k_slices=1,
+    activation = "identity"
+):
+    """Create a test-running function with the given specification"""
+    test_name = get_name_conv2d(
+        cc, conv_kind, element, element_accumulator,
+        element_output, opclass, threadblock_shape, warp_count, instruction_shape, stages,
+        iterator_algorithm, swizzle, split_k_mode, split_k_slices, activation)
+
+    def run(self):
+        # Create the plan
+        plan = cutlass_cppgen.Conv2d(
+            kind=conv_kind,
+            element=element,
+            element_accumulator=element_accumulator,
+            element_C=element_output,
+            element_D=element_output
+        )
+
+        # Set the opclass
+        plan.opclass = opclass
+        # Set the tile description
+        td = {
+            "threadblock_shape": threadblock_shape,
+            "warp_count": warp_count,
+            "stages": stages,
+            "instruction_shape": instruction_shape,
+        }
+
+        plan.tile_description = td
+        # Set iterator algorithm
+        if iterator_algorithm is not None:
+            plan.iterator_algorithm = iterator_algorithm
+        # Set swizzling functor
+        if swizzle is not None:
+            plan.swizzling_stride = swizzle
+
+        if activation != "identity":
+            if activation == "leaky_relu":
+                plan.activation = (cutlass_cppgen.epilogue.leaky_relu, 0.5)
+            else:
+                plan.activation = getattr(cutlass_cppgen.epilogue, activation)
+
+        conv2d_launcher = Conv2dLauncherFrontend(plan, 80, backend="torch")
+
+        for ps in problem_sizes:
+            if not validate_problem_size(ps, conv_kind, split_k_slices):
+                continue
+
+            self.assertTrue(conv2d_launcher.run(ps, split_k_mode, split_k_slices, 1.0, 2.0))
+
+    setattr(cls, test_name, run)
+
+    return run
+
+
+def get_conv_problems():
+    # 64: minimum channel size
+    conv_problems = TestbedConv2dProblemSizes(64).all
+
+    # Insert alignment 4 & 2 tests
+    conv_problems += [
+        Conv2DProblemSize(
+            1, 4, 4, 12,
+            8, 3, 3, 12,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 4, 4, 14,
+            8, 3, 3, 14,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+        Conv2DProblemSize(
+            1, 23, 56, 98,
+            128, 3, 3, 98,
+            4, 5,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        ),
+    ]
+
+    return conv_problems
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/run_all_tests.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/run_all_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..d892b5df047d5121345d902a77aadf2256b4c3b5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/conv2d/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'conv2d_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/emit/pytorch.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/emit/pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9d4c52a9f75fb4c3bc809947bf48ba85356ec70
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/emit/pytorch.py
@@ -0,0 +1,309 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests emitting a CUTLASS kernel to a PyTorch CUDA extension
+"""
+
+import random
+import tempfile
+import unittest
+
+from cutlass_library import ConvMode
+
+import cutlass_cppgen
+
+if cutlass_cppgen.utils.datatypes.is_torch_available():
+    import torch
+
+
+def _initialize(dtype, M: int, N: int, K: int):
+    """
+    Utility function to initialize A, B, C, and D matrices corresponding to dimensions M, N, and K
+
+    :param dtype: data type of tensors
+    :param M: M dimension of GEMM problem
+    :type M: int
+    :param N: N dimension of GEMM problem
+    :type N: int
+    :param K: N dimension of GEMM problem
+    :type K: int
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    sizes = [(M, K), (K, N), (M, N), (M, N)]
+    return [torch.randint(-3, 3, size, device='cuda').to(dtype) for size in sizes]
+
+
+def _generate_problems(dtype, num):
+    """
+    Utility function to generate `num` GEMMs of random sizes
+
+    :param dtype: data type of tensors
+    :param num: number of GEMMs to generate
+    :type num: int
+
+    :return: lists of A, B, C, and D tensors
+    :rtype: list
+    """
+    valid_sizes = [128, 256, 512, 1024]
+    As, Bs, Cs, Ds = [], [], [], []
+    for _ in range(num):
+        M, N, K = [random.choice(valid_sizes) for _ in range(3)]
+        A, B, C, D = _initialize(dtype, M, N, K)
+        As.append(A)
+        Bs.append(B)
+        Cs.append(C)
+        Ds.append(D)
+    return As, Bs, Cs, Ds
+
+def _generate_conv2d_problem(conv_kind, dtype, ps):
+    """
+    Utility function to generate conv2d inputs
+
+    :param conv_kind: kind of convolution
+    :type conv_kind: str
+    :param dtype: data type of tensors
+    :param problem_size: the conv2d problem size
+    :type problem_size: cutlass_cppgen.shape.Conv2DProblemSize
+
+    :return: initialized tensors A, B, C, and D
+    :rtype: list
+    """
+    if conv_kind == "fprop":
+        tensor_A_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.K, ps.P, ps.Q)
+    elif conv_kind == "dgrad":
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.K, ps.C, ps.R, ps.S)
+        tensor_C_size = (ps.N, ps.C, ps.H, ps.W)
+    else:
+        tensor_A_size = (ps.N, ps.K, ps.P, ps.Q)
+        tensor_B_size = (ps.N, ps.C, ps.H, ps.W)
+        tensor_C_size = (ps.K, ps.C, ps.R, ps.S)
+    sizes = [tensor_A_size, tensor_B_size, tensor_C_size]
+    return [torch.ceil(torch.empty(size, dtype=dtype, device='cuda').uniform_(-4.5, 3.5)).to(memory_format=torch.channels_last) for size in sizes]
+
+
+@unittest.skipIf(not cutlass_cppgen.utils.datatypes.is_torch_available(), 'PyTorch must be available to run PyTorch extension tests')
+class PyTorchExtensionTest(unittest.TestCase):
+
+    def test_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Gemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name='gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        A, B, C, _ = _initialize(dtype, 1024, 256, 512)
+
+        D_ref = A @ B
+        D = mod.run(A, B)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0)
+        assert torch.allclose(D, D_ref)
+
+        D = mod.run(A, B, C, 1.0, 0.0)
+        assert torch.allclose(D, D_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        D_ref = (A @ B) * alpha + (beta * C)
+        D = mod.run(A, B, C, alpha, beta)
+        assert torch.allclose(D, D_ref)
+
+    def test_grouped_gemm(self):
+        random.seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.GroupedGemm(element=dtype, layout=cutlass_cppgen.LayoutType.RowMajor)
+        op = plan.construct()
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name='grouped_gemm_mod', cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        As, Bs, Cs, _ = _generate_problems(dtype, 50)
+
+        def check_all(X, Y):
+            for x, y in zip(X, Y):
+                assert torch.allclose(x, y)
+
+        Ds_ref = [a @ b for a, b in zip(As, Bs)]
+        Ds = mod.run(As, Bs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0)
+        check_all(Ds, Ds_ref)
+
+        Ds = mod.run(As, Bs, Cs, 1.0, 0.0)
+        check_all(Ds, Ds_ref)
+
+        alpha = 2.0
+        beta = -1.0
+        Ds_ref = [(a @ b) * alpha + (beta * c) for a, b, c in zip(As, Bs, Cs)]
+        Ds = mod.run(As, Bs, Cs, alpha, beta)
+        check_all(Ds, Ds_ref)
+
+    def test_conv2d_fprop(self):
+        torch.manual_seed(2023)
+
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=dtype, element_accumulator=torch.float32)
+        plan.activation = "relu"
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("fprop", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+
+        D_ref = alpha * torch.ops.aten.conv2d(
+            A, B, stride=stride, padding=padding
+        ) + beta * C
+        D_ref = torch.nn.functional.relu(D_ref)
+        D = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta)
+
+        assert torch.allclose(D, D_ref)
+
+        # Test serial split-K
+        D_serial_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+
+
+    def test_conv2d_dgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="dgrad", element=dtype, element_accumulator=torch.float32)
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_dgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("dgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        input_size = (problem_size.N, problem_size.C, problem_size.H, problem_size.W)
+        D_ref = alpha * torch.nn.grad.conv2d_input(
+            input_size, B, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(input_size, A, B, C, stride, padding, alpha=alpha, beta=beta, )
+
+        assert torch.allclose(D, D_ref)
+
+    def test_conv2d_wgrad(self):
+        torch.manual_seed(2023)
+        dtype = torch.float16
+        plan = cutlass_cppgen.op.Conv2d(kind="wgrad", element=dtype, element_accumulator=torch.float32)
+
+        op = plan.construct()
+        with tempfile.TemporaryDirectory() as tmpdir:
+            mod = cutlass_cppgen.emit.pytorch(op, name="conv2d_wgrad_mod", cc=plan.cc, sourcedir=tmpdir, jit=True)
+
+        problem_size = cutlass_cppgen.shape.Conv2DProblemSize(
+            1, 4, 4, 16,
+            8, 3, 3, 16,
+            0, 0,
+            3, 3,
+            1, 1,
+            ConvMode.CrossCorrelation,
+            1, 1
+        )
+
+        A, B, C = _generate_conv2d_problem("wgrad", dtype, problem_size)
+        stride = (problem_size.stride_h, problem_size.stride_w)
+        padding = (problem_size.pad_h, problem_size.pad_w)
+
+        alpha = 1.0
+        beta = 0.5
+        weight_size = (problem_size.K, problem_size.C, problem_size.R, problem_size.S)
+        D_ref = alpha * torch.nn.grad.conv2d_weight(
+            B, weight_size, A,
+            stride=stride, padding=padding
+        ) + beta * C
+        D = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta)
+
+        assert torch.allclose(D, D_ref)
+
+        # Test serial split-K
+        D_serial_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="serial", split_k_slices=3)
+        assert torch.allclose(D, D_serial_split_k)
+
+        # Test parallel split-K
+        D_parallel_split_k = mod.run(weight_size, A, B, C, stride, padding, alpha=alpha, beta=beta, split_k_mode="parallel", split_k_slices=7)
+        assert torch.allclose(D, D_parallel_split_k)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_compute_sm80_90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_compute_sm80_90.py
new file mode 100644
index 0000000000000000000000000000000000000000..5467469e74e05573fb297b009914e0980e5ab222
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_compute_sm80_90.py
@@ -0,0 +1,198 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+"""
+Unit test for compute node in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from cutlass_cppgen import swizzle
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTCompute(EVTTestCaseBase):
+
+    def test_arith(self):
+        """
+        Test Arithmatic op
+        """
+        def evt_arith_compute(accum, C, alpha, beta, gamma):
+            D = ((accum + C) * alpha - gamma) / beta
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_arith_compute, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call(self):
+        """
+        Test Function call
+        """
+        def evt_func_call(accum, C, alpha, beta, gamma):
+            D = multiply_add(relu(accum + alpha) + C, beta, gamma)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "gamma": 2.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call, example_inputs)
+            input_keys = ["C", "alpha", "beta", "gamma"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_func_call2(self):
+        """
+        Test Function call
+        """
+
+        def evt_func_call2(accum, C, alpha, beta):
+            D = maximum(alpha * accum + beta * C, 0.0)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.5,
+                "beta": 0.5,
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_func_call2, example_inputs)
+            input_keys = ["C", "alpha", "beta"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    
+    def test_tanh(self):
+        """
+        Test Tanh op
+        """
+        def evt_tanh(accum):
+            D = tanh(accum)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_tanh, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    
+    def test_sigmoid(self):
+        """
+        Test Sigmoid op
+        """
+        def evt_sigmoid(accum):
+            D = sigmoid(accum)
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+
+            launcher = EVTTestBed(self.element, evt_sigmoid, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+    
+    def test_gelu(self):
+        """
+        Test GELU op
+        """
+        def evt_gelu(accum):
+            D = gelu(accum)
+            return D
+        
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            
+            launcher = EVTTestBed(self.element, evt_gelu, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_exp(self):
+        """
+        Test Exp op
+        """
+        def evt_exp(accum):
+            D = exp(accum)
+            return D
+        
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n))
+            }
+            
+            launcher = EVTTestBed(self.element, evt_exp, example_inputs)
+            input_keys = []
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a7b7f7a336dce0651f299d26b17df04952be99
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_layout_sm80_90.py
@@ -0,0 +1,173 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLayout(EVTTestCaseBase):
+
+    def test_permute_1(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D_permute = F_permute + permute(C, indices=(0, 2, 1))
+            D = permute(D_permute, indices=(0, 2, 1))
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_2(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(0, 2, 1))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, n, m)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, n, m)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() != 90, "This unittest is for cc = Sm90 only")
+    def test_permute_3(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_permute(accum, alpha, C):
+            F = alpha * accum
+            F_permute = permute(F, indices=(1, 0, 2))
+            D = F_permute + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (m, l, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (m, l, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_permute, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_reshape(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            E_reshape = reshape(TensorE, new_shape=(512, 1))
+            D = F + E_reshape
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (16, 32)),
+            "D": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+    def test_reshape2(self):
+        """
+        Test reshape
+        """
+        def evt_reshape(accum, alpha, TensorE):
+            F = alpha * accum
+            F_reshape = reshape(F, new_shape=(2, 3, 512, 256))
+            D = F_reshape + TensorE
+            return D
+
+        example_inputs = {
+            "accum": self.fake_tensor(self.element, (self.l, self.m, self.n)),
+            "alpha": 0.5,
+            "TensorE": self.fake_tensor(self.element, (2, 3, 1, self.n)),
+            "D": self.fake_tensor(self.element, (2, 3, self.m, self.n)),
+        }
+
+        launcher = EVTTestBed(self.element, evt_reshape, example_inputs)
+        input_keys = ["alpha", "TensorE"]
+        result_keys = ["D"]
+        launcher.verify(self.problem_size, input_keys, result_keys, self.l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a5c6bb17bb44bf294cc7a6a749c706601034f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_load_sm80_90.py
@@ -0,0 +1,142 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for load nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTLoad(EVTTestCaseBase):
+
+    def test_tensor_load(self):
+        """
+        Load extra tensor with shape [m, n]
+        """
+        def evt_tensor_load(accum, C, aux, aux_batch):
+            D = accum + C + aux + aux_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "aux_batch": self.fake_tensor(np.float32, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_tensor_load, example_inputs)
+            input_keys = ["C", "aux", "aux_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_broadcast(self):
+        """
+        Load extra tensor with shape [1, n]
+        """
+        def evt_row_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (n,)),
+                "bias_batch": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_column_broadcast(self):
+        """
+        Load extra tensor with shape [m, 1]
+        """
+        def evt_column_broadcast(accum, C, bias, bias_batch):
+            D = accum + C + bias + bias_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m, 1)),
+                "bias_batch": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_column_broadcast, example_inputs)
+            input_keys = ["C", "bias", "bias_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_broadcast(self):
+        """
+        Load extra tensor with shape [1, 1]
+        """
+        def evt_scalar_broadcast(accum, C, alpha, alpha_batch):
+            D = accum + C + alpha + alpha_batch
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "alpha_batch": self.fake_tensor(np.float32, (l, 1, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_broadcast, example_inputs)
+            input_keys = ["C", "alpha", "alpha_batch"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dc8fe0d5ec413f1da57a8fa0875ed5e7baa887
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_mixed_sm80_90.py
@@ -0,0 +1,319 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unittest for mixed types of nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+from cutlass_cppgen.swizzle import ThreadblockSwizzleStreamK
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTMixed(EVTTestCaseBase):
+
+    def test_same_variable_used_multiple_times(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_aux_store(accum):
+            z0 = relu(accum)
+            D = z0 + z0
+            return z0, D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "z0": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["accum"]
+            result_keys = ["z0", "D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_no_lca(self):
+        """
+        The same variable z0 is used multiple times
+        """
+        def evt_no_lca(accum, bias):
+            E = relu(accum)
+            F = E + bias
+            tmp_2 = E + 2
+            D = tmp_2 + E
+            return D
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "bias": self.fake_tensor(self.element, (m,1), stride=(1,0)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_no_lca, example_inputs)
+            input_keys = ["accum", "bias"]
+            result_keys = ["D"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        if device_cc() == 80:
+            alignments = [2, 4, 8]
+        else:
+            # Sm90 EVT currently only supports 128-bit alignment
+            alignments = [8,]
+        for align in alignments:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(self.element, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(self.element, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(self.element, (l, m, n)),
+                    "cbias": self.fake_tensor(self.element, (m, 1)),
+                    "rbias": self.fake_tensor(self.element, (n,)),
+                    "D": self.fake_tensor(self.element, (l, m, n)),
+                    "F": self.fake_tensor(self.element, (l, m, n)),
+                    "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                    "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                }
+
+                launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_float(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for align in [3, 2, 4]:
+            for m, n, k, l in self.get_problem_sizes(align):
+                example_inputs = {
+                    "accum": self.fake_tensor(np.float32, (l, m, n)),
+                    "alpha": 1.0,
+                    "C": self.fake_tensor(np.float32, (l, m, n)),
+                    "beta": 1.0,
+                    "aux": self.fake_tensor(np.float32, (l, m, n)),
+                    "cbias": self.fake_tensor(np.float32, (m, 1)),
+                    "rbias": self.fake_tensor(np.float32, (n,)),
+                    "D": self.fake_tensor(np.float32, (l, m, n)),
+                    "F": self.fake_tensor(np.float32, (l, m, n)),
+                    "F_row_max": self.fake_tensor(np.float32, (n,)),
+                    "E_col_max": self.fake_tensor(np.float32, (m, 1))
+                }
+                launcher = EVTTestBed(DataType.f32, evt_mixed_dag, example_inputs)
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stage2(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_partition_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (l, m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            tile_description = {
+                "threadblock_shape": [128, 128, 64],
+                "warp_count": [2, 2, 2]
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag, example_inputs, tile_description=tile_description, epilogue_stages=2)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    @unittest.skipIf(device_cc() not in [80, 89], "This unittest is for cc 80 and 89 only")
+    def test_mixed_dag_stream_k(self):
+        def evt_mixed_dag(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        # High per-sm occupancy tile_description
+        tile_description = {
+            "threadblock_shape": [128, 128, 32],
+            "warp_count": [2, 2, 1],
+            "stages": 3
+        }
+        tds = [None, tile_description]
+        for td in tds:
+            for m, n, k, l in self.get_problem_sizes(8, k=960, batch_count=[1, 3]):
+                if l == 1:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (m, n)),
+                        "F": self.fake_tensor(self.element, (m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+                else:
+                    example_inputs = {
+                        "accum": self.fake_tensor(self.element, (l, m, n)),
+                        "alpha": 1.0,
+                        "C": self.fake_tensor(self.element, (l, m, n)),
+                        "beta": 1.0,
+                        "aux": self.fake_tensor(self.element, (l, m, n)),
+                        "cbias": self.fake_tensor(self.element, (m, 1)),
+                        "rbias": self.fake_tensor(self.element, (n,)),
+                        "D": self.fake_tensor(self.element, (l, m, n)),
+                        "F": self.fake_tensor(self.element, (l, m, n)),
+                        "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                        "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+                    }
+
+                if td is not None:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        tile_description=td,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+                else:
+                    launcher = EVTTestBed(
+                        self.element, evt_mixed_dag, example_inputs,
+                        swizzling_functor=ThreadblockSwizzleStreamK, backend="torch")
+
+                input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+                result_keys = ["D", "F", "F_row_max", "E_col_max"]
+                launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_mixed_dag_no_batch(self):
+        def evt_mixed_dag_no_batch(accum, alpha, C, beta, aux, cbias, rbias):
+            F = alpha * accum + (beta * C + aux)
+            F_row_max = max(F, dim=[0, 1])
+            E = relu(F + 1) + cbias + rbias
+            E_col_max = max(E, dim=[0, 2])
+            D = E + F
+            return D, F, F_row_max, E_col_max
+
+        for m, n, k, _ in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (m, n)),
+                "alpha": 1.0,
+                "C": self.fake_tensor(self.element, (m, n)),
+                "beta": 1.0,
+                "aux": self.fake_tensor(self.element, (m, n)),
+                "cbias": self.fake_tensor(self.element, (m, 1)),
+                "rbias": self.fake_tensor(self.element, (n,)),
+                "D": self.fake_tensor(self.element, (m, n)),
+                "F": self.fake_tensor(self.element, (m, n)),
+                "F_row_max": self.fake_tensor(DataType.f32, (n,)),
+                "E_col_max": self.fake_tensor(DataType.f32, (m, 1))
+            }
+
+            launcher = EVTTestBed(self.element, evt_mixed_dag_no_batch, example_inputs)
+            input_keys = ["alpha", "C", "beta", "aux", "cbias", "rbias"]
+            result_keys = ["D", "F", "F_row_max", "E_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, 1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47f11e4f3bde3499948ae68b1b5bb79347f0fd1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/evt_store_sm80_90.py
@@ -0,0 +1,180 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Unit test for store nodes in SM90
+"""
+
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend import *
+from cutlass_cppgen.epilogue import *
+
+from utils.evt_testbed import EVTTestBed, EVTTestCaseBase
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+
+@unittest.skipIf(device_cc() not in [80, 86, 89, 90], "This unittest is only supported on CC [80, 86, 89, 90]")
+class TestEVTStore(EVTTestCaseBase):
+
+    @unittest.skipIf(device_cc() != 90, "This test is only for CC 90")
+    def test_invalid_store(self):
+        """
+        Test invalid store
+        """
+        def evt_invalid_store(accum):
+            D = accum
+            F = D + 1 # D has users, which is not allowed on SM90 or higher
+            return D, F
+        
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n))
+            }
+            with self.assertRaisesRegex(
+                    RuntimeError, 
+                    r"On SM90 or higher, D is expected to be a output node with 0 users " 
+                    r"to enable smem reuse between C and D, but got 1"
+                ):
+                launcher = EVTTestBed(self.element, evt_invalid_store, example_inputs)
+            
+            break  # Only need to test once
+
+    def test_aux_store(self):
+        """
+        Returning a tensor with shape [m, n]
+        """
+        def evt_aux_store(accum, alpha, C):
+            F = alpha * accum
+            D = F + C
+            return D, F
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 0.5,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F": self.fake_tensor(self.element, (l, m, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_aux_store, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_col_reduce(self):
+        """
+        Reduction [m, n] -> [m, 1]
+        """
+        def evt_row_reduce(accum, alpha, C):
+            acc_row_max = max(accum, dim=[2,])
+            F = alpha * accum
+            F_row_max = max(F, dim=[0, 2])
+            D = F + C
+            return D, F_row_max, acc_row_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_row_max": self.fake_tensor(np.float32, (m, 1)),
+                "acc_row_max": self.fake_tensor(np.float32, (l, m, 1)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_row_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_row_max", "acc_row_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_row_reduce(self):
+        """
+        Reduction [m, n] -> [n]
+        """
+        def evt_col_reduce(accum, alpha, C):
+            acc_col_max = max(accum, dim=[1,])
+            F = alpha * accum
+            F_col_max = max(F, dim=[0, 1])
+            D = F + C
+            return D, F_col_max, acc_col_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "F_col_max": self.fake_tensor(np.float32, (n,)),
+                "acc_col_max": self.fake_tensor(np.float32, (l, 1, n)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_col_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_col_max", "acc_col_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+    def test_scalar_reduce(self):
+        """
+        Reduction [m, n] -> [1,]
+        """
+        def evt_scalar_reduce(accum, alpha, C):
+            acc_max = max(accum, dim=[1, 2])
+            F = alpha * accum
+            F_max = max(F, dim=[0, 1, 2])
+            D = F + C
+            return D, F_max, acc_max
+
+        for m, n, k, l in self.get_problem_sizes(8):
+            example_inputs = {
+                "accum": self.fake_tensor(self.element, (l, m, n)),
+                "alpha": 2.0,
+                "C": self.fake_tensor(self.element, (l, m, n)),
+                "acc_max": self.fake_tensor(np.float32, (l, 1, 1)),
+                "F_max": self.fake_tensor(np.float32, (1,)),
+                "D": self.fake_tensor(self.element, (l, m, n)),
+            }
+
+            launcher = EVTTestBed(self.element, evt_scalar_reduce, example_inputs)
+            input_keys = ["C", "alpha"]
+            result_keys = ["D", "F_max", "acc_max"]
+            launcher.verify((m, n, k), input_keys, result_keys, l)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb84e2e8c85e602b45b9ee18ce324accd3a32cd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'evt_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/utils/evt_testbed.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/utils/evt_testbed.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d375d856ffaef6be50b39b76121e0eb78a7465
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/evt/utils/evt_testbed.py
@@ -0,0 +1,235 @@
+################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+
+"""
+Testbed classes of EVT
+"""
+
+import torch
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen import Tensor
+import cutlass_cppgen.backend.evt
+from cutlass_cppgen.shape import GemmCoord
+from cutlass_cppgen.utils.datatypes import torch_type
+from cutlass_cppgen.utils.profiler import CUDAEventProfiler
+
+
+class EVTReferenceModule:
+    def __init__(self, layout_A, layout_B, layout_C, epilogue_visitor):
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.epilogue_visitor = epilogue_visitor
+
+    def run(self, A, B, C, problem_size, alpha, beta, batch=1):
+        if self.layout_A == cutlass_cppgen.LayoutType.RowMajor:
+            A_row = A.view((batch, problem_size.m, problem_size.k))
+        else:
+            A_col = A.view((batch, problem_size.k, problem_size.m))
+            A_row = torch.permute(A_col, (0, 2, 1))
+
+        if self.layout_B == cutlass_cppgen.LayoutType.RowMajor:
+            B_row = B.view((batch, problem_size.k, problem_size.n))
+        else:
+            B_col = B.view((batch, problem_size.n, problem_size.k))
+            B_row = torch.permute(B_col, (0, 2, 1))
+
+        if self.layout_C == cutlass_cppgen.LayoutType.RowMajor:
+            C_row = C.view((batch, problem_size.m, problem_size.n))
+        else:
+            C_col = C.view((batch, problem_size.n, problem_size.m))
+            C_row = torch.permute(C_col, (0, 2, 1))
+
+        out_row = torch.matmul(A_row, B_row) * alpha + C_row * beta
+
+        if self.layout_C == cutlass_cppgen.LayoutType.ColumnMajor:
+            out = torch.permute(out_row, (0, 2, 1))
+        else:
+            out = out_row
+
+        return torch.flatten(out)
+
+    def __call__(self, A, B, C, problem_size, batch=1, epilogue_args=None):
+        # Running the mainloop
+        accum = self.run(
+            A, B, C, problem_size, 1.0, 0.0, batch=batch
+        ).reshape(batch, problem_size.m, problem_size.n)
+        
+        # Running the epilogue
+        epilogue_args["accum"] = accum
+        references = self.epilogue_visitor(**epilogue_args)
+        
+        # Return the results
+        if not isinstance(references, tuple):
+            references = (references,)
+        return references
+        
+
+class EVTTestBed:
+    """
+    Epilogue Visitor Testbed
+    """
+    def __init__(self, element, evt_fn, example_inputs, profile=False, **kwargs) -> None:
+        self.element = element
+        layout = cutlass_cppgen.LayoutType.RowMajor
+        self.example_inputs = example_inputs
+        
+        # Create the Gemm plan
+        self.plan = cutlass_cppgen.op.Gemm(element=element, layout=layout, element_accumulator=torch.float32)
+        
+        if "tile_description" in kwargs:
+            self.plan.tile_description = kwargs["tile_description"]
+        
+        if "swizzling_functor" in kwargs:
+            self.plan.swizzling_functor = kwargs["swizzling_functor"]
+        
+        # Compile the epilogue visitor
+        epilogue_visitor = cutlass_cppgen.epilogue.trace(evt_fn, example_inputs)
+        if "epilogue_stages" in kwargs:
+            epilogue_visitor.epilogue_stages = kwargs["epilogue_stages"]
+        self.plan.epilogue_visitor = epilogue_visitor
+        
+        # Reference model
+        self.reference_fn = EVTReferenceModule(layout, layout, layout, epilogue_visitor)
+        
+        self.profile = profile
+
+    def get_torch_tensor(self, shape, dtype=None, fill=None):
+        if dtype is None:
+            dtype = self.element
+        
+        dtype = torch_type(dtype)
+        if fill is None:
+            return torch.ceil(
+                torch.empty(size=shape, dtype=dtype, device="cuda").uniform_(-4.5, 3.5)
+            )
+        else:
+            return torch.full(shape, fill, dtype=dtype, device="cuda")
+    
+    def verify(self, problem_size, input_keys, result_keys, batch_count=1):
+        """
+        Verify the results
+        """
+        problem_size = GemmCoord(*problem_size)
+
+        # Initiate the GEMM arguments
+        tensor_A = self.get_torch_tensor((batch_count, problem_size.m, problem_size.k))
+        tensor_B = self.get_torch_tensor((batch_count, problem_size.k, problem_size.n))
+        
+        # Initialize the epilogue args
+        epilogue_args = {}
+        for key in self.example_inputs.keys():
+            if key in input_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element)
+                else:
+                    epilogue_args[key] = tensor
+            elif key in result_keys:
+                tensor = self.example_inputs[key]
+                if isinstance(tensor, Tensor):
+                    if "max" in key:
+                        fill = -1000
+                    else:
+                        fill = 0
+                    epilogue_args[key] = self.get_torch_tensor(tensor.shape, tensor.element, fill=fill)
+                else:
+                    epilogue_args[key] = tensor
+        
+        tensor_D = epilogue_args["D"]
+        if "C" in epilogue_args:
+            tensor_C = epilogue_args["C"]
+        else:
+            tensor_C = tensor_D
+        # Run the device kernel
+        self.plan.run(tensor_A, tensor_B, tensor_C, tensor_D, visitor_args=epilogue_args)
+        
+        # Run the host reference
+        evt_args_inputs = {}
+        for key in input_keys:
+            evt_args_inputs[key] = epilogue_args[key]
+        
+        reference_results = self.reference_fn(
+            tensor_A, tensor_B, tensor_C, problem_size, batch_count, evt_args_inputs)
+        
+        # Compare the results
+        for result, ref in zip(result_keys, reference_results):
+            assert torch.equal(
+                epilogue_args[result].flatten(), 
+                ref.masked_fill(torch.isnan(ref), float('inf')).flatten())
+        
+        # Run profile
+        if self.profile:
+            profiler = CUDAEventProfiler(
+                self.plan, 100, 100, tensor_A, tensor_B, tensor_C, tensor_D,
+                visitor_args = epilogue_args
+            )
+            print(f"Cutlass Python Duration: {profiler()}")
+
+
+class EVTTestCaseBase(unittest.TestCase):
+    """
+    Base class for EVT Unittest
+    """
+    def __init__(self, methodName: str = "runTest", lmnk=(6, 512, 256, 128)) -> None:
+        super().__init__(methodName)
+        
+        self.element = cutlass_cppgen.DataType.f16
+        self.l, self.m, self.n, self.k = lmnk
+        
+        self.problem_size = (self.m, self.n, self.k)
+        
+        torch.random.manual_seed(42)
+    
+    def fake_tensor(self, element, shape, stride=None):
+        if stride is None:
+            return Tensor(element=element, shape=shape, layout_tag=cutlass_cppgen.LayoutType.RowMajor)
+        else:
+            return Tensor(element=element, shape=shape, stride=stride)
+    
+    def get_problem_sizes(self, alignment, k=None, batch_count=[3,]):
+        k = k if k else self.k
+        problem_size_m = [alignment, 512 - 3 * alignment]
+        problem_size_n = [alignment, 512 - alignment]
+        if alignment % 8 == 0:
+            problem_size_m.append(768)
+            problem_size_n.append(768)
+        problem_size_l = batch_count
+        problem_sizes = []
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for l in problem_size_l:
+                    problem_sizes.append((m, n, k, l))
+        
+        return problem_sizes
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_batched.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_batched.py
new file mode 100644
index 0000000000000000000000000000000000000000..155426ab902d1f99eafc7b03c388fc79b4520317
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_batched.py
@@ -0,0 +1,134 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+High-level tests for running batched GEMMs
+"""
+
+from functools import partial
+import logging
+from math import prod
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+import torch
+
+from utils import LayoutCombination
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+
+torch.manual_seed(2023)
+
+
+def pytorch_reference(A, B, C, alpha, beta):
+    # Get the batch count. Assume that any of A, B, and C
+    # with a batch dimension ahve matching batch count. Thus,
+    # we break out of the loop once we have found the first
+    # tensor containing a batch dimension.
+    batch_count = (1,)
+    for tensor in [A, B, C]:
+        if len(tensor.shape) > 2:
+            batch_count = tensor.shape[:-2]
+            break
+
+    int_batch_count = prod(batch_count)
+
+    def add_batch(tensor):
+        if len(tensor.shape) == 2:
+            return tensor.unsqueeze(0).repeat(int_batch_count, 1, 1)
+        else:
+            return tensor.reshape(-1, tensor.size(-2), tensor.size(-1))
+
+    # Reshape tensors to have batch dimension
+    A = add_batch(A)
+    B = add_batch(B)
+    C = add_batch(C)
+
+    ret = (torch.bmm(A, B) * alpha) + (C * beta)
+    reshape_vals = batch_count + C.shape[-2:]
+    return ret.reshape(*reshape_vals)
+
+
+def initialize(rows, cols, batch):
+    tensor = torch.randint(-3, 3, size=(rows*cols*prod(batch),), device='cuda').half()
+    if len(batch) > 0 and prod(batch) > 1:
+        reshape_vals = batch + (rows, cols)
+        return tensor.reshape(*reshape_vals)
+    else:
+        return tensor.reshape(rows, cols)
+
+
+class GemmF16Batched(unittest.TestCase):
+    def run_batched(self, batch_count: tuple, batch_A: bool, batch_B: bool, batch_C: bool):
+        M = 512
+        N = 256
+        K = 128
+        alpha = 1.
+        beta = 2.
+
+        A = initialize(M, K, batch_count if batch_A else (1,))
+        B = initialize(K, N, batch_count if batch_B else (1,))
+        C = initialize(M, N, batch_count if batch_C else (1,))
+        D = initialize(M, N, batch_count)
+
+        plan = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=cutlass_cppgen.DataType.f32)
+        plan.run(A, B, C, D, alpha, beta)
+        reference = pytorch_reference(A, B, C, alpha, beta)
+        assert reference.equal(D)
+
+    def test_batched_ABC(self):
+        self.run_batched((3,), True, True, True)
+        self.run_batched((2, 3), True, True, True)
+
+    def test_batched_AB(self):
+        self.run_batched((3,), True, True, False)
+        self.run_batched((2, 3), True, True, False)
+
+    def test_batched_AC(self):
+        self.run_batched((3,), True, False, True)
+        self.run_batched((2, 3), True, False, True)
+
+    def test_batched_BC(self):
+        self.run_batched((3,), False, True, True)
+        self.run_batched((2, 3), False, True, True)
+
+    def test_batched_A(self):
+        self.run_batched((3,), True, False, False)
+        self.run_batched((2, 3), True, False, False)
+
+    def test_batched_B(self):
+        self.run_batched((3,), False, True, False)
+        self.run_batched((2, 3), False, True, False)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd26951ec5d8a1eb6cbe38491c64fde2873b9c3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm80.py
@@ -0,0 +1,128 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f16
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 32], warp_count=[2, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+add_test_tensorop(cls=GemmF16Sm80, layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                  element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF16Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+              element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                 element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_streamk(cls=GemmF16Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                 element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=5)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm90.py
new file mode 100644
index 0000000000000000000000000000000000000000..61aa295b966daf5943e7092572c98ee20143e2b5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f16_sm90.py
@@ -0,0 +1,146 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F16 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.f16
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF16Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF16Sm90, element=dtype,
+                               warp_count=None, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_unit_cluster = partial(add_test_tensorop, cluster_shape=[1, 1, 1])
+add_test_unit_cluster(layouts=LayoutCombination.NNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=3)
+add_test_unit_cluster(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.NTT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[4, 4, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 64], stages=5)
+add_test_unit_cluster(layouts=LayoutCombination.TNT, alignments=[2, 2, 2], element_output=cutlass_cppgen.DataType.f16,
+                      element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 32], stages=None)
+
+# Tests with different cluster shapes
+add_test_cluster_shape = partial(add_test_tensorop, threadblock_shape=[64, 128, 64], stages=None)
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                       element_accumulator=cutlass_cppgen.DataType.f16, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TNN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.NNN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 4, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[4, 1, 1])
+add_test_cluster_shape(layouts=LayoutCombination.TTN, alignments=[8, 8, 4], element_output=cutlass_cppgen.DataType.f32,
+                       element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[4, 2, 1])
+
+# Tests for different schedule modes
+add_test_schedule = partial(add_test_specialized, layouts=LayoutCombination.TTN, alignments=[8, 8, 4],
+                            element_output=cutlass_cppgen.DataType.f32, element_accumulator=cutlass_cppgen.DataType.f32,
+                            opclass=cutlass_cppgen.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64], stages=None)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[1, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+)
+add_test_schedule(
+    cluster_shape=[2, 1, 1],
+    kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative,
+    epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt, alignments=[1, 1, 1], cluster_shape=[1, 1, 1], stages=2)
+add_test_simt(layouts=LayoutCombination.NNN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 8])
+add_test_simt(layouts=LayoutCombination.TNN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64, 128, 8])
+add_test_simt(layouts=LayoutCombination.NTN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128,  64, 8])
+add_test_simt(layouts=LayoutCombination.TTN, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[ 64,  64, 8])
+add_test_simt(layouts=LayoutCombination.NNT, element_output=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16, threadblock_shape=[128, 128, 8])
+
+# Tests with void-C kernels
+add_test_cluster_shape(layouts=LayoutCombination.NNT, alignments=[8, 8, 8], element_output=cutlass_cppgen.DataType.f16,
+                       element_accumulator=cutlass_cppgen.DataType.f32, threadblock_shape=[128, 128, 32], stages=None,
+                       cluster_shape=[2, 1, 1], element_C=cutlass_cppgen.DataType.void)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f32_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f32_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf662b9208ab2a5343d0fd11106835b7d9a5b2e9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f32_sm80.py
@@ -0,0 +1,104 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F32 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f32
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF32Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64, 128, 32], warp_count=[1, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 32], warp_count=[1, 1, 1], stages=4)
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF32Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF32Sm80StreamK, layouts=LayoutCombination.TTN, alignments=[4, 4, 4], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 32], warp_count=[2, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..3075ddf74bf2a119759ca1a3e47c0815f4b0923c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm80.py
@@ -0,0 +1,103 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.f64
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 64,  64, 16], warp_count=[2, 2, 1], stages=4)
+add_test_tensorop(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                  element_accumulator=dtype, threadblock_shape=[ 32,  32, 16], warp_count=[2, 1, 1], stages=5)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmF64Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+              element_accumulator=dtype, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmF64Sm80StreamK, layouts=LayoutCombination.NTT, alignments=[1, 1, 1], element_output=dtype, element_C=dtype,
+                 element_accumulator=dtype, threadblock_shape=[128, 128, 16], warp_count=[4, 2, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm90.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf36fc77436fef22882e98c752b7a599cf7fb95
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f64_sm90.py
@@ -0,0 +1,71 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with F64 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.f64
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF64Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF64Sm90, alignments=[1, 1, 1], cluster_shape=[1, 1, 1],
+                               element=dtype, element_output=dtype, element_accumulator=dtype, compilation_modes=['nvcc'])
+
+add_test_specialized(opclass=cutlass_cppgen.OpcodeClass.TensorOp, layouts=LayoutCombination.NNT, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(opclass=cutlass_cppgen.OpcodeClass.TensorOp, layouts=LayoutCombination.TNN, threadblock_shape=[128, 128, 32], stages=3)
+add_test_specialized(    opclass=cutlass_cppgen.OpcodeClass.Simt, layouts=LayoutCombination.NNN, threadblock_shape=[128, 128,  8], stages=2)
+add_test_specialized(    opclass=cutlass_cppgen.OpcodeClass.Simt, layouts=LayoutCombination.TTT, threadblock_shape=[ 64, 128,  8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f8_sm90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f8_sm90.py
new file mode 100644
index 0000000000000000000000000000000000000000..fef6d457a6528a61613d1295877a2b6b8f80fef5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_f8_sm90.py
@@ -0,0 +1,112 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.e4m3
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E4M3Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E4M3Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+# Test with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.e4m3,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized)
+
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.e4m3,
+              element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+
+
+#
+# Add a test for E5M2
+#
+dtype = cutlass_cppgen.DataType.e5m2
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmF8E5M2Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmF8E5M2Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=dtype,
+                  element_accumulator=cutlass_cppgen.DataType.f32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_mixed_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_mixed_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a002a5fbad80de5f7b29e42db0806469244914c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_mixed_sm80.py
@@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with mixed operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype =cutlass_cppgen.DataType.f16
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmMixedSm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_mixed = partial(add_test_gemm, cls=GemmMixedSm80, element=dtype, cc=cc, cluster_shape=[1, 1, 1],
+                         opclass=cutlass_cppgen.OpcodeClass.TensorOp, threadblock_shape=[128, 128, 64],
+                         warp_count=[2, 2, 1], stages=3, element_accumulator=cutlass_cppgen.DataType.f32)
+
+# Test with upcast on A
+add_test_mixed(element_A=cutlass_cppgen.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_A=cutlass_cppgen.DataType.s8, alignments=[16, 8, 8], layouts=LayoutCombination.TNN)
+
+# Test with upcast on B
+add_test_mixed(element_B=cutlass_cppgen.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNT)
+add_test_mixed(element_B=cutlass_cppgen.DataType.s8, alignments=[8, 16, 8], layouts=LayoutCombination.TNN)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm80.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm80.py
new file mode 100644
index 0000000000000000000000000000000000000000..e226e23684147cb0a9cd5c1270468eb96c67ba15
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm80.py
@@ -0,0 +1,103 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM80
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 80
+dtype = cutlass_cppgen.DataType.s8
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM80 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm80StreamK(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, element=dtype, cc=cc, cluster_shape=[1, 1, 1])
+
+# Tests using TensorOp
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16, 16],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[256, 128, 64], warp_count=[4, 2, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNT, alignments=[16, 16, 16],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+add_test_tensorop(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[16, 16,  4], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+                  element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64,  64, 64], warp_count=[1, 1, 1], stages=4)
+
+# Tests using SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TNN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64, 128, 8], warp_count=[1, 2, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NTN, alignments=[1, 1, 1],  element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128,  64, 8], warp_count=[2, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.TTN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[ 64,  64, 8], warp_count=[1, 1, 1], stages=2)
+add_test_simt(cls=GemmS8Sm80, layouts=LayoutCombination.NNT, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s32, element_C=cutlass_cppgen.DataType.s32,
+              element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 128, 8], warp_count=[2, 2, 1], stages=2)
+
+# Stream K tests
+add_test_streamk = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp, swizzle=cutlass_cppgen.swizzle.ThreadblockSwizzleStreamK)
+add_test_streamk(cls=GemmS8Sm80StreamK, layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8, element_C=cutlass_cppgen.DataType.s8,
+                 element_accumulator=cutlass_cppgen.DataType.s32, threadblock_shape=[128, 256, 64], warp_count=[2, 4, 1], stages=3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm90.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm90.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0101f78da3b62b599a5deeb89f5596a7e515ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_s8_sm90.py
@@ -0,0 +1,98 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Low-level functionality tests for GEMM with S8 operands on SM90
+"""
+
+from functools import partial
+import logging
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen.backend.utils.device import device_cc
+
+from utils import LayoutCombination, add_test_gemm
+
+
+cutlass_cppgen.set_log_level(logging.WARNING)
+cc = 90
+dtype = cutlass_cppgen.DataType.s8
+
+
+@unittest.skipIf(device_cc() < cc, 'Device compute capability is insufficient for SM90 tests.')
+@unittest.skipIf(cutlass_cppgen.utils.datatypes.torch_type(dtype) is None, f'Version of torch installed does not contain a datatype match for {dtype}')
+class GemmS8Sm90(unittest.TestCase):
+    """
+    Wrapper class to which tests will be added dynamically in __main__
+    """
+    pass
+
+
+add_test_specialized = partial(add_test_gemm, cls=GemmS8Sm90, element=dtype, compilation_modes=['nvcc'])
+
+add_test_tensorop = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.TensorOp)
+
+# Tests with 1x1x1 clusters
+add_test_tensorop(layouts=LayoutCombination.TNN, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=3)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16,  8], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64,  128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128,  64,  32], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[ 4,  4, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with different cluster shapes
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[2, 2, 1], threadblock_shape=[128, 128, 128], stages=None)
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 4, 1], threadblock_shape=[128, 128, 128], stages=None)
+
+# Tests with warp-specialized ping-pong schedule
+add_test_tensorop(layouts=LayoutCombination.TNT, alignments=[16, 16, 16], element_output=cutlass_cppgen.DataType.s8,
+                  element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[2, 1, 1], threadblock_shape=[128, 128, 128], stages=None,
+                  kernel_schedule=cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong,
+                  epilogue_schedule=cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized)
+
+# Tests for SIMT
+add_test_simt = partial(add_test_specialized, opclass=cutlass_cppgen.OpcodeClass.Simt)
+add_test_simt(layouts=LayoutCombination.TNN, alignments=[1, 1, 1], element_output=cutlass_cppgen.DataType.s8,
+              element_accumulator=cutlass_cppgen.DataType.s32, cluster_shape=[1, 1, 1], threadblock_shape=[64, 32, 8], stages=2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_testbed.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_testbed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffda5b47e37f184c2352f0ee4e737635dbd4147
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/gemm_testbed.py
@@ -0,0 +1,423 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from math import prod
+import os
+import re
+import subprocess
+
+import torch
+
+from cutlass_library import (
+    DataType,
+    DataTypeSize,
+    GemmUniversalMode,
+    LayoutType,
+    OpcodeClass,
+    ShortDataTypeNames,
+    SwizzlingFunctor
+)
+
+from cutlass_cppgen.backend import compiler
+from cutlass_cppgen.backend.gemm_operation import GemmArguments, GemmOperationUniversal
+from cutlass_cppgen.backend.reduction_operation import ReductionArguments, ReductionOperation
+from cutlass_cppgen.shape import GemmCoord, MatrixCoord
+from cutlass_cppgen.utils.datatypes import torch_type
+
+
+class GemmUniversalLauncher:
+    def __init__(
+        self,
+        operation,
+        seed=2080,
+        verification=True,
+        iterations=500,
+        compiler_mode= "nvcc",
+        **kwargs,
+    ) -> None:
+        self.math_operation = operation.tile_description.math_instruction.math_operation
+        self.verification = verification
+
+        if compiler_mode == "nvcc":
+            compiler.nvcc()
+        elif compiler_mode == "nvrtc":
+            compiler.nvrtc()
+        else:
+            raise Exception(f"Unexpected compiler string {compiler_mode}")
+
+        op_list = [operation]
+        if operation.arch < 90:
+            # Split K via Python is currently only supported for pre-SM90 kernels
+            self.reduction_operation: ReductionOperation = ReductionOperation(
+                shape=MatrixCoord(4, 32 * operation.C.alignment),
+                C=operation.C,
+                element_accumulator=operation.tile_description.math_instruction.element_accumulator,
+                element_compute=operation.epilogue_functor.element_epilogue,
+                epilogue_functor=operation.epilogue_functor,
+                count=operation.C.alignment,
+            )
+            op_list.append(self.reduction_operation)
+
+        compiler.add_module(op_list, bypass_cache=False)
+
+        self.operation = operation
+
+        self.dtype_A = torch_type(operation.A.element if not self.operation.switched else self.operation.B.element)
+        self.dtype_B = torch_type(operation.B.element if not self.operation.switched else self.operation.A.element)
+        self.dtype_C = torch_type(operation.C.element)
+        self.dtype_D = torch_type(operation.epilogue_functor.element_output)
+
+        element_size = min(DataTypeSize[operation.A.element], DataTypeSize[operation.B.element])
+
+        if element_size == 1:
+            self.rand_max = 1
+            self.rand_min = 0
+        elif element_size <= 8:
+            self.rand_max = 1
+            self.rand_min = -1
+        elif element_size == 16:
+            self.rand_max = 4
+            self.rand_min = -4
+        else:
+            self.rand_max = 8
+            self.rand_min = -8
+
+        self.seed = seed
+
+        self.compute_type = operation.epilogue_functor.element_epilogue
+        self.accumulator_type = operation.tile_description.math_instruction.element_accumulator
+
+    def print_problem_size(self, p, mode, batch_count):
+        if mode == GemmUniversalMode.Gemm:
+            mode = "Gemm"
+        elif mode == GemmUniversalMode.Batched:
+            mode = "GemmBatched"
+        elif mode == GemmUniversalMode.GemmSplitKParallel:
+            mode = "GemmSplitKParallel"
+        print(f"problem: {p.m}, {p.n}, {p.k}\n batch_count: {batch_count}\n mode: {mode}")
+
+    def uniform_init(self, shape, dtype, layout):
+        size = prod(shape)
+        if dtype.is_floating_point:
+            # Initialize data in FP32 and call convert to the data type we desire.
+            # This is a workaround for the following error that occurs when attempting to
+            # call uniform_ on a tensor with torch.float8_e4m3fn data:
+            # RuntimeError: "check_uniform_bounds" not implemented for 'Float8_e4m3fn'
+            data = torch.ceil(
+                torch.empty(size=(size,), dtype=torch.float32, device="cuda").uniform_(
+                    self.rand_min - 0.5, self.rand_max - 0.5)
+                ).to(dtype)
+        else:
+            # PyTorch does not currently support integer-typed matrix multiplications on GPU.
+            # Fall back to CPU for integer type references.
+            data = torch.empty(size=(size,), dtype=dtype, device="cpu").random_(self.rand_min, self.rand_max + 1)
+
+        is_fp8 = dtype == getattr(torch, "float8_e4m3fn", -1) or dtype == dtype == getattr(torch, "float8_e5m2", -1)
+
+        if dtype == torch.float64 or dtype == torch.float32 or is_fp8:
+            data = data.to("cpu")
+
+        data_ref = data.reshape(shape)
+
+        if layout == LayoutType.RowMajor:
+            data_cutlass = data_ref
+        else:
+            data_cutlass = data_ref.transpose(-1, -2).contiguous()
+
+        data_cutlass = data_cutlass.to("cuda")
+
+        # As of this writing, few operations in PyTorch are supported with FP8 data.
+        # Thus, we perform computation in FP32 for FP8 reference checks.
+        if is_fp8:
+            data_ref = data_ref.to(torch.float32)
+
+        return data_cutlass, data_ref
+
+    def reference(self, problem_size, tensor_A, tensor_B, tensor_C, alpha, beta):
+        # If any tensor is on CPU, place all tensors on CPU unless only
+        # tensor C is on CPU
+        # Handle mixed-input cases by casting to the larger data type and overriding
+        # to whatever the data type of the larger type is
+        if self.dtype_A != self.dtype_B:
+            if DataTypeSize[self.operation.A.element] < DataTypeSize[self.operation.B.element]:
+                tensor_A = tensor_A.to(self.dtype_B).to(tensor_B.device)
+            else:
+                tensor_B = tensor_B.to(self.dtype_A).to(tensor_A.device)
+
+        devices = [x.device.type for x in [tensor_A, tensor_B]]
+        if tensor_C is not None:
+            devices.append(tensor_C.device.type)
+
+        if "cpu" in devices and devices != ["cuda", "cuda", "cpu"]:
+            device = torch.device("cpu")
+        else:
+            device = tensor_A.device
+
+        tensor_A = tensor_A.to(device)
+        tensor_B = tensor_B.to(device)
+        if tensor_C is not None:
+            tensor_C = tensor_C.to(device)
+
+        dtype = torch_type(self.compute_type)
+        alpha_torch = torch.tensor([alpha], device=device).to(dtype)
+        beta_torch = torch.tensor([beta], device=device).to(dtype)
+
+        tmp = tensor_A @ tensor_B
+        tensor_D_ref = (alpha_torch * tmp)
+        if tensor_C is not None:
+            tensor_D_ref += (tensor_C * beta_torch)
+        return tensor_D_ref.to(self.dtype_D)
+
+    def run(self, mode, problem_size, batch_count=1, split_k_slices=1, alpha=1.0, beta=0.0):
+        torch.random.manual_seed(self.seed)
+
+        # Assign an actual batch count in cases where we are not running in batched mode.
+        # This is to differentiate between the number of split K slices and the batch count,
+        # which are overloaded within the single `batch_count` variable.
+        if mode == GemmUniversalMode.Batched:
+            true_batch_count = batch_count
+        else:
+            true_batch_count = 1
+
+        def transpose(layout):
+            if layout == LayoutType.RowMajor:
+                return LayoutType.ColumnMajor
+            else:
+                return LayoutType.RowMajor
+
+        tensor_A, tensor_A_ref = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.k),
+            self.dtype_A,
+            self.operation.A.layout if not self.operation.switched else transpose(self.operation.B.layout),
+        )
+        tensor_B, tensor_B_ref = self.uniform_init(
+            (true_batch_count, problem_size.k, problem_size.n),
+            self.dtype_B,
+            self.operation.B.layout if not self.operation.switched else transpose(self.operation.A.layout),
+        )
+        if self.dtype_C is not None:
+            tensor_C, tensor_C_ref = self.uniform_init(
+                (true_batch_count, problem_size.m, problem_size.n),
+                self.dtype_C,
+                self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+            )
+        else:
+            tensor_C = None
+            tensor_C_ref = None
+
+        tensor_D, _ = self.uniform_init(
+            (true_batch_count, problem_size.m, problem_size.n),
+            self.dtype_D,
+            self.operation.C.layout if not self.operation.switched else transpose(self.operation.C.layout),
+        )
+        tensor_D = torch.zeros_like(tensor_D)
+
+        if self.compute_type in [DataType.s8, DataType.s32, DataType.u8, DataType.u32]:
+            alpha = int(alpha)
+            beta = int(beta)
+
+        #
+        # Launch kernel
+        #
+
+        arguments = GemmArguments(
+            operation=self.operation,
+            problem_size=problem_size,
+            A=tensor_A,
+            B=tensor_B,
+            C=tensor_C,
+            D=tensor_D,
+            output_op=self.operation.epilogue_type(alpha, beta),
+            gemm_mode=mode,
+            split_k_slices=split_k_slices,
+            batch=batch_count,
+        )
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            reduction_arguments = ReductionArguments(
+                self.reduction_operation,
+                problem_size=[problem_size.m, problem_size.n],
+                partitions=split_k_slices,
+                workspace=arguments.ptr_D,
+                destination=tensor_D,
+                source=tensor_C,
+                output_op=self.reduction_operation.epilogue_type(alpha, beta),
+            )
+
+        self.operation.run(arguments)
+
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            self.reduction_operation.run(reduction_arguments)
+
+        passed = True
+
+        if self.verification:
+            if mode == GemmUniversalMode.GemmSplitKParallel:
+                reduction_arguments.sync()
+
+                # Free memory allocated by args because we are not
+                # calling `arguments.sync()` in this case (which will free memory)
+                arguments.free()
+            else:
+                arguments.sync()
+            tensor_D_ref = self.reference(
+                problem_size,
+                tensor_A_ref,
+                tensor_B_ref,
+                tensor_C_ref,
+                alpha,
+                beta,
+            )
+
+            tensor_D_ref = tensor_D_ref.to('cuda')
+
+            if self.operation.switched or self.operation.C.layout == LayoutType.ColumnMajor:
+                tensor_D = tensor_D.transpose(-1, -2).contiguous()
+
+            passed = tensor_D.equal(tensor_D_ref)
+
+            try:
+                assert passed
+            except AssertionError:
+                self.print_problem_size(problem_size, mode, batch_count)
+        del arguments
+        if mode == GemmUniversalMode.GemmSplitKParallel:
+            del reduction_arguments
+
+        return passed
+
+
+def test_all_gemm(operation: "GemmOperationUniversal", testcase="universal", compilation_mode="nvcc"):
+    passed = True
+
+    minimum_operand_element_size = min(
+        DataTypeSize[operation.A.element], DataTypeSize[operation.B.element]
+    )
+    opcode_class = operation.tile_description.math_instruction.opcode_class
+
+    if opcode_class == OpcodeClass.Simt:
+        alignment = 1
+    else:
+        alignment = 128 // minimum_operand_element_size
+
+    alignment_m = alignment
+    alignment_n = alignment
+    alignment_k = alignment
+
+    # INT8 alignment constraints
+    if opcode_class == OpcodeClass.Simt:
+        A_is_s8 = operation.A.element == DataType.s8
+        B_is_s8 = operation.B.element == DataType.s8
+
+        if A_is_s8 and operation.A.layout == LayoutType.ColumnMajor:
+            alignment_m = 4
+        if B_is_s8 == DataType.s8 and operation.A.layout == LayoutType.RowMajor:
+            alignment_n = 4
+        if A_is_s8 and B_is_s8 and (operation.A.layout == LayoutType.RowMajor or operation.B.layout == LayoutType.ColumnMajor):
+            alignment_k = 4
+
+    threadblock_k = operation.tile_description.threadblock_shape[2]
+
+    assert testcase != "interleaved"
+
+    supports_split_k = operation.arch < 90 and not operation.swizzling_functor == SwizzlingFunctor.StreamK
+
+    if testcase == "multistage":
+        modes = [GemmUniversalMode.Gemm]
+        problem_size_m = [16, 528]
+        problem_size_n = [16, 528]
+        problem_size_k = [
+            threadblock_k,
+            threadblock_k * operation.tile_description.stages
+            + operation.tile_description.math_instruction.instruction_shape[2],
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [0.0]
+        batch_counts = [1]
+    else:
+        modes = [GemmUniversalMode.Gemm]
+        batch_counts = [1, 2, 3, 5, 7]
+        if supports_split_k:
+            modes.append(GemmUniversalMode.GemmSplitKParallel)
+
+        problem_size_m = [alignment_m, 512 - 3 * alignment_m]
+        problem_size_n = [alignment_n, 512 - 2 * alignment_n]
+        if operation.tile_description.stages is None:
+            stages_for_k_calc = 7
+        else:
+            stages_for_k_calc = operation.tile_description.stages
+        problem_size_k = [
+            alignment_k,
+            threadblock_k * stages_for_k_calc - alignment_k,
+            threadblock_k * stages_for_k_calc * 3 - alignment_k,
+        ]
+        problem_alpha = [1.0]
+        problem_beta = [2.0]
+
+    testbed = GemmUniversalLauncher(operation, compiler_mode=compilation_mode)
+
+    for mode in modes:
+        for m in problem_size_m:
+            for n in problem_size_n:
+                for k in problem_size_k:
+                    for batch_count in batch_counts:
+                        for alpha in problem_alpha:
+                            for beta in problem_beta:
+                                # skip very small K problems
+                                if testcase == "universal":
+                                    if k // batch_count < 2 * threadblock_k:
+                                        continue
+
+                                problem_size = GemmCoord(m, n, k)
+
+                                if supports_split_k:
+                                    split_k_slices = batch_count
+                                else:
+                                    split_k_slices = 1
+
+                                overridden_mode = mode
+                                if mode == GemmUniversalMode.Gemm and batch_count > 1:
+                                    overridden_mode = GemmUniversalMode.Batched
+
+                                passed = testbed.run(
+                                    overridden_mode,
+                                    problem_size,
+                                    batch_count,
+                                    split_k_slices,
+                                    alpha,
+                                    beta,
+                                )
+
+                                if not passed:
+                                    return False
+
+    return passed
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/run_all_tests.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/run_all_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc5e7467b1e0040ce3012ff8541dfbac381bb861
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/run_all_tests.py
@@ -0,0 +1,44 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import pathlib
+import unittest
+
+
+if __name__ == '__main__':
+    loader = unittest.TestLoader()
+    script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+    tests = loader.discover(script_dir, 'gemm_*.py')
+    testRunner = unittest.runner.TextTestRunner()
+    results = testRunner.run(tests)
+    if not results.wasSuccessful():
+        raise Exception('Test cases failed')
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bba3e922961c96df75f8685e3064ab55cbbc87
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/gemm/utils.py
@@ -0,0 +1,260 @@
+#################################################################################################
+#
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from cutlass_library import SubstituteTemplate
+
+import cutlass_cppgen
+from cutlass_library import (
+    DataTypeNames,
+    EpilogueScheduleSuffixes,
+    KernelScheduleSuffixes,
+    LayoutType,
+    OpcodeClassNames,
+    ShortDataTypeNames,
+    ShortLayoutTypeNames
+)
+from cutlass_cppgen.backend import library
+
+from gemm_testbed import test_all_gemm
+
+
+class Layout:
+    """
+    Utility class to map transpose and non-transpose terminology to row- and column-major terminology
+    """
+
+    T = LayoutType.RowMajor
+    N = LayoutType.ColumnMajor
+
+
+class LayoutCombination:
+    """
+    Utility class defining all combinations of row- and column-major layouts for operands to a GEMMs
+    """
+
+    NNN = (Layout.N, Layout.N, Layout.N)
+    NNT = (Layout.N, Layout.N, Layout.T)
+    NTN = (Layout.N, Layout.T, Layout.N)
+    NTT = (Layout.N, Layout.T, Layout.T)
+    TNN = (Layout.T, Layout.N, Layout.N)
+    TNT = (Layout.T, Layout.N, Layout.T)
+    TTN = (Layout.T, Layout.T, Layout.N)
+    TTT = (Layout.T, Layout.T, Layout.T)
+
+
+def get_name(
+    layouts,
+    alignments,
+    element_output,
+    element_accumulator,
+    element_epilogue,
+    cluster_shape,
+    threadblock_shape,
+    stages,
+    element_a,
+    element_b,
+    element_c,
+    arch,
+    opclass,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    suffix="",
+):
+    """
+    Generates a procedural name for a test case.
+
+    :param layouts: indexable container of layouts of A, B, and C operands
+    :param alignments: indexable container of alignments of A, B, and C operands
+    :param element_output: data type of the output element
+    :param element_accumulator: data type used in accumulation
+    :param element_epilogue: data type used in computing the epilogue
+    :param cluster_shape: indexable container of dimensions of threadblock cluster to be launched
+    :param threadblock_shape: indexable container of dimensions of threadblock tiles
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param element_a: data type of operand A
+    :param element_b: data type of operand B
+    :param element_c: data type of operand C
+    :param arch: compute capability of kernel being generated
+    :type arch: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param kernel_schedule: kernel_schedule type
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue_schedule type
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param suffix: additional string to add to the suffix of the name
+    :type suffix: str
+
+    :return: str
+    """
+    name_format = "test_SM${arch}_Device_Gemm_${eA}${lA}_${eB}${lB}_${eC}${lC}_${opclass}_${acc}_${tbM}x${tbN}x${tbK}_${cM}x${cN}x${cK}_${stages}_align${aA}-${aB}-${aC}${k}${e}${suffix}"
+    return SubstituteTemplate(
+        name_format,
+        {
+            "arch": str(arch),
+            "eA": DataTypeNames[element_a],
+            "eB": DataTypeNames[element_b],
+            "eC": DataTypeNames[element_c],
+            "lA": ShortLayoutTypeNames[layouts[0]],
+            "lB": ShortLayoutTypeNames[layouts[1]],
+            "lC": ShortLayoutTypeNames[layouts[2]],
+            "opclass": OpcodeClassNames[opclass],
+            "acc": DataTypeNames[element_accumulator],
+            "cM": str(cluster_shape[0]),
+            "cN": str(cluster_shape[1]),
+            "cK": str(cluster_shape[2]),
+            "tbM": str(threadblock_shape[0]),
+            "tbN": str(threadblock_shape[1]),
+            "tbK": str(threadblock_shape[2]),
+            "stages": str(stages) if stages is not None else "auto",
+            "aA": str(alignments[0]),
+            "aB": str(alignments[1]),
+            "aC": str(alignments[2]),
+            "k": "" if kernel_schedule is None else KernelScheduleSuffixes[kernel_schedule],
+            "e": "" if epilogue_schedule is None else EpilogueScheduleSuffixes[epilogue_schedule],
+            "suffix": "" if suffix is None else suffix,
+        },
+    )
+
+
+def add_test_gemm(
+    cls=None,
+    cc=None,
+    element=None,
+    layouts=None,
+    alignments=None,
+    element_output=None,
+    element_accumulator=None,
+    cluster_shape=None,
+    threadblock_shape=None,
+    warp_count=None,
+    stages=None,
+    opclass=None,
+    swizzle=None,
+    kernel_schedule=None,
+    epilogue_schedule=None,
+    compilation_modes=['nvcc', 'nvrtc'],
+    element_A=None,
+    element_B=None,
+    element_C=None):
+    """
+    Create test-running functions with the given specification and set it as a method of ``cls``.
+
+    :param cls: class to which the generated method will be added
+    :type cls: type
+    :param cc: compute capability to compile for
+    :type cc: int
+    :param element: data type of A and B operands
+    :type element: cutlass_cppgen.DataType.f16
+    :param layouts: layouts of A, B, and C operands
+    :type layouts: list or tuple
+    :param alignments: alingments of A, B, and C operands
+    :type alignments: list or tuple
+    :param element_output: data type of the output element
+    :type element_output: cutlass_cppgen.DataType
+    :param element_accumulator: data type used in accumulation
+    :type element_accumulator: cutlass_cppgen.DataType
+    :param cluster_shape: dimensions of clusters
+    :type cluster_shape: list or tuple
+    :param threadblock_shape: dimensions of threadblock tiles
+    :type threadblock_shape: list or tuple
+    :param warp_count: warps to be launched per threadblock dimension
+    :type warp_count: list or tuple
+    :param stages: number of pipeline stages to use in the kernel
+    :type stages: int
+    :param opclass: class of operation being performed (e.g., SIMT, Tensor Core)
+    :type opclass: cutlass_cppgen.OpcodeClass
+    :param swizzle: threadblock swizzling functor
+    :param kernel_schedule: kernel schedule to use
+    :type kernel_schedule: cutlass_cppgen.KernelScheduleType
+    :param epilogue_schedule: epilogue schedule to use
+    :type epilogue_schedule: cutlass_cppgen.EpilogueScheduleType
+    :param compilation_modes: list of compilers to used in testing the kernel (options: 'nvrtc', 'nvcc')
+    :type compilation_modes: list,
+    :param element_A: data type of operand A. If set, overrides ``element``
+    :type element_A: cutlass_cppgen.DataType
+    :param element_B: data type of operand B. If set, overrides ``element``
+    :type element_B: cutlass_cppgen.DataType
+    :param element_C: data type of operand C. If set, overrides ``element``
+    :type element_C: cutlass_cppgen.DataType
+    """
+
+    if element_A is None:
+        element_A = element
+    if element_B is None:
+        element_B = element
+    if element_C is None:
+        element_C = element
+    if element_output is None:
+        element_output = element
+    if element_accumulator is None:
+        element_accumulator = element
+
+    for compilation_mode in compilation_modes:
+        def run(self):
+            """
+            Dynamically-generated function that constructs a GEMM operation and verifies it against
+            multiple test cases.
+            """
+
+            layout_A, layout_B, layout_C = layouts
+            alignment_A, alignment_B, alignment_C = alignments
+
+            plan = cutlass_cppgen.op.Gemm(element_A=element_A, element_B=element_B,
+                                element_C=element_C, element_D=element_output,
+                                layout_A=layout_A, layout_B=layout_B, layout_C=layout_C,
+                                element_accumulator=element_accumulator,
+                                kernel_cc=cc)
+
+            plan.opclass = opclass
+            if swizzle is not None:
+                plan.swizzling_functor = swizzle
+
+            td = plan.tile_descriptions()[0]
+
+            if warp_count is not None:
+                td.warp_count = warp_count
+            td.threadblock_shape = threadblock_shape
+            td.stages = stages
+            td.cluster_shape = cluster_shape
+            op = plan.construct(tile_description=td, alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+            self.assertTrue(test_all_gemm(op, 'universal', compilation_mode=compilation_mode))
+
+        element_epilogue = element_accumulator
+        name = get_name(
+            layouts=layouts, alignments=alignments, element_output=element_output, element_accumulator=element_accumulator,
+            element_epilogue=element_epilogue, cluster_shape=cluster_shape, threadblock_shape=threadblock_shape,
+            stages=stages, element_a=element_A, element_b=element_B, element_c=element_C, arch=cc, opclass=opclass,
+            kernel_schedule=kernel_schedule, epilogue_schedule=epilogue_schedule, suffix=f'_{compilation_mode}')
+
+        setattr(cls, name, run)
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/installation.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/installation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f550c394812c7fede55070e4c99c4471a69c2f88
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/installation.py
@@ -0,0 +1,57 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests for a successful installation of the CUTLASS Python interface
+"""
+
+import os
+import unittest
+
+import cutlass_cppgen
+import cutlass_library
+
+
+class InstallationTest(unittest.TestCase):
+    def test_cutlass_source_paths(self):
+        """
+        Tests that CUTLASS source is available as part of the cutlass and cutlass_library packages
+        """
+        src_file = 'include/cutlass/cutlass.h'
+        library_file = os.path.join(cutlass_library.source_path, src_file)
+        cutlass_file = os.path.join(cutlass_cppgen.CUTLASS_PATH, src_file)
+        assert os.path.isfile(library_file), f"Unable to locate file {library_file}. Installation has not succeeded."
+        assert os.path.isfile(cutlass_file), f"Unable to locate file {cutlass_file}. Installation has not succeeded."
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/conv2d_interface.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/conv2d_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5d46d45d617198a46bec85cd7218cb5431a7b1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/conv2d_interface.py
@@ -0,0 +1,284 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level Conv2d interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass_cppgen
+import cutlass_cppgen.utils.datatypes as datatypes
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import ExpectException
+import os
+
+
+class Conv2dEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Conv2d interface
+    """
+    def __init__(self, conv_kind, element_A, element_B, element_C, element_D, element_accumulator,
+                 alignment_A, alignment_B, alignment_C):
+
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+
+        self.conv_kind = conv_kind
+
+        self.plan = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind, element_A=element_A, element_B=element_B, element_C=element_C,
+            element_D=element_D, element_accumulator=element_accumulator)
+
+        self.op = self.plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+
+        :param other_plan: plan to compare against the default Conv2d
+        :type other_plan: cutlass_cppgen.op.Conv2d
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(
+            alignment_A=self.alignment_A, alignment_B=self.alignment_B,
+            alignment_C=self.alignment_C)
+
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using CUTLASS data types
+        and layouts for constructing the Conv2d interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        # Test when specifying all parameters
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_B=self.element_B, element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A and B as tensors using generic element and output
+        plan_other = cutlass_cppgen.op.Conv2d(
+            kind=self.conv_kind,
+            element_C=self.element_C,
+            element_D=self.element_D, element_accumulator=self.element_accumulator,
+            element=self.element_A)
+        assert self._plans_equal(plan_other)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator are equal
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass_cppgen.op.Conv2d(
+                kind=self.conv_kind,
+                element_C=self.element_C,
+                element_D=self.element_D,
+                element=self.element_A)
+            assert self._plans_equal(plan_other)
+
+        # Test with only the generic types. Only rune if the types of A, B, C, and D are the same
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator):
+            plan_other = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, element=self.element_A)
+            assert self._plans_equal(plan_other)
+
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+
+        size = (2, 2)
+        A = np.zeros(size, dtype=type_A)
+        B = np.zeros(size, dtype=type_B)
+        C = np.zeros(size, dtype=type_C)
+        D = np.zeros(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+
+    def torch_test(self):
+        """
+        Tests the equivalence of various constructions of the Conv2d interface when using torch as a frontend
+        """
+        if not datatypes.is_torch_available():
+            return
+
+        import torch
+        type_A = datatypes.torch_type(self.element_A)
+        type_B = datatypes.torch_type(self.element_B)
+        type_C = datatypes.torch_type(self.element_C)
+        type_D = datatypes.torch_type(self.element_D)
+        type_accum = datatypes.torch_type(self.element_accumulator)
+
+        size = (2, 2)
+
+        A = torch.empty(size, dtype=type_A)
+        B = torch.empty(size, dtype=type_B)
+        C = torch.empty(size, dtype=type_C)
+        D = torch.empty(size, dtype=type_D)
+
+        return self.tensor_test(type_A, type_B, type_C, type_D, type_accum, A, B, C, D)
+
+    def tensor_test(self, type_A, type_B, type_C, type_D, type_accum, A, B, C, D):
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        if type_A == type_B:
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, C=C, D=D, element_accumulator=type_accum, element=type_A)
+            assert self._plans_equal(plan_np)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum):
+            plan_np = cutlass_cppgen.op.Conv2d(kind=self.conv_kind, element=type_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+        self.torch_test()
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class ConvEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Conv2d interface
+    """
+    pass
+
+type2alignment = {
+    cutlass_cppgen.DataType.f16: 8,
+    cutlass_cppgen.DataType.f32: 4
+}
+
+def add_test(conv_kind, element_A, element_B, element_C, element_D, element_accumulator):
+
+    test_name = f"test_conv2d_{conv_kind}_{element_A}_{element_B}_{element_C}_{element_D}_{element_accumulator}"
+
+    def run(self):
+        conv2d_eq = Conv2dEquivalence(
+            conv_kind=conv_kind,
+            element_A=element_A, element_B=element_B,
+            element_C=element_C, element_D=element_D,
+            element_accumulator=element_accumulator,
+            alignment_A=type2alignment[element_A], alignment_B=type2alignment[element_B],
+            alignment_C=type2alignment[element_C]
+        )
+        conv2d_eq.test_all()
+
+    setattr(ConvEquivalenceTest, test_name, run)
+
+for conv_kind in ["fprop", "wgrad", "dgrad"]:
+    for types in [
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f16],
+        [cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f16, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32],
+        [cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32, cutlass_cppgen.DataType.f32]
+    ]:
+        add_test(conv_kind, types[0], types[1], types[2], types[3], types[4])
+
+
+@unittest.skipIf(device_cc() <= 80, 'Device compute capability is insufficient for SM80 tests.')
+class Conv2dErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f16)
+
+        with ExpectException(True, 'Alignment 3 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=3, alignment_B=3, alignment_C=3)
+
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        plan = cutlass_cppgen.op.Conv2d(kind="fprop", element=cutlass_cppgen.DataType.f16)
+
+        td = plan.tile_descriptions()[0]
+        td.threadblock_shape=[17, 32, 5]
+
+        plan.tile_description = td
+        with ExpectException(True, 'The threadblock shape is invalid. The compilation should fail.'):
+            plan.compile()
+        # Clean up the error message
+        os.remove("./cutlass_python_compilation_device_error.txt")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/evt_interface.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/evt_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d67f4d07f01b0936ff5796bfb6fe4c98b5c031
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/evt_interface.py
@@ -0,0 +1,254 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Test the EVT interface
+"""
+
+import numpy as np
+import unittest
+
+import cutlass_cppgen
+from cutlass_cppgen import LayoutType, Tensor
+from cutlass_cppgen.backend.utils.device import device_cc
+from cutlass_cppgen.epilogue import reshape, permute
+
+from utils import ExpectException
+
+
+@unittest.skipIf(device_cc() not in [80, 90], "This unittest is for Sm80 and Sm90 only")
+class EVTErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the EVT interface
+    """
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT requires root node be 'D'")
+    def test_root_not_d(self):
+        """
+        Test when "D" does not exist in Sm90 EVT
+        """
+        def evt_root_not_d(accum, alpha):
+            F = accum * alpha
+            return F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(device_cc() == 90, 
+            "SyntaxError: Sm90 EVT requires the epilogue to have a returned tensor D, "
+            "but the variable 'D' is not found in the return values.", True):
+            
+            cutlass_cppgen.epilogue.trace(evt_root_not_d, example_tensors)
+
+    def test_no_accum(self):
+        """
+        Test when "accum" is not in input arguments
+        """
+        def evt_no_accum(alpha, C):
+            D = alpha * C
+            return D
+        
+        example_tensors = {
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.2,
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Cannot find 'accum' in the argument list.", True):
+            cutlass_cppgen.epilogue.trace(evt_no_accum, example_tensors)
+    
+    @unittest.skipIf(device_cc() != 90, "Only Sm90 EVT has concern on smem size")
+    def test_too_much_shared_memory(self):
+        """
+        Test when the epilogue consumes too much shared memory
+        """
+        def evt_too_much_shared_memory(accum, C1, C2, C3, C4, C5, C6, C7, C8):
+            D1 = accum + C1
+            D2 = D1 + C2
+            D3 = D2 + C3
+            D4 = D3 + C4
+            D5 = D4 + C5
+            D6 = D5 + C6
+            D7 = D6 + C7
+            D = D7 + C8
+            return D, D1, D2, D3, D4, D5, D6, D7
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C8": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D1": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D2": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D3": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D4": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D5": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D6": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D7": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        epilogue_visitor = cutlass_cppgen.epilogue.trace(evt_too_much_shared_memory, example_tensors)
+        
+        plan = cutlass_cppgen.op.Gemm(
+            element=np.float16, layout=cutlass_cppgen.LayoutType.RowMajor,
+            element_accumulator=np.float32
+        )
+        
+        with ExpectException(True, 
+            "RuntimeError: The epilogue consumes too much shared memory. " 
+            "No valid tile description is found in the generator.", True):
+            plan.epilogue_visitor = epilogue_visitor
+    
+    def test_not_ssa(self):
+        """
+        Test when the epilogue is not in SSA
+        """
+        def evt_redefine(accum, C, alpha):
+            F = accum + C
+            F = F * alpha
+            D = F
+            return D, F
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'F' cannot be defined twice.", True):
+            cutlass_cppgen.epilogue.trace(evt_redefine, example_tensors)
+
+        def evt_undefine(accum, alpha):
+            F = accum + C
+            D = F * alpha
+            return D, F
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "alpha": 1.5,
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+            "F": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, "SyntaxError: Variable 'C' is undefined.", True):
+            cutlass_cppgen.epilogue.trace(evt_undefine, example_tensors)
+    
+    def test_missing_example_tensor(self):
+        """
+        Test when the example tensor of an input/output variable is not provided
+        """
+        def evt_missing_example_tensor(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for D is not provided.", True):
+            cutlass_cppgen.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "RuntimeError: Example input for C is not provided.", True):
+            cutlass_cppgen.epilogue.trace(evt_missing_example_tensor, example_tensors)
+        
+    def test_return_expression(self):
+        """
+        Test when the return value is an expression
+        """
+        def evt_return_expr(accum, C):
+            return accum + C
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+        }
+        
+        with ExpectException(True, "SyntaxError: Return value cannot be an expression", True):
+            cutlass_cppgen.epilogue.trace(evt_return_expr, example_tensors)
+    
+    def test_incompatible_shape(self):
+        """
+        Test when the shape of example tensors are incompatible
+        """
+        def evt_incompatible_shape(accum, C):
+            D = accum + C
+            return D
+        
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 256, 512)),
+            "C": self.fake_tensor(np.float16, (6, 512, 512)),
+            "D": self.fake_tensor(np.float16, (6, 512, 512))
+        }
+        
+        with ExpectException(True, 
+            "RuntimeError: Dimension mismatch between accum(6, 256, 512), C(6, 512, 512).", True):
+            cutlass_cppgen.epilogue.trace(evt_incompatible_shape, example_tensors)
+    
+    def test_no_matching_impl(self):
+        def evt_no_matching_impl(accum, bias):
+            D = accum + reshape(permute(bias, indices=(1, 0)), new_shape=(512, 1))
+            return D
+
+        example_tensors = {
+            "accum": self.fake_tensor(np.float16, (6, 512, 256)),
+            "bias": self.fake_tensor(np.float16, (16, 32)),
+            "D": self.fake_tensor(np.float16, (6, 512, 256))
+        }
+        
+        with ExpectException(True, "NotImplementedError: No matching op for node bias with stride (0, (1, 32), 0).", True):
+            cutlass_cppgen.epilogue.trace(evt_no_matching_impl, example_tensors)
+    #
+    # Helper functions
+    #
+    
+    def fake_tensor(self, element, shape):
+        return Tensor(element=element, shape=shape, layout_tag=LayoutType.RowMajor)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/gemm_interface.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/gemm_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..2913d5933f5342cc58b4f252657a724d2c7692da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/gemm_interface.py
@@ -0,0 +1,354 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Tests the high-level GEMM interface
+"""
+
+from math import ceil
+import unittest
+
+import cutlass_cppgen
+import cutlass_cppgen.utils.datatypes as datatypes
+from cutlass_cppgen.backend.utils.device import device_cc
+from utils import ExpectException
+
+
+class GemmEquivalence:
+    """
+    Helper class for testing the equivalence of different constructions of the Gemm interface
+    """
+    def __init__(self, element_A, element_B, element_C, element_D, element_accumulator,
+                 layout_A, layout_B, layout_C, alignment_A, alignment_B, alignment_C):
+        self.element_A = element_A
+        self.element_B = element_B
+        self.element_C = element_C
+        self.element_D = element_D
+        self.element_accumulator = element_accumulator
+        self.layout_A = layout_A
+        self.layout_B = layout_B
+        self.layout_C = layout_C
+        self.alignment_A = alignment_A
+        self.alignment_B = alignment_B
+        self.alignment_C = alignment_C
+        self.plan = cutlass_cppgen.op.Gemm(element_A=element_A, element_B=element_B, element_C=element_C,
+                                    element_D=element_D, element_accumulator=element_accumulator,
+                                    layout_A=layout_A, layout_B=layout_B, layout_C=layout_C)
+        self.op = self.plan.construct(alignment_A=alignment_A, alignment_B=alignment_B, alignment_C=alignment_C)
+
+    def _plans_equal(self, other_plan) -> bool:
+        """
+        Compares whether two plans are equal
+
+        :param other_plan: plan to compare against the default GEMM
+        :type other_plan: cutlass_cppgen.op.Gemm
+
+        :return: whether `other_plan` is equivalent to `self.plan`
+        :rtype: bool
+        """
+        other_op = other_plan.construct(alignment_A=self.alignment_A, alignment_B=self.alignment_B, alignment_C=self.alignment_C)
+
+        # Compare whether the operations are equal by comparing the C++ code that would be emitted for them
+        return self.op.rt_module.emit() == other_op.rt_module.emit()
+
+    def generic_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using CUTLASS data types
+        and layouts for constructing the Gemm interface
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        # Test when specifying all parameters
+        plan_other = cutlass_cppgen.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_A=self.layout_A, layout_B=self.layout_B, layout_C=self.layout_C)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A
+        plan_other = cutlass_cppgen.op.Gemm(element_B=self.element_B, element_C=self.element_C,
+                                  element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                  layout_B=self.layout_B, layout_C=self.layout_C,
+                                  element=self.element_A, layout=self.layout_A)
+        assert self._plans_equal(plan_other)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if self.element_A == self.element_B and self.layout_A == self.layout_B:
+            plan_other = cutlass_cppgen.op.Gemm(element_C=self.element_C, element_D=self.element_D, element_accumulator=self.element_accumulator,
+                                      layout_C=self.layout_C, element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if self.element_C == self.element_accumulator:
+            plan_other = cutlass_cppgen.op.Gemm(element_A=self.element_A, element_B=self.element_B, element_C=self.element_C,
+                                      element_D=self.element_D, layout_A=self.layout_A, layout_B=self.layout_B,
+                                      layout_C=self.layout_C)
+            assert self._plans_equal(plan_other)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (self.element_A == self.element_B and self.element_A == self.element_C and self.element_A == self.element_D
+            and self.element_A == self.element_accumulator and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_other = cutlass_cppgen.op.Gemm(element=self.element_A, layout=self.layout_A)
+            assert self._plans_equal(plan_other)
+
+    def numpy_test(self):
+        """
+        Tests the equivalence of various constructions of the Gemm interface when using numpy as a frontend
+        """
+        if not datatypes.is_numpy_available():
+            return
+
+        import numpy as np
+        type_A = datatypes.numpy_type(self.element_A)
+        type_B = datatypes.numpy_type(self.element_B)
+        type_C = datatypes.numpy_type(self.element_C)
+        type_D = datatypes.numpy_type(self.element_D)
+        type_accum = datatypes.numpy_type(self.element_accumulator)
+
+        layout_to_order = {
+            cutlass_cppgen.LayoutType.RowMajor: 'C',
+            cutlass_cppgen.LayoutType.ColumnMajor: 'F'
+        }
+        size = (2, 2)
+        A = np.zeros(size, order=layout_to_order[self.layout_A], dtype=type_A)
+        B = np.zeros(size, order=layout_to_order[self.layout_B], dtype=type_B)
+        C = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_C)
+        D = np.zeros(size, order=layout_to_order[self.layout_C], dtype=type_D)
+
+        # Test when specifying all parameters via tensors
+        plan_np = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D, element_accumulator=type_accum)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A as tensors
+        plan_np = cutlass_cppgen.op.Gemm(B=B, C=C, D=D, element_accumulator=type_accum, element_A=type_A, layout_A=self.layout_A)
+        assert self._plans_equal(plan_np)
+
+        # Test when specifying all parameters but A and B as tensors and using generic element and output
+        # Only run this test if the layouts and types for A and B are equal.
+        if type_A == type_B and self.layout_A == self.layout_B:
+            plan_np = cutlass_cppgen.op.Gemm(C=C, D=D, element_accumulator=type_accum, element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+        # Test without explicit accumulator. Only run if the type of C and the accumulator.
+        if type_C == type_accum:
+            plan_np = cutlass_cppgen.op.Gemm(A=A, B=B, C=C, D=D)
+            assert self._plans_equal(plan_np)
+
+        # Test with only the generic types and layouts. Only run if types and layouts of A, B, C, and D are the same.
+        if (type_A == type_B and type_A == type_C and type_A == type_D and type_A == type_accum and
+            self.layout_A == self.layout_B and self.layout_A == self.layout_C):
+            plan_np = cutlass_cppgen.op.Gemm(element=type_A, layout=self.layout_A)
+            assert self._plans_equal(plan_np)
+
+    def test_all(self):
+        """
+        Runs all tests on the Gemm interface
+        """
+        self.generic_test()
+        self.numpy_test()
+
+
+class GemmEquivalenceTest(unittest.TestCase):
+    """
+    Tests the equivalence of different constructions of the Gemm interface
+    """
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f32_ntn_8_8_8(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f32,
+                layout_A=cutlass_cppgen.LayoutType.ColumnMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.ColumnMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for FP16 Tensor Core tests.")
+    def test_gemm_equivalence_f16_f16_f16_f16_f16_ttt_4_4_4(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f16, element_B=cutlass_cppgen.DataType.f16, element_C=cutlass_cppgen.DataType.f16,
+                element_D=cutlass_cppgen.DataType.f16, element_accumulator=cutlass_cppgen.DataType.f16,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.RowMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=8, alignment_B=8, alignment_C=8)
+        gemm_eq.test_all()
+
+    @unittest.skipIf(device_cc() < 80, "Device compute capability is insufficient for F64 Tensor Core tests.")
+    def test_gemm_equivalence_f64_f64_f64_f64_f64_tnt_1_1_1(self):
+        gemm_eq = GemmEquivalence(
+                element_A=cutlass_cppgen.DataType.f64, element_B=cutlass_cppgen.DataType.f64, element_C=cutlass_cppgen.DataType.f64,
+                element_D=cutlass_cppgen.DataType.f64, element_accumulator=cutlass_cppgen.DataType.f64,
+                layout_A=cutlass_cppgen.LayoutType.RowMajor, layout_B=cutlass_cppgen.LayoutType.ColumnMajor, layout_C=cutlass_cppgen.LayoutType.RowMajor,
+                alignment_A=1, alignment_B=1, alignment_C=1)
+        gemm_eq.test_all()
+
+
+class GemmErrorTests(unittest.TestCase):
+    """
+    Tests various error scenarios that arise with the high-level Gemm interface
+    """
+
+    def test_alignment(self):
+        """
+        Tests case in which the alignment specified is unsupported
+        """
+        plan = cutlass_cppgen.op.Gemm(element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+        with ExpectException(True, 'Alignment 16 is not supported for F16. The construction should fail.'):
+            op = plan.construct(alignment_A=16, alignment_B=16, alignment_C=16)
+
+    def test_tensorop_availability(self):
+        """
+        Tests case in which only SIMT operations are available but TensorOp is requested
+        """
+        cc = device_cc()
+
+        # F64 Tensor Core operations are only avaiable on certain devices
+        supports_tensorop_f64 = cc in [80, 89, 90]
+        plan = cutlass_cppgen.op.Gemm(cc=cc, element=cutlass_cppgen.DataType.f64, layout=cutlass_cppgen.LayoutType.RowMajor)
+
+        error_msg = f'Incorrectly raised an exception for availability of TensorOp with F64 operands on SM{cc}'
+        with ExpectException(not supports_tensorop_f64, error_msg):
+            plan.opclass = cutlass_cppgen.OpcodeClass.TensorOp
+
+        expected_opclass = cutlass_cppgen.OpcodeClass.TensorOp if supports_tensorop_f64 else cutlass_cppgen.OpcodeClass.Simt
+        assert plan.opclass == expected_opclass, f'Expected opclass to be {expected_opclass}, but received {plan.opclass} for SM{cc}'
+
+    @unittest.skipIf(device_cc() < 70, "Device compute capability is insufficient for F16 Tensor Core tests.")
+    def test_opclass_switch(self):
+        """
+        Tests cases in which the opcode class in question is switched (e.g., from TensorOp to SIMT)
+        """
+        plan = cutlass_cppgen.op.Gemm( element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        assert plan.opclass == cutlass_cppgen.OpcodeClass.TensorOp
+
+        # Ensure that all tile descriptions have opclass of TensorOp
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass_cppgen.OpcodeClass.TensorOp
+
+        plan.opclass = cutlass_cppgen.OpcodeClass.Simt
+
+        # Ensure that all tile descriptions have opclass of Simt
+        for td in plan.tile_descriptions():
+            assert td.math_instruction.opcode_class == cutlass_cppgen.OpcodeClass.Simt
+
+    def test_invalid_tile_description(self):
+        """
+        Tests scenarios in which an invalid tile description is provided for a given CC
+        """
+        cc = device_cc()
+        plan = cutlass_cppgen.op.Gemm(cc=cc, element=cutlass_cppgen.DataType.f16, layout=cutlass_cppgen.LayoutType.RowMajor)
+        td = plan.tile_descriptions()[0]
+        stages = td.stages
+
+        # Zero stage count is valid for SM90+, as this is used to indicate that the builder's auto stage
+        # count should be used
+        with ExpectException(cc < 90, f'Requested zero stages'):
+            td.stages = 0
+            plan.construct(td)
+
+        if cc < 90:
+            with ExpectException(cc < 80, f'Requested more than 2 stages on SM{cc}'):
+                td.stages = 3
+                plan.construct(td)
+        elif cc == 90:
+            original_kschedule = td.kernel_schedule
+            original_eschedule = td.epilogue_schedule
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+                td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.NoSmemWarpSpecialized
+                td.stages = 3
+                plan.construct(td)
+            # Reset schedules
+            td.kernel_schedule = original_kschedule
+            td.epilogue_schedule = original_eschedule
+        elif cc in [100, 101, 103]:
+            with ExpectException(False, f'Incorrectly flagged an error for insufficient shared memory'):
+                td.stages = 3
+                plan.construct(td)
+
+        with ExpectException(True, f'Requested too many stages'):
+            td.stages = 100
+            plan.construct(td)
+
+        # Reset stage count
+        td.stages = stages
+
+        cluster_shape = td.cluster_shape
+        with ExpectException(cc < 90, f'Requested non-unit cluster shape on SM{cc}'):
+            td.cluster_shape = [2, 1, 1]
+            plan.construct(td)
+
+        # Reset cluster shape
+        td.cluster_shape = cluster_shape
+
+        with ExpectException(cc < 90, f'Requested a non-auto schedule on SM{cc}'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(cc == 90, f'Requested a non-auto kernel schedule with an auto epilogue schedule'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedPingpong
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.ScheduleAuto
+            plan.construct(td)
+
+        with ExpectException(cc == 90, f'Requested an auto kernel schedule with a non-auto epilogue schedule'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.ScheduleAuto
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecialized
+            plan.construct(td)
+
+        with ExpectException(cc < 90, f'Requested a tile scheduler on SM{cc}'):
+            td.kernel_schedule = cutlass_cppgen.KernelScheduleType.TmaWarpSpecializedCooperative
+            td.epilogue_schedule = cutlass_cppgen.EpilogueScheduleType.TmaWarpSpecializedCooperative
+            td.tile_scheduler = cutlass_cppgen.TileSchedulerType.StreamK
+            plan.construct(td)
+
+        # Ensure that all returned tile descriptions are unique
+        ops = {}
+        for i, td in enumerate(plan.tile_descriptions()):
+            op = plan.construct(td)
+            code_str = op.rt_module.emit()
+            if code_str in ops:
+                conflicting_td = ops[code_str]
+                assert False, f'Multiple tile descriptions emitted {code_str}\nTile descriptions are:\n{td}\n{conflicting_td}'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/utils.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f93ca26e2d79a15dab4dd0045836ebd9fe62757
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/cutlass/interface/utils.py
@@ -0,0 +1,69 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Helper functions & classes for interface test
+"""
+class ExpectException:
+    """
+    Utility class to assert that an exception was raised when expected
+
+    Example:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        with ExceptionExpected(True, 'Division by zero'):
+            x = 1.0 / 0.0
+
+    :param exception_expected: whether an exception is expected to be raised
+    :type exception_expected: bool
+    :param message: message to print if an exception is raised when not expected or vice versa
+    :type message: str
+    """
+    def __init__(self, exception_expected: bool, message: str = '', verify_msg=False):
+        self.exception_expected = exception_expected
+        self.message = message
+        self.verify_msg = verify_msg
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, traceback):
+        exception_raised = exc_type is not None
+        assert self.exception_expected == exception_raised, self.message
+        if self.verify_msg:
+            exc_message = f"{exc_type.__name__}: {exc_val}"
+            assert exc_message == self.message, f"expect error message {self.message}, got {exc_message}"
+
+        # Suppress the exception
+        return True
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/run_all_tests.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/run_all_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7cdc421ccffffeb7bd1696aaf9916330a6625ca
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/run_all_tests.py
@@ -0,0 +1,75 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Utility script for discovering and running all PyCuTe tests
+"""
+
+import argparse
+import logging
+import pathlib
+import unittest
+
+
+def numeric_log_level(log_level: str) -> int:
+  """
+  Converts the string identifier of the log level into the numeric identifier used
+  in setting the log level
+
+  :param x: string representation of log level (e.g., 'INFO', 'DEBUG')
+  :type x: str
+
+  :return: numeric representation of log level
+  :rtype: int
+  """
+  numeric_level = getattr(logging, log_level.upper(), None)
+  if not isinstance(numeric_level, int):
+    raise ValueError(f"Invalid log level: {log_level}")
+  return numeric_level
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--log-level", default='info', type=numeric_log_level, required=False,
+                      help='Logging level to be used by the generator script')
+  args = parser.parse_args()
+
+  # Set the logging level based on the user-provided `--log-level` command-line option
+  logging.basicConfig(level=args.log_level)
+
+  loader = unittest.TestLoader()
+  script_dir = str(pathlib.Path(__file__).parent.resolve()) + '/'
+  tests = loader.discover(script_dir, "test_*.py")
+  test_runner = unittest.runner.TextTestRunner()
+  results = test_runner.run(tests)
+  if not results.wasSuccessful():
+    raise Exception("Test cases failed")
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_coalesce.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_coalesce.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4330377cab7079ea16422f194ddf4f2403ea507
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_coalesce.py
@@ -0,0 +1,95 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.coalesce
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestCoalesce(unittest.TestCase):
+  def helper_test_coalesce(self, layout):
+    layoutR = coalesce(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    self.assertEqual(size(layoutR), size(layout))
+
+    for i in range(size(layout)):
+      self.assertEqual(layoutR(i), layout(i))
+
+  def test_coalesce(self):
+    layout = Layout(1,0)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(1,1)
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (1,6,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (1,7,2))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,6), (4,7,8))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,(4,6)))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4), (4,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,4,6), (24,6,1))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout((2,1,3), (2,4,4))
+    self.helper_test_coalesce(layout)
+
+    layout = Layout(((2,2),(2,2)), ((1,4),(8,32)))
+    self.helper_test_coalesce(layout)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_complement.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_complement.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a8684a55b19c90eae11ddd1cca011c2ff8270b5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_complement.py
@@ -0,0 +1,92 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.complement
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComplement(unittest.TestCase):
+  def helper_test_complement(self, layout):
+    layoutR = complement(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {layoutR}")
+
+    # Post-condition: test disjointness of the codomains
+    for a in range(size(layout)):
+      for b in range(size(layoutR)):
+        assert (layout(a) != layoutR(b)) or (layout(a) == 0 and layoutR(b) == 0)
+
+  def test_complement(self):
+    test = Layout(1,0)
+    self.helper_test_complement(test)
+
+    test = Layout(1,1)
+    self.helper_test_complement(test)
+
+    test = Layout(4,0)
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,3),(1,2))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4),(1,4))
+    self.helper_test_complement(test)
+
+    test = Layout((2,4,8),(8,1,64))
+    self.helper_test_complement(test)
+
+    test = Layout(((2,2),(2,2)),((1,4),(8,32)))
+    self.helper_test_complement(test)
+
+    test = Layout((2,(3,4)),(3,(1,6)))
+    self.helper_test_complement(test)
+
+    test = Layout((4,6),(1,6))
+    self.helper_test_complement(test)
+
+    test = Layout((4,10),(1,10))
+    self.helper_test_complement(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_composition.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_composition.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c27eb7fe6cbb7bbbea7bd644ac8e64a2fc853c9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_composition.py
@@ -0,0 +1,213 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.composition
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestComposition(unittest.TestCase):
+  def helper_test_composition(self, layoutA, layoutB):
+    layoutR = composition(layoutA, layoutB)
+
+    _LOGGER.debug(f"{layoutA} o {layoutB}  =>  {layoutR}")
+
+    # True post-condition: Every coordinate c of layoutB with L1D(c) < size(layoutR) is a coordinate of layoutR.
+
+    # Test that R(c) = A(B(c)) for all coordinates c in layoutR
+    for i in range(size(layoutR)):
+      self.assertEqual(layoutR(i), layoutA(layoutB(i)))
+
+  def test_composition(self):
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,0)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,0)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(1,1)
+    layoutB = Layout(1,1)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((4), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((1), (0))
+    layoutB = Layout((4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((1), (0))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4), (2))
+    layoutB = Layout((2), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12), (2))
+    layoutB = Layout((4,3), (3,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((12))
+    layoutB = Layout((2,3), (2,4))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((4,3))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((12))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6), (2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,3), (3,1))
+    layoutB = Layout((6,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((8,8), (8,1))
+    layoutB = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((2,2,2), (2,2,2)),((1,16,4), (8,2,32)))
+    layoutB = Layout(8, 4)
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout(((4,2)), ((1,16)))
+    layoutB = Layout((4,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((2,2), (2,1))
+    layoutB = Layout((2,2), (2,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2))
+    layoutB = Layout((2,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((2,2,2), (1,8,2))
+    self.helper_test_composition(layoutA, layoutB)
+
+    layoutA = Layout((4,8,2), (2,8,1))
+    layoutB = Layout((4,2,2), (2,8,1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    # Pre-coalesced LHS
+    layoutA = Layout((4,6,8),(1,4,7))
+    layoutB = Layout((6),(1))
+    self.helper_test_composition(layoutA, layoutB)
+
+    # Mid-layout truncation
+    layoutA = Layout((4,6,8,10),(2,3,5,7))
+    layoutB = Layout(6,12)
+    self.helper_test_composition(layoutA, layoutB)
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_int_tuple.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_int_tuple.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dbf443c9725735b0051d0a225a55eece9c663a8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_int_tuple.py
@@ -0,0 +1,80 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.int_tuple
+"""
+
+import unittest
+
+from pycute import *
+
+
+class TestIntTuple(unittest.TestCase):
+  def test_product(self):
+    self.assertEqual(product(2), 2)
+
+    self.assertEqual(product((3,2)), 6)
+
+    self.assertEqual(product(product(((2,3),4))), 24)
+
+  def test_inner_product(self):
+    self.assertEqual(inner_product(2, 3), 6)
+
+    self.assertEqual(inner_product((1,2), (3,2)), 7)
+
+    self.assertEqual(inner_product(((2,3),4), ((2,1),2)), 15)
+
+  def test_shape_div(self):
+    self.assertEqual(shape_div((3,4), 6), (1,2))
+
+    self.assertEqual(shape_div((3,4), 12), (1,1))
+
+    self.assertEqual(shape_div((3,4), 36), (1,1))
+
+    self.assertEqual(shape_div(((3,4),6), 36), ((1,1),2))
+
+    self.assertEqual(shape_div((6,(3,4)), 36), (1,(1,2)))
+
+  def test_prefix_product(self):
+    self.assertEqual(prefix_product(2), 1)
+
+    self.assertEqual(prefix_product((3,2)), (1,3))
+
+    self.assertEqual(prefix_product((3,2,4)), (1,3,6))
+
+    self.assertEqual(prefix_product(((2,3),4)), ((1,2),6))
+
+    self.assertEqual(prefix_product(((2,3),(2, 1, 2),( 5,  2,  1))),
+                                    ((1,2),(6,12,12),(24,120,240)))
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_left_inverse.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_left_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6501fd6c7c6fc5a518e4d22bf93dc0e4746a8ba
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_left_inverse.py
@@ -0,0 +1,87 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestLeftInverse(unittest.TestCase):
+  def helper_test_left_inverse(self, layout):
+    inv_layout = left_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(layout)):
+      self.assertEqual(inv_layout(layout(i)), i)
+
+  def test_left_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_left_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_left_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_left_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_left_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_right_inverse.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_right_inverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed9759d7808da8087fe9c76761d2dd9eaeab08b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_right_inverse.py
@@ -0,0 +1,96 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.left_inverse
+"""
+
+import logging
+import unittest
+
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestRightInverse(unittest.TestCase):
+  def helper_test_right_inverse(self, layout):
+    inv_layout = right_inverse(layout)
+
+    _LOGGER.debug(f"{layout}  =>  {inv_layout}")
+
+    for i in range(size(inv_layout)):
+      self.assertEqual(layout(inv_layout(i)), i)
+
+  def test_right_inverse(self):
+    test = Layout(1,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((1,1),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((3,7),(0,0))
+    self.helper_test_right_inverse(test)
+
+    test = Layout(1,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,0)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,1)
+    self.helper_test_right_inverse(test)
+
+    test = Layout(4,2)
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4),(0,2))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((8,4),(4,1))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(1,2,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((2,4,6),(4,1,8))
+    self.helper_test_right_inverse(test)
+
+    test = Layout((4,2),(1,16))
+    self.helper_test_right_inverse(test)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_typing.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_typing.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb99a4833529e18fa22d65a235ce80dad372365
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/python/pycute/test_typing.py
@@ -0,0 +1,59 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+"""
+Unit tests for pycute.typing
+"""
+
+import logging
+import unittest
+from pycute import *
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class TestTyping(unittest.TestCase):
+    def helper_test_typing(self, _cls, _obj, cls, expected: bool):
+        _LOGGER.debug(f"issubclass({_cls}, {cls})")
+        _LOGGER.debug(f"isinstance({_obj}, {cls})")
+
+        self.assertEqual(expected, issubclass(_cls, cls))
+        self.assertEqual(expected, isinstance(_obj, cls))
+
+    def test_typing(self):
+        self.helper_test_typing(int, 1, Integer, True)
+        self.helper_test_typing(float, 1., Integer, False)
+        self.helper_test_typing(str, 'hi', Integer, False)
+        self.helper_test_typing(bool, False, Integer, False)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/common/cutlass_unit_test.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/common/cutlass_unit_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..86b7823785a9f2a957cf505740d6cfde45ccfef1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/common/cutlass_unit_test.h
@@ -0,0 +1,102 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#pragma warning (disable : 4068 ) /* disable unknown pragma warnings for visual studio */
+
+#pragma nv_diag_suppress boolean_controlling_expr_is_constant
+#include <gtest/gtest.h>
+#pragma nv_diag_warning boolean_controlling_expr_is_constant
+#pragma warning( disable : 4503)
+
+#include <cstdlib>
+#include <string>
+
+#include <cuda_runtime_api.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Gets a CUDA device
+cudaDeviceProp GetCudaDevice();
+
+/// Prints device properties
+std::ostream &operator<<(std::ostream &out, cudaDeviceProp const &device);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Sets flags for Unit test
+void FilterArchitecture();
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Reads environment variable `CUTLASS_UNIT_TEST_PROBLEM_COUNT` to control the number and order
+//  of problem sizes run by CUTLASS unit tests
+int CutlassUnitTestProblemCount();
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// active test macro
+#define CUTLASS_TEST_LEVEL_ACTIVE(LEVEL,NAME_STATIC,NAME_DYNAMIC,...) \
+    TEST(NAME_STATIC,L##LEVEL##_##NAME_DYNAMIC) __VA_ARGS__
+
+// disabled test macro
+#define CUTLASS_TEST_LEVEL_DISABLED(LEVEL,NAME_STATIC,NAME_DYNAMIC,...) \
+    TEST(NAME_STATIC,DISABLED_L##LEVEL##_##NAME_DYNAMIC) {}
+
+#if CUTLASS_TEST_LEVEL == 0
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#elif CUTLASS_TEST_LEVEL == 1
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...) CUTLASS_TEST_LEVEL_DISABLED(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#else
+#define CUTLASS_TEST_L0(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(0,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L1(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(1,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#define CUTLASS_TEST_L2(NAME_STATIC,NAME_DYNAMIC,...)   CUTLASS_TEST_LEVEL_ACTIVE(2,NAME_STATIC,NAME_DYNAMIC,__VA_ARGS__)
+#endif
+
+#if !defined(CUTLASS_TEST_UNIT_ENABLE_WARNINGS)
+#define CUTLASS_TEST_UNIT_ENABLE_WARNINGS false
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 12)
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED true
+#else
+  #define CUDA_12_0_SM90_FEATURES_SUPPORTED false
+#endif
+
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/cache_testbed_output.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/cache_testbed_output.h
new file mode 100644
index 0000000000000000000000000000000000000000..3035e9862bcb79b749b4cbc4a74341bceac9c598
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/cache_testbed_output.h
@@ -0,0 +1,907 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Helper to construct cached name for
+*/
+#pragma once
+
+#include <typeinfo>
+#include <fstream>
+#include <list>
+#include <utility>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+#include "cutlass/conv/conv3d_problem_size.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "thrust/universal_vector.h"
+
+#ifndef CUTLASS_TEST_ENABLE_CACHED_RESULTS
+#define CUTLASS_TEST_ENABLE_CACHED_RESULTS false
+#endif
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test::conv::device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result of a test
+struct CachedTestKey {
+
+  std::string op;         ///< Concatenated string representation of operation performed
+  std::string problem;    ///< Concatenated string representation of problem description
+  std::string types;      ///< Concatenated string representation of operand types
+  uint32_t    A;          ///< Hashed result of tensor A
+  uint32_t    B;          ///< Hashed result of tensor B
+  uint32_t    C;          ///< Hashed result of tensor C
+
+  //
+  // Methods
+  //
+  inline CachedTestKey(): A(), B(), C() { }
+
+  inline CachedTestKey(
+    std::string op,         ///< Concatenated string representation of operation performed
+    std::string problem,    ///< Concatenated string representation of problem description
+    std::string types,      ///< Concatenated string representation of operand types
+    uint32_t    A,          ///< Hashed result of tensor A
+    uint32_t    B,          ///< Hashed result of tensor B
+    uint32_t    C           ///< Hashed result of tensor C
+  ):
+    op(op), problem(problem), types(types), A(A), B(B), C(C)
+  { }
+
+  /// Checks for equality of the problem
+  bool operator==(CachedTestKey const &rhs) const {
+    return op == rhs.op && problem == rhs.problem && types == rhs.types && A == rhs.A && B == rhs.B && C == rhs.C;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline std::istream &operator>>(std::istream &in, CachedTestKey &result) {
+
+  in >> result.op;
+  in >> result.problem;
+  in >> result.types;
+  in >> result.A;
+  in >> result.B;
+  in >> result.C;
+
+  return in;
+}
+
+inline std::ostream &operator<<(std::ostream &out, CachedTestKey const &result) {
+
+  out << result.op << " ";
+  out << result.problem << " ";
+  out << result.types << " ";
+  out << result.A << " ";
+  out << result.B << " ";
+  out << result.C << " ";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct CachedTestResult {
+  uint32_t D;
+  //
+  // Methods
+  //
+
+  CachedTestResult(): D()
+      { }
+
+  CachedTestResult(uint32_t D): D(D)
+      { }
+
+  operator bool() const {
+    return bool(D);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline std::istream &operator>>(std::istream &in, CachedTestResult &result) {
+  in >> result.D;
+  return in;
+}
+
+inline std::ostream &operator<<(std::ostream &out, CachedTestResult const &result) {
+  out << result.D;
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct CachedTestResultListing {
+
+  std::list<std::pair<CachedTestKey, CachedTestResult>> results;
+
+  //
+  // Methods
+  //
+
+  inline CachedTestResultListing(std::string const &path) {
+    std::ifstream file(path);
+
+    while (file.good()) {
+      CachedTestKey key;
+      file >> key;
+
+      CachedTestResult result;
+      file >> result;
+
+      if (result) {
+        results.push_back(std::make_pair(key, result));  
+      }
+    }
+  }
+
+  /// Returns the cached result 
+  std::pair<bool, CachedTestResult> find(CachedTestKey const &rhs) const {
+    for (auto const & result : results) {
+      if (result.first == rhs) {
+        return std::make_pair(true, result.second);
+      }
+    }
+    return std::make_pair(false, CachedTestResult());
+  }
+
+  /// Appends an entry
+  void append(CachedTestKey const &key, CachedTestResult const &result) {
+    if (result) {
+      results.push_back(std::make_pair(key, result));  
+    }
+  }
+
+  /// Writes the entire listing to a file
+  bool write(std::string const &path) {
+    std::ofstream file(path);
+    if (!file.good()) {
+      return false;
+    }
+
+    for (auto const &result : results) {
+      file << result.first << result.second << std::endl;
+    }
+
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+struct ScalarEncoder {
+  Element scalar;
+
+  ScalarEncoder(Element s): scalar(s) { }
+
+  std::string str() const {
+    std::stringstream ss;
+    Element s = scalar;
+    if (s < Element()) {
+      s = -s;
+      ss << "n";
+    }
+    ss << s;
+    return ss.str();
+  }
+};
+
+template <typename Element>
+ScalarEncoder<Element> EncodeScalar(Element a) {
+  return ScalarEncoder<Element>(a);
+}
+
+template <typename Element>
+struct ScalarEncoder<cutlass::complex<Element>> {
+  cutlass::complex<Element> scalar;
+
+  ScalarEncoder(cutlass::complex<Element> s): scalar(s) { }
+
+  std::string str() const {
+    std::stringstream ss;
+    ss << EncodeScalar<Element>(scalar.real()) << "_" << EncodeScalar<Element>(scalar.imag()) << "i";
+    return ss.str();
+  }
+};
+
+template <typename Element>
+std::ostream &operator<<(std::ostream &out, ScalarEncoder<Element> const &scalar) {
+  out << scalar.str();
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline char const *EncodeOperator(cutlass::conv::Operator conv_op) {
+    switch (conv_op) {
+      case cutlass::conv::Operator::kFprop: return "fprop";
+      case cutlass::conv::Operator::kDgrad: return "dgrad";
+      case cutlass::conv::Operator::kWgrad: return "wgrad";
+      case cutlass::conv::Operator::kDeconv: return "deconv";
+    }
+    return "conv_unknown";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Encode GemmCoord (Gemm problem size)
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out, 
+  cutlass::gemm::GemmCoord const &problem) {
+    
+  out << problem.m() << "x" << problem.n() << "x" << problem.k() << "_";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode Conv2dProblemSize
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out, 
+  cutlass::conv::Conv2dProblemSize const &problem) {
+    
+  out << problem.N << "x" << problem.H << "x" << problem.W << "x" << problem.C << "_" 
+    << problem.P << "x" << problem.Q << "_" << problem.K << "x" << problem.R << "x" << problem.S << "_";
+
+  out << "pad_h" << problem.pad_h << "w" << problem.pad_w << "_";
+  out << "stride_h" << problem.stride_h << "w" << problem.stride_w << "_";
+  out << "dil_h" << problem.dilation_h << "w" << problem.dilation_w << "_";
+
+  switch (problem.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Encode Conv3dProblemSize
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out, 
+  cutlass::conv::Conv3dProblemSize const &problem) {
+    
+  out << problem.N << "x" << problem.D << "x" << problem.H << "x" << problem.W << "x" << problem.C << "_" 
+    << problem.Z << problem.P << "x" << problem.Q << "_" << problem.K << "x" << problem.R << "x" << problem.S << "_";
+
+  out << "pad_d" << problem.pad_h << "h" << problem.pad_h << "w" << problem.pad_w << "_";
+  out << "stride_d" << problem.stride_d << "h" << problem.stride_h << "w" << problem.stride_w << "_";
+  out << "dil_d" << problem.dilation_d << "h" << problem.dilation_h << "w" << problem.dilation_w << "_";
+
+  switch (problem.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Encode 3.x ConvNd ProblemShape
+template <class ProblemShape>
+inline std::ostream &EncodeProblemSize(
+  std::ostream &out, 
+  ProblemShape const& problem_shape) {
+
+  out << problem_shape.shape_A << "_";
+  out << problem_shape.shape_B << "_";
+
+  out << "padl" << problem_shape.lower_padding << "_";
+  out << "padu" << problem_shape.upper_padding << "_";
+  out << "str"  << problem_shape.traversal_stride << "_";
+  out << "dil"  << problem_shape.dilation << "_";
+
+  switch (problem_shape.mode) {
+    case cutlass::conv::Mode::kCrossCorrelation:
+        out << "corr";
+        break;
+    case cutlass::conv::Mode::kConvolution:
+        out << "conv";
+        break;
+  }
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+inline std::string ElementTypeName() {
+  return std::string(typeid(Element).name());
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::half_t>() {
+  return "h";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::half_t>>() {
+  return "ch";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::bfloat16_t>() {
+  return "bf16";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::bfloat16_t>>() {
+  return "cbf16";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::tfloat32_t>() {
+  return "tf32";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::complex<cutlass::tfloat32_t>>() {
+  return "ctf32";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::complex<float>>() {
+  return "c";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::complex<double>>() {
+  return "z";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::Quaternion<float>>() {
+  return "q";
+}
+
+template <>
+inline std::string ElementTypeName<int8_t>() {
+  return "s8";
+}
+
+template <>
+inline std::string ElementTypeName<uint8_t>() {
+  return "u8";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::int4b_t>() {
+  return "s4";
+}
+
+template <>
+inline std::string ElementTypeName<cutlass::uint4b_t>() {
+  return "u4";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Layout>
+inline std::string LayoutTypeName() {
+  return std::string(typeid(Layout).name());
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::ColumnMajor>() {
+  return "n";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::RowMajor>() {
+  return "t";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNHWC>() {
+  return "nhwc";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNCxHWx<32>>() {
+  return "nc32hw32";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNCxHWx<64>>() {
+  return "nc64hw64";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorCxRSKx<32>>() {
+  return "c32rsk32";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorCxRSKx<64>>() {
+  return "c64rsk64";
+}
+
+template <>
+inline std::string LayoutTypeName<cutlass::layout::TensorNDHWC>() {
+  return "ndhwc";
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, typename Layout>
+inline std::string TensorTypeName() {
+  std::stringstream ss;
+  ss << ElementTypeName<Element>() << LayoutTypeName<Layout>();
+  return ss.str();
+}
+
+template <typename Element>
+inline std::string TensorTypeName() {
+  std::stringstream ss;
+  ss << ElementTypeName<Element>();
+  return ss.str();
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function on a byte array
+struct CRC32 {
+
+  uint32_t table[256];
+
+  //
+  // Methods
+  //
+
+  CRC32() {
+
+    uint32_t rem;
+    int i, j;
+   
+    for (i = 0; i < 256; i++) {
+      rem = i;
+      for (j = 0; j < 8; j++) {
+        if (rem & 1) {
+          rem >>= 1;
+          rem ^= 0xedb88320;
+        } else
+          rem >>= 1;
+      }
+      table[i] = rem;
+    }
+  }
+
+  /// Computes the CRC of an array of bytes
+  uint32_t operator()(void const *start, size_t length, uint32_t crc = uint32_t()) const {
+    uint8_t const *p = static_cast<uint8_t const *>(start);
+    uint8_t const *q = static_cast<uint8_t const *>(start) + length;
+
+    crc = ~crc;
+    
+    for (; p != q; ++p) {
+      uint8_t octet = *p;
+      crc = (crc >> 8) ^ table[(crc & 0xff) ^ octet];
+    }
+
+    return ~crc;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Element, typename Layout
+>
+uint32_t TensorHash(
+  cutlass::TensorView<Element, Layout> view, 
+  CRC32 const &hash = CRC32(), 
+  uint32_t crc = uint32_t()
+) {
+
+  return hash(view.data(), view.capacity() * cutlass::sizeof_bits<Element>::value / 8, crc);
+}
+
+template <typename Element>
+uint32_t TensorHash(
+  thrust::universal_vector<Element>& tensor,
+  CRC32 const &hash = CRC32(), 
+  uint32_t crc = uint32_t()
+) {
+
+  return hash(tensor.data().get(), tensor.size() * cutlass::sizeof_bits<Element>::value / 8, crc);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline std::ostream &EncodeTypes(
+  std::ostream &out
+) {
+  
+  out << TensorTypeName<ElementA, LayoutA>() << "_" 
+    << TensorTypeName<ElementB, LayoutB>() << "_" 
+    << TensorTypeName<ElementC, LayoutC>() << "_"
+    << ElementTypeName<ElementAccumulator>() << "_"
+    << ElementTypeName<ElementCompute>();
+
+  return out;
+}
+
+template <
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline std::ostream &EncodeTypes(
+  std::ostream &out
+) {
+  
+  out << TensorTypeName<ElementA>() << "_" 
+      << TensorTypeName<ElementB>() << "_" 
+      << TensorTypeName<ElementC>() << "_"
+      << ElementTypeName<ElementD>();
+
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedGemmTestKey(
+  cutlass::gemm::GemmCoord const &problem, 
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+
+  CachedTestKey key;
+
+  // Encode gemm operator and problem sizes
+  key.op = "gemm";
+
+  std::stringstream ss_problem;
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode hash for problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dTestKey(
+
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem, 
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+
+  CachedTestKey key;
+
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d";
+  
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode hash for problem data
+  CRC32 crc_hash;
+
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dWithBroadcastTestKey(
+
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem, 
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+
+  CachedTestKey key;
+
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d_with_broadcast";
+  
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode hash for problem data
+  CRC32 crc_hash;
+
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv2dWithReductionTestKey(
+
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv2dProblemSize const &problem, 
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+
+  CachedTestKey key;
+
+  // Encode conv2d operator and problem sizes
+  key.op = "conv2d_with_reduction";
+  
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode hash for problem data
+  CRC32 crc_hash;
+
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA, typename LayoutA,
+  typename ElementB, typename LayoutB,
+  typename ElementC, typename LayoutC,
+  typename ElementAccumulator,
+  typename ElementCompute
+>
+inline CachedTestKey CreateCachedConv3dTestKey(
+  cutlass::conv::Operator conv_operator,
+  cutlass::conv::Conv3dProblemSize const &problem, 
+  ElementCompute alpha,
+  ElementCompute beta,
+  cutlass::TensorView<ElementA, LayoutA> A,
+  cutlass::TensorView<ElementB, LayoutB> B,
+  cutlass::TensorView<ElementC, LayoutC> C
+) {
+
+  CachedTestKey key;
+
+  // Encode conv3d operator and problem sizes
+  key.op = "conv3d";
+  
+  std::stringstream ss_problem;
+  
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  class ProblemShape,
+  typename ElementA,
+  typename ElementB,
+  typename ElementC,
+  typename ElementD
+>
+inline CachedTestKey CreateCachedConvNd3xTestKey(
+  cutlass::conv::Operator conv_operator,
+  ProblemShape const& problem_shape,
+  double alpha,
+  double beta,
+  thrust::universal_vector<ElementA> A,
+  thrust::universal_vector<ElementB> B,
+  thrust::universal_vector<ElementC> C
+) {
+
+  CachedTestKey key;
+ 
+  // Encode convNd operator and problem sizes
+  std::stringstream ss_op;
+  ss_op << "conv" << ProblemShape::RankS <<  "d";
+  key.op = ss_op.str();
+
+  std::stringstream ss_problem;
+  ss_problem << EncodeOperator(conv_operator) << "_";
+  EncodeProblemSize(ss_problem, problem_shape);
+  ss_problem << "_alpha" << EncodeScalar(alpha) << "_beta" << EncodeScalar(beta);
+  key.problem = ss_problem.str();
+
+  // Encode problem data types
+  std::stringstream ss_types;
+  EncodeTypes<
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD>(ss_types);
+  key.types = ss_types.str();
+
+  // Encode problem data
+  CRC32 crc_hash;
+  key.A = TensorHash(A, crc_hash);
+  key.B = TensorHash(B, crc_hash);
+  key.C = TensorHash(C, crc_hash);
+
+  return key;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace test::conv::device
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_problems.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_problems.h
new file mode 100644
index 0000000000000000000000000000000000000000..a14134b2854732e669977831207a456d28beed9f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_problems.h
@@ -0,0 +1,927 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+using Conv2dProblemVector = std::vector<cutlass::conv::Conv2dProblemSize>;
+
+//
+// Structures to prune items from Conv2dProblemVector
+//
+// Specification template for pruning items for convolution problem lists
+template <typename T> struct Specification
+{
+  virtual ~Specification() = default;
+  virtual bool is_satisfied(T item) const = 0;
+};
+
+// input size  (NHWC) specification
+struct InputSizeSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::Tensor4DCoord input_size;
+
+  InputSizeSpecification(cutlass::Tensor4DCoord input_size_) : input_size(input_size_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((input_size.n() == item.N) && (input_size.h() == item.H) && (input_size.w() == item.W) && (input_size.c() == item.C));
+  }
+};
+
+// stride (stride_h, stride_w) specification
+struct StrideSpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  cutlass::MatrixCoord stride;
+
+  StrideSpecification(cutlass::MatrixCoord stride_) : stride(stride_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((stride.row() == item.stride_h) && (stride.column() == item.stride_h));
+  }
+};
+
+// channel (C,K) specification, must be multiple of minimum channel
+struct ChannelDivisibilitySpecification : Specification<cutlass::conv::Conv2dProblemSize>
+{
+  int channel_multiple;
+
+  ChannelDivisibilitySpecification(int channel_multiple_) : channel_multiple(channel_multiple_) {}
+
+  bool is_satisfied(cutlass::conv::Conv2dProblemSize item) const override {
+    return ((item.K % channel_multiple == 0) && (item.C % channel_multiple == 0));
+  }
+};
+
+//
+// Pruning function for items from Conv2dProblemVector based on a Specification
+//
+inline Conv2dProblemVector prune(Conv2dProblemVector const &items,
+                           Specification<cutlass::conv::Conv2dProblemSize> const &spec)
+{
+  Conv2dProblemVector pruned_list;
+
+  for (auto& p : items)
+    if (spec.is_satisfied(p))
+      pruned_list.push_back(p);
+  return pruned_list;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv2dProblemSizes initializes and holds conv default and 
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv2dProblemSizes {
+
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+
+  Conv2dProblemVector conv2d_default_sizes;
+  Conv2dProblemVector conv2d_rigorous_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes;
+  Conv2dProblemVector conv2d_resnet50_sizes_perf;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv2dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { 
+    initialize_conv2d_default_sizes();
+    initialize_conv2d_rigorous_sizes();
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes, 1 /*batch-size*/);
+
+    initialize_conv2d_resnet50_sizes(conv2d_resnet50_sizes_perf, 34 /*batch-size*/);
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv2dProblemVector *problems_vectors[] = {
+      &conv2d_default_sizes,
+      &conv2d_rigorous_sizes,
+      &conv2d_resnet50_sizes,
+      &conv2d_resnet50_sizes_perf
+    };
+
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    } 
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_conv2d_default_sizes() {
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (1,1)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 7, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 9, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 1, 1, 1},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {2, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (1,1) asymmetric paddings (1, 0, 1, 0)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 1, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 1, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 7, 8, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 7, 9, minimum_channel_size},  // input size  (NHWC)
+      {8, 4, 4, minimum_channel_size},  // filter size (KRSC)
+      {1, 0, 1, 0},                     // padding (pad_h, _, pad_w, _)
+      {1, 1},                           // stride (stride_h, stride_w)
+      {1, 1}                            // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {2, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 5, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 5, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 6, 6, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 7, 9, minimum_channel_size},   // input size  (NHWC)
+      {8, 7, 7, minimum_channel_size},   // filter size (KRSC)
+      {1, 0, 1, 0},                      // padding (pad_h, _, pad_w, _)
+      {1, 1},                            // stride (stride_h, stride_w)
+      {1, 1}                             // dilation (dilation_h, dilation_w) 
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    // Small input size x stride (2,2)
+    // C < CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 11, 7, minimum_channel_size},  // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},    // filter size (KRSC)
+      {0, 0, 0, 0},                       // padding (pad_h, _, pad_w, _)
+      {2, 2},                             // stride (stride_h, stride_w)
+      {1, 1}                              // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 11, 7, minimum_channel_size},   // input size  (NHWC)
+      {8, 3, 3, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 13, 11, minimum_channel_size},   // input size  (NHWC)
+      {8, 1, 1, minimum_channel_size},     // filter size (KRSC)
+      {1, 1, 1, 1},                        // padding (pad_h, _, pad_w, _)
+      {2, 2},                              // stride (stride_h, stride_w)
+      {1, 1}                               // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 17, 19, minimum_channel_size},   // input size  (NHWC)
+      {16, 2, 2, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 23, 5, minimum_channel_size},   // input size  (NHWC)
+      {16, 3, 3, minimum_channel_size},   // filter size (KRSC)
+      {1, 1, 1, 1},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize( 
+      {1, 13, 17, 8},   // input size  (NHWC)
+      {24, 3, 3, 8},   // filter size (KRSC)
+      {0, 0, 0, 0},    // padding (pad_h, _, pad_w, _)
+      {2, 2},          // stride (stride_h, stride_w)
+      {1, 1}           // dilation (dilation_h, dilation_w) 
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 21, 8},     // input size (NHWC)
+      {24, 3, 3, 8},     // filter size (KRSC)
+      {1, 1, 1, 1},     // padding (pad_h, _, pad_w, _)
+      {3, 3},           // stride (stride_h, stride_w)
+      {1, 1}            // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 20, 24, 8},   // input size (NHWC)
+      {40, 3, 3, 8},     // filter size (KRSC)
+      {3, 3, 3, 3},     // padding (pad_h, _, pad_w, _)
+      {3, 3},           // stride (stride_h, stride_w)
+      {1, 1}            // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size (1x16x16x128), filter size (1x1, 2x2, 3x3, 5x5), stride (1, 1) 
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 160},   // input size  (NHWC)
+      {224, 1, 1, 160},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _) 
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w) 
+    ));
+    
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 19, 37, 160},     // input size  (NHWC)
+      {224, 3, 3, 160},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 16, 160},   // input size  (NHWC)
+      {224, 2, 3, 160},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _) 
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w) 
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 23, 21, 128},  // input size  (NHWC)
+      {224, 3, 3, 128},  // filter size (KRSC)
+      {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+  
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 29, 37, 160},      // input size  (NHWC)
+      {224, 5, 5, 160},      // filter size (KRSC)
+      {2, 2, 2, 2},          // padding (pad_h, _, pad_w, _)
+      {1, 1},                // stride (stride_h, stride_w)
+      {1, 1}                 // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // C > CTA::K and non-multiples of CTA::K. Typical CTA::K = {32, 64}
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 15, 19, 32 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 32 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 16, 24, 64 + minimum_channel_size},     // input size  (NHWC)
+      {96, 3, 3, 64 + minimum_channel_size},      // filter size (KRSC)
+      {1, 1, 1, 1},                               // padding (pad_h, _, pad_w, _)
+      {1, 1},                                     // stride (stride_h, stride_w)
+      {1, 1}                                      // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size, filter size (1x1, 3,x3, 5x5, 7x7), stride (2, 2)  
+    //////////////////////////////////////////////////////////////////////////////////// 
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 13, 16, 288},   // input size  (NHWC)
+      {160, 5, 5, 288},   // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 55, 51, 256},   // input size (NHWC)
+      {512, 1, 1, 256},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 71, 80, 32},    // input size (NHWC)
+      {64, 5, 5, 32},     // filter size (KRSC)
+      {2, 2, 2, 2},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 224, 224, 8},   // input size (NHWC)
+      {64, 7, 7, 8},      // filter size (KRSC)
+      {3, 3, 3, 3},       // padding (pad_h, _, pad_w, _)
+      {2, 2},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size stride (3, 3), filter (3, 3), non-default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 23, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {0, 0, 0, 0},         // padding (pad_h, _, pad_w, _)
+      {3, 3},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+    
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size padding > stride, asymmetric filter, padding and striding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 31, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {5, 5, 7, 7},         // padding (pad_h, _, pad_w, _)
+      {3, 4},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 35, 256},     // input size (NHWC)
+      {512, 7, 5, 256},     // filter size (KRSC)
+      {11, 11, 7, 7},       // padding (pad_h, _, pad_w, _)
+      {3, 5},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // Medium input size *mixed* stride (1, 2) and (2, 1), 
+    // filter (3, 3), default padding
+    ////////////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {1, 2},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 27, 27, 256},     // input size (NHWC)
+      {512, 3, 3, 256},     // filter size (KRSC)
+      {1, 1, 1, 1},         // padding (pad_h, _, pad_w, _)
+      {2, 1},               // stride (stride_h, stride_w)
+      {1, 1}                // dilation (dilation_h, dilation_w)
+    ));
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Additional input size 
+    /////////////////////////////////////////////////////////////////////////////
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {3, 28, 28, 256},  // input size  (NHWC)
+      {256, 2, 2, 256},  // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {2, 2},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+   
+   conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 32, 32, 16},  // input size  (NHWC)
+      {32, 3, 3, 16},  // filter size (KRSC)
+      {1, 1, 1, 1},      // padding (pad_h, _, pad_w, _)
+      {6, 2},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {32, 24, 32, 32},  // input size  (NHWC)
+      {32, 1, 2, 32},    // filter size (KRSC)
+      {0, 0, 0, 0},      // padding (pad_h, _, pad_w, _)
+      {1, 1},            // stride (stride_h, stride_w)
+      {1, 1}             // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 4, 5, 128},     // input size  (NHWC)
+      {256, 3, 6, 128},   // filter size (KRSC)
+      {0, 0, 0, 0},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 3, 3, 256}      // output size (NPQK)
+    ));
+
+    conv2d_default_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {4, 2, 3, 256},     // input size  (NHWC)
+      {328, 3, 5, 256},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1},             // dilation (dilation_h, dilation_w)
+      {4, 1, 1, 328}      // output size (NPQK)
+    ));
+  }
+
+
+  // Add a few large and rigorous convolution problem sizes
+  void initialize_conv2d_rigorous_sizes() {
+
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED                  
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 124, 224, 96},    // input size  (NHWC)
+    {24, 7, 7, 96},       // filter size (KRSC)
+    {1, 229, 129, 32}     // output size (NPQK)
+  ));
+
+  conv2d_rigorous_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+    {1, 233, 35, 48},     // input size  (NHWC)
+    {24, 7, 5, 48},       // filter size (KRSC)
+    {1, 233, 35, 24}      // output size (NPQK)
+  ));
+
+#endif 
+
+  }
+
+
+  // Add resent50 layers to unit testing sizes 
+  void initialize_conv2d_resnet50_sizes(Conv2dProblemVector &conv2d_problem_vector, int batch_size = 1){
+
+#if 0 // Resnet50 first layer (layer_id = 0) with channel = 3 is not supported in cutlass
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(   
+      [1, 224, 224, 3],           // input size (NHWC)
+      [64, 7, 7, 3],              // filter size (KRSC)
+      [3, 3, 3, 3],               // padding (pad_h, _, pad_w, _)
+      [2, 2],                     // stride (stride_h, stride_w)
+      [1, 1],                     // dilation (dilation_h, dilation_w)
+    ));
+#endif
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {256, 1, 1, 64},            // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},   // input size (NHWC)
+      {64, 1, 1, 64},             // filter size (KRSC)
+      {0, 0, 0, 0},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 64},    // input size (NHWC)
+      {64, 3, 3, 64},             // filter size (KRSC)
+      {1, 1, 1, 1},               // padding (pad_h, _, pad_w, _)
+      {1, 1},                     // stride (stride_h, stride_w)
+      {1, 1}                      // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {64, 1, 1, 256},             // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+   conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {512, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 56, 56, 256},   // input size (NHWC)
+      {128, 1, 1, 256},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {128, 3, 3, 128},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 128},   // input size (NHWC)
+      {512, 1, 1, 128},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {128, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+ 
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {1024, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+        
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 28, 28, 512},   // input size (NHWC)
+      {256, 1, 1, 512},            // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {2, 2},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {256, 3, 3, 256},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 256},   // input size (NHWC)
+      {1024, 1, 1, 256},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {256, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {1, 1},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+     conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {2048, 1, 1, 1024},           // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 14, 14, 1024},   // input size (NHWC)
+      {512, 1, 1, 1024},            // filter size (KRSC)
+      {0, 0, 0, 0},                 // padding (pad_h, _, pad_w, _)
+      {2, 2},                       // stride (stride_h, stride_w)
+      {1, 1}                        // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {512, 3, 3, 512},            // filter size (KRSC)
+      {1, 1, 1, 1},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 512},     // input size (NHWC)
+      {2048, 1, 1, 512},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+
+    conv2d_problem_vector.push_back(cutlass::conv::Conv2dProblemSize(
+      {batch_size, 7, 7, 2048},    // input size (NHWC)
+      {512, 1, 1, 2048},           // filter size (KRSC)
+      {0, 0, 0, 0},                // padding (pad_h, _, pad_w, _)
+      {1, 1},                      // stride (stride_h, stride_w)
+      {1, 1}                       // dilation (dilation_h, dilation_w)
+    ));
+ }
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedGroupConv2dProblemSizes initializes and holds group conv default and
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedGroupConv2dProblemSizes {
+
+  //
+  // Data members
+  //
+  int threadblock_n;
+  int threadblock_k;
+  int minimum_channel_size;
+
+  Conv2dProblemVector default_single_group_sizes;
+  Conv2dProblemVector default_multiple_group_sizes;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedGroupConv2dProblemSizes(
+    int threadblock_n_,
+    int threadblock_k_,
+    int minimum_channel_size_ = 64)
+  : threadblock_n (threadblock_n_),
+    threadblock_k (threadblock_k_),
+    minimum_channel_size (minimum_channel_size_) {
+    initialize_group_conv2d_default_sizes();
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv2dProblemVector *problems_vectors[] = {
+      &default_single_group_sizes,
+      &default_multiple_group_sizes
+    };
+
+    for (Conv2dProblemVector *problems : problems_vectors) {
+      Conv2dProblemVector filtered;
+
+      for (cutlass::conv::Conv2dProblemSize const & problem : *problems) {
+        if (!((problem.C / problem.groups) % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    }
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_group_conv2d_default_sizes() {
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One group calculated by one or multiple CTAs: k_per_group % CTA::N = 0
+    // One CTA calculates a single group
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    for (int cta_per_group_k = 1; cta_per_group_k < 4; ++cta_per_group_k) {
+      // groups = 2, 3, 4
+      for (int groups = 2; groups < 5; ++groups) {
+
+        int conv_k = cta_per_group_k * threadblock_n * groups;
+        default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+          {1, 8, 8, threadblock_k * 2 * groups},        // input size  (NHWC)
+          {conv_k, 3, 3, threadblock_k * 2},            // filter size (KRSC)
+          {1, 1, 1, 1},                                 // padding (pad_h, _, pad_w, _)
+          {1, 1},                                       // stride (stride_h, stride_w)
+          {1, 1},                                       // dilation (dilation_h, dilation_w)
+          cutlass::conv::Mode::kCrossCorrelation,
+          1,                                            // split_k_slices
+          groups                                        // groups
+        ));
+
+      } // loop groups
+    } // loop cta_per_group_k
+
+    // Partial gemm_k: k_per_group == CTA::N && channels_per_group < CTA::K
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n * 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // Larger problem sizes
+    
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 696},                               // input size  (NHWC)
+      {768, 3, 3, 232},                               // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {2, 2},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+    default_single_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 14, 14, 1392},                              // input size  (NHWC)
+      {1536, 3, 3, 232},                              // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      3                                               // groups
+    ));
+
+    ////////////////////////////////////////////////////////////////////////////////////
+    // One CTA calculate multiple groups: CTA::N % k_per_group = 0
+    ////////////////////////////////////////////////////////////////////////////////////
+
+    // 2 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 4},                   // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k * 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // 2 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k},                       // input size  (NHWC)
+      {threadblock_n, 3, 3, threadblock_k / 2},       // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      2                                               // groups
+    ));
+
+    // 4 groups per CTA
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 8},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k * 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+
+    // 4 groups per CTA and partial gemm_k
+    default_multiple_group_sizes.push_back(cutlass::conv::Conv2dProblemSize(
+      {1, 8, 8, threadblock_k * 2},                   // input size  (NHWC)
+      {threadblock_n / 2, 3, 3, threadblock_k / 2},   // filter size (KRSC)
+      {1, 1, 1, 1},                                   // padding (pad_h, _, pad_w, _)
+      {1, 1},                                         // stride (stride_h, stride_w)
+      {1, 1},                                         // dilation (dilation_h, dilation_w)
+      cutlass::conv::Mode::kCrossCorrelation,
+      1,                                              // split_k_slices
+      4                                               // groups
+    ));
+  }
+
+};
+
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..34588ecb467b824cc0fcbbff0bc0d99e4385d80e
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed.h
@@ -0,0 +1,818 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedConv2d {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  int tested_problem_count;
+
+public:
+
+  TestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    // increment tested problem count run by the testbed
+    tested_problem_count++;
+
+#if 0 // display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
+      return false;
+    }
+
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta} 
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+      >(
+        kConvolutionalOperator,
+        problem_size, 
+        alpha, 
+        beta, 
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv2d_result_cache_name = 
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    
+    if (!cached_result_loaded) {
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+        tensor_D_reference.host_view());
+    }
+
+    EXPECT_TRUE(passed);
+
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << ss_problem_size_text.str()
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+        << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm>
+bool TestSpecificConv2d(
+  const Conv2dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2d<ImplicitGemm> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2d<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  // Vectors of Conv2dProblemVector (lenient/easiest to rigorous problem sizes)
+  std::vector<Conv2dProblemVector> problem_vectors = {
+    conv_test_sizes,                               // run user specified sizes
+    conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    //conv_problems.conv2d_resnet50_sizes,         // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED
+    conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Flatten 2D problem_vectors into a 1D problem_sizes
+  std::vector<cutlass::conv::Conv2dProblemSize> problem_sizes;
+  for (auto problem_vector : problem_vectors) {
+    for(auto conv_problem : problem_vector) {
+      problem_sizes.push_back(conv_problem);
+    }
+  }  
+
+  // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reverse the order (rigorous to lenient) 
+  // run the most rigorous problem size first
+  if (CutlassUnitTestProblemCount()) {
+    std::reverse(problem_sizes.begin(), problem_sizes.end());
+  }
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    // Skip blacklist and avoid duplicate problem sizes
+    if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+        std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+      continue;
+    }
+
+    //
+    // Procedurally disable certain cases
+    //
+  
+    // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+          cutlass::conv::StrideSupport::kUnity)) {
+      if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+        continue;
+      }
+    }
+
+    // Fixed channels algorithm requires channel count to match access size
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+      if (conv_problem.C != ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+
+    // Few channels algorithm requires channel count to match access size
+    if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+      if (conv_problem.C % ImplicitGemm::UnderlyingKernel::Mma::IteratorA::AccessType::kElements) {
+        continue;
+      }
+    }
+
+    // CUTLASS DGRAD's *strided* stride specialization supports all stride {stride_h, stride_w} 
+    // Although strided dgrad works for all stride combinations, we are only going 
+    // to run strided dgrad for non-unity strides 
+    if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+        (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+          cutlass::conv::StrideSupport::kStrided)) {
+       if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+         continue;
+       }
+    }
+    
+    //
+    // Test
+    //
+    // push back tested problem size to avoid re-running duplicates
+    conv_tested_sizes.push_back(conv_problem);
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+  
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+  
+    if (!passed) {
+      return false;
+    }
+
+    // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
+    if (CutlassUnitTestProblemCount() && 
+        testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
+      return true;
+    }
+  }
+
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFixedChannels) {
+
+    return true;
+  }
+
+  // Small-channels convolution can't run here.
+  if (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kIteratorAlgorithm ==
+        cutlass::conv::IteratorAlgorithm::kFewChannels) {
+
+    return true;
+  }
+
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+          ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {1, 1},           // stride (stride_h, stride_w)
+      {1, 1})           // dilation (dilation_h, dilation_w)
+      .reset_split_k_slices(2),
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+
+          // If CUTLASS_UNIT_TEST_PROBLEM_COUNT is set reduce the number of tested problem counts
+          if (CutlassUnitTestProblemCount() && 
+              testbed.tested_problem_count > CutlassUnitTestProblemCount()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed_interleaved.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed_interleaved.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf075674da673cf8e056172732f912b8acba3c5b
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -0,0 +1,666 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/host_reorder.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d, int InterleavedK>
+class InterleavedTestbedConv2d {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B_reordered;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  InterleavedTestbedConv2d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 3;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_B_reordered.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    cutlass::reorder_convK<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), implicit_gemm_problem_size(kConvolutionalOperator, problem_size));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+  
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv2d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta}
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+    
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConv2dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+      >(
+        kConvolutionalOperator,
+        problem_size, 
+        alpha, 
+        beta, 
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv2d_result_cache_name = 
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+    
+    if (!cached_result_loaded) {
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ElementC,
+      cutlass::NumericConverterClamp<ElementC, ElementCompute>
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+        tensor_D_reference.host_view());
+    }
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "ncxhwx_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_cxrskx_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+        << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm, int InterleavedK>
+bool TestAllInterleavedConv2d(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  InterleavedTestbedConv2d<ImplicitGemm, InterleavedK> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(InterleavedK); // minimum channel size must be multiple of InterleavedK for interleaved layout
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    ChannelDivisibilitySpecification channel_spec(InterleavedK); //input and output channels must be multiple of InterleavedK
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+#if 0
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+#endif
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_absmax_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_absmax_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad7b2ce61a66a79f852c0aac0895d10ba18e5466
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_absmax_testbed.h
@@ -0,0 +1,622 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed for running device-level Conv2Ds with absolute maximum calculation and scaling
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "conv2d_problems.h"
+#include "../../common/cutlass_unit_test.h"
+#include "../../gemm/device/testbed_utils.h"
+
+#include "cutlass/matrix_coord.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_reduce.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Conv,
+  template<typename T> class ActivationFunctor
+>
+struct TestbedConv2dWithAbsMax {
+
+  using ElementAccumulator = typename Conv::ElementAccumulator;
+  using ElementCompute = typename Conv::UnderlyingKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Conv::EpilogueOutputOp::ElementScalingFactor;
+  using ElementAbsmax = typename Conv::EpilogueOutputOp::ElementAbsmax;
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv::kConvolutionalOperator;
+
+  static bool const kScaleAux = Conv::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Conv::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+  bool doScaleA;
+  bool doScaleB;
+  bool doScaleC;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Conv::ElementA, typename Conv::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Conv::ElementB, typename Conv::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Conv::ElementC, typename Conv::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Conv::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementOutput, typename Conv::LayoutC> reference_D;
+  cutlass::HostTensor<typename Conv::EpilogueOutputOp::ElementAuxOutput, typename Conv::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Conv::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Conv::LayoutC> reference_abs_max_D;
+
+  //
+  // Methods
+  //
+
+  TestbedConv2dWithAbsMax(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    doScaleA(scaleA), doScaleB(scaleB), doScaleC(scaleC),
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Conv::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Vector.resize({1, 1, 1, implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()});
+    reference_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+    tmp_D.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_Vector.host_view(), init_C, seed + 2020));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<4> origin(0);
+    tensor_A.host_view().at(origin) = typename Conv::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Conv::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Conv::ElementC(1);
+    tensor_Vector.host_view().at(origin) = typename Conv::ElementC(1);
+
+    cutlass::reference::host::TensorFill(tensor_D.host_view());
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_Vector.sync_device();
+
+    int scale_bits = 2;
+    if (doScaleA) {
+      scale_A.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_A.host_view(), seed + 2021, scale_bits));
+      scale_A.sync_device();
+    }
+
+    if (doScaleB) {
+      scale_B.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_B.host_view(), seed + 2022, scale_bits));
+      scale_B.sync_device();
+    }
+
+    if (doScaleC) {
+      scale_C.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_C.host_view(), seed + 2023, scale_bits));
+      scale_C.sync_device();
+    }
+
+    if (kScaleOutput) {
+      scale_D.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_D.host_view(), seed + 2024, scale_bits));
+      scale_D.sync_device();
+
+      abs_max_D.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+
+      reference_abs_max_D.resize({1, 1, 1, 1});
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+
+      scale_Aux.resize({1, 1, 1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_Aux.host_view(), seed + 2025, scale_bits));
+      scale_Aux.sync_device();
+
+      abs_max_Aux.resize({1, 1, 1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+
+      reference_Aux.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size), false);
+      reference_abs_max_Aux.resize({1, 1, 1, 1});
+    }
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view());
+      passed &= cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view());
+    }
+
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_D.host_view()), 0);
+      passed &= cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view());
+    }
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file0("conv_testbed_with_amax_errors_reference.txt");
+      std::ofstream file1("conv_testbed_with_amax_errors_computed.txt");
+
+      std::ofstream file("conv_testbed_with_amax_errors.txt");
+
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << std::endl;
+
+      file0 << "\n\nReference D =\n" << reference_D.host_view() << std::endl;
+      file1 << "\n\nComputed D =\n" << tensor_D.host_view() << std::endl;
+      if (kScaleAux) {
+        file0 << "\n\nReference Aux =\n" << reference_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Aux =\n" << tensor_Aux.host_view() << std::endl;
+        file0 << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax Aux = " << abs_max_Aux.host_view() << std::endl;
+      }
+      if (kScaleOutput) {
+        file0 << "\n\nReference Absmax D = " << reference_abs_max_D.host_view() << std::endl;
+        file1 << "\n\nComputed Absmax D = " << abs_max_D.host_view() << std::endl;
+      }
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    cutlass::Coord<4> origin(0);
+    ElementCompute scaled_alpha = alpha;
+    if (doScaleA) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (doScaleB) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+
+    ElementCompute scaled_beta = beta;
+    if (doScaleC) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Conv2d<
+        typename Conv::ElementA, typename Conv::LayoutA,
+        typename Conv::ElementB, typename Conv::LayoutB,
+        typename Conv::ElementC, typename Conv::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      scaled_alpha,
+      scaled_beta
+    );
+
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+
+    cutlass::NumericConverter<ElementCompute, typename Conv::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAbsmax, ElementCompute> cvt_compute_to_absmax;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Conv::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    ActivationFunctor<ElementCompute> act;
+
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int p = 0; p < problem_size.P; ++p) {
+        for (int q = 0; q < problem_size.Q; ++q) {
+          for (int k = 0; k < problem_size.K; ++k) {
+            ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({n, p, q, k}));
+            ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, 0, 0, k}));
+            ElementCompute aux = intermediate + bias;
+            ElementCompute d = act(aux);
+            tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+            tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+            reference_D.host_view().at({n, p, q, k}) = cvt_compute_to_d(d * d_scale);
+
+            if (kScaleAux) {
+              reference_Aux.host_view().at({n, p, q, k}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+            }
+          }
+        }
+      }
+    }
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_Aux);
+    }
+
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_D);
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Conv::EpilogueOutputOp::Params::ActivationParams activation_params{alpha, beta};
+    typename Conv::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+
+    typename Conv::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      tensor_Aux.device_ref(),
+      epilogue_params,
+      cutlass::conv::SplitKMode::kSerial,
+      tensor_Vector.device_data(),
+      0
+    };
+
+    Conv conv2d_op;
+
+    cutlass::Status status = conv2d_op.can_implement(arguments);
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = Conv::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = conv2d_op.initialize(arguments, workspace.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = conv2d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    EXPECT_TRUE(cuda_error == cudaSuccess) << cudaGetErrorString(cuda_error);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed" << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ImplicitGemm,
+  template<typename T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAllConv2dWithAbsmax(bool scaleA=true, bool scaleB=true, bool scaleC=true) {
+  const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector();
+  const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector();
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithAbsMax<ImplicitGemm, ActivationFunctor> testbed(scaleA, scaleB, scaleC);
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  bool passed = true;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    // Prune all problems with channels that aren't divisible by the number of elements accessed per
+    // load for operands A and B. This is meant to align with the requirements of iterators used for
+    // fprop kernels.
+    ChannelDivisibilitySpecification channel_spec(128 / cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+    auto pruned_problem_vector = prune(*problem_vector, channel_spec);
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : pruned_problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed &= testbed.run(conv_problem);
+
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed &= testbed.run(conv_problem.reset_mode(cutlass::conv::Mode::kConvolution));
+
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_broadcast_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..f768f5b25f425910a49058599d3854352136caef
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@@ -0,0 +1,734 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM for fused epilogue broadcast testbed
+
+    Parallel split-k is not tested because we can just use regular conv kernel
+    when we need to use parallel-splitk.  Broadcast can happen in the reduction
+    kernel.
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Conv2d>
+struct Conv2dWithBroadcastReferenceOp {
+
+  using OutputOp = typename Conv2d::EpilogueOutputOp;
+
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+
+  Conv2dWithBroadcastReferenceOp() { }
+
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute conv2d, ElementCompute bias) {
+    ElementCompute t_full = binary_op(conv2d, bias);
+    T = ElementT(t_full);
+
+    ElementCompute z_full = elementwise_op(t_full);
+    Z = ElementZ(z_full);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fused testbed
+//
+//  Y = CONV(AB, C)
+//
+//  T[n, p, q, k] = ReductionOp(Y[n, p, q, k], Broadcast[k])
+//
+//  Z[n, p, q, k] = Elementwise(T[n, p, q, k])
+//
+
+template <
+  typename Conv2d,
+  typename ReferenceOp,
+  bool AddBroadcastFirst = false
+>
+class TestbedConv2dWithBroadcast {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementZ = typename EpilogueOutputOp::ElementZ;
+  using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+  static const bool kAddBroadcastFirst = AddBroadcastFirst;
+  static const bool kStoreT = EpilogueOutputOp::kStoreT;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_C_reference;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_computed;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_reference;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
+
+public:
+
+  TestbedConv2dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_C_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Z_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Z_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Y_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Broadcast.resize({
+      1,
+      1,
+      1,
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c(),
+    });
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_Broadcast.host_view(), init_C, seed * 39);
+ 
+    for (int n = 0; n < tensor_C_reference.extent().n(); ++n) {
+      for (int p = 0; p < tensor_C_reference.extent().h(); ++p) {
+        for (int q = 0; q < tensor_C_reference.extent().w(); ++q) {
+          for (int k = 0; k < tensor_C_reference.extent().c(); ++k) {
+            tensor_C_reference.at({n, p, q, k}) = ElementAccumulator(tensor_C.at({n, p, q, k}));
+          }
+        }
+      }
+    }
+   
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+    tensor_C_reference.sync_device();
+    tensor_Z_computed.sync_device();
+    tensor_Z_reference.sync_device();
+    tensor_T_computed.sync_device();
+    tensor_T_reference.sync_device();
+    tensor_Y_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(1)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_Z_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Broadcast.device_data(),
+      kStoreT ? tensor_T_computed.device_data() : nullptr,
+      0,         // This must be zero
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()
+    );
+
+    // initialize the kernel 
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_T_computed.sync_host();
+    tensor_Z_computed.sync_host();
+
+    //
+    // Reference check
+    //
+
+    // When kAddBroadcastFirst is true, add bias on the host
+    ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C_reference.device_ref(),
+      tensor_Y_reference.device_ref(),
+      alpha, 
+      beta_ref);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_Y_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C_reference.host_ref(),
+      tensor_Y_reference.host_ref(),
+      alpha, 
+      beta_ref);
+
+#endif
+    ReferenceOp reference_op;
+
+    // compute tensor Z and tensor T
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+        for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+          for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
+  
+            ElementZ z{};
+            ElementT t{};
+    
+            ElementCompute accum = tensor_Y_reference.at({n, p, q, k});
+	          ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, k}));
+
+
+            if (kAddBroadcastFirst) {
+              reference_op(z, t, accum + bias,
+                           beta * ElementCompute(tensor_C_reference.at({n, p, q, k})));
+            } else {
+              reference_op(z, t, accum, bias);
+            }   
+ 
+            tensor_Z_reference.at({n, p, q, k}) = z;
+            tensor_T_reference.at({n, p, q, k}) = t;
+          }
+        }
+      }
+    }
+
+    if (kStoreT) {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_T_computed.host_view(), 
+        tensor_T_reference.host_view());
+
+      EXPECT_TRUE(passed);
+    }
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Z_computed.host_view(), 
+      tensor_Z_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nBroadcast:\n" << tensor_Broadcast.host_view() << "\n"
+        << "\nY reference:\n" << tensor_Y_reference.host_view() << "\n"
+        << "\nT reference:\n" << tensor_T_reference.host_view() << "\n"
+        << "\nT computed:\n" << tensor_T_computed.host_view() << "\n"
+        << "\nZ reference:\n" << tensor_Z_reference.host_view() << "\n"
+        << "\nZ computed:\n" << tensor_Z_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv2dWithBroadcast(
+  const Conv2dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv2dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false,
+          bool TestSplitK = true 
+>
+bool TestAllConv2dWithBroadcast(
+  const Conv2dProblemVector &conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector &conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+      
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+        ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+
+  if (!TestSplitK)
+    return passed;
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_reduction_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_reduction_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8ec16ca5de369470f5dc50bb6f8b5e2da3da10d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv2d_with_reduction_testbed.h
@@ -0,0 +1,643 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/tensor_reduce.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv2d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedConv2dWithReduction {
+public:
+
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+  using ElementT = typename EpilogueOutputOp::ElementTensor;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Reduction;
+  cutlass::HostTensor<ElementT,           cutlass::layout::RowMajor> tensor_Tensor;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Final_Reduction;
+
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv2dWithReduction(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope = 2;
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    tensor_Reduction.resize({
+      1,
+      1,
+      (problem_size.N * problem_size.P * problem_size.Q - 1 + Conv2d::ThreadblockShape::kM) / Conv2d::ThreadblockShape::kM,
+      (problem_size.K)
+    });
+
+    tensor_Final_Reduction.resize({
+      1,
+      1,
+      1,
+      (problem_size.K)
+    });
+
+    tensor_Tensor.resize({(problem_size.N * problem_size.P * problem_size.Q), problem_size.K});
+
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv2d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv2dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Reduction.device_data(),
+      tensor_Tensor.device_data(),
+      static_cast<int>(tensor_Reduction.stride()[0]),
+      static_cast<int>(tensor_Tensor.stride()[0])
+    );
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv2d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv2d output is written to workspace in global memory
+      conv2d_args.ref_D.reset(reinterpret_cast<ElementC*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv2d_args.output_op = {ElementCompute(1), ElementCompute(0)}; 
+      // update conv2d operator arguments
+      status = conv2d_op.update(conv2d_args, workspace.get());
+    }
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    // Final reduction over the partial reduction tensor
+    using Functor = cutlass::plus<ElementAccumulator>;
+    using TensorReduction = cutlass::reduction::device::TensorReduction<
+      ElementAccumulator,
+      ElementAccumulator,
+      LayoutC, 
+      Functor,
+      8,
+      ElementAccumulator
+    >;
+
+    TensorReduction reduction(tensor_Reduction.extent(), 2);
+
+    cutlass::DeviceAllocation<uint8_t> reduction_device_workspace(reduction.workspace_size());
+
+    status = reduction.reduce(
+      tensor_Final_Reduction.device_ref(),
+      tensor_Reduction.device_ref(),
+      reduction_device_workspace.get(),
+      ElementAccumulator());
+
+    EXPECT_EQ(status, cutlass::Status::kSuccess);
+    EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
+
+    //
+    // Reference check
+    //
+
+    tensor_D_computed.sync_host();
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv2d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha, 
+      beta);
+
+#endif
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(), 
+      tensor_D_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    //
+    // Reference check on reduction results
+    //
+
+    tensor_Reduction.sync_host();
+    tensor_Final_Reduction.sync_host();
+
+    // compute backwards for reduction results
+    cutlass::HostTensor<ElementAccumulator, LayoutC> reference_Reduction;
+    reference_Reduction.resize({
+      1,
+      1,
+      1,
+      (problem_size.K) 
+    });
+
+    for (int k = 0; k < problem_size.K; ++k) {
+      ElementAccumulator reduced_value = ElementAccumulator();
+      for (int n = 0; n < problem_size.N; ++n) {
+        for (int p = 0; p < problem_size.P; ++p) {
+          for (int q = 0; q < problem_size.Q; ++q) {
+            reduced_value += tensor_D_reference.at({n, p, q, k});
+          }
+        }
+      }
+      reference_Reduction.at({0, 0, 0, k}) = reduced_value;
+    }
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Final_Reduction.host_view(),
+      reference_Reduction.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_")) 
+        << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << tensor_D_reference.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D_computed.host_view() << "\n"
+        << "\nreduction reference:\n" << reference_Reduction.host_view() << "\n"
+        << "\nreduction computed:\n" << tensor_Reduction.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv2d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm>
+bool TestAllConv2dWithReduction(
+  const Conv2dProblemVector & conv_test_sizes = Conv2dProblemVector(),
+  const Conv2dProblemVector & conv_blacklist_sizes = Conv2dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv2dWithReduction<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv2dProblemSizes conv_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv2d problem sizes to avoid duplicate runs
+  Conv2dProblemVector conv_tested_sizes;
+
+  Conv2dProblemVector const *problem_vectors[] = {
+    &conv_test_sizes,                               // run user specified sizes
+    &conv_problems.conv2d_default_sizes,            // run default and cudnn bug sizes
+    &conv_problems.conv2d_resnet50_sizes,           // run resnet50 sizes
+#if CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED 
+    &conv_problems.conv2d_rigorous_sizes,           // run large and rigorous sizes if enabled
+#endif
+  };
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv2dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == 
+            cutlass::conv::Operator::kDgrad) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+      
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // CUTLASS DGRAD's *strided* specialization does not support split-k mode 
+  if ((ImplicitGemm::kConvolutionalOperator == 
+          cutlass::conv::Operator::kDgrad) && 
+      (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+        cutlass::conv::StrideSupport::kStrided)) {
+
+    passed = testbed.run(
+      cutlass::conv::Conv2dProblemSize(
+      {1, 56, 56, 8},   // input size (NHWC)
+      {8, 1, 1, 8},     // filter size (KRSC)
+      {0, 0, 0, 0},     // padding (pad_h, _, pad_w, _)
+      {2, 2},           // stride (stride_h, stride_w)
+      {1, 1}),          // dilation (dilation_h, dilation_w)
+      cutlass::conv::SplitKMode::kSerial,
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0), 
+      cutlass::from_real<typename ImplicitGemm::ElementCompute>(2.0));
+
+    if (!passed) {
+      return false;
+    }
+
+    return passed;
+  }
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv2dProblemSize conv2d_split_k_test_size (
+      {1, 17, 11, 288},   // input size (NHWC)
+      {160, 3, 3, 288},   // filter size (KRSC)
+      {1, 1, 1, 1},       // padding (pad_h, _, pad_w, _)
+      {1, 1},             // stride (stride_h, stride_w)
+      {1, 1}              // dilation (dilation_h, dilation_w)
+    );
+
+  // Parallel SplitK is not tested.
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv2d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_problems.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_problems.h
new file mode 100644
index 0000000000000000000000000000000000000000..fae7d6194fb671594221a90faea7cac1e5fbeb9f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_problems.h
@@ -0,0 +1,293 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed sizes for Conv2d problem
+*/
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+using Conv3dProblemVector = std::vector<cutlass::conv::Conv3dProblemSize>;
+
+////////////////////////////////////////////////////////////////////////////
+/// Structure TestbedConv3dProblemSizes initializes and holds conv default and 
+/// important network sizes
+////////////////////////////////////////////////////////////////////////////
+struct TestbedConv3dProblemSizes {
+
+  //
+  // Data members
+  //
+  int minimum_channel_size;
+  Conv3dProblemVector conv3d_default_sizes;
+  Conv3dProblemVector conv3d_vnet_medical_sizes;
+
+  //
+  // Methods
+  //
+  /// Default ctor
+  TestbedConv3dProblemSizes(int minimum_channel_size_ = 64): minimum_channel_size (minimum_channel_size_) { 
+
+    initialize_conv3d_default_sizes();
+    initialize_conv3d_vnet_medical_sizes(conv3d_vnet_medical_sizes, 1 /*batch-size*/);
+
+    filter_all();
+  }
+
+  /// Eliminates some illegal cases
+  void filter_all() {
+
+    Conv3dProblemVector *problems_vectors[] = {
+      &conv3d_default_sizes,
+      &conv3d_vnet_medical_sizes
+    };
+
+    for (Conv3dProblemVector *problems : problems_vectors) {
+      Conv3dProblemVector filtered;
+
+      for (cutlass::conv::Conv3dProblemSize const & problem : *problems) {
+        if (!(problem.C % minimum_channel_size)) {
+          filtered.push_back(problem);
+        }
+      }
+
+      *problems = filtered;
+    } 
+  }
+
+  // Add a few standard convolution problem sizes
+  void initialize_conv3d_default_sizes() {
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 3, 3, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 1, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),       // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),       // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})        // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 1, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w)
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 1, 8, minimum_channel_size},   // input size  (NDHWC)
+      {8, 1, 1, 3, minimum_channel_size},   // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 8, 8, 8, minimum_channel_size},    // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},    // filter size (KTRSC)
+      CUTLASS_STL_NAMESPACE::make_tuple(
+        cutlass::Coord<3>({1, 1, 1}),       // near padding (pad_d, pad_h, pad_w)
+        cutlass::Coord<3>({0, 0, 0})        // far padding (pad_d, pad_h, pad_w)
+      ),
+      cutlass::Coord<3>({1, 1, 1}),          // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})           // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 16, 16, 16, minimum_channel_size}, // input size  (NDHWC)
+      {8, 3, 3, 3, minimum_channel_size},   // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),         // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),         // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})          // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 1, 15, 19, 160},              // input size  (NDHWC)
+      {224, 1, 3, 6, 160},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w) 
+    )); 
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 2, 1, 1, minimum_channel_size},  // input size  (NDHWC)
+      {8, 2, 1, 1, minimum_channel_size},  // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1,  1, 7, 7, minimum_channel_size}, // input size  (NDHWC)
+      {16, 1, 3, 3, minimum_channel_size}, // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),        // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),        // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})         // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+
+    conv3d_default_sizes.push_back(cutlass::conv::Conv3dProblemSize(
+      {1, 11, 15, 19, 64},              // input size  (NDHWC)
+      {32, 4, 3, 6, 64},                // filter size (KTRSC)
+      cutlass::Coord<3>({2, 1, 3}),     // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),     // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})      // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  }
+
+  // Add vnet layers to unit testing sizes 
+  void initialize_conv3d_vnet_medical_sizes(Conv3dProblemVector &conv3d_problem_vector, int batch_size = 1) {
+
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {32, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {32, 3, 3, 3, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {64, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 64},     // input size  (NDHWC)
+      {128, 2, 2, 2, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 4, 4, 4, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 8, 8, 8, 128},     // input size  (NDHWC)
+      {128, 3, 3, 3, 128},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 64},     // input size  (NDHWC)
+      {64, 3, 3, 3, 64},              // filter size (KTRSC)
+      cutlass::Coord<3>({1, 1, 1}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({1, 1, 1}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 32, 32, 32, 16},     // input size  (NDHWC)
+      {64, 2, 2, 2, 16},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+  
+  
+    conv3d_problem_vector.push_back(cutlass::conv::Conv3dProblemSize(
+      {batch_size, 16, 16, 16, 32},     // input size  (NDHWC)
+      {128, 2, 2, 2, 32},              // filter size (KTRSC)
+      cutlass::Coord<3>({0, 0, 0}),    // padding (pad_d, pad_h, pad_w)
+      cutlass::Coord<3>({2, 2, 2}),    // stride (stride_d, stride_h, stride_w)
+      cutlass::Coord<3>({1, 1, 1})     // dilation (dilation_d, dilation_h, dilation_w) 
+    ));
+
+  }
+
+};
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..029f5effb9103bebd4ee61767795d3883541d986
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_testbed.h
@@ -0,0 +1,716 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "conv3d_problems.h"
+#include "cutlass/core_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv3d>
+class TestbedConv3d {
+public:
+
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+
+  /// Reduction kernel
+  using ReductionOp = cutlass::reduction::thread::ReduceAdd<
+    ElementAccumulator, 
+    typename EpilogueOutputOp::ElementAccumulator,
+    EpilogueOutputOp::kCount
+  >;
+
+  using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<
+    cutlass::MatrixShape<4, 32 * EpilogueOutputOp::kCount>,
+    EpilogueOutputOp,
+    ReductionOp
+  >;
+
+  using ReductionDevice = cutlass::reduction::device::ReduceSplitK<ReductionKernel>;
+  using ReductionStrideIndex = typename ReductionDevice::StrideIndex;
+  
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+public:
+
+  TestbedConv3d(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        scope = 4;
+      }
+      else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, uint64_t seed = 2019) {
+        
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute()) {
+
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << float(alpha) << ", " << float(beta) << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv3d conv3d_op;
+
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode
+    );
+
+    cutlass::Status status = conv3d_op.can_implement(conv3d_args);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_size: \n";
+      return false;
+    }
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = conv3d_op.initialize(conv3d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // conv3d operation with parallel split-k-mode
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // conv3d output is written to workspace in global memory
+      conv3d_args.ref_D.reset(reinterpret_cast<ElementAccumulator*>(workspace.get()));
+      // accumulate mma for each cta in k-dimension (1.0 * A * B)
+      conv3d_args.output_op = {1.0, 0.0}; 
+      // update conv3d operator arguments
+      status = conv3d_op.update(conv3d_args, workspace.get());
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+  
+    // run conv3d operator
+    status = conv3d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    if (split_k_mode == cutlass::conv::SplitKMode::kParallel) {
+
+      // configure parallel reduction operator 
+      ReductionDevice reduction_op;
+
+      typename ReductionDevice::Arguments reduction_args(
+        cutlass::conv::implicit_gemm_problem_size(kConvolutionalOperator, problem_size).mn(),
+        problem_size.split_k_slices,
+        cutlass::conv::implicit_gemm_tensor_c_size(kConvolutionalOperator, problem_size),
+        {
+          reinterpret_cast<ElementAccumulator*> (workspace.get()),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_D_computed.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        {
+          tensor_C.device_data(),
+          ReductionStrideIndex(tensor_C.stride()[Conv3d::UnderlyingKernel::kTensorCStrideIdx])
+        },
+        // apply alpha, beta to obtain the following equation alpha * ReduceAdd(A * B) + beta * C 
+        {alpha, beta}
+      );
+
+      status = reduction_op.initialize(reduction_args, nullptr);
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+
+      // run prallel reduction kernel
+      status = reduction_op();
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess);
+      if (status != cutlass::Status::kSuccess) {
+        return false;
+      }
+    }
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConv3dTestKey<
+        ElementA, LayoutA,
+        ElementB, LayoutB,
+        ElementC, LayoutC,
+        ElementAccumulator,
+        ElementCompute
+    >(
+        kConvolutionalOperator,
+        problem_size, 
+        alpha, 
+        beta, 
+        tensor_A.host_view(),
+        tensor_B.host_view(),
+        tensor_C.host_view()
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv3d_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv3d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+
+    if (!cached_result_loaded) {
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D_reference.device_ref(),
+      alpha, 
+      beta
+    );
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_D_reference.sync_host();
+    
+#else
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementAccumulator,
+      ElementCompute
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref(),
+      alpha,
+      beta
+    );
+#endif
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv3d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv3d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+        tensor_D_reference.host_view());
+    }
+    
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "ndhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_ktrsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_" 
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"  
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << Conv3d::ThreadblockShape::kM << "x"  
+        << Conv3d::ThreadblockShape::kN << "x"  
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"  
+        << Conv3d::WarpShape::kN << "x"  
+        << Conv3d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+        << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv2dProblemSizes
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ImplicitGemm>
+bool TestAllConv3d(
+  const Conv3dProblemVector & conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector & conv_blacklist_sizes = Conv3dProblemVector()) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  //TestbedConv3d<ImplicitGemm> testbed(cutlass::Distribution::Sequential, cutlass::Distribution::Sequential, cutlass::Distribution::Sequential);
+  TestbedConv3d<ImplicitGemm> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's unity stride specialization only support stride {1, 1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) &&
+          ((ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity) ||
+           (ImplicitGemm::UnderlyingKernel::Mma::IteratorB::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity))) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) && 
+              (conv_problem.stride_w == 1))
+          ) {
+          continue;
+        }
+      }
+
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial);
+    
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  // Sweep split-k-slice using serial reduction with non-unity alpha and non-zero beta for 
+  // a single conv2d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},            // input size  (NDHWC)
+    {32, 3, 3, 3, 32},               // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w) 
+  );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial,
+    cutlass::conv::SplitKMode::kParallel
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+template <typename ImplicitGemm>
+bool TestSpecificConv3d(
+  const Conv3dProblemVector & problem_sizes) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3d<ImplicitGemm> testbed;
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_with_broadcast_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_with_broadcast_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8ba785c9d0ecbdd518711714558c9e166c0209a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/conv3d_with_broadcast_testbed.h
@@ -0,0 +1,732 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM for fused epilogue broadcast testbed
+
+    Parallel split-k is not tested because we can just use regular conv kernel
+    when we need to use parallel-splitk.  Broadcast can happen in the reduction
+    kernel.
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+
+#include "conv3d_problems.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "../cache_testbed_output.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Conv3d>
+struct Conv3dWithBroadcastReferenceOp {
+
+  using OutputOp = typename Conv3d::EpilogueOutputOp;
+
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+
+  Conv3dWithBroadcastReferenceOp() { }
+
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute conv3d, ElementCompute bias) {
+    ElementCompute t_full = binary_op(conv3d, bias);
+    T = ElementT(t_full);
+
+    ElementCompute z_full = elementwise_op(t_full);
+    Z = ElementZ(z_full);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fused testbed
+//
+//  Y = CONV(AB, C)
+//
+//  T[n, o, p, q, k] = ReductionOp(Y[n, o, p, q, k], Broadcast[k])
+//
+//  Z[n, o, p, q, k] = Elementwise(T[n, o, p, q, k])
+//
+
+template <
+  typename Conv3d,
+  typename ReferenceOp,
+  bool AddBroadcastFirst = false
+>
+class TestbedConv3dWithBroadcast {
+public:
+
+  using ElementA = typename Conv3d::ElementA;
+  using LayoutA = typename Conv3d::LayoutA;
+  using ElementB = typename Conv3d::ElementB;
+  using LayoutB = typename Conv3d::LayoutB;
+  using ElementC = typename Conv3d::ElementC;
+  using LayoutC = typename Conv3d::LayoutC;
+  using ElementAccumulator = typename Conv3d::ElementAccumulator;
+  using ElementCompute = typename Conv3d::ElementCompute;
+  using EpilogueOutputOp = typename Conv3d::EpilogueOutputOp;
+  using ElementZ = typename EpilogueOutputOp::ElementZ;
+  using ElementT = typename EpilogueOutputOp::ElementT;
+  using ElementVector = typename EpilogueOutputOp::ElementVector;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv3d::kConvolutionalOperator;
+  static const bool kAddBroadcastFirst = AddBroadcastFirst;
+  static const bool kStoreT = EpilogueOutputOp::kStoreT;
+
+public:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_C_reference;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_computed;
+  cutlass::HostTensor<ElementZ, LayoutC> tensor_Z_reference;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_computed;
+  cutlass::HostTensor<ElementT, LayoutC> tensor_T_reference;
+  cutlass::HostTensor<ElementAccumulator, LayoutC> tensor_Y_reference;
+  cutlass::HostTensor<ElementVector, LayoutC> tensor_Broadcast;            // Input Broadcast
+
+public:
+
+  TestbedConv3dWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {
+
+  }
+
+    /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      }
+      else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        }
+        else {
+          scope = 5;
+        }
+      }
+      else {
+        scope = 8;
+      }
+      
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope, -scope, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } 
+    else {
+    }
+  }
+
+  void initialize(
+    cutlass::conv::Conv3dProblemSize const &problem_size, bool non_packed_test = false, uint64_t seed = 2019) {
+        
+    // to make the layout of tensors a little bit bigger than the problem size
+    cutlass::Tensor5DCoord stride_increment = cutlass::Tensor5DCoord(8, 16, 32, 32, 64);
+
+    cutlass::Tensor5DCoord tensor_A_extent = implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_B_extent = implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size);
+    cutlass::Tensor5DCoord tensor_C_extent = implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size);
+
+    if (non_packed_test) {
+      tensor_A_extent += stride_increment;
+      tensor_C_extent += stride_increment;
+    }
+
+    tensor_A.resize(tensor_A_extent);
+    tensor_B.resize(tensor_B_extent);
+    tensor_C.resize(tensor_C_extent);
+    tensor_C_reference.resize(tensor_C_extent);
+    tensor_Z_computed.resize(tensor_C_extent);
+    tensor_Z_reference.resize(tensor_C_extent);
+    tensor_T_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_T_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_Y_reference.resize(tensor_C_extent);
+    tensor_Broadcast.resize({
+      1,
+      1,
+      1,
+      1,
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c(),
+    });
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed); 
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17); 
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+    initialize_tensor(tensor_Broadcast.host_view(), init_C, seed * 39);
+    for (int n = 0; n < tensor_C_reference.extent().n(); ++n) {
+      for (int o = 0; o < tensor_C_reference.extent().d(); ++o) {
+        for (int p = 0; p < tensor_C_reference.extent().h(); ++p) {
+          for (int q = 0; q < tensor_C_reference.extent().w(); ++q) {
+            for (int k = 0; k < tensor_C_reference.extent().c(); ++k) {
+              tensor_C_reference.at({n, o, p, q, k}) = ElementAccumulator(tensor_C.at({n, o, p, q, k}));
+            }
+          }
+        }
+      }
+    }
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+    tensor_C_reference.sync_device();
+    tensor_Z_computed.sync_device();
+    tensor_Z_reference.sync_device();
+    tensor_T_computed.sync_device();
+    tensor_T_reference.sync_device();
+    tensor_Y_reference.sync_device();
+  }
+
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Conv3d::UnderlyingKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::conv::Conv3dProblemSize const &problem_size,
+    cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+    bool non_packed_test = false,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(1)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0 //display conv3d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: " << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)") << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size, non_packed_test);
+
+    // configure the operator
+    Conv3d conv3d_op;
+    typename Conv3d::Arguments conv3d_args(
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_Z_computed.device_ref(),
+      {alpha, beta},
+      split_k_mode,
+      tensor_Broadcast.device_data(),
+      kStoreT ? tensor_T_computed.device_data() : nullptr,
+      0,         // This must be zero
+      implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size).c()
+    );
+
+    // initialize the kernel 
+    size_t workspace_size = Conv3d::get_workspace_size(conv3d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv3d_op.initialize(conv3d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // run conv3d operator
+    status = conv3d_op();
+    
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " 
+                                   << cudaGetErrorString(result);
+
+    tensor_T_computed.sync_host();
+    tensor_Z_computed.sync_host();
+
+    //
+    // Reference check
+    //
+
+    // When kAddBroadcastFirst is true, add bias on the host
+    ElementCompute beta_ref = kAddBroadcastFirst ? ElementCompute(0) : beta;
+
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+    cutlass::reference::device::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator 
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C_reference.device_ref(),
+      tensor_Y_reference.device_ref(),
+      alpha, 
+      beta_ref);
+
+    // sync host (copy device data to host) for dumping error output in case of mismatches
+    tensor_Y_reference.sync_host();
+    
+#else 
+
+    cutlass::reference::host::Conv3d<
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementAccumulator,
+      LayoutC,
+      ElementAccumulator,
+      ElementAccumulator
+    >(
+      kConvolutionalOperator,
+      problem_size,
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      tensor_C_reference.host_ref(),
+      tensor_Y_reference.host_ref(),
+      alpha, 
+      beta_ref);
+
+#endif
+    ReferenceOp reference_op;
+
+    // compute tensor Z and tensor T
+    for (int n = 0; n < problem_size.N; ++n) {
+      for (int o = 0; o < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Z : problem_size.D); ++o) {
+        for (int p = 0; p < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.P : problem_size.H); ++p) {
+          for (int q = 0; q < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.Q : problem_size.W); ++q) {
+            for (int k = 0; k < (kConvolutionalOperator == cutlass::conv::Operator::kFprop ? problem_size.K : problem_size.C); ++k) {
+    
+              ElementZ z{};
+              ElementT t{};
+      
+              ElementCompute accum = tensor_Y_reference.at({n, o, p, q, k});
+              ElementCompute bias = ElementCompute(tensor_Broadcast.at({0, 0, 0, 0, k}));
+
+
+              if (kAddBroadcastFirst) {
+                reference_op(z, t, accum + bias,
+                            beta * ElementCompute(tensor_C_reference.at({n, o, p, q, k})));
+              } else {
+                reference_op(z, t, accum, bias);
+              }   
+  
+              tensor_Z_reference.at({n, o, p, q, k}) = z;
+              tensor_T_reference.at({n, o, p, q, k}) = t;
+            }
+          }
+        }
+      }
+    }
+
+    if (kStoreT) {
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_T_computed.host_view(), 
+        tensor_T_reference.host_view());
+
+      EXPECT_TRUE(passed);
+    }
+
+    passed = cutlass::reference::host::TensorEquals(
+      tensor_Z_computed.host_view(), 
+      tensor_Z_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv3d_ImplicitGemm_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" :
+              (Conv3d::kConvolutionalOperator == cutlass::conv::Operator::kDeconv ? "deconv_" : "wgrad_")))
+        << "nnhwc_"
+        << problem_size.N << "x"
+        << problem_size.D << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C 
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.T << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C 
+        << "_padding_"
+        << problem_size.pad_d << "x"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w 
+        << "_stride_"
+        << problem_size.stride_d << "x"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w 
+        << "_dilation_"
+        << problem_size.dilation_d << "x"
+        << problem_size.dilation_h << "x"
+        << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_")
+        << (non_packed_test ? "non_packed_tensor_test_" : "packed_tensor_test_")
+        << Conv3d::ThreadblockShape::kM << "x"  
+        << Conv3d::ThreadblockShape::kN << "x"  
+        << Conv3d::ThreadblockShape::kK << "_"
+        << Conv3d::WarpShape::kM << "x"  
+        << Conv3d::WarpShape::kN << "x"  
+        << Conv3d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nBroadcast:\n" << tensor_Broadcast.host_view() << "\n"
+        << "\nY reference:\n" << tensor_Y_reference.host_view() << "\n"
+        << "\nT reference:\n" << tensor_T_reference.host_view() << "\n"
+        << "\nT computed:\n" << tensor_T_computed.host_view() << "\n"
+        << "\nZ reference:\n" << tensor_Z_reference.host_view() << "\n"
+        << "\nZ computed:\n" << tensor_Z_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+// TestAllConv: Runs cutlass::conv::device::ImplicitGemmConvolution operator and compares it with reference
+// TestAllConv runs conv operator on default conv problem sizes from test::conv::device::TestbedConv3dProblemSizes
+// Additionally, each conv3d test can provide conv problem sizes (conv_test_sizes) and blacklist of sizes 
+// (conv_blacklist_sizes)
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false,
+          bool TestSplitK = true 
+>
+bool TestAllConv3dWithBroadcast(
+  const Conv3dProblemVector &conv_test_sizes = Conv3dProblemVector(),
+  const Conv3dProblemVector &conv_blacklist_sizes = Conv3dProblemVector(),
+  bool non_packed_test = false) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  //
+  // Get conv problem sizes to run conv operator 
+  //
+  TestbedConv3dProblemSizes conv3d_problems(128/cutlass::sizeof_bits<typename ImplicitGemm::ElementA>::value);
+
+  // Vector of conv3d problem sizes to avoid duplicate runs
+  Conv3dProblemVector conv_tested_sizes;
+
+  Conv3dProblemVector const *problem_vectors[] = {
+    &conv3d_problems.conv3d_default_sizes,
+    &conv3d_problems.conv3d_vnet_medical_sizes,
+    &conv_test_sizes
+  };
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (Conv3dProblemVector const * problem_vector : problem_vectors) {
+
+    //  Run conv testbed on default convolution sizes
+    for(auto conv_problem : *problem_vector) {
+
+      // Skip blacklist and avoid duplicate problem sizes
+      if (std::find(conv_blacklist_sizes.begin(), conv_blacklist_sizes.end(), conv_problem) != conv_blacklist_sizes.end() ||
+          std::find(conv_tested_sizes.begin(), conv_tested_sizes.end(), conv_problem) != conv_tested_sizes.end()) {
+        continue;
+      }
+
+      //
+      // Procedurally disable certain cases
+      //
+  
+      // CUTLASS DGRAD's *unity* stride specialization only support stride {1, 1} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kUnity)) {
+        if (!((conv_problem.stride_d == 1) &&
+              (conv_problem.stride_h == 1) && 
+              (conv_problem.stride_w == 1))
+          ) {
+          continue;
+        }
+      }
+
+#if 0 // relax restrictions on analytic strided dgrad
+      // CUTLASS DGRAD's *strided* specialization only support stride >= {2, 2} 
+      if ((ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ||
+            ImplicitGemm::kConvolutionalOperator == cutlass::conv::Operator::kDeconv) && 
+          (ImplicitGemm::UnderlyingKernel::Mma::IteratorA::kStrideSupport == 
+            cutlass::conv::StrideSupport::kStrided)) {
+         if (((conv_problem.stride_d == 1) && (conv_problem.stride_h == 1) && (conv_problem.stride_w == 1))) {
+           continue;
+         }
+      }
+#endif
+      
+      //
+      // Test
+      //
+      // push back tested problem size to avoid re-running duplicates
+      conv_tested_sizes.push_back(conv_problem);
+
+      // test mode = xcross
+      passed = testbed.run(
+        conv_problem,
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+      if (!passed) {
+        return false;
+      }
+
+      // test mode = convolution
+      passed = testbed.run(
+        conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+        cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+      if (!passed) {
+        return false;
+      }
+    }
+  }
+
+  if (!TestSplitK)
+    return passed;
+
+  // Sweep split-k-slice using serial and prallel reduction with non-unity alpha and non-zero beta for 
+  // a single conv3d problem size. Convolution unit tests take a long time to run so only sweep parameters 
+  // which are abolutely necessary to catch functional bugs. The below code does provide option to sweep
+  // alpha and beta for local testing, but only runs one value for alpha and beta.
+  cutlass::conv::Conv3dProblemSize conv3d_split_k_test_size (
+    {1, 8, 8, 8, 32},               // input size  (NDHWC)
+    {32, 3, 3, 3, 32},              // filter size (KTRSC)
+    cutlass::Coord<3>({0, 0, 0}),   // padding (pad_d, pad_h, pad_w)
+    cutlass::Coord<3>({1, 1, 1}),   // stride (stride_d, stride_h, stride_w)
+    cutlass::Coord<3>({1, 1, 1})    // dilation (dilation_d, dilation_h, dilation_w) 
+  );
+
+  cutlass::conv::SplitKMode split_k_modes [] = {
+    cutlass::conv::SplitKMode::kSerial
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3, 4, 201
+  };
+
+  double problem_alpha[] = {
+    2.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  for (auto split_k_mode : split_k_modes) {
+    for (auto split_k_slice : split_k_slices) {
+      for (auto alpha : problem_alpha) {
+        for (auto beta : problem_beta) {
+
+          passed = testbed.run(
+            conv3d_split_k_test_size.reset_split_k_slices(split_k_slice),
+            split_k_mode,
+            false,/*non_packed_test*/
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(alpha), 
+            cutlass::from_real<typename ImplicitGemm::ElementCompute>(beta));
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+template <typename ImplicitGemm,
+          typename ReferenceOp = Conv3dWithBroadcastReferenceOp<ImplicitGemm>,
+          bool AddBroadcastFirst = false>
+bool TestSpecificConv3dWithBroadcast(
+  const Conv3dProblemVector & problem_sizes,
+  bool non_packed_test = false) {
+
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+
+  TestbedConv3dWithBroadcast<ImplicitGemm, ReferenceOp, AddBroadcastFirst> testbed;
+
+  // Sweep conv3d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for(auto conv_problem : problem_sizes) {
+
+    //
+    // Test
+    //
+
+    // test mode = xcross, non_packed_test = false
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution, non_packed_test = false
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial, non_packed_test);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..cef5f981c595dfbbb95658fb757865b219538192
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
@@ -0,0 +1,473 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Depthwise Direct Conv testbed
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "../cache_testbed_output.h"
+#include "conv2d_problems.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/device/convolution.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace conv {
+namespace device {
+
+template <typename Conv2d>
+class TestbedDepthwiseDirectConv2d {
+ public:
+ 
+  using ElementA = typename Conv2d::ElementA;
+  using LayoutA = typename Conv2d::LayoutA;
+  using ElementB = typename Conv2d::ElementB;
+  using LayoutB = typename Conv2d::LayoutB;
+  using ElementC = typename Conv2d::ElementC;
+  using LayoutC = typename Conv2d::LayoutC;
+  using ElementAccumulator = typename Conv2d::ElementAccumulator;
+  using ElementCompute = typename Conv2d::ElementCompute;
+  using EpilogueOutputOp = typename Conv2d::EpilogueOutputOp;
+
+  static cutlass::conv::Operator const kConvolutionalOperator = Conv2d::kConvolutionalOperator;
+
+ public:
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_reordered_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  int tested_problem_count;
+
+ public:
+  TestbedDepthwiseDirectConv2d(cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+                               cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+                               uint64_t seed_ = 2080)
+      : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_), tested_problem_count(0) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  void initialize_tensor(cutlass::TensorView<Element, Layout> view,
+                         cutlass::Distribution::Kind dist_kind,
+                         uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope;
+      int bits = cutlass::sizeof_bits<Element>::value;
+
+      if (bits <= 8) {
+        scope = 2;
+      } else if (bits == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope = 3;
+        } else {
+          scope = 5;
+        }
+      } else {
+        scope = 8;
+      }
+      cutlass::reference::host::TensorFillRandomUniform(view, seed, scope, -scope, 0);
+    } else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+
+    } else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    } else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+    } else {
+    }
+  }
+
+  void initialize(cutlass::conv::Conv2dProblemSize const &problem_size, uint64_t seed = 2019) {
+    tensor_A.resize(implicit_gemm_tensor_a_extent(kConvolutionalOperator, problem_size));
+    tensor_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_reordered_B.resize(implicit_gemm_tensor_b_extent(kConvolutionalOperator, problem_size));
+    tensor_C.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_computed.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+    tensor_D_reference.resize(implicit_gemm_tensor_c_extent(kConvolutionalOperator, problem_size));
+
+    initialize_tensor(tensor_A.host_view(), init_A, seed);
+    initialize_tensor(tensor_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_reordered_B.host_view(), init_B, seed * 17);
+    initialize_tensor(tensor_C.host_view(), init_C, seed * 39);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_reordered_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_D_reference.sync_device();
+  }
+
+  bool sufficient(int smem_size) const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < static_cast<size_t>(smem_size)) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(cutlass::conv::Conv2dProblemSize const &problem_size,
+           cutlass::conv::SplitKMode const &split_k_mode = cutlass::conv::SplitKMode::kSerial,
+           ElementCompute alpha = ElementCompute(1.5),
+           ElementCompute beta = ElementCompute(1)) {
+    // increment tested problem count run by the testbed
+    tested_problem_count++;
+
+#if 0 // display conv2d problem size for debugging
+    std::cout << problem_size << std::endl
+              << "alpha, beta: (" << alpha << ", " << beta << ")" << std::endl
+              << "split_k_mode: "
+              << ((split_k_mode == cutlass::conv::SplitKMode::kSerial) ? "(serial)" : "(parallel)")
+              << std::endl
+              << std::endl;
+#endif
+
+    initialize(problem_size);
+
+    // configure the operator
+    Conv2d conv2d_op;
+
+    typename Conv2d::Arguments conv2d_args(problem_size,
+                                           tensor_A.device_ref(),
+                                           tensor_B.device_ref(),
+                                           tensor_C.device_ref(),
+                                           tensor_D_computed.device_ref(),
+                                           {alpha, beta},
+                                           tensor_reordered_B.device_ref(),
+                                           split_k_mode);
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv2d::get_workspace_size(conv2d_args);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = conv2d_op.can_implement(problem_size);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    status = conv2d_op.initialize(conv2d_args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    if (!sufficient(conv2d_op.get_smem_size())) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    // run conv2d operator
+    status = conv2d_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run." << std::endl;
+      return false;
+    }
+
+    bool passed = false;
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " device reference error: " << cudaGetErrorString(result);
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key =
+        CreateCachedConv2dTestKey<ElementA,
+                                  LayoutA,
+                                  ElementB,
+                                  LayoutB,
+                                  ElementC,
+                                  LayoutC,
+                                  ElementAccumulator,
+                                  ElementCompute>(kConvolutionalOperator,
+                                                  problem_size,
+                                                  alpha,
+                                                  beta,
+                                                  tensor_A.host_view(),
+                                                  tensor_B.host_view(),
+                                                  tensor_C.host_view());
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string conv2d_result_cache_name =
+        std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+      CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    }
+
+    if (!cached_result_loaded) {
+#if CUTLASS_CONV_TEST_UNIT_REFERENCE_DEVICE_ENABLED
+
+      cutlass::reference::device::Conv2d<ElementA,
+                                         LayoutA,
+                                         ElementB,
+                                         LayoutB,
+                                         ElementC,
+                                         LayoutC,
+                                         ElementCompute,
+                                         ElementAccumulator>(kConvolutionalOperator,
+                                                             problem_size,
+                                                             tensor_A.device_ref(),
+                                                             tensor_B.device_ref(),
+                                                             tensor_C.device_ref(),
+                                                             tensor_D_reference.device_ref(),
+                                                             alpha,
+                                                             beta);
+
+      // sync host (copy device data to host) for dumping error output in case of mismatches
+      tensor_D_reference.sync_host();
+
+#else
+
+      cutlass::reference::host::Conv2d<ElementA,
+                                       LayoutA,
+                                       ElementB,
+                                       LayoutB,
+                                       ElementC,
+                                       LayoutC,
+                                       ElementCompute,
+                                       ElementAccumulator>(kConvolutionalOperator,
+                                                           problem_size,
+                                                           tensor_A.host_ref(),
+                                                           tensor_B.host_ref(),
+                                                           tensor_C.host_ref(),
+                                                           tensor_D_reference.host_ref(),
+                                                           alpha,
+                                                           beta);
+
+#endif
+
+      if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+
+        cached_test_result.D = TensorHash(tensor_D_reference.host_view());
+
+        CachedTestResultListing cached_results(conv2d_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(conv2d_result_cache_name);
+      }
+    } // if (!cached_result_loaded)
+
+    uint32_t tensor_D_hash = TensorHash(tensor_D_computed.host_view());
+
+    if (CUTLASS_TEST_ENABLE_CACHED_RESULTS) {
+      passed = (tensor_D_hash == cached_test_result.D);
+
+      EXPECT_EQ(tensor_D_hash, cached_test_result.D) 
+        << "Hash-based comparison failed for key:" << "\n" << cached_test_key << "\n";
+    }
+    else {
+
+      passed = cutlass::reference::host::TensorEquals(
+        tensor_D_computed.host_view(), 
+                                                      tensor_D_reference.host_view());
+    }
+
+    EXPECT_TRUE(passed);
+
+    std::stringstream ss_problem_size_text;
+    ss_problem_size_text         << "nhwc_"
+        << problem_size.N << "x"
+        << problem_size.H << "x"
+        << problem_size.W << "x"
+        << problem_size.C
+        << "_krsc_"
+        << problem_size.K << "x"
+        << problem_size.R << "x"
+        << problem_size.S << "x"
+        << problem_size.C
+        << "_padding_"
+        << problem_size.pad_h << "x"
+        << problem_size.pad_w
+        << "_stride_"
+        << problem_size.stride_h << "x"
+        << problem_size.stride_w
+        << "_dilation_"
+        << problem_size.dilation_h << "x"
+                         << problem_size.dilation_w << "_"
+        << (problem_size.mode == cutlass::conv::Mode::kCrossCorrelation ? "xcorr_" : "conv_");
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Conv2d_DirectConv_device_"
+        << (split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial_reduction_" : "parallel_reduction_")
+        << (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kFprop ? "fprop_" :
+            (Conv2d::kConvolutionalOperator == cutlass::conv::Operator::kDgrad ? "dgrad_" : "wgrad_"))
+        << ss_problem_size_text.str()
+        << Conv2d::ThreadblockShape::kM << "x"  
+        << Conv2d::ThreadblockShape::kN << "x"  
+        << Conv2d::ThreadblockShape::kK << "_"
+        << Conv2d::WarpShape::kM << "x"  
+        << Conv2d::WarpShape::kN << "x"  
+        << Conv2d::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n";
+
+      results << "\nD reference (hash: " << cached_test_result.D << ")\n";
+
+      if (!cached_result_loaded) {
+        results
+          << tensor_D_reference.host_view() << "\n";  
+      }
+
+      results
+        << "\nD computed (hash: " << tensor_D_hash << ")\n" 
+              << tensor_D_computed.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename DirectConv>
+bool TestSpecificDepthwiseDirectConv2d(const Conv2dProblemVector &problem_sizes) {
+  bool passed = true;
+
+  //
+  // Testbed object
+  //
+  TestbedDepthwiseDirectConv2d<DirectConv> testbed;
+
+  // Sweep conv2d problem sizes (split-k-mode=kSerial, split-k-slice=1, alpha=1.0, beta=0.0)
+  for (auto conv_problem : problem_sizes) {
+    //
+    // Test
+    //
+
+    // test mode = xcross
+    passed = testbed.run(
+      conv_problem,
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+
+    // test mode = convolution
+    passed = testbed.run(
+      conv_problem.reset_mode(cutlass::conv::Mode::kConvolution),
+      cutlass::conv::SplitKMode::kSerial);
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace conv
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/conv_problem_sizes.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/conv_problem_sizes.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..54c11281e14b813b249d7f9710542843b37bcc68
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/conv_problem_sizes.hpp
@@ -0,0 +1,1385 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief CUTLASS 3.x Implicit GEMM testbed sizes for ConvNd problem
+*/
+#pragma once
+
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include <vector>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test::conv::device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int SpatialDim, cutlass::conv::Operator ConvOp, bool SupportStrides = (ConvOp != cutlass::conv::Operator::kDgrad)>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_conv_problem_vector();
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Fprop
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {800, 80, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nwc
+    {512, 64, 1},   // stride (nwc)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nqk)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 4, 64},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 64},
+    {256, 3, 64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // nhwc
+    {4096, 512, 64, 1},   // stride (nhwc)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (npqk)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 3, 3, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,2/1,2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {2, 2},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   7, 7, 64},
+    {256, 2, 5, 64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D fprop problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kFprop>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kFprop, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},  // ndhwc
+    {64, 1, 1, 1, 64},  // ktrsc
+    {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+    1                   // group
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // ndhwc
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 64},
+    {16, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 64},
+    {96, 1, 1, 1, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 3, 3, 64},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x3x3 + symmetric padding with c % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3, 4, 5, 64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {96, 3,  4,  5,  64},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nwc
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1, 8, 64},
+    {16,1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 64},
+    {96, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 64},
+    {256, 1, 64},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 4, 32},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and tstride of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {2},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 32},
+    {256, 3, 32},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1024, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1040, 128},
+    {640, 1,    128},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // nhwc
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},
+    {16, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 64},
+    {96, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,   8, 8, 64},
+    {256, 1, 1, 64},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 3, 3, 32},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   8, 8, 32},
+    {256, 2, 5, 32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   15, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ stride, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 15, 32},
+    {256, 2,  5,  32},
+    {1, 1},
+    {0, 0},
+    {2, 3},
+    {2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   64, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   65, 16, 128},
+    {640, 1,  1,  128},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+     cutlass::conv::Mode::kCrossCorrelation,
+     {2,  1, 8, 8, 64},  // ndhwc
+     {64, 1, 1, 1, 64},  // ktrsc
+     {0, 0, 0},          // padding lower (pad_d, pad_h, pad_w)
+     {0, 0, 0},          // padding upper (pad_d, pad_h, pad_w)
+     {1, 1, 1},          // stride (stride_d, stride_h, stride_w)
+     {1, 1, 1},          // dilation (dilation_d, dilation_h, dilation_w)
+     1                   // group
+   });
+  // Filter 3x3x3 + no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 3, 3, 32},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 32},
+    {96, 3, 4, 5, 32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 2, 3},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 32},
+    {96, 3,  4,  5,  32},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2048
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 64, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // To test streamk, equals to gemm-MxNxK size 128x640x2080
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   1, 65, 16, 128},
+    {640, 1, 1,  1,  128},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Grouped Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Get problem size vectors for group conv problems
+template<int SpatialDim, cutlass::conv::Operator ConvOp>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_grouped_conv_problem_vector(int GroupsPerTile);
+
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_grouped_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>(int GroupsPerTile) {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+
+  if (GroupsPerTile == 1) {
+    // channel_per_group == 64
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 2048}, // ndhwc
+      {2048, 1, 3, 3, 64},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 2) {
+    // channel_per_group == 32
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 1024}, // ndhwc
+      {1024, 1, 3, 3, 32},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 4) {
+    // channel_per_group == 16
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 512}, // ndhwc
+      {512, 1, 3, 3, 16},  // ktrsc
+      {0, 1, 1},           // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},           // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},           // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},           // dilation (dilation_d, dilation_h, dilation_w)
+      32                   // groups
+    });
+  }
+  else if (GroupsPerTile == 8) {
+    // channel_per_group == 8
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 256},  // ndhwc
+      {256, 1, 3, 3, 8},    // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Unit Stride Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {1},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,   8,  64},  // nqk
+    {512, 64, 1},   // stride (nqk)
+    {64,  1,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {1},            // stride (stride_w)
+    {1},            // dilation (dilation_w)
+    1               // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 16},
+    {64, 1, 16},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 96},
+    {64, 1, 96},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 256},
+    {64, 1, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 3, 256},
+    {0},
+    {0},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {32, 3, 256},
+    {1},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 4 filter, asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 256},
+    {64, 4, 256},
+    {0},
+    {1},
+    {1},
+    {1},
+    1
+  });
+  // 3 filter, asymmetric padding and dilation of 2
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 64},
+    {256, 3,  64},
+    {0},
+    {1},
+    {1},
+    {2},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 64},  // npqk
+    {64, 1, 1, 64},  // krsc
+    {0, 0},          // padding lower (pad_h, pad_w)
+    {0, 0},          // padding upper (pad_h, pad_w)
+    {1, 1},          // stride (stride_h, stride_w)
+    {1, 1},          // dilation (dilation_h, dilation_w)
+    1                // group
+  });
+  // non-packed input strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {8000, 800, 80, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // non-packed output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    8,   8,  64},  // npqk
+    {4096, 512, 64, 1},   // stride (npqk)
+    {64,   1,   1,  64},  // krsc
+    {64,   64,  64, 1},   // stride (krsc)
+    {8000, 800, 80, 1},   // stride (nhwc)
+    {0, 0},               // padding lower (pad_h, pad_w)
+    {0, 0},               // padding upper (pad_h, pad_w)
+    {1, 1},               // stride (stride_h, stride_w)
+    {1, 1},               // dilation (dilation_h, dilation_w)
+    1                     // group
+  });
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  8, 8, 16},
+    {64, 1, 1, 16},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 2 and K = 128 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 96},
+    {64, 1, 1, 96},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // N = 7 and K = 256 for a even larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {7,  8, 8, 256},
+    {64, 1, 1, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, no padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 3, 3, 256},
+    {0, 0},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 3x3 filter, symmetric padding with k % cta_k !=0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {32, 3, 3, 256},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  8, 8, 256},
+    {64, 2, 5, 256},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {1, 1},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,   16, 16, 64},
+    {256, 2,  5,  64},
+    {1, 1},
+    {0, 0},
+    {1, 1},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, false>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter-K = 16 for predication
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1, 8, 8, 16},
+    {64, 1, 1, 1, 16},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // non-packed input output strides.
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,    1,    8,   8,  64},  // nzpqk
+    {8000, 8000, 800, 80, 1},   // stride (nzpqk)
+    {64,   1,    1,   1,  64},  // ktrsc
+    {64,   64,   64,  64, 1},   // stride (ktrsc)
+    {8000, 8000, 800, 80, 1},   // stride (ndhwc)
+    {0, 0, 0},                  // padding lower (pad_d, pad_h, pad_w)
+    {0, 0, 0},                  // padding upper (pad_d, pad_h, pad_w)
+    {1, 1, 1},                  // stride (stride_d, stride_h, stride_w)
+    {1, 1, 1},                  // dilation (dilation_d, dilation_h, dilation_w)
+    1                           // group
+  });
+  // N = 7 and K = 256 for a larger grid
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  1, 8, 8, 96},
+    {64, 1, 1, 1, 96},
+    {0, 0, 0},
+    {0, 0, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + symmetric padding 111
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  3, 5, 8, 96},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {1, 1, 1},
+    1
+  });
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3,  4,  5,  96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {1, 1, 1},
+    {2, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Strided Dgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Specialization for 1D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>> inline
+get_conv_problem_vector<1, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 1>;
+  std::vector<ProblemShape> problem_shapes;
+  // Test TMA truncation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  512, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {2},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  1024, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {4},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {1,  2048, 64},  // nqk
+    {64, 1, 64},  // ksc
+    {0},          // padding lower (pad_w)
+    {0},          // padding upper (pad_w)
+    {8},          // stride (stride_w)
+    {1},          // dilation (dilation_w)
+    1             // group
+  });
+  // non-packed input/output strides.
+  // stride divides dilation
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {0},            // padding lower (pad_w)
+    {1},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {4},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // dilation divides stride
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {0},            // padding upper (pad_w)
+    {4},            // stride (stride_w)
+    {2},            // dilation (dilation_w)
+    1               // group
+  });
+  // non-packed input/output strides.
+  // stride dilation dont divide
+  // asymmetric padding
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   8,  64},  // nqk
+    {800, 80, 1},   // stride (nqk)
+    {64,  3,  64},  // ksc
+    {64,  64, 1},   // stride (ksc)
+    {800, 80, 1},   // stride (nwc)
+    {1},            // padding lower (pad_w)
+    {2},            // padding upper (pad_w)
+    {2},            // stride (stride_w)
+    {3},            // dilation (dilation_w)
+    1               // group
+  });
+  return problem_shapes;
+}
+
+// Specialization for 2D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>> inline
+get_conv_problem_vector<2, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 2>;
+  std::vector<ProblemShape> problem_shapes;
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 stride divides dilation
+  // mode 1 dilation divides stride
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {2, 4},
+    {4, 2},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // mode 0 dilation divides stride
+  // mode 1 stride divides dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {4, 2},
+    {2, 4},
+    1
+  });
+  // 2x5 filter, asymmetric padding 1,0/1,0, w/ dilation
+  // stride dilation dont divide
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {3,   16, 16, 64},
+    {256, 2, 5, 64},
+    {1, 0},
+    {0, 1},
+    {3, 2},
+    {2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+// Specialization for 3D dgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>> inline
+get_conv_problem_vector<3, cutlass::conv::Operator::kDgrad, true>() {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kDgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+  // Filter 3x4x5 + asymmetric padding 102/010, w/ dilation
+  problem_shapes.push_back({
+    cutlass::conv::Mode::kCrossCorrelation,
+    {2,  16, 10, 16, 64},
+    {64, 3, 4, 5, 96},
+    {1, 0, 1},
+    {0, 2, 0},
+    {2, 1, 2},
+    {4, 2, 3},
+    1
+  });
+  return problem_shapes;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/testbed_conv.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/testbed_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..99ba9c407cec38e919812fedeee38ba75d9129f7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/conv/device_3x/testbed_conv.hpp
@@ -0,0 +1,768 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Implicit GEMM testbed for 3.x API
+*/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "../../common/cutlass_unit_test.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "../test/unit/gemm/device/gemm_testbed_3x.hpp"
+
+#include "thrust/universal_vector.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/conv.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "conv_problem_sizes.hpp"
+#include "../cache_testbed_output.h"
+
+#include <iostream>
+
+#include "cute/layout.hpp"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test::conv::device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Initializes a flat device buffer
+template <typename Element>
+static void
+initialize_values(
+    thrust::universal_vector<Element>& dst_ptr,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+  if (cutlass::Distribution::Uniform == dist_kind) {
+    int scope;
+    int bits = cutlass::sizeof_bits<Element>::value;
+
+    if (bits <= 8) {
+      scope = 2;
+    }
+    else if (bits == 16) {
+      scope = 4;
+    }
+    else {
+      scope = 8;
+    }
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, scope, -scope, 0);
+  }
+  else if (cutlass::Distribution::Identity == dist_kind) {
+    cutlass::reference::host::BlockFillRandomUniform(
+        dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0, 0);
+  }
+  else if (cutlass::Distribution::Gaussian == dist_kind) {
+    cutlass::reference::host::BlockFillRandomGaussian(dst_ptr.data().get(), dst_ptr.size(), seed, 0, 0.5);
+  }
+  else if (cutlass::Distribution::Sequential == dist_kind) {
+    cutlass::reference::host::BlockFillSequential(dst_ptr.data().get(), dst_ptr.size());
+  }
+  else {
+    std::cerr << "Invalid distribution kind!\n.";
+    exit(1);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// utils for sparse or dense conv parameters
+
+template <class Conv>
+struct DenseConvParams {
+  // Default Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+
+  // get the default arguments without sparse data
+  auto get_mainloop_arguments(
+    [[maybe_unused]] ProblemShape const& problem_shape,
+    thrust::universal_vector<ElementA>& tensor_A,
+    thrust::universal_vector<ElementB>& tensor_B
+  ) {
+    auto args = typename Conv::ConvKernel::MainloopArguments {
+      tensor_A.data().get(),
+      tensor_B.data().get(),
+    };
+    return args;
+  }
+};
+
+template <class Conv>
+struct SparseConvParams {
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <class Conv, bool isSparseEnabled_ = false>
+struct ConvTestbed {
+  // Kernel data types
+  using ElementA = typename Conv::ConvKernel::ElementA;
+  using ElementB = typename Conv::ConvKernel::ElementB;
+  using ElementC = cute::conditional_t<cute::is_void_v<typename Conv::ConvKernel::ElementC>,
+      typename Conv::ConvKernel::ElementD, typename Conv::ConvKernel::ElementC>;
+  using ElementD = typename Conv::ConvKernel::ElementD;
+  using ElementAccumulator = typename Conv::ConvKernel::ElementAccumulator;
+
+  // ConvTest for sparse kernel
+  static constexpr bool isSparseEnabled = isSparseEnabled_;
+  using ConvParams = cute::conditional_t<isSparseEnabled, SparseConvParams<Conv>, DenseConvParams<Conv>>;
+  ConvParams params;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  using FusionOp = typename Conv::EpilogueOutputOp;
+
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t               = cute::conditional_t<cute::is_void_v<T>, U, T>;
+  using ElementScalar            = typename FusionOp::ElementScalar;
+  using ElementCompute           = typename FusionOp::ElementCompute;
+  using BiasType                 = typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::type;
+  using ElementBias              = non_void_t<BiasType>;
+  using ActivationType           = non_void_t<typename cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::type,
+                                   cutlass::epilogue::thread::Identity<ElementCompute>>;
+  static constexpr bool IsActivationEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithActivation<FusionOp>::value;
+  using ActivationFunctor        = cute::conditional_t<IsActivationEnabled, ActivationType, cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsBiasEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithBias<FusionOp>::value &&
+                                        !cute::is_same_v<BiasType, void>;
+  static constexpr bool IsPerChannelScaleEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithPerChannelScaled<FusionOp>::value;
+
+  static constexpr bool DisableSource = cute::is_void_v<typename FusionOp::ElementSource>;
+
+  static constexpr bool IsResidualEnabled = cutlass::epilogue::collective::detail::IsThreadEpilogueOpWithResidualAdd<FusionOp>::value;
+
+  using StrideC  = typename Conv::ConvKernel::StrideC;
+  using StrideD  = typename Conv::ConvKernel::StrideD;
+  using ThreadEpilogueOp = typename Conv::ConvKernel::CollectiveEpilogue::ThreadEpilogueOp;
+
+  static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
+  static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
+  using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+
+  using Schedule = typename Conv::DispatchPolicy::Schedule;
+  /// Initialization
+  cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_disable = cutlass::Distribution::Identity; // all zeros
+  uint64_t seed = 6090;
+  float epsilon = 0.0f;
+  int split_p_slices = 1;
+  thrust::universal_vector<ElementA> tensor_A;
+  thrust::universal_vector<ElementB> tensor_B;
+  thrust::universal_vector<ElementC> tensor_C;
+  thrust::universal_vector<ElementD> tensor_D_computed;
+  thrust::universal_vector<ElementD> tensor_D_reference;
+  thrust::universal_vector<ElementBias> tensor_bias;
+  thrust::universal_vector<ElementScalar> tensor_alpha;
+  thrust::universal_vector<ElementScalar> tensor_beta;
+
+  // Return true on success, else false
+  bool initialize(ProblemShape const& problem_shape, uint64_t seed = 6090) {
+    tensor_A.resize(sizeof(ElementA) * problem_shape.size_A());
+    tensor_B.resize(sizeof(ElementB) * problem_shape.size_B());
+    tensor_C.resize(sizeof(ElementC) * problem_shape.size_C());
+    tensor_D_computed.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_D_reference.resize(sizeof(ElementD) * problem_shape.size_C());
+    tensor_bias.resize(sizeof(ElementBias) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    if constexpr (IsPerChannelScaleEnabled) {
+      tensor_alpha.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+      tensor_beta.resize(sizeof(ElementScalar) * cute::size(cute::get<0>(problem_shape.get_shape_B())));
+    }
+    initialize_values(tensor_A, init_A, seed);
+    initialize_values(tensor_B, init_B, seed * 11);
+    initialize_values(tensor_C, init_C, seed * 17);
+    initialize_values(tensor_bias, init_bias, seed * 19);
+    if constexpr (IsPerChannelScaleEnabled) {
+      initialize_values(tensor_alpha, init_bias, seed * 23);
+      if constexpr (DisableSource) {
+        initialize_values(tensor_beta, init_disable, seed * 27);
+      }
+      else {
+        initialize_values(tensor_beta, init_bias, seed * 27);
+      }
+    }
+
+    bool flag = true;
+    if constexpr (isSparseEnabled) {
+      flag &= params.initialize(problem_shape, tensor_B, static_cast<int>(seed + 2023));
+    }
+
+    return flag;
+  }
+
+  // Determine SMEM requirements and waive if not satisfied
+  bool sufficient() const {
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    int max_smem_size;
+    result = cudaDeviceGetAttribute(&max_smem_size, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx);
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaDeviceGetAttribute() failed");
+    }
+
+    return max_smem_size >= Conv::ConvKernel::SharedStorageSize;
+  }
+
+  auto transform_shape_and_stride_with_groups(ProblemShape const& problem_shape) {
+    using TensorExtent = cute::array<int32_t, NumSpatialDimensions + 3>;
+    using TensorStride = cute::array<int64_t, NumSpatialDimensions + 3>;
+
+    TensorExtent shape_a_g{};
+    TensorExtent shape_b_g{};
+    TensorExtent shape_c_g{};
+    TensorStride stride_a_g{};
+    TensorStride stride_b_g{};
+    TensorStride stride_c_g{};
+
+    auto shape_a = cute::reverse(problem_shape.shape_A);
+    auto shape_b = cute::reverse(problem_shape.shape_B);
+    auto shape_c = cute::reverse(problem_shape.shape_C);
+    auto stride_a = cute::reverse(problem_shape.stride_A);
+    auto stride_b = cute::reverse(problem_shape.stride_B);
+    auto stride_c = cute::reverse(problem_shape.stride_C);
+
+    int32_t G = problem_shape.groups;
+
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop ||
+                  ConvOp == cutlass::conv::Operator::kDgrad) {
+      // shape_a_g = (c,w,h,d,n,g) or (k,q,p,z,n,g)
+      // shape_b_g = (c,s,r,k,t,g)
+      // shape_c_g = (k,q,p,z,n,g) or (c,w,h,d,n,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_b),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_b) / G, G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_c) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_c),
+        cute::make_shape(G)));
+
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b,
+        cute::size<NumSpatialDimensions + 1>(stride_b) * cute::size<NumSpatialDimensions + 1>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c, cute::size<0>(shape_c) / G));
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      // shape_a_g = (k,q,p,z,n,g)
+      // shape_b_g = (c,w,h,d,n,g)
+      // shape_c_g = (c,s,r,k,t,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_b) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_b),
+        cute::make_shape(G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_c),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_c) / G, G)));
+
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b, cute::size<0>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c,
+        cute::size<NumSpatialDimensions + 1>(stride_c) * cute::size<NumSpatialDimensions + 1>(shape_c) / G));
+    }
+
+    return make_tuple(shape_a_g, shape_b_g, shape_c_g,
+                      stride_a_g, stride_b_g, stride_c_g);
+  }
+
+  // Executes one test
+  bool run(
+    ProblemShape const& problem_shape,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    dim3 cluster_shape = dim3(0, 0, 0),
+    dim3 cluster_shape_fallback = dim3(0, 0, 0),
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    MaxSwizzleSize max_swizzle = MaxSwizzleSize{},
+    Splits splits = Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic
+  ) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device.\n";
+      }
+      return true;
+    }
+
+    bool ret = initialize(problem_shape);
+
+    if (!ret) {
+      std::cerr << "initialize failed for the given problem_shape: \n";
+      return false;
+    }
+
+    cutlass::KernelHardwareInfo hw_info;
+    cudaGetDevice(&hw_info.device_id);
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+    hw_info.cluster_shape = cluster_shape;
+    hw_info.cluster_shape_fallback = cluster_shape_fallback;
+
+    // configure the operator
+    Conv conv_op;
+    auto stride_C = StrideC{};
+    auto stride_D = StrideD{};
+    if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      stride_C = cutlass::make_cute_packed_stride(
+        StrideC{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+      stride_D = cutlass::make_cute_packed_stride(
+        StrideD{}, problem_shape.shape_C, problem_shape.stride_C, ConvOp);
+    }
+    // Need to support non-packed output strides for fprop and dgrad kernel.
+    else {
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_C) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+      cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+        cute::get<0, i>(stride_D) = problem_shape.stride_C[ProblemShape::RankT-2-i];
+      });
+    }
+
+    using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+   using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+    typename Conv::ConvKernel::TileScheduler::Arguments scheduler_args{};
+    if constexpr (cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+
+    auto mainloop_args = params.get_mainloop_arguments(problem_shape, tensor_A, tensor_B); 
+
+    auto epilogue_args = typename Conv::ConvKernel::EpilogueArguments {
+      {},
+      tensor_C.data().get(),
+      stride_C,
+      tensor_D_computed.data().get(),
+      stride_D,
+    };
+
+    auto args = typename Conv::Arguments {
+      problem_shape,
+      mainloop_args, // MainloopArguments
+      epilogue_args, // EpilogueArguments
+      hw_info,
+      scheduler_args
+    };
+
+    auto &fusion_args = args.epilogue.thread;
+
+    fusion_args.alpha = alpha;
+    fusion_args.beta = beta;
+
+    if constexpr (IsPerChannelScaleEnabled) {
+      fusion_args.alpha_ptr = tensor_alpha.data().get();
+      fusion_args.beta_ptr = tensor_beta.data().get();
+    }
+
+    if constexpr (IsBiasEnabled) {
+      fusion_args.bias_ptr = tensor_bias.data().get();
+    }
+
+    // Clamp bound
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+      fusion_args.activation.lower_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::lowest();
+      fusion_args.activation.upper_bound = CUTLASS_STL_NAMESPACE::numeric_limits<ElementCompute>::max();
+    }
+
+    // Scale
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledSiLu<ElementCompute>> ||
+                  cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledHardSwish<ElementCompute>> ) {
+      fusion_args.activation.scale = ElementCompute{1};
+    }
+
+    // LeakyRelu
+    if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::LeakyReLU<ElementCompute>> ) {
+      fusion_args.activation.leaky_alpha = ElementCompute{0};
+    }
+
+    cutlass::Status status = cutlass::Status::kInvalid;
+
+    status = conv_op.can_implement(args);
+    EXPECT_EQ(conv_op.can_implement(args), cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "can_implement failed for the given problem_shape: \n";
+      print(problem_shape);
+      return false;
+    }
+
+    // find workspace requirement for parallel split-k reduction
+    size_t workspace_size = Conv::get_workspace_size(args);
+    thrust::universal_vector<uint8_t> workspace(workspace_size);
+
+    status = conv_op.initialize(args, workspace.data().get());
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    // run conv3d operator
+    status = conv_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    bool passed = false;
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " Kernel execution error: "
+                                   << cudaGetErrorString(result);
+
+    // Create cute::Tensors using the logical rank-3 MNK multi-mode shapes the mainloop gives us
+    auto [shape_mA, shape_mB, shape_mC, stride_mA, stride_mB, stride_mC] =
+      transform_shape_and_stride_with_groups(problem_shape);
+    auto shape_mBias = cute::make_shape(cute::size(cute::get<0>(problem_shape.get_shape_B())));
+
+    auto mA = make_tensor(tensor_A.data().get(), make_layout(shape_mA, stride_mA));
+    auto mB = make_tensor(tensor_B.data().get(), make_layout(shape_mB, stride_mB));
+    auto mC = make_tensor(tensor_C.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_ref = make_tensor(tensor_D_reference.data().get(), make_layout(shape_mC, stride_mC));
+    auto mD_computed = make_tensor(tensor_D_computed.data().get(), make_layout(shape_mC, stride_mC));
+    auto mBias = make_tensor(tensor_bias.data().get(), make_layout(shape_mBias));
+    auto mAlpha = make_tensor(tensor_alpha.data().get(), make_layout(shape_mBias));
+    auto mBeta = make_tensor(tensor_beta.data().get(), make_layout(shape_mBias));
+
+    cutlass::reference::host::ConvEpilogueFusionParams<
+      ElementAccumulator,
+      ElementScalar,
+      ElementCompute,
+      ElementC,
+      ElementD,
+      IsResidualEnabled,
+      decltype(mAlpha),
+      decltype(mBeta),
+      decltype(mBias),
+      ActivationFunctor>
+        epilogue_fusion_params{};
+
+    epilogue_fusion_params.alpha = alpha;
+    epilogue_fusion_params.beta = beta;
+
+    if constexpr (IsPerChannelScaleEnabled) {
+      epilogue_fusion_params.tensor_alpha = mAlpha;
+      epilogue_fusion_params.tensor_beta = mBeta;
+    }
+
+    if constexpr (IsBiasEnabled) {
+      epilogue_fusion_params.tensor_bias = mBias;
+    }
+
+    auto padding = cute::reverse(problem_shape.lower_padding);
+    auto tstride = cute::reverse(problem_shape.traversal_stride);
+    auto dilation = cute::reverse(problem_shape.dilation);
+
+    cutlass::reference::host::ConvReferenceImpl<
+      ConvOp,
+      NumSpatialDimensions,
+      decltype(mA),
+      decltype(mB),
+      decltype(mC),
+      decltype(mD_ref),
+      decltype(padding),
+      decltype(tstride),
+      decltype(dilation),
+      decltype(epilogue_fusion_params)>
+        reference_impl(mA, mB, mC, mD_ref, padding, tstride, dilation, epilogue_fusion_params);
+
+    //
+    // Reference check - support caching results
+    //
+
+    CachedTestKey cached_test_key = CreateCachedConvNd3xTestKey<
+        ProblemShape,
+        ElementA,
+        ElementB,
+        ElementC,
+        ElementD
+    >(
+        ConvOp,
+        problem_shape,
+        alpha,
+        beta,
+        tensor_A,
+        tensor_B,
+        tensor_C
+      );
+
+    //
+    // Look for the cached key
+    //
+
+    bool cached_result_loaded = false;
+    CachedTestResult cached_test_result;
+
+    std::string convnd_result_cache_name =
+      std::string("cached_results_") + CUTLASS_TARGET_NAME + ".txt";
+
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      CachedTestResultListing cached_results(convnd_result_cache_name);
+
+      auto cached = cached_results.find(cached_test_key);
+
+      cached_result_loaded = cached.first;
+      if (cached_result_loaded) {
+        cached_test_result = cached.second;
+      }
+    #endif
+
+    if (!cached_result_loaded) {
+      // Compute reference
+      reference_impl.compute_reference();
+
+      #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+        cached_test_result.D = TensorHash(tensor_D_reference);
+        CachedTestResultListing cached_results(convnd_result_cache_name);
+
+        cached_results.append(cached_test_key, cached_test_result);
+        cached_results.write(convnd_result_cache_name);
+      #endif
+    } // if (!cached_result_loaded)
+
+    #if (CUTLASS_TEST_ENABLE_CACHED_RESULTS)
+      uint32_t tensor_D_computed_hash = TensorHash(tensor_D_computed);
+      passed = (tensor_D_computed_hash == cached_test_result.D);
+      // If hash fails, double check against reference implementation.
+      if(!passed) {
+        std::cerr << "Hash-based comparison unsuccessful for key:" << "\n" << cached_test_key
+            << ", comparing with reference implementation now.\n";
+        if (cached_result_loaded) {
+          // Compute reference
+          reference_impl.compute_reference();
+        }
+        // Validate kernel against reference
+        passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+      }
+    #else
+      // Validate kernel against reference
+      passed = compare_reference(mD_ref, mD_computed, mA, mB, mAlpha, mBeta, mBias, this->epsilon);
+    #endif
+
+    EXPECT_TRUE(passed);
+    return passed;
+  }
+
+  template<
+    class Engine, class Layout,
+    class EngineA, class LayoutA,
+    class EngineB, class LayoutB,
+    class EngineAlpha, class LayoutAlpha,
+    class EngineBeta, class LayoutBeta,
+    class EngineBias, class LayoutBias>
+  static constexpr bool
+  compare_reference(
+      cute::Tensor<Engine, Layout> const& reference,
+      cute::Tensor<Engine, Layout> const& computed,
+      cute::Tensor<EngineA, LayoutA> const& A,
+      cute::Tensor<EngineB, LayoutB> const& B,
+      cute::Tensor<EngineAlpha, LayoutAlpha> const& tensor_alpha,
+      cute::Tensor<EngineBeta, LayoutBeta> const& tensor_beta,
+      cute::Tensor<EngineBias, LayoutBias> const& tensor_bias,
+      float epsilon = 0.0f) {
+    if (size(reference) != size(computed)) {
+      return false;
+    }
+
+    bool passed = true;
+    if (epsilon == 0.0f) {
+      // fast refcheck w/o epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        if (reference(i) != computed(i)) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    } else {
+      // refcheck with epsilon
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        auto ref = static_cast<float>(reference(i));
+        auto act = static_cast<float>(computed(i));
+        auto abs_error = std::abs(act - ref);
+        auto rel_error = abs_error / (std::max(std::abs(act), std::abs(ref)) + 0.00001f);
+        if (std::isnan(abs_error) || std::isnan(rel_error) ||
+            std::min(abs_error, rel_error) > epsilon) {
+          passed = false;
+          printf("[%llu] %f, %f\n", static_cast<unsigned long long>(i),
+            float(reference(i)), float(computed(i)));
+          break;
+        }
+      }
+    }
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 1
+    if (not passed) {
+      cute::print("Reference:");
+      cute::print_tensor(reference);
+      cute::print("\nComputed:");
+      cute::print_tensor(computed);
+      cute::print("\n");
+
+      for (size_t i = 0; i < size_t(size(A)); ++i) {
+        printf("[%llu]: A = %f\n", static_cast<unsigned long long>(i), float(A(i)));
+      }
+      for (size_t i = 0; i < size_t(size(B)); ++i) {
+        printf("[%llu]: B = %f\n", static_cast<unsigned long long>(i), float(B(i)));
+      }
+      if constexpr (IsPerChannelScaleEnabled) {
+        for (size_t i = 0; i < size_t(size(tensor_alpha)); ++i) {
+          printf("[%llu]: alpha = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_alpha(i)));
+        }
+        for (size_t i = 0; i < size_t(size(tensor_beta)); ++i) {
+          printf("[%llu]: beta = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_beta(i)));
+        }
+      }
+      if constexpr (IsBiasEnabled) {
+        for (size_t i = 0; i < size_t(size(tensor_bias)); ++i) {
+          printf("[%llu]: bias = %f\n", static_cast<unsigned long long>(i),
+            float(tensor_bias(i)));
+        }
+      }
+      for (size_t i = 0; i < size_t(size(reference)); ++i) {
+        printf("[%llu]: ref = %f, computed = %f\n", static_cast<unsigned long long>(i),
+          float(reference(i)), float(computed(i)));
+      }
+    }
+    #endif
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Conv, bool SupportStrides = (Conv::DispatchPolicy::ConvOp != cutlass::conv::Operator::kDgrad)>
+bool TestAllConv(double alpha = 1.0, double beta = 0.0, float epsilon = 0.0f,
+                 dim3 cluster_shape = dim3(0, 0, 0),
+                 dim3 cluster_shape_fallback = dim3(0, 0, 0)
+                 ) {
+  using ElementScalar = typename Conv::EpilogueOutputOp::ElementScalar;
+
+  bool passed = true;
+  ConvTestbed<Conv> testbed;
+  testbed.epsilon = epsilon;
+  auto problem_vector = get_conv_problem_vector<
+      Conv::NumSpatialDimensions, Conv::DispatchPolicy::ConvOp, SupportStrides>();
+
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using MaxSwizzleSize = typename gemm::device::detail::MaxSwizzleSize;
+  using Splits = typename gemm::device::detail::Splits;
+
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Conv::ConvKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+  }
+
+  for (auto conv_problem : problem_vector) {
+    #if CUTLASS_DEBUG_TRACE_LEVEL > 0
+    print(conv_problem);
+    #endif
+    for (DecompositionMode decomp_mode : decomposition_modes) {
+      std::vector problem_splits = {Splits{1}};
+      if constexpr (UsesStreamKScheduler) {
+        if (decomp_mode == DecompositionMode::SplitK) {
+          problem_splits.push_back(Splits{2});
+          problem_splits.push_back(Splits{4});
+        }
+      }
+      for (auto splits : problem_splits) {
+
+        passed = testbed.run(
+          conv_problem,
+          cutlass::from_real<ElementScalar>(alpha),
+          cutlass::from_real<ElementScalar>(beta),
+          cluster_shape,
+          cluster_shape_fallback,
+          RasterOrderOptions::Heuristic, // raster_order
+          MaxSwizzleSize(1),
+          splits,
+          decomp_mode
+          );
+        if (!passed) {
+          printf("Failed test for "); print(conv_problem);
+          return false;
+        }
+      } // splits
+    } // decomposition_mode
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace test::conv::device
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/ampere/tiled_cp_async_testbed.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff170be142ff9d0d02cc684c2873c3ec014bd236
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
@@ -0,0 +1,158 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <numeric>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+template <class T, class TiledCopy, class GmemLayout, class SmemLayout>
+__global__ void
+test_tiled_cp_async_device_cute(T const* g_in, T* g_out,
+                     TiledCopy const tiled_copy,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  auto thr_copy = tiled_copy.get_slice(threadIdx.x);
+  Tensor gA = make_tensor(make_gmem_ptr(g_in), gmem_layout);
+  Tensor gB = make_tensor(make_gmem_ptr(g_out), gmem_layout);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  
+
+  auto tAgA = thr_copy.partition_S(gA);
+  auto tAsA = thr_copy.partition_D(sA);
+
+#if 0
+  if (thread0()) {
+    print("gA  : "); print(gA.layout());   print("\n");
+    print("sA  : "); print(sA.layout());   print("\n");
+    print("tAgA: "); print(tAgA.layout()); print("\n");
+    print("tAsA: "); print(tAsA.layout()); print("\n");
+  }
+#endif
+
+  copy(tiled_copy, tAgA, tAsA);
+
+  cp_async_fence();
+  cp_async_wait<0>();
+  __syncthreads();
+
+  // Store trivially smem -> gmem
+
+  if (thread0()) {
+    copy(sA, gB);
+  }
+
+}
+
+template <class T, class TiledCopy, class GMEM_Layout, class SMEM_Layout>
+void
+test_tiled_cp_async(
+               TiledCopy const tiled_copy,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<T> h_in(N);
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+  for (int i = 0; i < size(hA_in); ++i) { hA_in(i) = static_cast<T>(i % 13); }
+
+  // Allocate and initialize device test data
+  thrust::device_vector<T> d_in = h_in;
+  thrust::device_vector<T> d_out(h_in.size(), T(-1));
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  test_tiled_cp_async_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tiled_copy,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<T> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < size(hA_out) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_no_swizzle() {
+  using namespace cute;
+  auto smem_atom = SMEM_LAYOUT{};
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}
+
+template <typename T, typename M, typename N, typename GMEM_STRIDE_TYPE, typename SWIZZLE_ATOM, typename SMEM_LAYOUT, typename TILED_COPY>
+void test_cp_async_with_swizzle() {
+  using namespace cute;
+  auto swizzle_atom = SWIZZLE_ATOM{};
+  auto smem_atom = composition(swizzle_atom, SMEM_LAYOUT{});
+  auto smem_layout = tile_to_shape(smem_atom, Shape<M, N>{});
+  auto gmem_layout = make_layout(make_shape(M{}, N{}), GMEM_STRIDE_TYPE{});
+  test_tiled_cp_async<T>(TILED_COPY{}, gmem_layout, smem_layout);
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/cooperative_gemm_common.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/cooperative_gemm_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ff20d4087ee2fd6f4f74338e3e63eef27c221d3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/cooperative_gemm_common.hpp
@@ -0,0 +1,775 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/relatively_equal.h"
+#include "cutlass_unit_test.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+#include <iostream>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+using namespace cute;
+
+template<typename T>
+struct fp64_tester {
+  using value_type = double;
+};
+
+template<typename T>
+struct fp64_tester<complex<T>> {
+  using value_type = complex<double>;
+};
+
+template<class TA,
+         class TB,
+         class TC,
+         class ALayout, // logical shape (M, K)
+         class BLayout, // logical shape (N, K)
+         class CLayout> // logical shape (M, N)
+auto host_generate_gemm_inputs(
+  ALayout a_layout,
+  BLayout b_layout,
+  CLayout c_layout
+) {
+  thrust::host_vector<TA> h_a(cosize(a_layout));
+  thrust::host_vector<TB> h_b(cosize(b_layout));
+  thrust::host_vector<TC> h_c(cosize(c_layout));
+  thrust::host_vector<TC> h_c_out(cosize(c_layout));
+
+  auto h_a_tensor = make_tensor(h_a.data(), a_layout);
+  auto h_b_tensor = make_tensor(h_b.data(), b_layout);
+  auto h_c_tensor = make_tensor(h_c.data(), c_layout);
+  size_t max_size   = std::max<size_t>({static_cast<size_t>(size(a_layout)),
+                                        static_cast<size_t>(size(b_layout)),
+                                        static_cast<size_t>(size(c_layout))});
+  for (size_t i = 0; i < max_size; ++i) {
+    double di = static_cast<double>(i);
+    if(i < size(a_layout)) {
+      h_a_tensor(i) = static_cast<TA>(di / size(a_layout));
+    }
+    if(i < size(b_layout)) {
+      h_b_tensor(i) = static_cast<TB>(di / size(a_layout));
+    }
+    if(i < size(c_layout)) {
+      h_c_tensor(i) = static_cast<TC>((di*di) / size(a_layout));
+    }
+  }
+
+  return std::make_tuple(h_a, h_b, h_c, h_c_out);
+}
+
+template<class Alpha, class EngineA, class ALayout,
+         class EngineB, class BLayout,
+         class Beta, class EngineC, class CLayout,
+         class ALoadTransform  = cute::identity,
+         class BLoadTransform  = cute::identity,
+         class CLoadTransform  = cute::identity,
+         class CStoreTransform = cute::identity>
+thrust::host_vector<typename EngineC::value_type>
+host_reference_gemm(Alpha                           alpha,
+                    Tensor<EngineA, ALayout> const& h_a_tensor,
+                    Tensor<EngineB, BLayout> const& h_b_tensor,
+                    Beta                            beta,
+                    Tensor<EngineC, CLayout> const& h_c_tensor,
+                    ALoadTransform           const& a_load_transform = {},
+                    BLoadTransform           const& b_load_transform = {},
+                    CLoadTransform           const& c_load_transform = {},
+                    CStoreTransform          const& c_store_transform = {})
+  {
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TA = remove_cv_t<typename EngineA::element_type>;
+  using TB = remove_cv_t<typename EngineB::element_type>;
+  using TC = remove_cv_t<typename EngineC::element_type>;
+
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+
+  thrust::host_vector<TC> h_c_ref(cosize(h_c_tensor.layout()), static_cast<TC>(0.0));
+  auto h_c_ref_tensor = make_tensor(h_c_ref.data(), h_c_tensor.layout());
+  // A * B
+  for (int k = 0; k < size<1>(h_a_tensor); k++) {
+    for (int m = 0; m < size<0>(h_a_tensor); m++) {
+      for (int n = 0; n < size<0>(h_b_tensor); n++) {
+          const auto a_value      = a_load_transform(h_a_tensor(m, k));
+          const auto b_value      = b_load_transform(h_b_tensor(n, k));
+          const auto a_value_fp64 = static_cast<ABC_64>(a_value);
+          const auto b_value_fp64 = static_cast<ABC_64>(b_value);
+          h_c_ref_tensor(m, n) += static_cast<TC>(a_value_fp64 * b_value_fp64);
+      }
+    }
+  }
+  // C = A*B + C
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    const auto ab_value_fp64 = static_cast<ABC_64>(h_c_ref_tensor(i));
+    const auto c_value_fp64  = static_cast<ABC_64>(c_load_transform(h_c_tensor(i)));
+    h_c_ref_tensor(i)        = c_store_transform(static_cast<TC>(alpha * ab_value_fp64 + beta * c_value_fp64));
+  }
+
+  return h_c_ref;
+}
+
+template<class EngineC, class CLayout>
+void verify_gemm_correctness(cute::Tensor<EngineC, CLayout> const& h_c_out_tensor,
+                             cute::Tensor<EngineC, CLayout> const& h_c_ref_tensor)
+{
+  // Cannot use ::value_type because it propagates to complex::value_type,
+  // so ViewEngine<complex<double>>::value_type == double
+  using TC = remove_cv_t<typename EngineC::element_type>;
+
+  using tester = fp64_tester<TC>;
+  using ABC_64 = typename tester::value_type;
+
+  for (int i = 0; i < size(h_c_ref_tensor); i++) {
+    ABC_64 h_c_ref_i = h_c_ref_tensor(i);
+    ABC_64 h_c_out_i = h_c_out_tensor(i);
+    double epsilon(0.1f);
+    double nonzero_floor(std::numeric_limits<double>::min());
+    bool passed = cutlass::relatively_equal(h_c_out_i, h_c_ref_i, epsilon, nonzero_floor);
+    ASSERT_TRUE(passed) << i << " - result:" << h_c_out_i << " expected:" << h_c_ref_i;
+  }
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class SMemCLayout,
+         class TA,
+         class TB,
+         class TC,
+         class Alpha,
+         class Beta,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB,
+         class SMemCopyLdOpC,
+         class SMemCopyStOpC>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel(GMemALayout gmem_a_layout,
+                        GMemBLayout gmem_b_layout,
+                        GMemCLayout gmem_c_layout,
+                        SMemALayout smem_a_layout,
+                        SMemBLayout smem_b_layout,
+                        SMemCLayout smem_c_layout,
+                        TA       const* a,
+                        TB       const* b,
+                        TC       const* c,
+                        TC            * c_out,
+                        Alpha    const  alpha,
+                        Beta     const  beta,
+                        TiledMma        tiled_mma,
+                        ALoadTransform  a_load_transform,
+                        BLoadTransform  b_load_transform,
+                        CLoadTransform  c_load_transform,
+                        CStoreTransform c_store_transform,
+                        SMemCopyOpA     a_copy_op,
+                        SMemCopyOpB     b_copy_op,
+                        SMemCopyLdOpC   c_copy_ld_op,
+                        SMemCopyStOpC   c_copy_st_op)
+{
+    using namespace cute;
+
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+    auto* smem_ptr_c = smem_ptr_b + round_up((sizeof(TB) * cosize(smem_b_layout)), copy_max_vec_bytes);
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+    Tensor s_c_tensor = make_tensor(make_smem_ptr<TC>(smem_ptr_c), smem_c_layout);
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_c_tensor, s_c_tensor);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+
+    cooperative_gemm(
+      threadIdx.x, tiled_mma,
+      alpha, s_a_tensor, s_b_tensor, beta, s_c_tensor,
+      a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+      a_copy_op, b_copy_op, c_copy_ld_op, c_copy_st_op
+    );
+    __syncthreads();
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, s_c_tensor, g_c_out_tensor);
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class GMemALayout,
+         class GMemBLayout,
+         class GMemCLayout,
+         class SMemALayout,
+         class SMemBLayout,
+         class TA,
+         class TB,
+         class TC,
+         class TiledMma,
+         class ALoadTransform,
+         class BLoadTransform,
+         class CLoadTransform,
+         class CStoreTransform,
+         class SMemCopyOpA,
+         class SMemCopyOpB>
+__launch_bounds__(ThreadBlockSize) __global__ void
+cooperative_gemm_kernel_rmem_c(GMemALayout gmem_a_layout,
+                               GMemBLayout gmem_b_layout,
+                               GMemCLayout gmem_c_layout,
+                               SMemALayout smem_a_layout,
+                               SMemBLayout smem_b_layout,
+                               TA        const* a,
+                               TB        const* b,
+                               TC        const* c,
+                               TC             * c_out,
+                               TiledMma         tiled_mma,
+                               ALoadTransform   a_load_transform,
+                               BLoadTransform   b_load_transform,
+                               CLoadTransform   c_load_transform,
+                               CStoreTransform  c_store_transform,
+                               SMemCopyOpA      a_copy_op,
+                               SMemCopyOpB      b_copy_op)
+  {
+    using namespace cute;
+
+    Tensor g_a_tensor     = make_tensor(make_gmem_ptr(a), gmem_a_layout);
+    Tensor g_b_tensor     = make_tensor(make_gmem_ptr(b), gmem_b_layout);
+    Tensor g_c_tensor     = make_tensor(make_gmem_ptr(c), gmem_c_layout);
+    Tensor g_c_out_tensor = make_tensor(make_gmem_ptr(c_out), gmem_c_layout);
+
+    constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+    extern __shared__ float4 smem_buf[];
+    auto* smem_ptr = reinterpret_cast<unsigned char*>(smem_buf);
+    auto* smem_ptr_a = smem_ptr;
+    auto* smem_ptr_b = smem_ptr_a + round_up((sizeof(TA) * cosize(smem_a_layout)), copy_max_vec_bytes);
+
+    Tensor s_a_tensor = make_tensor(make_smem_ptr<TA>(smem_ptr_a), smem_a_layout);
+    Tensor s_b_tensor = make_tensor(make_smem_ptr<TB>(smem_ptr_b), smem_b_layout);
+
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_a_tensor, s_a_tensor);
+    cooperative_copy<ThreadBlockSize, CopyMaxVecBits>(threadIdx.x, g_b_tensor, s_b_tensor);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+    __syncthreads();
+
+    // Create C fragment for storing intermediate results
+    auto thr_mma = TiledMma().get_thread_slice(threadIdx.x);
+    Tensor g_c_partition = thr_mma.partition_C(g_c_tensor);
+    Tensor g_c_out_partition = thr_mma.partition_C(g_c_out_tensor);
+    Tensor r_c_partition = thr_mma.make_fragment_C(g_c_partition);
+
+    // Create indexing help for predicated GEMMs
+    Tensor cC   = make_identity_tensor(shape(gmem_c_layout));
+    Tensor tCcC = thr_mma.partition_C(cC);
+
+    // Load C from global
+    // (always loading in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        r_c_partition(i) = c_load_transform(g_c_partition(i));
+      }
+    }
+
+    cooperative_gemm(
+      threadIdx.x, tiled_mma, s_a_tensor, s_b_tensor, r_c_partition,
+      a_load_transform, b_load_transform, a_copy_op, b_copy_op
+    );
+
+    __syncthreads();
+
+    // Store C to global
+    // (always storing in predicated way)
+    CUTE_UNROLL
+    for (int i = 0; i < size(r_c_partition); ++i)
+    {
+      if (elem_less(tCcC(i), shape(g_c_tensor)))
+      {
+        g_c_out_partition(i) = c_store_transform(r_c_partition(i));
+      }
+    }
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class SMemCLayout, // logical shape (M, N)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class CSMemCopyLdOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class CSMemCopyStOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm(GMemALayout     gmem_a_layout,
+                           GMemBLayout     gmem_b_layout,
+                           GMemCLayout     gmem_c_layout,
+                           SMemALayout     smem_a_layout,
+                           SMemBLayout     smem_b_layout,
+                           SMemCLayout     smem_c_layout,
+                           TiledMma        tiled_mma,
+                           ALoadTransform  a_load_transform  = {},
+                           BLoadTransform  b_load_transform  = {},
+                           CLoadTransform  c_load_transform  = {},
+                           CStoreTransform c_store_transform = {},
+                           ASMemCopyOp     a_smem_copy_op = {},
+                           BSMemCopyOp     b_smem_copy_op = {},
+                           CSMemCopyLdOp   c_smem_copy_ld_op = {},
+                           CSMemCopyStOp   c_smem_copy_st_op = {})
+{
+  static_assert(std::is_same_v<typename fp64_tester<TA>::value_type, typename fp64_tester<TB>::value_type>);
+  static_assert(std::is_same_v<typename fp64_tester<TB>::value_type, typename fp64_tester<TC>::value_type>);
+
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+
+  static_assert(size<0>(smem_a_layout) == size<0>(smem_c_layout));  // AM == CM
+  static_assert(size<0>(smem_b_layout) == size<1>(smem_c_layout));  // BN == CN
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+  static_assert(cute::size(gmem_c_layout) == cute::size(smem_c_layout));
+
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+
+  const auto alpha = static_cast<TC>(1.1);
+  const auto beta  = static_cast<TC>(1.2);
+
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] = host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), TC(float(-1)));
+
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes) +
+                                    sizeof(TC) * h_c.size();
+
+
+  auto kernel = cooperative_gemm_kernel<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout, SMemCLayout,
+    TA, TB, TC, decltype(alpha), decltype(beta),
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp, CSMemCopyLdOp, CSMemCopyStOp
+  >;
+
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    smem_c_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    alpha,
+    beta,
+    tiled_mma,
+    a_load_transform,
+    b_load_transform,
+    c_load_transform,
+    c_store_transform,
+    a_smem_copy_op,
+    b_smem_copy_op,
+    c_smem_copy_ld_op,
+    c_smem_copy_st_op
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+
+  // Copy result data
+  h_c_out = d_c_out;
+
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class GMemALayout, // logical shape (M, K)
+         class GMemBLayout, // logical shape (N, K)
+         class GMemCLayout, // logical shape (M, N)
+         class SMemALayout, // logical shape (M, K)
+         class SMemBLayout, // logical shape (N, K)
+         class TiledMma,
+         class ALoadTransform = cute::identity,
+         class BLoadTransform = cute::identity,
+         class CLoadTransform = cute::identity,
+         class CStoreTransform = cute::identity,
+         class ASMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>,
+         class BSMemCopyOp = AutoVectorizingCopyWithAssumedAlignment<CopyMaxVecBits>>
+void test_cooperative_gemm_rmem_c(GMemALayout     gmem_a_layout,
+                                  GMemBLayout     gmem_b_layout,
+                                  GMemCLayout     gmem_c_layout,
+                                  SMemALayout     smem_a_layout,
+                                  SMemBLayout     smem_b_layout,
+                                  TiledMma        tiled_mma,
+                                  ALoadTransform  a_load_transform  = {},
+                                  BLoadTransform  b_load_transform  = {},
+                                  CLoadTransform  c_load_transform  = {},
+                                  CStoreTransform c_store_transform = {},
+                                  ASMemCopyOp     a_smem_copy_op    = {},
+                                  BSMemCopyOp     b_smem_copy_op    = {})
+{
+  static_assert(size<0>(gmem_a_layout) == size<0>(gmem_c_layout));  // AM == CM
+  static_assert(size<0>(gmem_b_layout) == size<1>(gmem_c_layout));  // BN == CN
+  static_assert(size<1>(gmem_a_layout) == size<1>(gmem_b_layout));  // AK == BK
+
+  static_assert(size<1>(smem_a_layout) == size<1>(smem_b_layout));  // AK == BK
+
+  static_assert(cute::size(gmem_a_layout) == cute::size(smem_a_layout));
+  static_assert(cute::size(gmem_b_layout) == cute::size(smem_b_layout));
+
+#if 0
+  print("   "); print("gmem:    "); print(gmem_layout); print("\n");
+  print("   "); print("smem:    "); print(smem_layout); print("\n");
+  print("   "); print("threads: "); print(ThreadBlockSize); print("\n");
+#endif
+
+  const auto alpha = static_cast<TC>(1.0);
+  const auto beta  = static_cast<TC>(1.0);
+
+  // Generate inputs
+  auto [h_a, h_b, h_c, h_c_out] =
+    host_generate_gemm_inputs<TA, TB, TC>(gmem_a_layout, gmem_b_layout, gmem_c_layout);
+
+  thrust::device_vector<TA> d_a(h_a);
+  thrust::device_vector<TB> d_b(h_b);
+  thrust::device_vector<TC> d_c(h_c);
+  thrust::device_vector<TC> d_c_out(h_c_out.size(), static_cast<TC>(-1));
+
+  constexpr uint32_t copy_max_vec_bytes = CopyMaxVecBits / 8;
+
+  const size_t shared_memory_size = round_up(sizeof(TA) * h_a.size(), copy_max_vec_bytes) +
+                                    round_up(sizeof(TB) * h_b.size(), copy_max_vec_bytes);
+
+
+  auto kernel = cooperative_gemm_kernel_rmem_c<
+    ThreadBlockSize, CopyMaxVecBits,
+    GMemALayout, GMemBLayout, GMemCLayout,
+    SMemALayout, SMemBLayout,
+    TA, TB, TC,
+    TiledMma,
+    ALoadTransform, BLoadTransform, CLoadTransform, CStoreTransform,
+    ASMemCopyOp, BSMemCopyOp
+  >;
+
+  ASSERT_EQ(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(shared_memory_size)), 0);
+
+  kernel<<<1, ThreadBlockSize, shared_memory_size>>>(
+    gmem_a_layout,
+    gmem_b_layout,
+    gmem_c_layout,
+    smem_a_layout,
+    smem_b_layout,
+    thrust::raw_pointer_cast(d_a.data()),
+    thrust::raw_pointer_cast(d_b.data()),
+    thrust::raw_pointer_cast(d_c.data()),
+    thrust::raw_pointer_cast(d_c_out.data()),
+    tiled_mma,
+    a_load_transform, b_load_transform, c_load_transform, c_store_transform,
+    a_smem_copy_op, b_smem_copy_op
+  );
+
+  cudaError_t result = cudaDeviceSynchronize();
+  if (result != cudaSuccess) {
+    cudaError_t error = cudaGetLastError();
+    FAIL() << "Error at kernel sync: " << cudaGetErrorString(error) << "\n";
+  }
+
+  // Copy result data
+  h_c_out = d_c_out;
+
+  // Reference gemm
+  auto h_c_ref = host_reference_gemm(alpha,
+                                     make_tensor(h_a.data(), gmem_a_layout),
+                                     make_tensor(h_b.data(), gmem_b_layout),
+                                     beta,
+                                     make_tensor(h_c.data(), gmem_c_layout),
+                                     a_load_transform,
+                                     b_load_transform,
+                                     c_load_transform,
+                                     c_store_transform);
+
+  // Verify correctness
+  verify_gemm_correctness(make_tensor(h_c_out.data(), gmem_c_layout),
+                          make_tensor(h_c_ref.data(), gmem_c_layout));
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout(ShapeMNK shape_mnk,
+                                            TiledMma tiled_mma,
+                                            Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     c_layout,
+     tiled_mma,
+     ops...);
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class SMemAtomLayoutC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>,
+                                    cute::is_layout<SMemAtomLayoutC>>>
+test_cooperative_gemm_col_major_layout(SMemAtomLayoutA smem_atom_layout_a,
+                                       SMemAtomLayoutB smem_atom_layout_b,
+                                       SMemAtomLayoutC smem_atom_layout_c,
+                                       ShapeMNK        shape_mnk,
+                                       TiledMma        tiled_mma,
+                                       Ops&&    ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  auto smem_c_layout = tile_to_shape(
+      smem_atom_layout_c,
+      make_shape(shape<0>(gmem_c_layout), shape<1>(gmem_c_layout)));
+
+  test_cooperative_gemm<ThreadBlockSize,
+                        CopyMaxVecBits,
+                        TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     smem_c_layout,
+     tiled_mma,
+     ops...);
+}
+
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+void test_cooperative_gemm_col_major_layout_rmem_c(ShapeMNK    shape_mnk,
+                                                   TiledMma    tiled_mma,
+                                                   Ops ... ops)
+{
+  auto a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto c_layout = make_layout(select<0, 1>(shape_mnk));
+
+
+  test_cooperative_gemm_rmem_c<ThreadBlockSize,
+                               CopyMaxVecBits,
+                               TA, TB,TC>
+    (a_layout,
+     b_layout,
+     c_layout,
+     a_layout,
+     b_layout,
+     tiled_mma,
+     ops...);
+}
+
+template<uint32_t ThreadBlockSize,
+         uint32_t CopyMaxVecBits,
+         class TA,
+         class TB,
+         class TC,
+         class SMemAtomLayoutA,
+         class SMemAtomLayoutB,
+         class ShapeMNK,
+         class TiledMma,
+         class ... Ops>
+std::enable_if_t<std::conjunction_v<cute::is_layout<SMemAtomLayoutA>,
+                                    cute::is_layout<SMemAtomLayoutB>>>
+test_cooperative_gemm_col_major_layout_rmem_c(SMemAtomLayoutA smem_atom_layout_a,
+                                              SMemAtomLayoutB smem_atom_layout_b,
+                                              ShapeMNK        shape_mnk,
+                                              TiledMma        tiled_mma,
+                                              Ops      ...    ops)
+{
+  auto gmem_a_layout = make_layout(select<0, 2>(shape_mnk));
+  auto gmem_b_layout = make_layout(select<1, 2>(shape_mnk), GenRowMajor{});
+  auto gmem_c_layout = make_layout(select<0, 1>(shape_mnk));
+
+  auto smem_a_layout = tile_to_shape(
+      smem_atom_layout_a,
+      make_shape(shape<0>(gmem_a_layout), shape<1>(gmem_a_layout)));
+
+  auto smem_b_layout = tile_to_shape(
+      smem_atom_layout_b,
+      make_shape(shape<0>(gmem_b_layout), shape<1>(gmem_b_layout)));
+
+  test_cooperative_gemm_rmem_c<ThreadBlockSize, CopyMaxVecBits,
+                               TA, TB, TC>
+    (gmem_a_layout,
+     gmem_b_layout,
+     gmem_c_layout,
+     smem_a_layout,
+     smem_b_layout,
+     tiled_mma,
+     ops...);
+}
+
+template<uint32_t ThreadBlockSize,
+         typename T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout_rmem_c(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout_rmem_c<ThreadBlockSize,
+                                                cute::sizeof_bits_v<T>,
+                                                T, T, T>
+    (static_cast<Args&&>(args)...);
+}
+
+template<uint32_t ThreadBlockSize,
+         class T,
+         class ... Args>
+void test_cooperative_gemm_col_major_layout(Args&& ... args)
+{
+  test_cooperative_gemm_col_major_layout<ThreadBlockSize,
+                                         cute::sizeof_bits_v<T>,
+                                         T, T, T>
+    (static_cast<Args&&>(args)...);
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_load_testbed.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_load_testbed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d2620e62ff247e36ae49809ab4ef3416560ae31
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -0,0 +1,217 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+  alignas(16) cute::uint64_t tma_load_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr<T>(g_out), gmem_layout);
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = flat_divide(mA, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = flat_divide(mB, cta_tiler);               // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_LOAD
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+  Tensor tAgA_x = cta_tma.partition_S(gA);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+  Tensor tAsA_x = cta_tma.partition_D(sA);                           // (TMA,TMA_M,TMA_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA);   print("\n");
+    print("  mB  :  "); print(  mB);   print("\n");
+    print("  gA  :  "); print(  gA);   print("\n");
+    print("  gB  :  "); print(  gB);   print("\n");
+    print("  sA  :  "); print(  sA);   print("\n");
+    print("tAgA_x:  "); print(tAgA_x); print("\n");
+    print("tAsA_x:  "); print(tAsA_x); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_LOAD
+  //
+
+  // INPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tAgA = group_modes<1,rank(tAgA_x)>(tAgA_x);                 // (TMA,REST)
+  Tensor tAsA = group_modes<1,rank(tAsA_x)>(tAsA_x);                 // (TMA,REST)
+  static_assert(size<1>(tAsA) == 1);
+
+  // OUTPUT: Group the CTA_TILE_X modes and REST_X modes for output
+  Tensor tBgB = group_modes<0,R>(group_modes<R,rank(gB)>(gB));       // (CTA_TILE, REST)
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA); print("\n");
+    print("tAsA  :  "); print(tAsA); print("\n");
+    print("tBgB  :  "); print(tBgB); print("\n");
+  }
+#endif
+
+  // Test L2 prefetch
+  if (threadIdx.x == 0) {
+    prefetch(tma, tAgA);
+  }
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = sizeof(make_tensor_like(tensor<0>(tAsA)));
+
+    if (threadIdx.x == 0)
+    {
+      /// Initialize shared memory barrier
+      tma_load_mbar[0] = 0;
+      cute::initialize_barrier(tma_load_mbar[0], 1 /*numThreads*/);
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+
+      copy(tma.with(tma_load_mbar[0]), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+
+    /// Wait on the shared memory barrier until the phase bit flips from kPhaseBit value
+    constexpr int kPhaseBit = 0;
+    cute::wait_barrier(tma_load_mbar[0], kPhaseBit);
+
+    //
+    // Write out trivially smem -> gmem
+    //
+
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (thread0()) {
+      copy(sA, tBgB(_,stage));
+    }
+
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+auto
+test_tma_load(CopyOp      const& copy_op,
+              GMEM_Layout const& gmem_layout,
+              SMEM_Layout const& smem_layout,
+              CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_in.data())), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+
+  return tma;
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_mcast_load_testbed.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e0ec46df1b672c35c3c38f731c09b0134d4cd80
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp>
+#include <cutlass/cluster_launch.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+  alignas(16) cute::uint64_t tma_load_mbar[1];
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class GmemLayout, class SmemLayout,
+          class CopyAtom, class CTA_Tiler, class Cluster_Size>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out, GmemLayout gmem_layout, SmemLayout smem_layout,
+                     CUTE_GRID_CONSTANT CopyAtom const tma, CTA_Tiler cta_tiler, Cluster_Size cluster_size)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sA = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+  // Shared memory barriers use 64bits in SMEM for synchronization
+  uint64_t* tma_load_mbar = shared_storage.tma_load_mbar;
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = tma.get_tma_tensor(shape(gmem_layout));
+  Tensor mB = make_tensor(make_gmem_ptr<T>(g_out), gmem_layout);
+
+  Tensor gA = zipped_divide(mA, cta_tiler);               // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+  Tensor gB = zipped_divide(mB, cta_tiler);               // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+
+#if 1
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mA  :  "); print(  mA);   print("\n");
+    print("  mB  :  "); print(  mB);   print("\n");
+    print("  gA  :  "); print(  gA);   print("\n");
+    print("  gB  :  "); print(  gB);   print("\n");
+    print("  sA  :  "); print(  sA);   print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // Prepare the TMA_LOAD
+  //
+
+  Tensor sA_x = make_tensor(sA.data(), make_layout(sA.layout(), Layout<_1>{}));  // ((CTA_TILE_M,CTA_TILE_N,...),_1)
+  Tensor tBgB = gB;                                                              // ((CTA_TILE_M,CTA_TILE_N,...),(REST_M,REST_N,...))
+
+  int cta_rank_in_cluster  = cute::block_rank_in_cluster();
+  auto [tAgA, tAsA] = tma_partition(tma, cta_rank_in_cluster, make_layout(cluster_size), sA_x, gA);
+
+#if 1
+  if (thread0()) {
+    print("sA_x  :  "); print(sA_x); print("\n");
+    print("tBgB  :  "); print(tBgB); print("\n");
+    print("tAgA  :  "); print(tAgA); print("\n");
+    print("tAsA  :  "); print(tAsA); print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // TMA Multicast Masks -- Get a mask of the active ctas in each TMA
+  //
+
+
+  int elected_cta_rank = 0;
+  bool elect_one_cta = (elected_cta_rank == cta_rank_in_cluster);
+  bool elect_one_thr = cute::elect_one_sync();
+
+  uint16_t tma_mcast_mask = ((uint16_t(1) << cluster_size) - 1);
+
+#if 1
+  if (thread0()) {
+    print("tma_mcast_mask :  "); print(tma_mcast_mask); print("\n");
+  } __syncthreads(); cute::cluster_sync();
+#endif
+
+  //
+  // Perform the TMA_LOAD
+  //
+
+  if (elect_one_thr) {
+    // Initialize TMA barrier
+    cute::initialize_barrier(tma_load_mbar[0], /* num_threads */ 1);
+  }
+  int tma_phase_bit = 0;
+  // Ensures all CTAs in the Cluster have initialized
+  __syncthreads();
+  cute::cluster_sync();
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tAgA); ++stage)
+  {
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    constexpr int kTmaTransactionBytes = sizeof(ArrayEngine<T, CUTE_STATIC_V(size(filter_zeros(sA)))>);
+
+    if (elect_one_thr)
+    {
+      cute::set_barrier_transaction_bytes(tma_load_mbar[0], kTmaTransactionBytes);
+
+      copy(tma.with(tma_load_mbar[0], tma_mcast_mask), tAgA(_,stage), tAsA(_,0));
+    }
+    __syncthreads();
+
+    /// Wait on the shared memory barrier until the phase bit flips from tma_phase_bit value
+    cute::wait_barrier(tma_load_mbar[0], tma_phase_bit);
+    tma_phase_bit ^= 1;
+
+    //
+    // Write out trivially smem -> gmem
+    //
+
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (elect_one_cta && elect_one_thr) {
+      copy(sA, tBgB(_,stage));
+    }
+
+    __syncthreads();
+    cute::cluster_sync();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp,
+          class GMEM_Layout, class SMEM_Layout,
+          class CTA_Tiler, class Cluster_Size>
+auto
+test_tma_load(CopyOp       const& copy_op,
+              GMEM_Layout  const& gmem_layout,
+              SMEM_Layout  const& smem_layout,
+              CTA_Tiler    const& cta_tiler,
+              Cluster_Size const& cluster_size)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_in.data())), gmem_layout);
+  auto tma = make_tma_atom<TmaType>(copy_op, gA, smem_layout, cta_tiler, cluster_size);
+  //print(tma);
+
+  // Launch
+
+  dim3 dimBlock(32);
+  dim3 dimCluster(size(cluster_size));
+  dim3 dimGrid = dimCluster;
+  int smem_size = sizeof(SharedStorage<T, SMEM_Layout>);
+
+  void* kernel_ptr = (void*) &tma_test_device_cute<T, GMEM_Layout, SMEM_Layout,
+                                                   decltype(tma), CTA_Tiler, Cluster_Size>;
+
+  cutlass::launch_kernel_on_cluster({dimGrid, dimBlock, dimCluster, smem_size},
+                                    kernel_ptr,
+                                    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+                                    reinterpret_cast<T      *>(raw_pointer_cast(d_out.data())),
+                                    gmem_layout,
+                                    smem_layout,
+                                    tma, cta_tiler, cluster_size);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+
+  return tma;
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_store_testbed.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_store_testbed.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0429d2435fbf43c690f311c1f7c04f7025a2dd94
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/cute/hopper/tma_store_testbed.hpp
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass_unit_test.h"
+
+#include <iostream>
+#include <cstdint>
+
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#include <cute/tensor.hpp>
+
+namespace cutlass::test {
+
+template <class ElementType, class SmemLayout>
+struct SharedStorage
+{
+  cute::ArrayEngine<ElementType, cute::cosize_v<SmemLayout>> smem;
+};
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+
+template <class T, class TiledCopy, class CTA_Tiler, class GmemLayout, class SmemLayout>
+__global__ void
+tma_test_device_cute(T const* g_in, T* g_out,
+                     CUTE_GRID_CONSTANT TiledCopy const tma, CTA_Tiler cta_tiler,
+                     GmemLayout gmem_layout, SmemLayout smem_layout)
+{
+  using namespace cute;
+  CUTE_STATIC_ASSERT_V(product_each(shape(cta_tiler)) == product_each(shape(smem_layout)));
+
+  // Use Shared Storage structure to allocate and distribute aligned SMEM addresses
+  extern __shared__ char shared_memory[];
+  using SharedStorage = SharedStorage<T, SmemLayout>;
+  SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(shared_memory);
+
+  // Construct SMEM tensor
+  Tensor sB = make_tensor(make_smem_ptr(shared_storage.smem.begin()), smem_layout);  // (CTA_TILE_M,CTA_TILE_N,...)
+
+  // TMA requires special handling of strides to deal with coord codomain mapping
+  // Represent the full tensors -- get these from TMA
+  Tensor mA = make_tensor(make_gmem_ptr<T>(g_in), gmem_layout);
+  Tensor mB = tma.get_tma_tensor(shape(gmem_layout));
+
+  constexpr int R = rank_v<CTA_Tiler>;
+  Tensor gA = flat_divide(mA, cta_tiler);                 // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+  Tensor gB = flat_divide(mB, cta_tiler);                 // (CTA_TILE_M,CTA_TILE_N,...REST_M,REST_N,...)
+
+  //
+  // Prepare the TMA_STORE
+  //
+
+  auto cta_tma = tma.get_slice(Int<0>{});                            // CTA slice
+  Tensor tBsB_x = cta_tma.partition_S(sB);                           // (TMA,TMA_M,TMA_N)
+  Tensor tBgB_x = cta_tma.partition_D(gB);                           // (TMA,TMA_M,TMA_N,REST_M,REST_N)
+
+#if 0
+  if (thread0()) {
+    print(tma);
+    print("TILE  :  "); print(cta_tiler); print("\n");
+    print("  mB  :  "); print(  mB.data());   print(" o "); print(  mB.layout());   print("\n");
+    print("  gB  :  "); print(  gB.data());   print(" o "); print(  gB.layout());   print("\n");
+    print("tBgB_x:  "); print(tBgB_x.data()); print(" o "); print(tBgB_x.layout()); print("\n");
+    print("  sB  :  "); print(  sB.data());   print(" o "); print(  sB.layout());   print("\n");
+    print("tBsB_x:  "); print(tBsB_x.data()); print(" o "); print(tBsB_x.layout()); print("\n");
+  }
+#endif
+
+  //
+  // Perform the TMA_STORE
+  //
+
+  // INPUT: Group the CTA_TILE_X modes and REST_X modes for input
+  Tensor tAgA = group_modes<0,R>(group_modes<R,rank(gA)>(gA));       // (CTA_TILE, REST)
+
+  // OUTPUT: Group the REST_X modes and the TMA_X modes to easily iterate through the tiles
+  Tensor tBgB = group_modes<1,rank(tBgB_x)>(tBgB_x);                 // (TMA,REST)
+  Tensor tBsB = group_modes<1,rank(tBsB_x)>(tBsB_x);                 // (TMA,REST)
+  static_assert(size<1>(tBsB) == 1);
+
+#if 0
+  if (thread0()) {
+    print("tAgA  :  "); print(tAgA.data()); print(" o "); print(tAgA.layout()); print("\n");
+    print("tBsB  :  "); print(tBsB.data()); print(" o "); print(tBsB.layout()); print("\n");
+    print("tBgB  :  "); print(tBgB.data()); print(" o "); print(tBgB.layout()); print("\n");
+  }
+#endif
+
+  // Test L2 prefetch
+  cooperative_prefetch<128>(threadIdx.x, gA);
+
+  // Loop over the TMA stages, using smem as our buffer
+  for (int stage = 0; stage < size<1>(tBgB); ++stage)
+  {
+    //
+    // Read in trivially gmem -> smem
+    //
+    // Subbyte elements could cause race conditions, so be even more conservative
+    if (thread0()) {
+      copy(tAgA(_,stage), sB);
+    }
+
+    __syncthreads();
+    cute::cp_async_wait<0>();
+
+    //
+    // Perform the TMA_STORE
+    //
+
+    if (threadIdx.x == 0) {
+      copy(tma, tBsB(_,0), tBgB(_,stage));
+    }
+
+    tma_store_wait<0>();
+    __syncthreads();
+  }
+}
+
+template <class T, class TmaType = T, class CopyOp, class GMEM_Layout, class SMEM_Layout, class CTA_Tile>
+void
+test_tma_store(CopyOp      const& copy_op,
+               GMEM_Layout const& gmem_layout,
+               SMEM_Layout const& smem_layout,
+               CTA_Tile    const& cta_tile)
+{
+  using namespace cute;
+
+  // Allocate and initialize host test data
+  size_t N = ceil_div(cosize(gmem_layout) * sizeof_bits<T>::value, 8);
+  thrust::host_vector<uint8_t> h_in(N);
+  for (size_t i = 0; i < h_in.size(); ++i) {
+    h_in[i] = uint8_t(i % 13);
+  }
+  Tensor hA_in  = make_tensor(recast_ptr<T>(h_in.data()), gmem_layout);
+
+  // Allocate and initialize device test data
+  thrust::device_vector<uint8_t> d_in = h_in;
+  thrust::device_vector<uint8_t> d_out(h_in.size(), uint8_t(-1)); // overflow uint
+
+  // Create TMA for this device Tensor
+  Tensor gA = make_tensor(make_gmem_ptr<T>(raw_pointer_cast(d_out.data())), gmem_layout);
+  auto tma = make_tma_copy<TmaType>(copy_op, gA, smem_layout, cta_tile, Int<1>{});
+  //print(tma);
+
+  // Launch
+  int smem_size = int(sizeof(SharedStorage<T, decltype(smem_layout)>));
+  tma_test_device_cute<<<1, 128, smem_size>>>(
+    reinterpret_cast<T const*>(raw_pointer_cast(d_in.data())),
+    reinterpret_cast<T*>      (raw_pointer_cast(d_out.data())),
+    tma, cta_tile,
+    gmem_layout,
+    smem_layout);
+
+  // Copy results back to host
+  thrust::host_vector<uint8_t> h_out = d_out;
+  Tensor hA_out = make_tensor(recast_ptr<T>(h_out.data()), gmem_layout);
+
+  // Validate the results. Print only the first 3 errors.
+  int count = 3;
+  for (int i = 0; i < int(size(hA_out)) && count > 0; ++i) {
+    EXPECT_EQ(hA_in(i), hA_out(i));
+    if (hA_in(i) != hA_out(i)) {
+      --count;
+    }
+  }
+}
+
+#endif
+
+} // end namespace cutlass::test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..3163a0d0eaa24513ee210bd2b310d1bf233773a9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
@@ -0,0 +1,417 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  
+    \brief Unit tests for epilogues
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace kernel {
+
+template <typename Epilogue>
+__global__ void epilogue_with_reduction_threadblock(
+  typename Epilogue::ElementVector *ptr_Reduction,
+  typename Epilogue::OutputTileIterator::Params params_D,
+  typename Epilogue::OutputTileIterator::Element *ptr_D,
+  typename Epilogue::OutputTileIterator::Params params_C,
+  typename Epilogue::OutputTileIterator::Element *ptr_C,
+  typename Epilogue::TensorTileIterator::Params params_Tensor,
+  typename Epilogue::TensorTileIterator::Element *ptr_Tensor,
+  typename Epilogue::OutputOp::Params params_output_op,
+  cutlass::MatrixCoord problem_size,
+  cutlass::TensorRef<
+    typename Epilogue::WarpMmaOperator::ElementC, 
+    typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
+  int epilogue_count = 1) {
+
+  __shared__ typename Epilogue::SharedStorage shared_storage;
+
+  int thread_idx = threadIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int lane_idx = threadIdx.x % 32;
+
+  //
+  // Construct the epilogue
+  //
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_D(
+    params_D,
+    ptr_D,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_C(
+    params_C,
+    ptr_C,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::TensorTileIterator iterator_T(
+    params_Tensor,
+    ptr_Tensor,
+    problem_size,
+    thread_idx
+  );
+
+  // Epilogue operator
+  Epilogue epilogue(
+    shared_storage, 
+    thread_idx, 
+    warp_idx, 
+    lane_idx);
+
+  //
+  // Initialize the accumulators
+  //
+
+  int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
+  int warp_m = warp_mn % Epilogue::WarpCount::kM;
+  int warp_n = warp_mn / Epilogue::WarpCount::kM;
+
+  accumulator_ref.add_coord_offset({
+    warp_m * Epilogue::WarpMmaOperator::Shape::kM, 
+    warp_n * Epilogue::WarpMmaOperator::Shape::kN});
+
+  typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
+  
+  typename Epilogue::AccumulatorTile accumulators;
+
+  accumulators.clear();
+  accumulator_iterator.load(accumulators);
+
+#if 0
+  // For debugging, enable this block of code to fill each accumulator element with its
+  // source thread ID.
+  CUTLASS_PRAGMA_UNROLL
+  for (size_t i = 0; i < accumulators.size(); ++i) {
+    typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
+    accumulators[i] = x;
+  }
+
+  __syncthreads();
+
+#endif
+
+  //
+  // Perform the epilogue operation
+  //
+
+  typename Epilogue::OutputOp output_op(params_output_op);
+
+  // Place the epilogue in a loop
+  for (int iter = 0; iter < epilogue_count; ++iter) {
+    epilogue(output_op, ptr_Reduction, iterator_D, accumulators, iterator_C, iterator_T);
+  }
+}
+
+} // namespace kernel
+} // namespace test
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Epilogue_
+>
+class EpilogueWithReductionTestbed {
+public:
+
+  using Epilogue = Epilogue_;
+  using ElementAccumulator = typename Epilogue::ElementAccumulator;
+  using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
+  using ElementTensor = typename Epilogue::TensorTileIterator::Element;
+  using ElementOutput = typename Epilogue::ElementOutput;
+  using OutputOpParams = typename Epilogue::OutputOp::Params;
+
+public:
+
+  //
+  // Data members
+  //
+
+  cutlass::MatrixCoord quantized_size;
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;
+  cutlass::HostTensor<ElementTensor, cutlass::layout::RowMajor> additional_tensor;
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> reduction_tensor;
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  EpilogueWithReductionTestbed(): 
+    quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
+    accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    additional_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    reduction_tensor({1, Epilogue::Shape::kN}) {
+
+    //
+    // Initialize problem space
+    //
+
+    uint64_t seed = 2019;
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      accumulator_tensor.host_view(), 
+      seed, 
+      20, 
+      -20, 
+      0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      source_tensor.host_view(),
+      seed + 2018, 
+      20, 
+      -20, 
+      0);
+
+    cutlass::reference::host::TensorFill(additional_tensor.host_view(), ElementTensor(1));
+  }
+
+  bool run_all() {
+   
+    /*
+    double alpha_values[] = {1, 0, 2.25};
+    double beta_values[] = {0, 1, -1.25};
+
+    // Test runtime explodes if we tried to test every case exhaustively. This tests the full
+    // output tile and several smaller sizes to stress predication.
+    for (int m_idx = 0; m_idx < 3; ++m_idx) {
+      for (int n_idx = 0; n_idx < 3; ++n_idx) {
+
+        int m = quantized_size.row() - m_idx * 3;
+        int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;
+
+        for (double const &alpha : alpha_values) {
+          for (double const &beta : beta_values) {
+
+            bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+    return true;
+    */
+
+    double alpha = 1;
+    double beta = 0;
+
+    return run(
+      {quantized_size.row(), quantized_size.column()},
+      {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
+  }
+
+  /// Runs the test
+  bool run(
+    cutlass::MatrixCoord problem_size,
+    OutputOpParams output_params) { 
+
+    //
+    // Initialize problem space
+    //
+
+    ElementOutput default_output = ElementOutput(-127);
+    ElementAccumulator default_reduction = ElementAccumulator();
+
+    cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);
+    cutlass::reference::host::TensorFill(reduction_tensor.host_view(), default_reduction);
+
+    accumulator_tensor.sync_device();
+    output_tensor.sync_device();
+    source_tensor.sync_device();
+    additional_tensor.sync_device();
+    reduction_tensor.sync_device();
+
+    //
+    // Initialize epilogue parameters
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());
+    typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());
+    typename Epilogue::TensorTileIterator::Params params_T(additional_tensor.device_ref().layout());
+
+    //
+    // Launch kernel
+    //
+
+    dim3 grid(1, 1);
+    dim3 block(Epilogue::WarpCount::kCount * 32, 1);
+
+    test::kernel::epilogue_with_reduction_threadblock<Epilogue><<< grid, block >>>(
+      reduction_tensor.device_data(),
+      params_D,
+      output_tensor.device_data(),
+      params_C,
+      source_tensor.device_data(),
+      params_T,
+      additional_tensor.device_data(),
+      output_params,
+      problem_size, 
+      accumulator_tensor.device_view());
+
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    //
+    // Verify results
+    //
+    output_tensor.sync_host();
+    reduction_tensor.sync_host();
+
+    int errors = 0;
+    int const kMaxErrors = 5;
+
+    //
+    // The output has two parts:
+    //  - GEMM tensor epilogue in canonical layout
+    //  - partial reduction in canonical row-major layout
+    //
+
+    // Verify the GEMM tensor output
+    for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
+      for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {
+
+        cutlass::MatrixCoord coord{r, c};
+        ElementOutput got = output_tensor.at(coord);
+        
+        ElementOutput expected;
+        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
+
+          expected = ElementOutput(output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
+            output_params.beta * ElementCompute(source_tensor.at(coord)));
+        }
+        else {
+          expected = default_output;
+        }
+
+        if (expected != got) {
+
+          using OutputIO = cutlass::ScalarIO<ElementOutput>;
+
+          EXPECT_TRUE(false)
+            << "-------\n"
+            << "Error - output element (" << coord << ") - expected: " 
+            << OutputIO(expected) 
+            << ",  got: " << OutputIO(got) << std::endl;
+
+          ++errors;
+        }
+      }
+    }
+
+    // Verify the partial reduction
+    for (int c = 0; c < quantized_size.column(); ++c) {
+
+      ElementAccumulator reduction_acc = ElementAccumulator();
+
+      for (int r = 0; r < quantized_size.row(); ++r) {
+        reduction_acc += accumulator_tensor.at({r, c});
+      }
+
+      ElementAccumulator expected = default_reduction;
+      ElementAccumulator got = reduction_tensor.at({0, c});
+
+      if (c < problem_size.column()) {
+        expected = reduction_acc;
+      }
+      else {
+        expected = default_reduction;
+      }
+
+      if (expected != got) {
+        
+        using OutputIO = cutlass::ScalarIO<ElementAccumulator>;
+
+        EXPECT_TRUE(false)
+          << "-------\n"
+          << "Error - reduction element (" << c << ") - expected: " 
+          << OutputIO(expected) 
+          << ", got: " << OutputIO(got) << std::endl;
+      }
+    }
+
+    //
+    // Report results on error
+    //
+
+    if (errors) {
+      std::stringstream ss;
+      ss 
+        << "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_" 
+        << Epilogue::WarpTileIterator::WarpShape::kM << "x" 
+        << Epilogue::WarpTileIterator::WarpShape::kN 
+        << "_slice_" << Epilogue::WarpCount::kK << ".csv"; 
+
+      std::ofstream output_file(ss.str()); 
+      output_file << output_tensor.host_view(); 
+    }
+
+    return !errors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2457fdb4817e1dfb3af73149ae1e4c4458670a2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed.h
@@ -0,0 +1,356 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for epilogues
+*/
+#pragma once
+
+#include <fstream>
+#include <cfenv>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+#include "cutlass/complex.h"
+#include "cutlass/quaternion.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace kernel {
+
+template <typename Epilogue>
+__global__ void epilogue_threadblock(
+  typename Epilogue::OutputTileIterator::Params params_D,
+  typename Epilogue::OutputTileIterator::Element *ptr_D,
+  typename Epilogue::OutputTileIterator::Params params_C,
+  typename Epilogue::OutputTileIterator::Element *ptr_C,
+  typename Epilogue::OutputOp::Params params_output_op,
+  cutlass::MatrixCoord problem_size,
+  cutlass::TensorRef<
+    typename Epilogue::WarpMmaOperator::ElementC, 
+    typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
+    int epilogue_count = 1) {
+
+  __shared__ typename Epilogue::SharedStorage shared_storage;
+
+  int thread_idx = threadIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int lane_idx = threadIdx.x % 32;
+
+  //
+  // Construct the epilogue
+  //
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_D(
+    params_D,
+    ptr_D,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_C(
+    params_C,
+    ptr_C,
+    problem_size,
+    thread_idx
+  );
+
+  // Epilogue operator
+  Epilogue epilogue(
+    shared_storage, 
+    thread_idx, 
+    warp_idx, 
+    lane_idx);
+
+  //
+  // Initialize the accumulators
+  //
+
+  int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
+  int warp_m = warp_mn % Epilogue::WarpCount::kM;
+  int warp_n = warp_mn / Epilogue::WarpCount::kM;
+
+  accumulator_ref.add_coord_offset({
+    warp_m * Epilogue::WarpMmaOperator::Shape::kM, 
+    warp_n * Epilogue::WarpMmaOperator::Shape::kN});
+
+  typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
+  
+  typename Epilogue::AccumulatorTile accumulators;
+
+  accumulators.clear();
+  accumulator_iterator.load(accumulators);
+
+#if 0
+  // For debugging, enable this block of code to fill each accumulator element with its
+  // source thread ID.
+  CUTLASS_PRAGMA_UNROLL
+  for (size_t i = 0; i < accumulators.size(); ++i) {
+    typename Epilogue::WarpMmaOperator::ElementC x(threadIdx.x);
+    accumulators[i] = x;
+  }
+
+  __syncthreads();
+
+#endif
+
+  //
+  // Perform the epilogue operation
+  //
+
+  typename Epilogue::OutputOp output_op(params_output_op);
+
+  // Place the epilogue in a loop
+  for (int iter = 0; iter < epilogue_count; ++iter) {
+    epilogue(output_op, iterator_D, accumulators, iterator_C);
+  }
+}
+
+} // namespace kernel
+} // namespace test
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Epilogue_
+>
+class EpilogueTestbed {
+public:
+
+  using Epilogue = Epilogue_;
+  using ElementAccumulator = typename Epilogue::ElementAccumulator;
+  using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
+  using ElementOutput = typename Epilogue::ElementOutput;
+  using OutputOpParams = typename Epilogue::OutputOp::Params;
+
+public:
+
+  //
+  // Data members
+  //
+
+  cutlass::MatrixCoord quantized_size;
+  cutlass::HostTensor<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> source_tensor;
+  cutlass::HostTensor<ElementOutput, cutlass::layout::RowMajor> output_tensor;
+
+public:
+
+  //
+  // Methods
+  //
+
+  EpilogueTestbed(): 
+    quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
+    accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}) {
+
+    //
+    // Initialize problem space
+    //
+
+    uint64_t seed = 2019;
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      accumulator_tensor.host_view(), 
+      seed, 
+      2,
+      -2,
+      0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      source_tensor.host_view(),
+      seed + 2018, 
+      2,
+      -2,
+      0);
+  }
+
+  bool run_all() {
+   
+    double alpha_values[] = {1, 0, 2.25};
+    double beta_values[] = {0, 1, -1.25};
+
+    // Test runtime explodes if we tried to test every case exhaustively. This tests the full
+    // output tile and several smaller sizes to stress predication.
+    for (int m_idx = 0; m_idx < 3; ++m_idx) {
+      for (int n_idx = 0; n_idx < 3; ++n_idx) {
+
+        int m = quantized_size.row() - m_idx * 3;
+        int n = quantized_size.column() - n_idx * Epilogue::kElementsPerAccess;
+
+        for (double const &alpha : alpha_values) {
+          for (double const &beta : beta_values) {
+
+            bool passed = run({m, n}, {cutlass::from_real<ElementCompute>(alpha), cutlass::from_real<ElementCompute>(beta)});
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+    cutlass::MatrixCoord problem_size,
+    OutputOpParams output_params) { 
+
+    //
+    // Initialize problem space
+    //
+
+    ElementOutput default_output = ElementOutput(-127);
+    cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);
+
+    accumulator_tensor.sync_device();
+    output_tensor.sync_device();
+    source_tensor.sync_device();
+
+    //
+    // Initialize epilogue parameters
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_D(output_tensor.device_ref().layout());
+    typename Epilogue::OutputTileIterator::Params params_C(source_tensor.device_ref().layout());
+
+    //
+    // Launch kernel
+    //
+
+    dim3 grid(1, 1);
+    dim3 block(Epilogue::WarpCount::kCount * 32, 1);
+
+    test::kernel::epilogue_threadblock<Epilogue><<< grid, block >>>(
+      params_D,
+      output_tensor.device_data(),
+      params_C,
+      source_tensor.device_data(),
+      output_params,
+      problem_size, 
+      accumulator_tensor.device_view());
+
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    //
+    // Verify results
+    //
+    output_tensor.sync_host();
+
+    int errors = 0;
+    int const kMaxErrors = 5;
+
+    for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
+      for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {
+
+        cutlass::MatrixCoord coord{r, c};
+        ElementOutput got = output_tensor.at(coord);
+        
+        ElementOutput expected;
+        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
+          ElementCompute intermediate =
+            output_params.alpha * ElementCompute(accumulator_tensor.at(coord)) + 
+            output_params.beta * ElementCompute(source_tensor.at(coord));
+          
+          if ((cutlass::platform::is_same<ElementOutput, cutlass::int4b_t>::value
+              || cutlass::platform::is_same<ElementOutput, cutlass::uint4b_t>::value
+              || std::numeric_limits<ElementOutput>::is_integer)
+              && !std::numeric_limits<ElementCompute>::is_integer) {
+            std::fesetround(FE_TONEAREST);
+            expected = ElementOutput(std::nearbyint(float(cutlass::real(intermediate))));
+          } else {
+            expected = ElementOutput(intermediate);
+          }
+        } else {
+          expected = default_output;
+        }
+
+        if (expected != got) {
+
+          using OutputIO = cutlass::ScalarIO<ElementOutput>;
+
+          EXPECT_TRUE(false)
+            << "-------\n"
+            << "Error - output element (" << coord << ") - expected: " 
+            << OutputIO(expected) 
+            << ",  got: " << OutputIO(got)
+            << ",  accum: " << (accumulator_tensor.at(coord))
+            << ",  source: " << OutputIO(source_tensor.at(coord))
+            << ",  alpha: " << (output_params.alpha)
+            << ",  beta: " << (output_params.beta) << "\n";
+
+          ++errors;
+        }
+      }
+    }
+
+    //
+    // Report results on error
+    //
+
+    if (errors) {
+      std::stringstream ss;
+      ss 
+        << "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_" 
+        << Epilogue::WarpTileIterator::WarpShape::kM << "x" 
+        << Epilogue::WarpTileIterator::WarpShape::kN 
+        << "_slice_" << Epilogue::WarpCount::kK << ".csv"; 
+
+      std::ofstream output_file(ss.str()); 
+      output_file << output_tensor.host_view(); 
+    }
+
+    return !errors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..a76578f7638ac1d30161a9bcb55ecec70b5c43e0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -0,0 +1,394 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for epilogues
+*/
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/half.h"
+#include "cutlass/complex.h"
+
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace kernel {
+
+template <typename Epilogue>
+__global__ void epilogue_planar_complex_threadblock(
+  typename Epilogue::OutputTileIterator::Params params_D,
+  typename Epilogue::OutputTileIterator::Element *ptr_D,
+  int64_t imaginary_stride_D,
+  typename Epilogue::OutputTileIterator::Params params_C,
+  typename Epilogue::OutputTileIterator::Element *ptr_C,
+  int64_t imaginary_stride_C,
+  typename Epilogue::OutputOp::Params params_output_op,
+  cutlass::MatrixCoord problem_size,
+  cutlass::TensorRef<
+    typename Epilogue::WarpMmaOperator::ElementC, 
+    typename Epilogue::WarpMmaOperator::LayoutC> accumulator_ref,
+  int64_t imaginary_stride_accum,
+  int epilogue_count = 1) {
+
+  __shared__ typename Epilogue::SharedStorage shared_storage;
+
+  int thread_idx = threadIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int lane_idx = threadIdx.x % 32;
+
+  //
+  // Construct the epilogue
+  //
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_D_real(
+    params_D,
+    ptr_D,
+    problem_size,
+    thread_idx
+  );
+
+  typename Epilogue::OutputTileIterator iterator_D_imag(
+    params_D,
+    ptr_D + imaginary_stride_D,
+    problem_size,
+    thread_idx
+  );
+
+  // Tile iterator writing to output tile
+  typename Epilogue::OutputTileIterator iterator_C_real(
+    params_C,
+    ptr_C,
+    problem_size,
+    thread_idx
+  );
+
+  typename Epilogue::OutputTileIterator iterator_C_imag(
+    params_C,
+    ptr_C + imaginary_stride_C,
+    problem_size,
+    thread_idx
+  );
+
+  // Epilogue operator
+  Epilogue epilogue(
+    shared_storage, 
+    thread_idx, 
+    warp_idx, 
+    lane_idx);
+
+  //
+  // Initialize the accumulators
+  //
+
+  int warp_mn = warp_idx % (Epilogue::WarpCount::kM * Epilogue::WarpCount::kN);
+  int warp_m = warp_mn % Epilogue::WarpCount::kM;
+  int warp_n = warp_mn / Epilogue::WarpCount::kM;
+
+  accumulator_ref.add_coord_offset({
+    warp_m * Epilogue::WarpMmaOperator::Shape::kM, 
+    warp_n * Epilogue::WarpMmaOperator::Shape::kN});
+
+  //
+  // Load accumulators
+  //
+
+  typename Epilogue::WarpMmaOperator::IteratorC accumulator_iterator(accumulator_ref, lane_idx);
+  
+  typename Epilogue::AccumulatorTile accumulators;
+
+  accumulators.clear();
+
+  accumulator_iterator.load(accumulators.real);
+  accumulator_iterator.load_with_pointer_offset(accumulators.imag, imaginary_stride_accum);
+
+  //
+  // Perform the epilogue operation
+  //
+
+  typename Epilogue::OutputOp output_op(params_output_op);
+
+  // Place the epilogue in a loop so assembly is clearly visible
+  for (int iter = 0; iter < epilogue_count; ++iter) {
+    epilogue(
+      output_op, 
+      iterator_D_real, 
+      iterator_D_imag, 
+      accumulators, 
+      iterator_C_real, 
+      iterator_C_imag); 
+  }
+}
+
+} // namespace kernel
+} // namespace test
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Epilogue_
+>
+class EpiloguePlanarComplexTestbed {
+public:
+
+  using Epilogue = Epilogue_;
+  using ElementAccumulator = typename Epilogue::ElementAccumulator;
+  using ElementCompute = typename Epilogue::OutputOp::ElementCompute;
+  using ElementOutput = typename Epilogue::ElementOutput;
+  using OutputOpParams = typename Epilogue::OutputOp::Params;
+
+  using ComplexElementOutput = cutlass::complex<ElementOutput>;
+  using ComplexElementAccumulator = cutlass::complex<ElementAccumulator>;
+  using ComplexElementCompute = cutlass::complex<ElementCompute>;
+
+public:
+
+  //
+  // Data members
+  //
+
+  cutlass::MatrixCoord quantized_size;
+  cutlass::HostTensorPlanarComplex<ElementAccumulator, cutlass::layout::RowMajor> accumulator_tensor;
+  cutlass::HostTensorPlanarComplex<ElementOutput, cutlass::layout::RowMajor> source_tensor;
+  cutlass::HostTensorPlanarComplex<ElementOutput, cutlass::layout::RowMajor> output_tensor;
+
+public:
+
+  //
+  // Methods
+  //
+
+  EpiloguePlanarComplexTestbed(): 
+    quantized_size(Epilogue::Shape::kM, Epilogue::Shape::kN),
+    accumulator_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    source_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}),
+    output_tensor({Epilogue::Shape::kM, Epilogue::Shape::kN}) {
+
+    //
+    // Initialize problem space
+    //
+
+    #if 1
+    uint64_t seed = 2019;
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      accumulator_tensor.host_view(), 
+      seed, 
+      20, 
+      -20, 
+      0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+      source_tensor.host_view(),
+      seed + 2018, 
+      20, 
+      -20, 
+      0);
+    #else
+
+    cutlass::reference::host::BlockFillSequential(accumulator_tensor.host_data(), accumulator_tensor.capacity());
+
+    #endif
+  }
+
+  bool run_all() {
+   
+    cutlass::complex<float> alpha_values[3];
+
+    alpha_values[0] = cutlass::complex<float>(1, 0);
+    alpha_values[1] = cutlass::complex<float>(0, 0);
+    alpha_values[2] = cutlass::complex<float>(2.25f, -0.5f);
+
+    cutlass::complex<float> beta_values[3];
+
+    beta_values[0] = cutlass::complex<float>(0, 0);
+    beta_values[1] = cutlass::complex<float>(1, 0);
+    beta_values[2] = cutlass::complex<float>(0.5f, -2.25f);
+
+    // Test runtime explodes if we tried to test every case exhaustively. This tests the full
+    // output tile and several smaller sizes to stress predication.
+    for (int m_idx = 0; m_idx < 3; ++m_idx) {
+      for (int n_idx = 0; n_idx < 3; ++n_idx) {
+
+        cutlass::MatrixCoord problem_size(
+          quantized_size.row() - m_idx * 3,
+          quantized_size.column() - n_idx * Epilogue::kElementsPerAccess
+        );
+
+        for (auto const &alpha : alpha_values) {
+          for (auto const &beta : beta_values) {
+
+            bool passed = run(problem_size, {alpha, beta});
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+    cutlass::MatrixCoord problem_size,
+    OutputOpParams output_params) { 
+
+    //
+    // Initialize problem space
+    //
+
+    ComplexElementOutput default_output = ComplexElementOutput(ElementOutput(-127), ElementOutput(-101));
+
+    cutlass::reference::host::TensorFill(output_tensor.host_view(), default_output);
+
+    accumulator_tensor.sync_device();
+    output_tensor.sync_device();
+    source_tensor.sync_device();
+
+    //
+    // Initialize epilogue parameters
+    //
+
+    typename Epilogue::OutputTileIterator::Params params_D(output_tensor.layout());
+    typename Epilogue::OutputTileIterator::Params params_C(source_tensor.layout());
+
+    //
+    // Launch kernel
+    //
+
+    dim3 grid(1, 1);
+    dim3 block(Epilogue::WarpCount::kCount * 32, 1);
+
+    test::kernel::epilogue_planar_complex_threadblock<Epilogue><<< grid, block >>>(
+      params_D,
+      output_tensor.device_data(),
+      output_tensor.imaginary_stride(),
+      params_C,
+      source_tensor.device_data(),
+      source_tensor.imaginary_stride(),
+      output_params,
+      problem_size, 
+      accumulator_tensor.device_view_real(),
+      accumulator_tensor.imaginary_stride()
+    );
+
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cerr << "Kernel error: " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    //
+    // Verify results
+    //
+    output_tensor.sync_host();
+
+    int errors = 0;
+    int const kMaxErrors = 5;
+
+    for (int r = 0; errors < kMaxErrors && r < quantized_size.row(); ++r) {
+      for (int c = 0; errors < kMaxErrors && c < quantized_size.column(); ++c) {
+
+        cutlass::MatrixCoord coord{r, c};
+        ComplexElementOutput got = output_tensor.at(coord);
+        
+        ComplexElementOutput expected = default_output;
+
+        if (coord.row() < problem_size.row() && coord.column() < problem_size.column()) {
+
+          ComplexElementOutput src = source_tensor.at(coord);
+
+          ComplexElementCompute tmp = 
+            output_params.alpha * ComplexElementCompute(accumulator_tensor.at(coord)) + 
+            output_params.beta * ComplexElementCompute(src.real(), src.imag());
+
+          expected = ComplexElementOutput(ElementOutput(tmp.real()), ElementOutput(tmp.imag()));
+        }
+
+        if (expected != got) {
+
+          using OutputIO = cutlass::ScalarIO<ComplexElementOutput>;
+
+          EXPECT_TRUE(false)
+            << "-------\n"
+            << "Error - output element (" << coord << ") - expected: " 
+            << OutputIO(expected) 
+            << ",  got: " << OutputIO(got) << std::endl;
+
+          ++errors;
+        }
+      }
+    }
+
+    //
+    // Report results on error
+    //
+
+    if (errors) {
+
+
+      std::cout << "Incorrect result for problem(" 
+      << problem_size.row() << ", " 
+      << problem_size.column() << ") for alpha: " << output_params.alpha << ", beta: " << output_params.beta << std::endl;
+
+      std::stringstream ss;
+      ss 
+        << "output_tensor_op_" << Epilogue::Shape::kM << "x" << Epilogue::Shape::kN << "_" 
+        << Epilogue::WarpTileIterator::WarpShape::kM << "x" 
+        << Epilogue::WarpTileIterator::WarpShape::kN 
+        << "_slice_" << Epilogue::WarpCount::kK << ".csv"; 
+
+      std::ofstream output_file(ss.str()); 
+      output_file << output_tensor.host_view(); 
+
+      std::cout << "Wrote workspace to '" << ss.str() << "'" << std::endl;
+    }
+
+    return !errors;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/default_gemm_configuration.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/default_gemm_configuration.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0054a1b6757a232e9177407fdd2041b6a91cffb9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/default_gemm_configuration.hpp
@@ -0,0 +1,1384 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cute/atom/mma_atom.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+using namespace cute;
+
+// This type is only intended to demonstrate porting 2.x kernels to 3.0
+template<
+  class OperatorClass, class ArchTag,
+  class ElementA, class LayoutA,
+  class ElementB, class LayoutB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types {
+  static_assert(sizeof(ElementA) == 0, "No valid DefaultGemmConfigurationToCutlass3Types configuration exists.");
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB;
+
+//
+// F16: 128-by-128-by-64
+//
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor, 8, 64>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                       Stride<_64, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::ColumnMajor, 8, SizeK>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                       Stride< _1,_64>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, layout::ColumnMajor, Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor,    Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, layout::RowMajor,    Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<half_t, layout::ColumnMajor, Alignment, SizeK>
+{};
+
+//
+// F16: 128-by-128-by-32 (small k-block)
+//
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, layout::RowMajor, 8, 32>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                       Stride<_32, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere MMA F32F16
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    half_t, LayoutA,
+    half_t, LayoutB,
+    float, LayoutC,
+    float>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+      Layout<Shape<_2,_2,_1>>,  // 2x2x1 thread group
+      Tile<_32,_32,_16>>;       // 32x32x16 MMA for LDSM, 1x2x1 value group
+
+  // A
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = detail::DefaultGemm_TensorOpSm80_OperandA<
+    half_t, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = detail::DefaultGemm_TensorOpSm80_OperandB<
+    half_t, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    half_t, TagToStrideA_t<LayoutA>,
+    half_t, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    float,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<float, 1, float, float>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+//
+// TF32: 128-by-128-by-kblock (kBlock = 16, 32)
+//
+
+/// Operand A - Row-major  (K-major) (kBlock = 32)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor, 4, 32>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,2,3>{},
+                Layout<Shape < _8,_32>,
+                       Stride<_32, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, tfloat32_t>;
+
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+};
+
+/// Operand A - Row-major  (K-major) (kBlock = 16)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor, 4, 16>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,2,3>{},
+                Layout<Shape < _8,_16>,
+                       Stride<_16, _1>>{}));
+  using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, tfloat32_t>;
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+};
+
+/// Operand A - Column-major  (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::ColumnMajor, 4, SizeK>
+{
+  // Smem
+  using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,2,3>{},
+                Layout<Shape <_32, _8>,
+                       Stride< _1,_32>>{}));
+  using SmemCopyAtom = Copy_Atom<UniversalCopy<tfloat32_t>, tfloat32_t>;
+  // Gmem
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, tfloat32_t>{},
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},
+                    Layout<Shape < _4, _1>>{}));
+};
+
+// Because the TF32 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major  (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<tfloat32_t, layout::ColumnMajor, Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::RowMajor,    Alignment, SizeK>
+{};
+
+// Operand B - Row-Major  (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<tfloat32_t, layout::RowMajor,    Alignment, SizeK>
+     : DefaultGemm_TensorOpSm80_OperandA<tfloat32_t, layout::ColumnMajor, Alignment, SizeK>
+{};
+
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere MMA F32TF32
+template <typename LayoutA, typename LayoutB, typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    tfloat32_t, LayoutA,
+    tfloat32_t, LayoutB,
+    float, LayoutC,
+    float>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
+      Layout<Shape<_2,_2,_1>, Stride<_2, _1, _1>>, // 2x2x1 thread group
+      Tile<_32,_32,_8>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  // A
+  static constexpr int kAlignmentA = 4;
+  using DefaultOperandA = detail::DefaultGemm_TensorOpSm80_OperandA<
+    tfloat32_t, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 4;
+  using DefaultOperandB = detail::DefaultGemm_TensorOpSm80_OperandB<
+    tfloat32_t, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    tfloat32_t, TagToStrideA_t<LayoutA>,
+    tfloat32_t, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    float,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<float, 1, float, float>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+template <typename LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    int32_t, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _64>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_16x8x32_S32S8S8S32_TN>,
+      Layout<Shape<_2,_2,_1>>,   // 2x2x1 thread group
+      Tile<_32,_32,_32>>;        // 16x16x32 MMA for LDSM, 1x2x1 value group
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = decltype(
+    composition(
+      Swizzle<2,4,3>{},
+      Layout<Shape <_16,_64>,
+             Stride<_64, _1>>{}));
+  static constexpr int kAlignmentA = 16;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, int8_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape<_1,Int<kAlignmentA>>>{}));
+  // LDS.32- or LDSM-based copy atom
+  // using SmemCopyAtomA = Copy_Atom<DefaultCopy, uint8_t>;
+  using SmemCopyAtomA = Copy_Atom<SM75_U32x4_LDSM_N, uint8_t>;  // LDSM works
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = decltype(
+    composition(
+      Swizzle<2,4,3>{},
+      Layout<Shape <_16,_64>,
+             Stride<_64, _1>>{}));
+  static constexpr int kAlignmentB = 16;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, int8_t>{},
+                    Layout<Shape <_32,_4>,
+                           Stride< _4,_1>>{},
+                    Layout<Shape<_1,Int<kAlignmentB>>>{}));
+
+  // LDS.32- or LDSM-based copy atom
+  // using SmemCopyAtomB = Copy_Atom<DefaultCopy, uint32_t>;
+  using SmemCopyAtomB = Copy_Atom<SM75_U32x4_LDSM_N, uint8_t>;  // LDSM works
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    int8_t, TagToStrideA_t<cutlass::layout::RowMajor>,
+    int8_t, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    int32_t,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<int32_t, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//////////////////////////// SIMT TWO STAGE ///////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, typename Layout, int ThreadCount, int ShapeM, int ShapeK>
+struct DefaultGemm_Simt_OperandA;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+struct DefaultGemm_Simt_OperandA<Element, layout::ColumnMajor, 256, 128, 8>
+{
+  using SmemLayoutAtom = Layout<Shape <_128,  _8>,
+                                Stride<  _1,_128>>;
+
+  using SmemCopyAtom = Copy_Atom<DefaultCopy, Element>;
+
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape<_1,_1>>{}));
+};
+
+template <typename Element>
+struct DefaultGemm_Simt_OperandA<Element, layout::RowMajor, 256, 128, 8>
+{
+  using SmemLayoutAtom = Layout<Shape <_128,          _8>,
+                                Stride<  _1,Int<128 + 4>>>;   // Padded
+
+  using SmemCopyAtom = Copy_Atom<DefaultCopy, Element>;
+
+  using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<Element>, Element>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _8, _1>>{},
+                    Layout<Shape<_1,_1>>{}));
+
+};
+
+template <typename Element, typename Layout, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB;
+
+template <typename Element, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB<Element, layout::ColumnMajor, ThreadCount, ShapeN, ShapeK>
+     : DefaultGemm_Simt_OperandA<Element, layout::RowMajor,    ThreadCount, ShapeN, ShapeK> {};
+
+template <typename Element, int ThreadCount, int ShapeN, int ShapeK>
+struct DefaultGemm_Simt_OperandB<Element, layout::RowMajor,    ThreadCount, ShapeN, ShapeK>
+     : DefaultGemm_Simt_OperandA<Element, layout::ColumnMajor, ThreadCount, ShapeN, ShapeK> {};
+
+} // end namespace detail
+
+// SIMT Two Stage
+template <
+  class ArchTag,
+  class ElementA, class LayoutA,
+  class ElementB, class LayoutB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    ElementA, LayoutA,
+    ElementB, LayoutB,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _8>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A
+  static constexpr int kAlignmentA = 1;
+  using DefaultOperandA = detail::DefaultGemm_Simt_OperandA<ElementA, LayoutA, ThreadCount, 128, 8>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom;
+  using SmemCopyAtomA   = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA  = typename DefaultOperandA::GmemTiledCopy;
+
+  // B
+  static constexpr int kAlignmentB = 1;
+  using DefaultOperandB = detail::DefaultGemm_Simt_OperandB<ElementB, LayoutB, ThreadCount, 128, 8>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom;
+  using SmemCopyAtomB   = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB  = typename DefaultOperandB::GmemTiledCopy;
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<LayoutA>,
+    ElementB, TagToStrideB_t<LayoutB>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+
+//
+// DP4A - int8    Proof-of-concept
+//
+
+// SIMT Two Stage TN - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  // NOTE: permuting MMA M mode lets us generate 128b smem loads (LDS.128) but has worst case bank conflicts
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16,_16,_1>>>;  // Tile of atoms (threads)
+
+  // A (M,K)  K-major
+  using ElementA = int8_t;
+  // 40% from regular M and N major layout
+  // using SmemLayoutAtomA = Layout<Shape <_128,_32>,
+  //                                Stride<  _1,_128>>;
+  // 80% from interleaved layouts
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 4;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementA>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // B (N,K)  K-major
+  using ElementB = int8_t;
+  // 40% from regular M and N major layout
+  // using SmemLayoutAtomB = Layout<Shape <_128,_32>,
+  //                                Stride<  _1,_128>>;
+  // 80% from interleaved layouts
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 4;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementB>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage NN - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+
+  using DispatchPolicy = MainloopSm70TwoStage;
+
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  M-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementA>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // B (N,K)  K-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 4;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementB>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage NT - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::ColumnMajor,
+    int8_t, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  M-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementA>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // B (N,K)  N-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementB>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Two Stage TT - idp4a
+template <
+  class ArchTag,
+  class ElementC, class LayoutC>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, ArchTag,
+    int8_t, cutlass::layout::RowMajor,
+    int8_t, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    int32_t>
+{
+  using TileShape = Shape<_128, _128, _32>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm70TwoStage;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM61_DP4A>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  K-major
+  using ElementA = int8_t;
+  using SmemLayoutAtomA = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 4;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint32_t>, ElementA>{},
+                    Layout<Shape <_32,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_4>>{}));
+
+  // B (N,K)  N-major
+  using ElementB = int8_t;
+  using SmemLayoutAtomB = Layout<Shape <_128, Shape <_4,  _8>>,
+                                 Stride<  _4, Stride<_1,_512>>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<UniversalCopy<cute::uint8_t>, ElementB>{},
+                    Layout<Shape <_32, _8>,
+                           Stride< _1,_32>>{},
+                    Layout<Shape < _1, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+/////////////////////////// SIMT MULTI STAGE //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage NT
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::ColumnMajor,
+    ElementB, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                            // 16x16x1 thread group
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,               // 32x32x1 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore>>;
+
+  // A (M,K)  M-major
+  using SmemLayoutAtomA = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 2;
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * kAlignmentA>;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // B (N,K)  N-major
+  using SmemLayoutAtomB = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 2;
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * kAlignmentB>;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage TN
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::RowMajor,
+    ElementB, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>>;
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentA
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementA>, ElementA>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentB
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementB>, ElementB>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage NN
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::ColumnMajor,
+    ElementB, cutlass::layout::ColumnMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                                      // 16x16x1 thread group
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore,Underscore>>; // 32x16x1 MMA with perm for load vectorization
+
+  // A (M,K)  M-major
+  using SmemLayoutAtomA = Layout<Shape<_128,_16>>;
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 2;
+  using AlignmentTypeA = cute::uint_byte_t<static_cast<int>(sizeof(ElementA)) * kAlignmentA>;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeA>, ElementA>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentB
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementB>, ElementB>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// SIMT Multi Stage TT
+template <
+  class ElementA,
+  class ElementB,
+  class ElementC, class LayoutC,
+  class ElementAccumulator>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassSimt, arch::Sm80,
+    ElementA, cutlass::layout::RowMajor,
+    ElementB, cutlass::layout::RowMajor,
+    ElementC, LayoutC,
+    ElementAccumulator>
+{
+  using TileShape = Shape<_128, _128, _16>;
+  static constexpr int ThreadCount = 256;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<UniversalFMA<ElementAccumulator, ElementA, ElementB, ElementC>>,
+      Layout<Shape<_16, _16, _1>>,                                      // 16x16x1 thread group
+      Tile<Underscore,Layout<Shape<_16,_2>,Stride<_2,_1>>,Underscore>>; // 16x32x1 MMA with perm for load vectorization
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = Layout<Shape <_128,          _16>,
+                                 Stride<  _1, Int<128 + 1>>>;  // Padded by kAlignmentA
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, ElementA>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementA>, ElementA>{},
+                    Layout<Shape <_16,_16>,
+                           Stride<_16, _1>>{}));
+
+  // B (N,K)  N-major
+  using SmemLayoutAtomB = Layout<Shape <_128,_16>>;
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, ElementB>;
+  static constexpr int kAlignmentB = 2;
+  using AlignmentTypeB = cute::uint_byte_t<static_cast<int>(sizeof(ElementB)) * kAlignmentB>;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<AlignmentTypeB>, ElementB>{},
+                    Layout<Shape<_32,_8>>{},
+                    Layout<Shape< _2,_1>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    ElementA, TagToStrideA_t<cutlass::layout::RowMajor>,
+    ElementB, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
+    TagToStrideC_t<LayoutC>,
+    TagToStrideC_t<LayoutC>,
+    epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA TN (K-Major A and K-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  K-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // B  (N,K)  K-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+
+/*
+  using EpilogueOutputOp = epilogue::collective::Epilogue<
+      epilogue::thread::LinearCombination<double, 1, double, double>,
+      Layout<Shape <_64,_32>,
+             Stride< _1,_64>>,                                           // SMEM layout
+      Copy_Atom<UniversalCopy<double>,double>,                           // R2S with tiled_mma layout
+      decltype(make_tiled_copy(Copy_Atom<UniversalCopy<double>,double>{},// S2R
+                               Layout<Shape <_16,_16>,
+                                      Stride< _1,_16>>{},                // Thread layout
+                               Layout<Shape<_2,_1>>{})),                 // Value layout
+      Copy_Atom<UniversalCopy<double>,double>                            // R2G with S2R_dst layout
+      >;
+*/
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA NN (M-Major A and K-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  M-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // B  (N,K)  K-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{}));// N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 1;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA NT (M-Major A and N-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  M-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // B  (N,K)  N-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::ColumnMajor>,
+    double, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Ampere fp64 MMA TT (K-Major A and N-Major B)
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm80,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM80_8x8x4_F64F64F64F64_TN>,            // Atom
+      Layout<Shape<_2,_2,_1>>,                         // Atom layout
+      Tile<Layout<Shape<_16,_2>,Stride<_2,_1>>,        // 32x32x4 MMA with perm for load vectorization
+           Layout<Shape<_16,_2>,Stride<_2,_1>>,
+           Underscore>>;
+
+  // A  (M,K)  K-Major
+  using SmemLayoutAtomA = decltype(
+      composition(Swizzle<2,0,4>{},
+                  Layout<Shape <_4,_16>,
+                         Stride<_1, _4>>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 1;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<double>, double>{}, // CopyAtom
+                    Layout<Shape < _8,_16>,
+                           Stride<_16, _1>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_1,_1>>{}));                            // Value layout: 1x1 doubles
+
+  // B  (N,K)  N-Major
+  using SmemLayoutAtomB = decltype(
+      composition(Swizzle<2,2,2>{},
+                  Layout<Shape <_16, _4>,
+                         Stride< _1,_16>>{})); // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{}, // CopyAtom
+                    Layout<Shape <_16, _8>,
+                           Stride< _1,_16>>{},                           // ThrLayout for CopyAtom
+                    Layout<Shape<_2,_1>>{}));                            // Value layout: 2x1 doubles
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::RowMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    TagToStrideC_t<cutlass::layout::ColumnMajor>,
+    epilogue::thread::LinearCombination<double, 1, double, double>,
+    cutlass::gemm::EpilogueDefault>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Hopper fp64 MMA TN
+template <>
+struct DefaultGemmConfigurationToCutlass3Types<
+    arch::OpClassTensorOp, arch::Sm90,
+    double, cutlass::layout::RowMajor,
+    double, cutlass::layout::ColumnMajor,
+    double, cutlass::layout::ColumnMajor,
+    double>
+{
+  using TileShape = Shape<_128, _64, _16>;
+  static constexpr int ThreadCount = 128;
+  using DispatchPolicy = MainloopSm80CpAsync<3>;
+  using TiledMma = TiledMMA<
+      MMA_Atom<SM90_16x8x16_F64F64F64F64_TN>,
+      Layout<Shape<_2,_2,_1>>>;
+
+  // A (M,K)  K-major
+  using SmemLayoutAtomA = decltype(
+    make_ordered_layout(Shape<_128,_16>{},
+                        Step <  _2, _1>{})); // M, K
+  using SmemCopyAtomA = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentA = 2;
+  using GmemTiledCopyA = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_2>>{}));
+
+  // B (N,K)  K-major
+  using SmemLayoutAtomB = decltype(
+    make_ordered_layout(Shape<_64,_16>{},
+                        Step < _2, _1>{}));                       // N, K
+  using SmemCopyAtomB = Copy_Atom<DefaultCopy, double>;
+  static constexpr int kAlignmentB = 2;
+  using GmemTiledCopyB = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<cute::uint128_t>, double>{},
+                    Layout<Shape <_16,_8>,
+                           Stride< _8,_1>>{},
+                    Layout<Shape < _1,_2>>{}));
+
+  // Mainloop
+  using CollectiveMainloop = collective::CollectiveMma<
+    DispatchPolicy, TileShape,
+    double, TagToStrideA_t<cutlass::layout::RowMajor>,
+    double, TagToStrideB_t<cutlass::layout::ColumnMajor>,
+    TiledMma,
+    GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+    GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  // Epilogue
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    TileShape, Shape<_1,_1,_1>,
+    cutlass::epilogue::collective::EpilogueTileAuto,
+    double, double,
+    double, cutlass::layout::ColumnMajor, 1,
+    double, cutlass::layout::ColumnMajor, 1,
+    cutlass::epilogue::collective::EpilogueScheduleAuto
+  >::CollectiveOp;
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..89755dd7d3162b114a537e58c6aa33cac80078f9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -0,0 +1,3993 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <random>
+#include <numeric> // std::lcm
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/complex.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/detail/collective.hpp"
+
+#include "testbed_utils.h"
+
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/layout.hpp"
+#include "cute/numeric/int.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class ScalarLoc {
+  ON_HOST = 0,
+  ON_DEVICE = 1
+};
+
+enum class VectorScale {
+  DISABLED = 0,
+  ENABLED = 1
+};
+
+enum class CheckEquality {
+  EXACT = 0,
+  RELATIVE = 1
+};
+
+namespace detail {
+
+inline constexpr auto decomp_mode_to_string =
+  [] (cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode mode) -> std::string {
+    using Mode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    if (mode == Mode::Heuristic) {
+      return "Heuristic";
+    }
+    else if (mode == Mode::DataParallel) {
+      return "DataParallel";
+    }
+    else if (mode == Mode::SplitK) {
+      return "SplitK";
+    }
+    else if (mode == Mode::StreamK) {
+      return "StreamK";
+    }
+    else {
+      return "Unknown";
+    }
+  };
+
+inline constexpr auto raster_order_to_string =
+  [] (cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions mode) -> std::string {
+    using Mode = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90Params::RasterOrderOptions;
+    if (mode == Mode::Heuristic) {
+      return "Heuristic";
+    }
+    else if (mode == Mode::AlongM) {
+      return "AlongM";
+    }
+    else if (mode == Mode::AlongN) {
+      return "AlongN";
+    }
+    else {
+      return "Unknown";
+    }
+  };
+
+// Helper classes that take default data type when
+// the Gemm::EpilogueOutputOp does not have ElementCompute
+// and ElementScalar.
+// (e.g. when Sm90TreeVisitor is used as FusionCallbacks)
+template <typename Gemm, typename Default, typename = void>
+struct ElementComputeType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementComputeType<Gemm, Default, std::enable_if_t<not std::is_void_v<typename Gemm::EpilogueOutputOp::ElementCompute>>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementCompute;
+};
+
+template <typename Gemm, typename Default, typename = void>
+struct ElementScalarType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementScalarType<Gemm, Default, std::enable_if_t<not std::is_void_v<typename Gemm::EpilogueOutputOp::ElementScalar>>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
+};
+
+
+template <typename Gemm, typename = void>
+struct IsF8F6F4Kernel {
+  static constexpr bool value = false;
+};
+
+template <typename Gemm>
+struct IsF8F6F4Kernel<Gemm, std::void_t<decltype(Gemm::GemmKernel::CollectiveMainloop::IsF8F6F4)>> {
+  static constexpr bool value = true;
+};
+
+
+template<class CollectiveEpilogue, class = void>
+struct IsSfdEpi : cute::false_type {};
+
+template<class CollectiveEpilogue>
+struct IsSfdEpi<CollectiveEpilogue, cute::void_t<typename CollectiveEpilogue::FusionCallbacks::Operation::GmemLayoutTagScalefactor>> : cute::true_type {};
+
+// The maximum swizzle size to use
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+class MaxSwizzleSize {
+public:
+  MaxSwizzleSize() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit MaxSwizzleSize(IntegralNotBool max_swizzle_size) : max_swizzle_size_(max_swizzle_size) {}
+  explicit operator int() const { return max_swizzle_size_; }
+private:
+  int max_swizzle_size_ = 1;
+};
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  return cute::recast_ptr<T>(ptr);
+}
+
+template<class T>
+struct IsDefaultEpilogue {
+  static constexpr bool value = false;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::DefaultEpilogue<args...>> {
+  static constexpr bool value = true;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<args...>> {
+  static constexpr bool value = true;
+};
+
+template <typename Epilogue, typename = void>
+struct IsLegacyEpiloguePolicy {
+  static constexpr bool value = false;
+};
+
+template <typename Epilogue>
+struct IsLegacyEpiloguePolicy<Epilogue, cute::void_t<decltype(Epilogue::DispatchPolicy::FragmentSize)>> {
+  using EpiloguePolicy = typename Epilogue::DispatchPolicy;
+  static constexpr bool value = cute::is_same_v<
+                                      EpiloguePolicy,
+                                      cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<
+                                        EpiloguePolicy::StagesC, EpiloguePolicy::StagesD, EpiloguePolicy::FragmentSize>>;
+};
+
+// The number of splits to test.
+//
+// This class makes it harder to confuse the order of arguments
+// of the various run(...) functions in this file.  The constructor
+// is explicit, so one can't just type 42 (or false, which the
+// compiler unhelpfully turns into 0); one has to type Splits(42).
+// Splits() picks the default number of splits, 1.
+//
+// The conversion-to-int operator (operator int()) MUST be explicit!
+// Conversion to int MUST require static_cast<int>.
+// Otherwise, that defeats a key purpose of this class,
+// which is to catch common errors of confusing the order
+// of function arguments.
+class Splits {
+public:
+  Splits() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Splits(IntegralNotBool splits) : splits_(splits) {}
+  explicit operator int() const { return splits_; }
+private:
+  int splits_ = 1;
+};
+
+// The number of iterations to test.
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+// Iterations() picks the default number of iterations, 20.
+class Iterations {
+public:
+  Iterations() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Iterations(IntegralNotBool iterations) : iterations_(iterations) {}
+  explicit operator int() const { return iterations_; }
+private:
+  int iterations_ = 20;
+};
+
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
+
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+
+    else if (bits_input <= 6) {
+      scope_max = 2;
+      scope_min = -2;
+    }
+
+    else if (bits_input <= 8) {
+
+      if constexpr (
+                    cute::is_same_v<Element, cutlass::float_ue8m0_t>){
+        scope_max = 4;
+        scope_min = 1;
+      }
+      else {
+
+        scope_max = 1;
+        scope_min = -1;
+
+      }
+
+    }
+    else{
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(
+      view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Looks at Cute Stride to check Row / Column Major
+template<typename Stride>
+static constexpr bool is_row_or_col_major(){
+  int stride_0 = int(cute::size<0>(Stride{}));
+  int stride_1 = int(cute::size<1>(Stride{}));
+  int depth = cute::depth(Stride{});
+  return ((stride_0 == 1) || (stride_1 == 1)) && (depth == 1);
+}
+
+
+//
+// Default MMA input Operands : A , B
+//
+template<
+  class ScheduleType_,
+  class Gemm,
+  class ElementA_ = typename Gemm::GemmKernel::ElementA,
+  class ElementB_ = typename Gemm::GemmKernel::ElementB,
+  class Enable = void>
+struct HostCollectiveMainloop {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  cutlass::ComplexTransform TransformA = Gemm::kTransformA;
+  cutlass::ComplexTransform TransformB = Gemm::kTransformB;
+
+  StrideA stride_a;
+  StrideB stride_b;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_),
+    check_relative_equality(check_relative_equality_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop (generic)::initialize(problem_shape)");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+    // so the HostTensorB should be treated as KxN in "coord"'s view
+    auto b_coord = cutlass::make_Coord(K, N * L);
+
+    try {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.resize");
+#endif
+      tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.resize");
+#endif
+      tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor A or B resize threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor A or B resize threw an unknown exception");
+      throw;
+    }
+
+    try {
+      EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+      EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+    }
+    catch (cutlass::cuda_exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked initialize_tensor threw cutlass::cuda_exception: " << e);
+      throw;
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked initialize_tensor threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: checked_initialize_tensor threw an unknown exception");
+      throw;
+    }
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: Check last error before sync_device()");
+      cudaError_t error = cudaGetLastError();
+      const auto error_str = cudaGetErrorString(error);
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: cudaGetLastError() is " << error_str);
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.host_data()=" << tensor_A.host_data() << ", tensor_A.device_data()=" << tensor_A.device_data());
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.host_data()=" << tensor_B.host_data() << ", tensor_B.device_data()=" << tensor_B.device_data());
+    }
+#endif
+    try {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_A.sync_device");
+#endif
+      tensor_A.sync_device();
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: tensor_B.sync_device");
+#endif
+      tensor_B.sync_device();
+    }
+    catch (cutlass::cuda_exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw cutlass::cuda_exception: " << e);
+      throw;
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop::initialize: Reached end");
+#endif
+    return true;
+  }
+
+  Arguments to_args() {
+
+
+    // Runtime datatype selection
+    if constexpr (not cute::is_same_v<ElementA, typename Gemm::GemmKernel::ElementA>) {
+      using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+      return {
+        reinterpret_cast<ArrayElementA *>(tensor_A.device_data()), stride_a,
+        reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b
+      };
+    }
+    else {
+
+    Arguments arguments =
+    {
+      tensor_A.device_data(), stride_a, tensor_B.device_data(), stride_b
+    };
+    return arguments;
+    }
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+
+
+    auto dummy_SFA = cute::make_tensor(static_cast<ElementA*>(nullptr),
+        cute::make_layout(cute::make_shape(M, K, L), stride_a));
+    auto dummy_SFB = cute::make_tensor(static_cast<ElementB*>(nullptr),
+        cute::make_layout(cute::make_shape(N, K, L), stride_b));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator,
+                                                 decltype(A),
+                                                 decltype(B)
+
+                                                 , decltype(dummy_SFA),
+                                                 decltype(dummy_SFB)
+
+                                                 > mainloop_params{};
+
+    mainloop_params.A = A;
+    mainloop_params.B = B;
+    mainloop_params.transform_A = TransformA;
+    mainloop_params.transform_B = TransformB;
+
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view();
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+
+    bool passed = true;
+    return passed;
+  }
+};
+
+//
+// Sparse MMA host implementation
+//
+template<
+  class Gemm,
+  class ElementA_,
+  class ElementB_>
+struct HostCollectiveMainloopSparse
+{
+
+  // Kernel data types
+  using ElementA = ElementA_;
+  // CuTe layout A for the kernel's sparse tensorA.
+  using LayoutA  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+
+  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+  // CuTe layout E for the kernel's metadata tensor.
+  using LayoutE  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // The following typenames are for the reference host tensors. They are non-sparse tensors.
+  using LayoutTagA = decltype(SparseConfig::deduce_layoutA_tag(LayoutA{}));
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  // We don't care about the actual strideE for the host tensor, but just need one to allocate memory.
+  using StrideE = StrideA;
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+  using LayoutTagE = cutlass::detail::StrideToLayoutTagA_t<StrideE>;
+
+  using ArchTag = typename Gemm::ArchTag;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig>;
+
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig,
+                              ArchTag>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  StrideA stride_a;
+  StrideA stride_a_compressed;
+  StrideB stride_b;
+  StrideE stride_e;
+
+  LayoutA layout_a;
+  LayoutE layout_e;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+  typename LayoutTagE::Stride stride_factor_E;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A_Comp;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  cutlass::HostTensor<ElementE, LayoutTagE> tensor_E;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr int MaxSmCount = 16;
+
+  HostCollectiveMainloopSparse(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride(),
+    typename LayoutTagE::Stride stride_factor_E_ = typename LayoutTagE::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    stride_factor_E(stride_factor_E_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloopSparse::initialize");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    int KAlignedE = compressor_utility.get_metadata_k_physical();
+    int MAlignedE = compressor_utility.get_metadata_m_physical();
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    int KAlignedAC = compressor_utility.get_tensorA_k_physical();
+    int MAlignedAC = compressor_utility.get_tensorA_m_physical();
+
+    stride_a_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KAlignedAC, L));
+    stride_e = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(MAlignedE, KAlignedE, L));
+
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    auto b_coord = cutlass::make_Coord(K, N * L);
+    auto e_coord = cutlass::make_Coord(MAlignedE * L, KAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(MAlignedAC * L, KAlignedAC);
+
+    tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+    tensor_A_Comp.resize(a_comp_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_comp_coord, stride_factor_A));
+    tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    tensor_E.resize(e_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagE>::layout_factory(e_coord, stride_factor_E));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+    compressor_utility.structure_sparse_zero_mask_fill(tensor_A.host_data(), static_cast<int>(seed + 2023));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_E.sync_device();
+    tensor_A_Comp.sync_device();
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+      {M, N, K, L},
+      {tensor_A.device_data(),
+       stride_a,
+       tensor_A_Comp.device_data(),
+       tensor_E.device_data()},
+      {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.run();
+
+    auto result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    layout_a = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    layout_e = SparseConfig::fill_layoutE(problem_shape_MNKL);
+
+    tensor_E.sync_host();
+    tensor_A_Comp.sync_host();
+
+    return true;
+  }
+
+  Arguments to_args() {
+    using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+    using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+    return {
+      reinterpret_cast<ArrayElementA *>(tensor_A_Comp.device_data()), layout_a,
+      reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b,
+      tensor_E.device_data(), layout_e
+    };
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view();
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    return true;
+  }
+};
+
+template<
+  class ScheduleType_,
+  class Gemm,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<ScheduleType_, Gemm, ElementA_, ElementB_,
+    cute::enable_if_t<
+      cute::is_base_of_v<
+        cutlass::gemm::MainloopSm90TmaGmmaWarpSpecializedSparse<Gemm::CollectiveMainloop::DispatchPolicy::Stages,
+                                                                typename Gemm::CollectiveMainloop::DispatchPolicy::ClusterShape,
+                                                                ScheduleType_>,
+        typename Gemm::CollectiveMainloop::DispatchPolicy>>>
+  : HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>
+{
+  using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
+};
+
+//
+// Sparse MMA input Operands : A_compressed, B, metadata
+//
+// Structured Sparse Gemm Input Operands
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  typename ElementA_,
+  typename ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<SchedulerPipelineStageCount_,
+                                                                                 AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_>
+  : HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>
+{
+  using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
+};
+
+//
+// Sparse Gemm Input Operands : A , B, E
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, false /*isAsymmetric*/>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0/*SchedulerPipelineStageCount_*/,
+                                                                                 0/*AccumulatorPipelineStageCount_*/>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_ >;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+//
+// Sparse Gemm Input Operands : A , B, E
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseSm120<SchedulerPipelineStageCount_, true /*isAsymmetric*/>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0/*SchedulerPipelineStageCount_*/,
+                                                                                 0/*AccumulatorPipelineStageCount_*/>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_ >;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,
+                                                                                      AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+
+  using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  StrideA stride_a;
+  StrideB stride_b;
+
+  LayoutSFA layout_sfa;
+  LayoutSFB layout_sfb;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  cutlass::HostTensor<ElementSF, LayoutTagA> tensor_SFA;
+  cutlass::HostTensor<ElementSF, LayoutTagB> tensor_SFB;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop (KernelTmaWarpSpecializedBlockScaledSm100)::initialize");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+    // so the HostTensorB should be treated as KxN in "coord"'s view
+    auto b_coord = cutlass::make_Coord(K, N * L);
+
+    tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+    tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+
+    using namespace cute;
+    auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
+    auto m_blks = cutlass::ceil_div(M, Blk_MN{});
+    auto n_blks = cutlass::ceil_div(N, Blk_MN{});
+    layout_sfa = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    layout_sfb = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+    auto sfb_coord   = cutlass::make_Coord(n_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+
+    tensor_SFA.resize(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A));
+    tensor_SFB.resize(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B));
+
+    EXPECT_TRUE(initialize_tensor(tensor_SFA.host_view(), init_A, seed + 2024));
+    EXPECT_TRUE(initialize_tensor(tensor_SFB.host_view(), init_B, seed + 2025));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_SFA.host_view().at({0, 0}) = ElementSF(1);
+    tensor_SFB.host_view().at({0, 0}) = ElementSF(1);
+
+    tensor_SFA.sync_device();
+    tensor_SFB.sync_device();
+
+    return true;
+  }
+
+  Arguments to_args() {
+    using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+    using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+    return {
+      reinterpret_cast<ArrayElementA *>(tensor_A.device_data()), stride_a,
+      reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b,
+      tensor_SFA.device_data(), layout_sfa,
+      tensor_SFB.device_data(), layout_sfb
+    };
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(tensor_SFA.host_data(), layout_sfa);
+
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(tensor_SFB.host_data(), layout_sfb);
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator,
+        decltype(A),
+        decltype(B),
+        decltype(SfA),
+        decltype(SfB)
+      >
+      mainloop_params{A, SfA, B, SfB};
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view()
+         << "\nSFA =\n" << tensor_SFA.host_view()
+         << "\nSFB =\n" << tensor_SFB.host_view();
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFA.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFB.host_view()), 0);
+    return true;
+  }
+};
+
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm103<SchedulerPipelineStageCount_,
+                                                                                      AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,AccumulatorPipelineStageCount_>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Structured Sparse Gemm Input Operands : A_compressed, B, metadata, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  typename ElementA_,
+  typename ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,
+                                                                                            AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> {
+  // Kernel data types
+  using ElementA = ElementA_;
+  // CuTe layout A for the kernel's sparse tensorA.
+  using LayoutA  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+
+  using ElementE = typename Gemm::GemmKernel::CollectiveMainloop::ElementE;
+  // CuTe layout E for the kernel's metadata tensor.
+  using LayoutE  = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+  using SparseConfig = typename Gemm::GemmKernel::CollectiveMainloop::SparseConfig;
+
+  // The following typenames are for the reference host tensors. They are non-sparse tensors.
+  using LayoutTagA = decltype(SparseConfig::deduce_layoutA_tag(LayoutA{}));
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
+  // We don't care about the actual strideE for the host tensor, but just need one to allocate memory.
+  using StrideE = StrideA;
+
+  static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  using LayoutTagE = cutlass::detail::StrideToLayoutTagA_t<StrideE>;
+
+  using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutTagA,
+                              SparseConfig>;
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                        cute::Shape<int, int, int, int>,
+                        ElementA,
+                        LayoutTagA,
+                        SparseConfig,
+                        cutlass::arch::Sm100>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  StrideA stride_a;
+  StrideA stride_a_compressed;
+  StrideB stride_b;
+  StrideE stride_e;
+
+  LayoutA layout_a;
+  LayoutE layout_e;
+  LayoutSFA layout_sfa;
+  LayoutSFB layout_sfb;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+  typename LayoutTagE::Stride stride_factor_E;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A;
+  cutlass::HostTensor<ElementA, LayoutTagA> tensor_A_Comp;
+  cutlass::HostTensor<ElementB, LayoutTagB> tensor_B;
+  cutlass::HostTensor<ElementE, LayoutTagE> tensor_E;
+  cutlass::HostTensor<ElementSF, LayoutTagA> tensor_SFA;
+  cutlass::HostTensor<ElementSF, LayoutTagB> tensor_SFB;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<StrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride(),
+    typename LayoutTagE::Stride stride_factor_E_ = typename LayoutTagE::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    stride_factor_E(stride_factor_E_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_size) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveMainloop (KernelSparseTmaWarpSpecializedBlockScaledSm100)::initialize");
+#endif
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    int KAlignedE = compressor_utility.get_metadata_k_physical();
+    int MAlignedE = compressor_utility.get_metadata_m_physical();
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    int KAlignedAC = compressor_utility.get_tensorA_k_physical();
+    int MAlignedAC = compressor_utility.get_tensorA_m_physical();
+
+    stride_a_compressed = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, KAlignedAC, L));
+    stride_e = cutlass::make_cute_packed_stride(StrideE{}, cute::make_shape(MAlignedE, KAlignedE, L));
+
+    auto a_coord = cutlass::make_Coord(M * L, K);
+    auto b_coord = cutlass::make_Coord(K, N * L);
+    auto e_coord = cutlass::make_Coord(MAlignedE * L, KAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(MAlignedAC * L, KAlignedAC);
+
+    tensor_A.resize(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A));
+    tensor_A_Comp.resize(a_comp_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_comp_coord, stride_factor_A));
+    tensor_B.resize(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B));
+    tensor_E.resize(e_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagE>::layout_factory(e_coord, stride_factor_E));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2022));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2021));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = ElementA(1);
+    tensor_B.host_view().at({0, 0}) = ElementB(1);
+
+    compressor_utility.structure_sparse_zero_mask_fill(tensor_A.host_data(), static_cast<int>(seed + 2023));
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_E.sync_device();
+    tensor_A_Comp.sync_device();
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+      {M, N, K, L},
+      {tensor_A.device_data(),
+       stride_a,
+       tensor_A_Comp.device_data(),
+       tensor_E.device_data()},
+      {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get());
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    status = compressor_op.run();
+
+    auto result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    layout_a = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    layout_e = SparseConfig::fill_layoutE(problem_shape_MNKL);
+
+    tensor_E.sync_host();
+    tensor_A_Comp.sync_host();
+
+    using namespace cute;
+    auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
+    auto m_blks = cutlass::ceil_div(M, Blk_MN{});
+    auto n_blks = cutlass::ceil_div(N, Blk_MN{});
+    layout_sfa = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL);
+    layout_sfb = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL);
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+    auto sfb_coord   = cutlass::make_Coord(n_blks * Blk_MN{} * L, k_blks * Blk_SF{});
+
+    tensor_SFA.resize(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A));
+    tensor_SFB.resize(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B));
+
+    EXPECT_TRUE(initialize_tensor(tensor_SFA.host_view(), init_A, seed + 2024));
+    EXPECT_TRUE(initialize_tensor(tensor_SFB.host_view(), init_B, seed + 2025));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_SFA.host_view().at({0, 0}) = ElementSF(1);
+    tensor_SFB.host_view().at({0, 0}) = ElementSF(1);
+
+    tensor_SFA.sync_device();
+    tensor_SFB.sync_device();
+
+    return true;
+  }
+
+  Arguments to_args() {
+    using ArrayElementA = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementA;
+    using ArrayElementB = typename Gemm::GemmKernel::CollectiveMainloop::ArrayElementB;
+    return {
+      reinterpret_cast<ArrayElementA *>(tensor_A_Comp.device_data()), layout_a,
+      reinterpret_cast<ArrayElementB *>(tensor_B.device_data()), stride_b,
+      tensor_E.device_data(), layout_e,
+      tensor_SFA.device_data(), layout_sfa,
+      tensor_SFB.device_data(), layout_sfb
+    };
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+    auto A = make_tensor(make_iterator(tensor_A.host_data()),
+          make_layout(make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(tensor_SFA.host_data(), layout_sfa);
+
+    auto B = make_tensor(make_iterator(tensor_B.host_data()),
+        make_layout(make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(tensor_SFB.host_data(), layout_sfb);
+
+    // return {A, SfA, B, SfB};
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator,
+        decltype(A),
+        decltype(B),
+        decltype(SfA),
+        decltype(SfB)
+      >
+          mainloop_params{A, SfA, B, SfB};
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file << "A =\n" << tensor_A.host_view()
+         << "\nB =\n" << tensor_B.host_view()
+         << "\nSFA =\n" << tensor_SFA.host_view()
+         << "\nSFB =\n" << tensor_SFB.host_view();
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFA.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_SFB.host_view()), 0);
+    return true;
+  }
+};
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, true>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelTmaWarpSpecializedCooperativeSparseBlockScaledSm120<SchedulerPipelineStageCount_, false>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelSparseTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride(),
+    typename Base::LayoutTagE::Stride stride_factor_E_ = typename Base::LayoutTagE::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_,
+                                                                                      stride_factor_B_,
+                                                                                      stride_factor_E_) {}
+};
+
+template<class Gemm>
+struct HostCollectiveDefaultEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+
+  static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<StrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  StrideC stride_c;
+  StrideD stride_d;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C;
+  // Inputs
+  ElementScalar alpha;
+  ElementScalar beta;
+
+  cutlass::HostTensor<ElementD, LayoutTagD> tensor_D;
+  cutlass::HostTensor<ElementD, LayoutTagD> reference_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveDefaultEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_C(init_C_), seed(seed_),
+     stride_factor_C(typename LayoutTagC::Stride()),
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize(problem_size, alpha, beta)");
+#endif
+    // Initialize Epilogue tensors
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    try {
+      tensor_C.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C));
+      tensor_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D));
+      reference_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: resizing tensors threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: resizing tensors threw an unknown exception");
+      throw;
+    }
+    {
+      const bool init_succeeded = initialize_tensor(tensor_C.host_view(), init_C, seed + 2020);
+      if (not init_succeeded) {
+        CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: initialize_tensor returned false");
+      }
+      EXPECT_TRUE(init_succeeded);
+    }
+    tensor_C.host_view().at({0, 0}) = ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    try {
+      tensor_C.sync_device();
+      tensor_D.sync_device();
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveDefaultEpilogue::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+    alpha = alpha_;
+    beta = beta_;
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta) {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    }
+
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    }
+
+    bool passed = equality_check(reference_D.host_view(), tensor_D.host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;
+    }
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    file
+    << "\nC =\n" << tensor_C.host_view()
+    << "\n\nReference =\n" << reference_D.host_view()
+    << "\n\nComputed =\n" << tensor_D.host_view();
+  }
+
+  Arguments to_args(ProblemShapeType problem_size) {
+    Arguments arguments =
+      {
+        {alpha, beta},
+        tensor_C.device_data(), stride_c, tensor_D.device_data(), stride_d
+      };
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensor_C.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_c));
+    auto D = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)>
+        epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha;
+    epilogue_params.beta = beta;
+
+    return epilogue_params;
+  }
+};
+
+template<class Gemm>
+struct HostCollectiveEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+  static_assert(IsDefaultEpilogue<Epilogue>::value == false, "Default Epilogue is not supported");
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+
+  static_assert(rank(StrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<StrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<StrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  static constexpr bool IsLegacy = detail::IsLegacyEpiloguePolicy<Epilogue>::value;
+
+  // FFMA2 SGEMM uses ThreadEpilogueOp for bias and relu support instead of FusionOp, so we compose LinCombPerRowBiasEltAct FusionOp by hand to test the functionality.
+  static constexpr bool IsFfma2Kernel = cute::is_same_v<ScheduleType, cutlass::gemm::KernelMultistage>;
+  using FusionOp = cute::conditional_t<IsFfma2Kernel,
+                                       cutlass::epilogue::fusion::LinCombPerRowBiasEltAct<cutlass::epilogue::thread::Clamp, float, float>,
+                                       typename Gemm::EpilogueOutputOp>;
+  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
+
+
+  // Scale factor Generation related
+  using SfStrategy = cutlass::reference::host::SfStrategy;
+  static constexpr bool IsBlockScaleSupported            = FusionOp::IsBlockScaleSupported;
+  static constexpr SfStrategy SfGenStrategy              = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
+  static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
+  static constexpr bool IsKMajorSFD = cute::is_same_v<typename FusionOp::GmemLayoutTagScalefactor, cutlass::layout::RowMajor>;
+  using ElementSFD = non_void_t<typename FusionOp::ElementBlockScaleFactor, ElementD>;
+  using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<SFD_VectorSize,
+                                        IsKMajorSFD ? cute::UMMA::Major::K : cute::UMMA::Major::MN>;
+  using Blk_MN = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
+  using Blk_SF = typename Sm1xxBlockScaledOutputConfig::Blk_SF;
+  using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+  cutlass::HostTensor<ElementSFD, LayoutTagD> tensor_SFD;
+  cutlass::HostTensor<ElementSFD, LayoutTagD> reference_SFD;
+
+  using ElementCompute    = typename FusionOp::ElementCompute;
+  using ElementScalar     = typename FusionOp::ElementScalar;
+  using ElementBias       = non_void_t<typename FusionOp::ElementBias>;
+  using ElementAux        = non_void_t<typename FusionOp::ElementAux>;
+  using ElementAmax       = non_void_t<typename FusionOp::ElementAmax>;
+  using LayoutTagAux      = non_void_t<typename FusionOp::GmemLayoutTagAux, LayoutTagD>;
+  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn,
+                              cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsRowBiasEnabled        = FusionOp::IsPerRowBiasSupported;
+  static constexpr bool IsColBiasEnabled        = FusionOp::IsPerColBiasSupported;
+  static_assert(not (IsColBiasEnabled && IsRowBiasEnabled));
+
+  static constexpr bool IsDeBiasEnabled      = FusionOp::IsDePerRowBiasSupported;
+  static constexpr bool IsPerRowScaleEnabled = FusionOp::IsPerRowScaleSupported;
+  static constexpr bool IsPerColScaleEnabled = FusionOp::IsPerColScaleSupported;
+  static constexpr bool IsScaleFactorEnabled = FusionOp::IsScaleFactorSupported;
+  static constexpr bool IsAuxInEnabled       = FusionOp::IsAuxInSupported;
+  static constexpr bool IsAuxOutEnabled      = FusionOp::IsAuxOutSupported;
+  static constexpr bool IsAbsMaxEnabledD     = FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementD, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementD, cutlass::float_e5m2_t>);
+  static constexpr bool IsAbsMaxEnabledAux   = IsAuxOutEnabled && FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementAux, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementAux, cutlass::float_e5m2_t>);
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  StrideC stride_c;
+  StrideD stride_d;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> alpha;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> beta;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_A;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_B;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_C;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_D;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_Aux;
+  cutlass::HostTensor<ElementBias  , LayoutTagVector> bias;
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C;
+  cutlass::HostTensor<ElementCompute, LayoutTagScalar> norm_constant;
+
+  // Outputs
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_D;
+  cutlass::HostTensor<ElementAux , LayoutTagAux   > tensor_Aux;
+  cutlass::gemm::TagToStrideC_t<   LayoutTagAux   > stride_Aux;
+  cutlass::HostTensor<ElementD, LayoutTagD> tensor_D;
+  cutlass::HostTensor<ElementD, LayoutTagD> reference_D;
+
+  // References
+  cutlass::HostTensor<ElementBias, LayoutTagVector> reference_dbias;
+  cutlass::HostTensor<ElementAux , LayoutTagAux   > reference_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If vector scale is supported and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  // Random distribution with which to initialize the A/B/C/D/Aux scaling factors
+  cutlass::Distribution::Kind init_scale = cutlass::Distribution::Uniform;
+  // Random distribution with which to initialize the bias vector
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_scale(init_scale_), init_bias(init_bias_),
+     init_C(init_C_), seed(seed_),
+     stride_factor_C(typename LayoutTagC::Stride()),
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize(problem_size, alpha, beta)");
+#endif
+    // Initialize Epilogue tensors
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::size<0>(problem_shape_MNKL);
+    auto N = cute::size<1>(problem_shape_MNKL);
+    auto K = cute::size<2>(problem_shape_MNKL);
+    auto L = cute::size<3>(problem_shape_MNKL);
+
+    stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    try {
+      tensor_C.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C));
+      tensor_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D));
+      reference_D.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: resizing tensors threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: resizing tensors threw an unknown exception");
+      throw;
+    }
+
+    try {
+      const bool initialize_tensor_C_succeeded =
+        initialize_tensor(tensor_C.host_view(), init_C, seed + 2020);
+      if (not initialize_tensor_C_succeeded) {
+        CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor returned false");
+      }
+      EXPECT_TRUE(initialize_tensor_C_succeeded);
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: initialize_tensor threw an unknown exception");
+      throw;
+    }
+
+    tensor_C.host_view().at({0, 0}) = ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+    try {
+      tensor_C.sync_device();
+      tensor_D.sync_device();
+    }
+    catch (std::exception const& e) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: sync_device() threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("HostCollectiveEpilogue::initialize: sync_device() threw an unknown exception");
+      throw;
+    }
+
+    auto scalar_coord = cutlass::make_Coord(1);
+    auto col_vector_coord = cutlass::make_Coord(M);
+    auto row_vector_coord = cutlass::make_Coord(N);
+    auto batch_vector_coord = cutlass::make_Coord(L);
+    if constexpr (IsPerRowScaleEnabled or IsPerColScaleEnabled) {
+      // scalars
+      if (vector_scale_mode == VectorScale::DISABLED) {
+        // batched scalars
+        if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+          alpha.resize(batch_vector_coord, true);
+          beta.resize(batch_vector_coord, true);
+          EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+          if (beta_ != ElementScalar(0)) {
+            EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+          }
+          else {
+            cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+          }
+        }
+        // non-batched scalars
+        else {
+          alpha.resize(scalar_coord, false);
+          beta.resize(scalar_coord, false);
+          cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+          cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+        }
+      }
+      // batched vectors
+      else {
+        auto batched_vector_coord = cutlass::make_Coord((IsPerRowScaleEnabled ? M : N) * L);
+        alpha.resize(batched_vector_coord, true);
+        beta.resize(batched_vector_coord, true);
+        EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+        if (beta_ != ElementScalar(0)) {
+          EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+        }
+        else {
+          cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+        }
+      }
+    }
+    else {
+      if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+        // Set alpha  beta for different batches.
+        alpha.resize(batch_vector_coord, true);
+        beta.resize(batch_vector_coord, true);
+        cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+        for (int l = 0; l < L; ++l) {
+          beta.host_view().at(cutlass::make_Coord(l)) = beta_ + ElementScalar(l);
+        }
+      }
+      else {
+        alpha.resize(scalar_coord, false);
+        beta.resize(scalar_coord, false);
+        cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+        cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+      }
+    }
+    alpha.sync_device();
+    beta.sync_device();
+
+    if constexpr (IsScaleFactorEnabled) {
+      scale_A.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_B.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_C.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_D.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      EXPECT_TRUE(initialize_tensor(scale_A.host_view(), init_scale, seed + 2023));
+      EXPECT_TRUE(initialize_tensor(scale_B.host_view(), init_scale, seed + 2024));
+      EXPECT_TRUE(initialize_tensor(scale_C.host_view(), init_scale, seed + 2025));
+      EXPECT_TRUE(initialize_tensor(scale_D.host_view(), init_scale, seed + 2026));
+      scale_A.sync_device();
+      scale_B.sync_device();
+      scale_C.sync_device();
+      scale_D.sync_device();
+    }
+
+    if constexpr (IsRowBiasEnabled or IsColBiasEnabled) {
+      bias.resize(IsRowBiasEnabled ? col_vector_coord : row_vector_coord);
+      EXPECT_TRUE(initialize_tensor(bias.host_view(), init_bias, seed + 2023));
+      bias.sync_device();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.resize(col_vector_coord);
+      reference_dbias.resize(col_vector_coord);
+      cutlass::reference::host::TensorFill(bias.host_view(), ElementBias(0));
+      cutlass::reference::host::TensorFill(reference_dbias.host_view(), ElementBias(0));
+      bias.sync_device();
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.resize(scalar_coord);
+      // ensure in-place device reductions perform their own initialization
+      cutlass::reference::host::TensorFill(abs_max_D.host_view(),
+                                           CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+      abs_max_D.sync_device();
+      reference_abs_max_D.resize(scalar_coord);
+      cutlass::reference::host::TensorFill(reference_abs_max_D.host_view(), ElementAmax(0));
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      auto aux_coord = cutlass::make_Coord(M * L, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      tensor_Aux.resize(aux_coord, aux_layout);
+      EXPECT_TRUE(initialize_tensor(tensor_Aux.host_view(), init_C, seed + 2023));
+      tensor_Aux.sync_device();
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, L));
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      auto aux_coord = cutlass::make_Coord(M * L, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      tensor_Aux.resize(aux_coord, aux_layout);
+      reference_Aux.resize(aux_coord, aux_layout, false);
+      tensor_Aux.sync_device();
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, L));
+
+      if constexpr (IsScaleFactorEnabled) {
+        scale_Aux.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+        EXPECT_TRUE(initialize_tensor(scale_Aux.host_view(), init_scale, seed + 2027));
+        scale_Aux.sync_device();
+      }
+
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.resize(scalar_coord);
+        // ensure in-place device reductions perform their own initialization
+        cutlass::reference::host::TensorFill(abs_max_Aux.host_view(),
+                                             CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+        abs_max_Aux.sync_device();
+        reference_abs_max_Aux.resize(scalar_coord);
+        cutlass::reference::host::TensorFill(reference_abs_max_Aux.host_view(), ElementAmax(0));
+      }
+    }
+
+
+    if constexpr (IsBlockScaleSupported) {
+      auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
+      auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
+      auto sfd_coord = [&] () {
+        if constexpr (IsKMajorSFD) {
+          return cutlass::make_Coord(m_blks * Blk_MN{} * L, n_blks * Blk_SF{});
+        }
+        else {
+          return cutlass::make_Coord(m_blks * Blk_SF{} * L, n_blks * Blk_MN{});
+        }
+      }();
+      tensor_SFD.resize(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D));
+      reference_SFD.resize(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false);
+      tensor_SFD.sync_device();
+      norm_constant.resize(scalar_coord, true);
+      EXPECT_TRUE(initialize_tensor(norm_constant.host_view(), init_scale, seed + 2023));
+      norm_constant.sync_device();
+    }
+
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta) {
+    tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    }
+
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    }
+
+    bool passed = equality_check(reference_D.host_view(), tensor_D.host_view());
+    if(!passed) {
+      #if 0
+      auto [M, N, K, L] = problem_shape_MNKL;
+      auto ref = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+      auto comp = cute::make_tensor(detail::make_iterator(tensor_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+      for(int i=0; i<M; i++) {
+        for(int j=0; j<N; j++) {
+          for(int l=0; l<L; l++) {
+            if(static_cast<float>(ElementD(ref(i, j, l))) != static_cast<float>((ElementD(comp(i, j, l))))) {
+              printf("<m %d, n %d, l %d> ref: %f comp: %f\n", i, j, l, static_cast<float>(ElementD(ref(i, j, l))), static_cast<float>((ElementD(comp(i, j, l)))));
+            }
+          }
+        }
+      }
+      #endif
+      std::cout<<"D is incorrect"<<std::endl;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.sync_host();
+      passed &= equality_check(reference_abs_max_D.host_view(), abs_max_D.host_view());
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(bias.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_dbias.host_view()), 0);
+      passed &= equality_check(reference_dbias.host_view(), bias.host_view());
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      tensor_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      passed &= equality_check(reference_Aux.host_view(), tensor_Aux.host_view());
+      if(!passed) {
+        std::cout<<"Aux is incorrect"<<std::endl;
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.sync_host();
+        bool tmp =  equality_check(reference_abs_max_Aux.host_view(), abs_max_Aux.host_view());
+        if(!tmp) {
+          std::cout<<"AbsMax of Aux is incorrect"<<std::endl;
+        }
+        passed &= tmp;
+      }
+    }
+
+
+    if constexpr (IsBlockScaleSupported) {
+      tensor_SFD.sync_host();
+      bool passed_sf = equality_check(reference_SFD.host_view(), tensor_SFD.host_view());
+      if(!passed_sf) {
+        std::cout<<"SF is incorrect"<<std::endl;
+      }
+      passed &= passed_sf;
+    }
+
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file) {
+    auto coord_0 = cutlass::make_Coord(0);
+    if constexpr (IsScaleFactorEnabled) {
+      file
+        << ", scale_a: " << scale_A.at(coord_0)
+        << ", scale_b: " << scale_B.at(coord_0)
+        << ", scale_c: " << scale_C.at(coord_0);
+    }
+    if constexpr (IsPerRowScaleEnabled or IsPerColScaleEnabled) {
+      file << "\n\nvalpha = \n" << alpha.host_view();
+      file << "\n\nvbeta = \n" << beta.host_view();
+    } else {
+      file
+        << "\n\nalpha= \n" << alpha.host_view()
+        << "\n\nbeta= \n " << beta.host_view();
+    }
+    file << "\n\n";
+
+    if constexpr (IsAbsMaxEnabledD) {
+      file << "scale_d: " << float(scale_D.at(coord_0));
+      file << "\nReference abs_max_D :";
+      file << " " << float(reference_abs_max_D.at(coord_0));
+
+      file << "\nComputed abs_max_D :";
+      file << " " << float(abs_max_D.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsAbsMaxEnabledAux) {
+      file << "scale_aux: " << float(scale_Aux.at(coord_0));
+      file << "\nReference abs_max_Aux :";
+      file << " " << float(reference_abs_max_Aux.at(coord_0));
+
+      file << "\nComputed abs_max_Aux :";
+      file << " " << float(abs_max_Aux.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsRowBiasEnabled or IsColBiasEnabled) {
+      file << "\n\nBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      file << "\n\nAux Input = \n" << tensor_Aux.host_view();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      file << "\n\nReference dBias = \n" << reference_dbias.host_view();
+      file << "\n\nComputed dBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      file
+        << "\n\nReference Aux =\n" << reference_Aux.host_view()
+        << "\n\nComputed Aux =\n" << tensor_Aux.host_view();
+    }
+
+    if constexpr (IsBlockScaleSupported) {
+      file
+        << "\n\nSFD Reference =\n" << reference_SFD.host_view()
+        << "\n\nSFD Computed =\n" << tensor_SFD.host_view();
+    }
+
+    file
+    << "\nC =\n" << tensor_C.host_view()
+    << "\n\nReference =\n" << reference_D.host_view()
+    << "\n\nComputed =\n" << tensor_D.host_view();
+
+  }
+
+  Arguments to_args(ProblemShapeType problem_size) {
+    auto coord_0 = cutlass::make_Coord(0);
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    Arguments arguments =
+      {
+        {},
+        tensor_C.device_data(), stride_c, tensor_D.device_data(), stride_d
+      };
+
+    auto &fusion_args = arguments.thread;
+    if constexpr (IsLegacy) {
+      arguments.thread = {
+        alpha.at(coord_0),
+        beta.at(coord_0),
+        alpha.device_data(),
+        beta.device_data()
+      };
+      arguments.ptr_Bias = bias.device_data();
+      arguments.ptr_T = tensor_Aux.device_data();
+    }
+    else {
+      fusion_args.alpha = alpha.at(coord_0);
+      fusion_args.alpha_ptr = alpha.device_data();
+      // Only initializing beta/beta_ptr for non-void source
+      if constexpr (not cute::is_void_v<typename kernel::ElementC>) {
+        fusion_args.beta = beta.at(coord_0);
+        fusion_args.beta_ptr = beta.device_data(); // if vector_scale_mode is true this is nullptr
+      }
+
+      if constexpr (IsPerRowScaleEnabled) {
+        int32_t m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int64_t l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        fusion_args.dAlpha = cute::make_stride(bool(m_stride),cute::_0{}, l_stride);
+        fusion_args.dBeta = cute::make_stride(bool(m_stride),cute::_0{}, l_stride);
+      }
+      else if constexpr (IsPerColScaleEnabled) {
+        int32_t n_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int64_t l_stride = vector_scale_mode == VectorScale::ENABLED ? N : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        fusion_args.dAlpha = cute::make_stride(cute::_0{}, bool(n_stride), l_stride);
+        fusion_args.dBeta = cute::make_stride(cute::_0{}, bool(n_stride), l_stride);
+      }
+      else {
+        if constexpr (not IsFfma2Kernel) {
+          if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+            if (L > 1) {
+              fusion_args.dAlpha = cute::make_stride(cute::_0{},cute::_0{}, int64_t(1));
+              fusion_args.dBeta  = cute::make_stride(cute::_0{},cute::_0{}, int64_t(1));
+            }
+          }
+        }
+      }
+
+      if constexpr (IsScaleFactorEnabled) {
+        fusion_args.scale_a = scale_A.at(coord_0);
+        fusion_args.scale_b = scale_B.at(coord_0);
+        fusion_args.scale_c = scale_C.at(coord_0);
+        fusion_args.scale_d = scale_D.at(coord_0);
+        fusion_args.scale_a_ptr = scale_A.device_data();
+        fusion_args.scale_b_ptr = scale_B.device_data();
+        fusion_args.scale_c_ptr = scale_C.device_data();
+        fusion_args.scale_d_ptr = scale_D.device_data();
+      }
+
+      if constexpr (IsRowBiasEnabled or IsColBiasEnabled) {
+        fusion_args.bias_ptr = bias.device_data();
+      }
+
+      if constexpr (IsDeBiasEnabled) {
+        fusion_args.dbias_ptr = bias.device_data();
+      }
+
+      // example of how to set kernel activation arguments
+      // see ActivationFunctor::Arguments in activation.h for definition
+      // if Arguments doesn't exist then fusion_args.activation is empty
+      auto init_activation_args = [] (auto activation, auto& args) {
+        using Activation = cute::remove_cvref_t<decltype(activation)>;
+        if constexpr (cute::is_same_v<Activation, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+          args.lower_bound = 0; // Treat Clamp as ReLU
+          args.upper_bound = cutlass::platform::identity_for_minimum<ElementCompute>();
+        }
+        if constexpr (cute::is_same_v<Activation, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>>) {
+          args.scale = ElementCompute(1);
+        }
+      };
+
+      if constexpr (not cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Identity<ElementCompute>>) {
+        init_activation_args(ActivationFunctor{}, fusion_args.activation);
+      }
+      if constexpr (IsAbsMaxEnabledD) {
+        fusion_args.amax_D_ptr = abs_max_D.device_data();
+      }
+
+      if constexpr (IsAuxInEnabled) {
+        fusion_args.aux_ptr = tensor_Aux.device_data();
+        fusion_args.dAux = stride_Aux;
+      }
+
+      if constexpr (IsAuxOutEnabled) {
+        fusion_args.aux_ptr = tensor_Aux.device_data();
+        fusion_args.dAux = stride_Aux;
+        if constexpr (IsScaleFactorEnabled) {
+          fusion_args.scale_aux = scale_Aux.at(coord_0);
+          fusion_args.scale_aux_ptr = scale_Aux.device_data();
+        }
+        if constexpr (IsAbsMaxEnabledAux) {
+          fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
+        }
+      }
+
+
+      if constexpr (IsBlockScaleSupported) {
+        arguments.thread.block_scale_factor_ptr = tensor_SFD.device_data();
+        arguments.thread.norm_constant_ptr = norm_constant.device_data();
+      }
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_size) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensor_C.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_c));
+    auto D = cute::make_tensor(detail::make_iterator(reference_D.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+    auto Bias = cute::make_tensor(detail::make_iterator(IsDeBiasEnabled ? reference_dbias.host_data() : bias.host_data()),
+        cute::make_layout(cute::make_shape(IsRowBiasEnabled ? M : N)));
+    auto Aux = cute::make_tensor(detail::make_iterator(IsAuxInEnabled ? tensor_Aux.host_data() : reference_Aux.host_data()),
+        cute::make_layout(cute::make_shape(M, N, L), stride_Aux));
+    auto Valpha = [&](){
+      if constexpr (IsPerRowScaleEnabled) {
+        int m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(alpha.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(m_stride, cute::_0{}, l_stride)));
+      }
+      else if constexpr (IsPerColScaleEnabled) {
+        int n_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? N : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(alpha.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, n_stride, l_stride)));
+      }
+      else {
+        return cute::make_tensor(detail::make_iterator(alpha.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, cute::_0{}, cute::_1{})));
+      }
+    }();
+
+    auto Vbeta = [&]() {
+      if constexpr (IsPerRowScaleEnabled) {
+        int m_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? M : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(beta.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(m_stride, cute::_0{}, l_stride)));
+      }
+      else if constexpr (IsPerColScaleEnabled) {
+        int n_stride = vector_scale_mode == VectorScale::ENABLED ? 1 : 0;
+        int l_stride = vector_scale_mode == VectorScale::ENABLED ? N : (use_device_scalars == ScalarLoc::ON_DEVICE ? 1 : 0);
+        return cute::make_tensor(detail::make_iterator(beta.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, n_stride, l_stride)));
+      }
+      else {
+        return  cute::make_tensor(detail::make_iterator(beta.host_data()),
+            cute::make_layout(cute::make_shape(M, N, L), make_stride(cute::_0{}, cute::_0{}, cute::_1{})));
+      }
+    }();
+
+    auto SfD = [&](){
+      if constexpr (IsBlockScaleSupported) {
+        auto tensor = make_tensor(detail::make_iterator(reference_SFD.host_data()),
+          Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+        return tensor;
+      }
+      else {
+        // Reference kernel has a logic to ignore scalefactor computation if we pass the tensor type same as output D tensor.
+        return D;
+      }
+    }();
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      decltype(Bias),
+      decltype(Aux),
+      decltype(Valpha),
+      decltype(Vbeta),
+      ActivationFunctor,
+      decltype(SfD),
+      Int<SFD_VectorSize>,
+      cutlass::plus<ElementCompute>,
+      IsColBiasEnabled
+      , SfGenStrategy
+    > epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha.at(coord_0);
+    epilogue_params.beta = beta.at(coord_0);
+
+    if constexpr (IsScaleFactorEnabled) {
+      epilogue_params.scale_a = scale_A.at(coord_0);
+      epilogue_params.scale_b = scale_B.at(coord_0);
+      epilogue_params.scale_c = scale_C.at(coord_0);
+      epilogue_params.scale_d = scale_D.at(coord_0);
+    }
+
+    if constexpr (IsRowBiasEnabled or IsColBiasEnabled or IsDeBiasEnabled)
+    {
+      epilogue_params.Bias = Bias;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      epilogue_params.abs_max_D = reference_abs_max_D.host_data();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      epilogue_params.Aux = Aux;
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      epilogue_params.Aux = Aux;
+      if constexpr (IsScaleFactorEnabled) {
+        epilogue_params.scale_aux = scale_Aux.at(coord_0);
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        epilogue_params.abs_max_Aux = reference_abs_max_Aux.host_data();
+      }
+    }
+
+    if constexpr (IsPerRowScaleEnabled or IsPerColScaleEnabled) {
+      epilogue_params.Valpha = Valpha;
+      if (vector_scale_mode == VectorScale::ENABLED) {
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+    else {
+      if (use_device_scalars == ScalarLoc::ON_DEVICE) {
+        epilogue_params.Valpha = Valpha;
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+
+    if constexpr (IsBlockScaleSupported) {
+      epilogue_params.SfD = SfD;
+      epilogue_params.st = norm_constant.at(coord_0);
+    }
+    return epilogue_params;
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor_ = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+  , typename RuntimeDatatypeA = void*
+  , typename RuntimeDatatypeB = void*
+>
+struct TestbedImpl {
+  // Kernel data types
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  // All Collective MMA operands are defined by HostCollectiveMainloopType based on the schedule type
+  using HostCollectiveMainloopType = HostCollectiveMainloop<ScheduleType, Gemm, ElementA, ElementB>;
+
+  using CollectiveEpilogue = cute::conditional_t<IsDefaultEpilogue<typename Gemm::GemmKernel::CollectiveEpilogue>::value || force_legacy_epilogue,
+                                                HostCollectiveDefaultEpilogue<Gemm>,
+                                                HostCollectiveEpilogue<Gemm>>;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using LayoutTagA = typename HostCollectiveMainloopType::LayoutTagA;
+  using LayoutTagB = typename HostCollectiveMainloopType::LayoutTagB;
+  using LayoutTagC = typename CollectiveEpilogue::LayoutTagC;
+  using LayoutTagD = typename CollectiveEpilogue::LayoutTagD;
+
+
+  using InternalElementA = typename Gemm::GemmKernel::ElementA;
+  using InternalElementB = typename Gemm::GemmKernel::ElementB;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<InternalElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<InternalElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+
+  uint32_t sm_count;
+  // Used to force multi-wave tests for persistent kernel schedules
+  constexpr static int MaxSmCount = 16;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr uint32_t mma_promotion_interval = 4;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  HostCollectiveMainloopType collective_mma_inputs;
+  CollectiveEpilogue collective_epilogue;
+
+  //
+  // Methods
+  //
+
+  TestbedImpl(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  TestbedImpl(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, stride_factor_A_, stride_factor_B_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  /// Initializes data structures
+  bool initialize(ProblemShapeType problem_size, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::initialize(problem_size, alpha, beta)");
+#endif
+    collective_mma_inputs.initialize(problem_size);
+    collective_epilogue.initialize(problem_size, alpha_, beta_);
+
+    return true;
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    bool passed = collective_mma_inputs.compare_reference(problem_shape_MNKL);
+    passed &= collective_epilogue.compare_reference(problem_shape_MNKL, alpha, beta);
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << L
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      collective_mma_inputs.print_tensors(file);
+      collective_epilogue.print_tensors(file);
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+      ProblemShapeType problem_size,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    using namespace cute;
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto mainloop_params = collective_mma_inputs.to_host_args(problem_size);
+    auto epilogue_params = collective_epilogue.to_host_args(problem_size);
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    bool passed = compare_reference(problem_shape_MNKL, alpha, beta);
+    return passed;
+  }
+
+	/// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = static_cast<size_t>(Gemm::GemmKernel::SharedStorageSize);
+
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    cudaDeviceProp properties;
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    this->sm_count = properties.multiProcessorCount;
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      printf("failed due to smem_size\n");
+      printf("hardware smem_size: %d, required smem_size: %d\n\n", int(properties.sharedMemPerBlockOptin), int(smem_size));
+      return false;
+    }
+
+    return true;
+  }
+
+  bool profile(
+    ProblemShapeType problem_size,
+    int iterations,
+    Gemm& gemm_op,
+    typename Gemm::Arguments& arguments,
+    cutlass::device_memory::allocation<uint8_t>& workspace) {
+    int M = cute::size<0>(problem_size);
+    int N = cute::size<1>(problem_size);
+    int K = cute::size<2>(problem_size);
+    int L = 1;
+    if constexpr(cute::rank(ProblemShapeType{}) == 4) {
+      L = cute::size<3>(problem_size);
+    }
+
+
+    cutlass::Status status;
+    //
+    // Run the GEMM
+    //
+    cudaError_t result;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+      status = gemm_op(arguments, workspace.get());
+      if (status != cutlass::Status::kSuccess) {
+        EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+        return false;
+      }
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    ProblemShapeType problem_size,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    bool profiling = false,
+    detail::Iterations iterations = detail::Iterations{},
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic
+    , RuntimeDatatypeA runtime_input_datatype_a = {}
+    , RuntimeDatatypeB runtime_input_datatype_b = {}
+    )
+  {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run");
+#endif
+
+    // Fail test if insufficient CUDA device
+    if (!sufficient()) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Test failed due to insufficient CUDA device");
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    else {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: sufficient() returned true");
+    }
+#endif
+
+    try {
+      const bool initialized = this->initialize(problem_size, alpha, beta);
+      if (not initialized) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize returned false");
+        std::cerr << "Initialization failed \n";
+        return false;
+      }
+    }
+    catch ([[maybe_unused]] std::exception const& e) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize threw an exception: " << e.what());
+      throw;
+    }
+    catch (...) {
+      CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize threw an unknown exception");
+      throw;
+    }
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: this->initialize() returned true");
+#endif
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      this->sm_count = std::min(MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = this->sm_count;
+    }
+    else {
+      this->sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = this->sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+    else {
+      scheduler_args = { static_cast<int>(max_swizzle), raster_order };
+    }
+    typename HostCollectiveMainloopType::Arguments mainloop_args;
+
+    mainloop_args = collective_mma_inputs.to_args();
+
+
+    if constexpr (IsRuntimeDataType) {
+      mainloop_args.runtime_data_type_a = runtime_input_datatype_a;
+      mainloop_args.runtime_data_type_b = runtime_input_datatype_b;
+    }
+
+
+    arguments =
+    {
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      mainloop_args,
+      collective_epilogue.to_args(problem_size),
+      hw_info,
+      scheduler_args
+    };
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Creating gemm_op");
+#endif
+    Gemm gemm_op;
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Calling Gemm::get_workspace_size");
+#endif
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Allocating workspace of size " << workspace_size);
+#endif
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.can_implement");
+#endif
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      const auto error_str = cudaGetErrorString(error);
+      CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      std::cerr << "This test is not supported: " << error_str << "\n";
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    if (profiling) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling profile");
+#endif
+      return profile(problem_size, static_cast<int>(iterations), gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.initialize");
+#endif
+      status = gemm_op.initialize(arguments, workspace.get());
+      if (status != cutlass::Status::kSuccess) {
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling gemm_op.run");
+#endif
+      status = gemm_op.run();
+      if (status != cutlass::Status::kSuccess) {
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling cudaDeviceSynchronize");
+#endif
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaDeviceSynchronize reports non-success");
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+      //
+      // Verify
+      //
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Calling this->verify");
+#endif
+      bool passed = this->verify(problem_size, alpha, beta);
+      if (!passed) {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->verify FAILED");
+        cudaError_t error = cudaGetLastError();
+        const auto error_str = cudaGetErrorString(error);
+        CUTLASS_TRACE_HOST("TestbedImpl::run: cudaGetLastError() is " << error_str);
+
+        std::cout << "Error : Failed : with alpha: " << alpha << ", beta: " << beta
+                  << "\n";
+      }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      else {
+        CUTLASS_TRACE_HOST("TestbedImpl::run: this->verify passed");
+      }
+#endif
+
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      CUTLASS_TRACE_HOST("TestbedImpl::run: Reached end");
+#endif
+      return passed;
+    }
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+  , typename RuntimeDatatypeA = void*
+  , typename RuntimeDatatypeB = void*
+>
+struct Testbed3x {
+
+  using TestBedImpl = typename detail::TestbedImpl<
+                        Gemm,
+                        ActivationFunctor,
+                        force_legacy_epilogue,
+                        ElementA,
+                        ElementB
+                        , RuntimeDatatypeA
+                        , RuntimeDatatypeB
+                        >;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementAccumulator   = typename TestBedImpl::ElementAccumulator;
+  using ElementCompute       = typename TestBedImpl::ElementCompute;
+  using ElementScalar        = typename TestBedImpl::ElementScalar;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3x(
+      CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+      ScalarLoc use_device_scalars_ = ScalarLoc::ON_DEVICE,
+      VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed)
+      : impl_(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_A_, init_B_, init_C_, init_scale_, init_bias_, seed_) {}
+
+  /// Executes one test
+  bool run(
+   typename TestBedImpl::ProblemShapeType problem_size,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic,
+    bool profiling = false,
+    detail::Iterations iterations = detail::Iterations{}
+    , RuntimeDatatypeA runtime_input_datatype_a = {}
+    , RuntimeDatatypeB runtime_input_datatype_b = {}
+    )
+  {
+    return impl_.run(
+        problem_size, alpha, beta, profiling, iterations, raster_order, max_swizzle, splits, decomposition_mode
+        , runtime_input_datatype_a, runtime_input_datatype_b
+        );
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestGemmPerf3x(int iterations = 20) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalar = ElementAccumulator;
+  bool passed = true;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+
+  std::vector<int> problem_size_m = { 4608 };
+  std::vector<int> problem_size_n = { 4608 };
+  std::vector<int> problem_size_k = { 8192 };
+
+  Testbed3x<Gemm> testbed;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        ProblemShapeType problem_size;
+        if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+          problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+        }
+        else {
+          problem_size = ProblemShapeType{m, n, k};
+        }
+
+        passed = testbed.run(
+          problem_size,
+          cutlass::from_real<ElementScalar>(1),
+          cutlass::from_real<ElementScalar>(0),
+          RasterOrderOptions{}, detail::MaxSwizzleSize(1), detail::Splits{1}, DecompositionMode{},
+          true, // profiling
+          detail::Iterations{iterations});
+
+        if (!passed) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+template <
+  typename Gemm,
+  typename RuntimeDataTypeA,
+  typename RuntimeDataTypeB,
+  bool force_legacy_epilogue = false>
+bool TestRuntimeDataTypeSmall(
+  RuntimeDataTypeA runtime_input_datatype_a,
+  RuntimeDataTypeB runtime_input_datatype_b,
+  double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::GemmKernel::ElementC, void> ? 0.0 : 1.0,
+  CheckEquality check_relative_equality = CheckEquality::RELATIVE, ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE, VectorScale vector_scale_mode = VectorScale::ENABLED, std::vector<int> override_problem_size_k = {}) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
+  using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
+
+  using InternalElementA = typename Gemm::GemmKernel::ElementA;
+  using InternalElementB = typename Gemm::GemmKernel::ElementB;
+
+  CtaShape_MNK cta_shape;
+  static constexpr int SmCount  = 16;
+  static constexpr int MultiplierOffsetM = 1;
+  static constexpr int MultiplierOffsetN = 2;
+  static constexpr int MultiplierOffsetK = 3;
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+
+  float waves[] = {0.5, 1.25, 2.5};
+  int cluster_m = 1;
+  int cluster_n = 1;
+
+  std::vector<int> problem_size_k;
+  if (override_problem_size_k.empty()) {
+    problem_size_k = {256 + max_alignment * MultiplierOffsetK, 512 + max_alignment * MultiplierOffsetK};
+  }
+  else {
+    problem_size_k = override_problem_size_k;
+  }
+
+  if constexpr(DispatchPolicy::ArchTag::kMinComputeCapability >= 90) {
+    typename DispatchPolicy::ClusterShape cluster_shape;
+    cluster_m = cute::size<0>(cluster_shape);
+    cluster_n = cute::size<1>(cluster_shape);
+  }
+
+  [[maybe_unused]] constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+  }
+  bool passed = true;
+
+  for (float wave : waves) {
+    for (int k : problem_size_k) {
+      int grid_m, grid_n = 0;
+      int num_grid = int(wave * SmCount);
+
+      if (cluster_m >= cluster_n) {
+        grid_m = cluster_m;
+        grid_n = num_grid / grid_m;
+        // Align grid_n to cluster_n
+        grid_n = std::max((grid_n + cluster_n - 1 ) / cluster_n * cluster_n, 1);
+      }
+      else {
+        grid_n = cluster_n;
+        grid_m = num_grid / grid_n;
+        // Align grid_m to cluster_m
+        grid_m = std::max((grid_m + cluster_m - 1 ) / cluster_m * cluster_m, 1);
+      }
+
+      int m = grid_m * cute::size<0>(cta_shape) + MultiplierOffsetM * max_alignment;
+      int n = grid_n * cute::size<1>(cta_shape) + MultiplierOffsetN * max_alignment;
+
+      ProblemShapeType problem_size;
+      if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+        problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+      }
+      else {
+        problem_size = ProblemShapeType{m, n, k};
+      }
+
+      for (DecompositionMode decomp_mode : decomposition_modes) {
+        std::vector problem_splits = {detail::Splits{1}};
+        if (decomp_mode == DecompositionMode::Heuristic || decomp_mode == DecompositionMode::SplitK) {
+          problem_splits.push_back(detail::Splits{2});
+        }
+        for (auto splits : problem_splits) {
+
+          if constexpr (cute::is_same_v<RuntimeDataTypeA, cute::UMMA::MXF4Format> &&
+                        cute::is_same_v<RuntimeDataTypeB, cute::UMMA::MXF4Format>) {
+            // e2m1_e2m1
+            if (runtime_input_datatype_a == cute::UMMA::MXF4Format::E2M1 &&
+                runtime_input_datatype_b == cute::UMMA::MXF4Format::E2M1) {
+              Testbed3x<Gemm,
+                        cutlass::epilogue::thread::Identity,
+                        force_legacy_epilogue,
+                        cutlass::float_e2m1_t,
+                        cutlass::float_e2m1_t,
+                        cute::UMMA::MXF4Format,
+                        cute::UMMA::MXF4Format> testbed(check_relative_equality,
+                                                        use_device_scalars,
+                                                        vector_scale_mode);
+              passed = testbed.run(
+                problem_size,
+                cutlass::from_real<ElementScalar>(alpha),
+                cutlass::from_real<ElementScalar>(beta),
+                RasterOrderOptions::Heuristic, // raster_order
+                detail::MaxSwizzleSize(1),
+                splits,
+                decomp_mode,
+                false,
+                detail::Iterations{},
+                runtime_input_datatype_a,
+                runtime_input_datatype_b
+              );
+            }
+            else {
+              std::cout << "Unsupported configuration for runtime datatype MXFP4." << std::endl;
+              return false;
+            }
+          }
+
+          else
+          if constexpr (cute::is_same_v<RuntimeDataTypeA, cute::UMMA::MXF8F6F4Format> &&
+                             cute::is_same_v<RuntimeDataTypeB, cute::UMMA::MXF8F6F4Format>) {
+            static_assert((cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float8_t> ||
+                           cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float6_t> ||
+                           cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float4_t>) &&
+                          (cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float8_t> ||
+                           cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float6_t> ||
+                           cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float4_t>),
+                          "Runtime datatype must be selected with an appropriate static umbrella data type.");
+            if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float8_t> &&
+                          cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float4_t>) {
+              // e4m3_e2m1
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E4M3 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E2M1) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e4m3_t,
+                          cutlass::float_e2m1_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupport
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            // f6xf4
+            else if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float6_t> &&
+                               cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float4_t>) {
+              // e3m2_e2m1
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E3M2 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E2M1) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e3m2_t,
+                          cutlass::float_e2m1_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupport
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            else if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float4_t> &&
+                               cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float4_t>) {
+              // e2m1_e2m1
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E2M1 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E2M1) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e2m1_t,
+                          cutlass::float_e2m1_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupport
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            else if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float8_t> &&
+                               cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float6_t>) {
+              // e4m3_e3m2
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E4M3 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E3M2) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e4m3_t,
+                          cutlass::float_e3m2_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupport
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            else if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float6_t> &&
+                               cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float6_t>) {
+              // e3m2_e2m3
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E3M2 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E2M3) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e3m2_t,
+                          cutlass::float_e2m3_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupported
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            else
+            if constexpr (cute::is_same_v<InternalElementA, cutlass::type_erased_dynamic_float8_t> &&
+                               cute::is_same_v<InternalElementB, cutlass::type_erased_dynamic_float8_t>) {
+              // e5m2_e5m2
+              if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E5M2 &&
+                  runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E5M2) {
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e5m2_t,
+                          cutlass::float_e5m2_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // e4m3_e5m2
+              else if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E4M3 &&
+                       runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E5M2){
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e4m3_t,
+                          cutlass::float_e5m2_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // e5m2_e4m3
+              else if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E5M2 &&
+                       runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E4M3){
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e5m2_t,
+                          cutlass::float_e4m3_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // e4m3_e4m3
+              else if (runtime_input_datatype_a == cute::UMMA::MXF8F6F4Format::E4M3 &&
+                       runtime_input_datatype_b == cute::UMMA::MXF8F6F4Format::E4M3){
+                Testbed3x<Gemm,
+                          cutlass::epilogue::thread::Identity,
+                          force_legacy_epilogue,
+                          cutlass::float_e4m3_t,
+                          cutlass::float_e4m3_t,
+                          cute::UMMA::MXF8F6F4Format,
+                          cute::UMMA::MXF8F6F4Format> testbed(check_relative_equality,
+                                                          use_device_scalars,
+                                                          vector_scale_mode);
+                passed = testbed.run(
+                  problem_size,
+                  cutlass::from_real<ElementScalar>(alpha),
+                  cutlass::from_real<ElementScalar>(beta),
+                  RasterOrderOptions::Heuristic, // raster_order
+                  detail::MaxSwizzleSize(1),
+                  splits,
+                  decomp_mode,
+                  false,
+                  detail::Iterations{},
+                  runtime_input_datatype_a,
+                  runtime_input_datatype_b
+                );
+              }
+              // Unsupported
+              else {
+                std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+                return false;
+              }
+            }
+            // Unsupported
+            else {
+              std::cout << "Unsupported configuration for runtime datatype Mxf8f6f4." << std::endl;
+              return false;
+            }
+          }
+
+          else {
+            static_assert(cutlass::detail::dependent_false<RuntimeDataTypeA>,
+                "Unsupported configuration for runtime datatype.");
+          }
+
+          if (!passed) {
+            std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNK " << m << " " << n << " " << k << " FAILED.\n";
+            return false;
+          }
+        } // splits
+      } // decomposition_mode
+    } // k
+  } // waves
+
+  return passed;
+}
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = true, bool test_batched_alpha_beta = false>
+bool TestSmall(double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::GemmKernel::ElementC, void> ? 0.0 : 1.0,
+  CheckEquality check_relative_equality = CheckEquality::RELATIVE,
+  ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE,
+  VectorScale vector_scale_mode = VectorScale::ENABLED,
+  std::vector<int> override_problem_size_k = {}) {
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
+  using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
+  CtaShape_MNK cta_shape;
+  Testbed3x<Gemm, cutlass::epilogue::thread::Identity, force_legacy_epilogue> testbed(check_relative_equality, use_device_scalars, vector_scale_mode);
+  static constexpr int SmCount  = 16;
+  static constexpr int MultiplierOffsetM = 1;
+  static constexpr int MultiplierOffsetN = 2;
+  static constexpr int MultiplierOffsetK = 3;
+  int max_alignment_k = 0;
+  int max_alignment_m = 0;
+  int max_alignment_n = 0;
+
+  if constexpr (apply_alignment_offset) {
+    max_alignment_k = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+    max_alignment_n = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+    max_alignment_m = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  }
+  // Alignment for SFD
+  if constexpr (detail::IsSfdEpi<typename Gemm::GemmKernel::CollectiveEpilogue>::value) {
+    using GmemLayoutTagScalefactor = typename Gemm::GemmKernel::CollectiveEpilogue::FusionCallbacks::Operation::GmemLayoutTagScalefactor;
+    constexpr int SFDVecSize = Gemm::GemmKernel::CollectiveEpilogue::FusionCallbacks::Operation::SFVecSize;
+    if constexpr (cute::is_same_v<GmemLayoutTagScalefactor, cutlass::layout::RowMajor>) {
+      max_alignment_n = std::lcm(max_alignment_n, SFDVecSize);
+    }
+    else {
+      max_alignment_m = std::lcm(max_alignment_m, SFDVecSize);
+    }
+  }
+
+  float waves[] = {0.5, 1.25, 2.5};
+  int cluster_m = 1;
+  int cluster_n = 1;
+
+  std::vector<int> problem_size_k;
+  if (override_problem_size_k.empty()) {
+    problem_size_k = {256 + max_alignment_k * MultiplierOffsetK, 512 + max_alignment_k * MultiplierOffsetK};
+  }
+  else {
+    problem_size_k = override_problem_size_k;
+  }
+
+  if constexpr(DispatchPolicy::ArchTag::kMinComputeCapability >= 90) {
+    typename DispatchPolicy::ClusterShape cluster_shape;
+    cluster_m = cute::size<0>(cluster_shape);
+    cluster_n = cute::size<1>(cluster_shape);
+  }
+
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+  }
+  bool passed = true;
+
+  std::vector<RasterOrderOptions> raster_order_options = {RasterOrderOptions::Heuristic};
+  for (float wave : waves) {
+    for (int k : problem_size_k) {
+      int grid_m, grid_n = 0;
+      int num_grid = int(wave * SmCount);
+
+      if (cluster_m >= cluster_n) {
+        grid_m = cluster_m;
+        grid_n = num_grid / grid_m;
+        // Align grid_n to cluster_n
+        grid_n = std::max((grid_n + cluster_n - 1 ) / cluster_n * cluster_n, 1);
+      }
+      else {
+        grid_n = cluster_n;
+        grid_m = num_grid / grid_n;
+        // Align grid_m to cluster_m
+        grid_m = std::max((grid_m + cluster_m - 1 ) / cluster_m * cluster_m, 1);
+      }
+
+      int m = grid_m * cute::size<0>(cta_shape) + MultiplierOffsetM * max_alignment_m;
+      int n = grid_n * cute::size<1>(cta_shape) + MultiplierOffsetN * max_alignment_n;
+      int l = test_batched_alpha_beta && wave == waves[0] && k == problem_size_k[0] ? 2 : 1; // only test the smallest problem size
+      ProblemShapeType problem_size;
+      if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+        problem_size = ProblemShapeType{m, n, k, l};
+      }
+      else {
+        problem_size = ProblemShapeType{m, n, k};
+      }
+
+      for (DecompositionMode decomp_mode : decomposition_modes) {
+        for (RasterOrderOptions raster_order : raster_order_options) {
+          std::vector problem_splits = {detail::Splits{1}};
+          if constexpr (UsesStreamKScheduler) {
+            if (decomp_mode == DecompositionMode::SplitK) {
+              problem_splits.push_back(detail::Splits{2});
+              problem_splits.push_back(detail::Splits{4});
+            }
+          }
+          for (auto splits : problem_splits) {
+            try {
+              passed = testbed.run(
+                problem_size,
+                cutlass::from_real<ElementScalar>(alpha),
+                cutlass::from_real<ElementScalar>(beta),
+                raster_order, // raster_order
+                detail::MaxSwizzleSize(0),
+                splits,
+                decomp_mode
+              );
+            }
+            catch (std::exception const& e) {
+              EXPECT_TRUE(false) << "TestSmall: testbed.run {"
+                << "m: " << m << ", n: " << n << ", k: " << k << ", l: " << l
+                << ", alpha: " << alpha << ", beta: " << beta
+                << ", raster_order: " << detail::raster_order_to_string(raster_order)
+                << ", max_swizzle_size: 1"
+                << ", splits: " << static_cast<int>(splits)
+                << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                << "} threw an exception: " << e.what();
+              throw;
+            }
+            catch (...) {
+              EXPECT_TRUE(false) << "TestSmall: testbed.run {"
+                << "m: " << m << ", n: " << n << ", k: " << k << ", l: " << l
+                << ", alpha: " << alpha << ", beta: " << beta
+                << ", raster_order: " << detail::raster_order_to_string(raster_order)
+                << ", max_swizzle_size: 1"
+                << ", splits: " << static_cast<int>(splits)
+                << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                << "} threw an exception (unknown)";
+              throw;
+            }
+            EXPECT_TRUE(passed) << "TestSmall: testbed.run {"
+              << "m: " << m << ", n: " << n << ", k: " << k << ", l: " << l
+              << ", alpha: " << alpha << ", beta: " << beta
+              << ", raster_order: " << detail::raster_order_to_string(raster_order)
+              << ", max_swizzle_size: 1"
+              << ", splits: " << static_cast<int>(splits)
+              << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+              << "} failed";
+
+            if (!passed) {
+              std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNKL " << m << " " << n << " " << k << " " << l << " FAILED.\n";
+              return false;
+            }
+          } // splits
+        } // raster_order
+      } // decomposition_mode
+    } // k
+  } // waves
+
+  return passed;
+}
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = true, bool test_batched_alpha_beta = false>
+bool TestSmallFusion(double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::GemmKernel::ElementC, void> ? 0.0 : 1.0,
+                     CheckEquality check_relative_equality = CheckEquality::RELATIVE,
+                     ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE,
+                     VectorScale vector_scale_mode = VectorScale::ENABLED,
+                     std::vector<int> override_problem_size_k = {}) {
+  return TestSmall<Gemm,
+                   force_legacy_epilogue,
+                   apply_alignment_offset,
+                   test_batched_alpha_beta>(alpha,
+                                            beta,
+                                            check_relative_equality,
+                                            use_device_scalars,
+                                            vector_scale_mode,
+                                            override_problem_size_k);
+}
+
+
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAll(double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::GemmKernel::ElementC, void> ? 0.0 : 1.0, CheckEquality check_relative_equality = CheckEquality::RELATIVE) {
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  Testbed3x<Gemm, ActivationFunctor> testbed(check_relative_equality, ScalarLoc::ON_HOST, VectorScale::DISABLED);
+
+  int max_alignment_m = std::max({Gemm::kAlignmentA, Gemm::kAlignmentC, Gemm::kAlignmentD});
+  int max_alignment_n = std::max({Gemm::kAlignmentB, Gemm::kAlignmentC, Gemm::kAlignmentD});
+  if constexpr (std::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, typename Gemm::EpilogueOutputOp>) {
+    max_alignment_m = std::max(max_alignment_m, Gemm::EpilogueOutputOp::AlignmentAux);
+    max_alignment_n = std::max(max_alignment_n, Gemm::EpilogueOutputOp::AlignmentAux);
+  }
+  std::vector<int> problem_size_m = {max_alignment_m, 512 - 3 * max_alignment_m};
+  std::vector<int> problem_size_n = {max_alignment_n, 512 - 2 * max_alignment_n};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+                cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+    problem_size_m.push_back(768);
+    problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  int max_alignment_k = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_k = {max_alignment_k, TileShapeK * (Stages + 1) - max_alignment_k};
+
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+  std::vector<DecompositionMode> decomposition_modes = {DecompositionMode::Heuristic};
+  std::vector problem_splits = {detail::Splits{1}};
+  static constexpr bool UsesStreamKScheduler = cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>;
+  if constexpr (UsesStreamKScheduler) {
+    problem_splits.push_back(detail::Splits{2});
+    problem_splits.push_back(detail::Splits{3});
+
+    decomposition_modes.push_back(DecompositionMode::DataParallel);
+    decomposition_modes.push_back(DecompositionMode::SplitK);
+    decomposition_modes.push_back(DecompositionMode::StreamK);
+
+    // Use larger K sizes for stream-K tests
+    static constexpr int min_tiles_per_sk_unit = cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::min_iters_per_sk_unit_;
+    problem_size_k = {TileShapeK * min_tiles_per_sk_unit, TileShapeK * 3 * min_tiles_per_sk_unit - max_alignment_k};
+  }
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  std::vector<RasterOrderOptions> raster_orders = {RasterOrderOptions::AlongM, RasterOrderOptions::AlongN};
+  std::vector max_swizzle_sizes{detail::MaxSwizzleSize{1}, detail::MaxSwizzleSize{4}};
+
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (auto raster_order : raster_orders) {
+          for (auto max_swizzle_size : max_swizzle_sizes) {
+            for (DecompositionMode decomp_mode : decomposition_modes) {
+
+              std::vector problem_splits = {detail::Splits{1}};
+              if (decomp_mode == DecompositionMode::Heuristic || decomp_mode == DecompositionMode::SplitK) {
+                auto max_splits = (k + TileShapeK - 1) / TileShapeK;
+                if (max_splits > 2) {
+                  problem_splits.push_back(detail::Splits{2});
+                }
+                if (max_splits > 3) {
+                  problem_splits.push_back(detail::Splits{3});
+                }
+
+                problem_splits.push_back(detail::Splits{max_splits});
+
+                // Test the case in which we ask for more splits than there are K tiles in the GEMM. In this
+                // case, split-K will fall back to a splitting factor of `max_splits`.
+                problem_splits.push_back(detail::Splits{max_splits + 1});
+              }
+              for (auto splits : problem_splits) {
+                ProblemShapeType problem_size;
+                if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+                  problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+                }
+                else {
+                  problem_size = ProblemShapeType{m, n, k};
+                }
+
+                try {
+                  passed = testbed.run(
+                    problem_size,
+                    cutlass::from_real<ElementScalar>(alpha),
+                    cutlass::from_real<ElementScalar>(beta),
+                    raster_order,
+                    max_swizzle_size,
+                    splits,
+                    decomp_mode
+                  );
+                }
+                catch (std::exception const& e) {
+                  EXPECT_TRUE(false) << "TestAll: testbed.run {"
+                    << "m: " << m << ", n: " << n << ", k: " << k
+                    << ", alpha: " << alpha << ", beta: " << beta
+                    << ", raster_order: ???"
+                    << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                    << ", splits: " << static_cast<int>(splits)
+                    << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                    << "} threw an exception: " << e.what();
+                  throw;
+                }
+                catch (...) {
+                  EXPECT_TRUE(false) << "TestAll: testbed.run {"
+                    << "m: " << m << ", n: " << n << ", k: " << k
+                    << ", alpha: " << alpha << ", beta: " << beta
+                    << ", raster_order: ???"
+                    << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                    << ", splits: " << static_cast<int>(splits)
+                    << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                    << "} threw an exception (unknown)";
+                  throw;
+                }
+
+                EXPECT_TRUE(passed) << "TestAll: testbed.run {"
+                  << "m: " << m << ", n: " << n << ", k: " << k
+                  << ", alpha: " << alpha << ", beta: " << beta
+                  << ", raster_order: ???"
+                  << ", max_swizzle_size: " << static_cast<int>(max_swizzle_size)
+                  << ", splits: " << static_cast<int>(splits)
+                  << ", decomp_mode: " << detail::decomp_mode_to_string(decomp_mode)
+                  << "} failed";
+
+                if (!passed) {
+                  std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNK " << m << " " << n << " " << k << " FAILED.\n";
+                  return false;
+                }
+              } // splits
+            } // decomposition_mode
+          } // max_swizzle_size
+        } // raster_order
+      } // k
+    } // n
+  } // m
+
+  // if we do support batched GEMM, just run one test on it to save on test time
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+    auto problem_size = ProblemShapeType{256 + max_alignment_m, 256 + max_alignment_n, 160 + max_alignment_k, /* l */ 3};
+    passed = testbed.run(
+      problem_size,
+      cutlass::from_real<ElementScalar>(alpha),
+      cutlass::from_real<ElementScalar>(beta)
+    );
+
+    if (!passed) {
+      return false;
+    }
+  }
+
+  return passed;
+}
+
+template <typename Gemm>
+bool TestAllBiasElementwise(double alpha = 1.0, double beta = cute::is_same_v<typename Gemm::GemmKernel::ElementC, void> ? 0.0 : 1.0, CheckEquality check_relative_equality = CheckEquality::EXACT) {
+  return TestAll<Gemm>(alpha, beta, check_relative_equality);
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_evt.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f18a7b39cbfe7dfb8d3251b2750e49261522de8a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@@ -0,0 +1,1742 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Testbed and host reference for EVT unittest
+*/
+
+
+#pragma once
+#include "gemm_testbed_3x.hpp" 
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/// Host-side tapply, tapply in cute is HOST_DEVICE
+template <class T, class F, class G, int... I>
+constexpr auto
+tapply(T&& t, F&& f, G&& g, cute::seq<I...>)
+{
+  return g(f(std::get<I>(static_cast<T&&>(t)))...);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT: Base class for EVT Node
+
+template < class ElementCompute_ >
+class HostEVTNodeBase {
+public:
+  using ElementCompute = ElementCompute_;
+
+private:
+  bool check_relative_equality_;
+  // Factors used for calculating relative equality. These default
+  // values are borrowed from those used by default in the CUTLASS
+  // profiler for performing relative equality checks.
+  float epsilon_ = 0.05f;
+  float nonzero_floor_ = 1.0f / 256.0f;
+
+public:
+  HostEVTNodeBase(){}
+  HostEVTNodeBase(bool check_relative_equality):
+    check_relative_equality_(check_relative_equality) { }
+
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+    if (check_relative_equality_) {
+      return cutlass::reference::host::TensorRelativelyEquals(
+        lhs, rhs, Element(epsilon_), Element(nonzero_floor_)
+      );
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  void* get_tensor_C_ptr() {
+    return nullptr;
+  }
+
+  void* get_tensor_D_ptr() {
+    return nullptr;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    return true;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Accumulator
+
+template< class ElementCompute = float >
+class HostAccumulator: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  struct Arguments { };
+  
+public:
+  HostAccumulator(){}
+  template<typename ProblemShapeType>
+  HostAccumulator(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(check_relative_equality) {}
+
+  template<typename ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> accumulator_converter;
+    return accumulator_converter(acc);
+  }
+
+  Arguments get_arguments() {
+    return Arguments{};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Scalar Broadcast
+
+template <
+  int Value,
+  int BroadcastCount = 1,
+  class StrideMNL = cute::Stride<cute::_0,cute::_0,cute::_0>,
+  template <class> class ReductionFn = cutlass::multiplies,
+  class ElementCompute = float
+>
+class HostScalarBroadcast : public HostEVTNodeBase<ElementCompute> {
+public:
+
+  using Base = HostEVTNodeBase<ElementCompute>;
+  struct Arguments {
+    ElementCompute scalar[BroadcastCount] = {0};
+    ElementCompute const* scalar_ptrs[BroadcastCount] = { nullptr };
+    StrideMNL dScalar[BroadcastCount] = {};
+  };
+private:
+  ElementCompute scalar_{};
+  StrideMNL dScalar{};
+  ElementCompute scalar_reduced_{};
+public:
+  HostScalarBroadcast(){}
+
+  template<typename ProblemShapeType>
+  HostScalarBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality), scalar_(ElementCompute(Value)) {
+    scalar_ = ElementCompute(Value);
+    scalar_reduced_ = scalar_;
+    for (int i = 1; i < BroadcastCount; ++i) {
+      scalar_reduced_ = ReductionFn<ElementCompute>{}(scalar_reduced_, ElementCompute(Value));
+    }
+  }
+  
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    
+    return scalar_reduced_;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss << "Scalar: " << float(scalar_) << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    if constexpr (BroadcastCount == 1)
+      return Arguments{{scalar_}, {nullptr}, {dScalar}};
+    else if constexpr (BroadcastCount == 2)
+      return Arguments{{scalar_, scalar_}, {nullptr, nullptr}, {dScalar,  dScalar}};
+    else if constexpr (BroadcastCount == 3)
+      return Arguments{{scalar_, scalar_, scalar_}, {nullptr, nullptr, nullptr}, {dScalar, dScalar, dScalar}};
+    else
+      return Arguments{{scalar_}, {nullptr}, {dScalar}};
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (BroadcastCount == 1) {
+      return cute::make_tuple(scalar_, nullptr);
+    } 
+    else if constexpr (BroadcastCount == 2) {
+      return cute::make_tuple(scalar_, scalar_, nullptr, nullptr);
+    } 
+    else if constexpr (BroadcastCount == 3) {
+      return cute::make_tuple(scalar_, scalar_, scalar_, nullptr, nullptr, nullptr);
+    } 
+    else {
+      return cute::make_tuple(scalar_, nullptr);
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Row Broadcast
+template <
+  typename ElementBias_,
+  typename StrideMNL = cute::Stride<cute::_0,cute::_1,cute::_0>,
+  typename ElementCompute = float
+>
+class HostRowBroadcast: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementBias = ElementBias_;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+  
+  struct Arguments {
+    ElementBias const* ptr_row = nullptr;
+    ElementBias null_default = ElementBias(0);
+    StrideMNL dRow = {};
+  };
+private:
+  cutlass::NumericConverter<ElementCompute, ElementBias> bias_converter_;
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias_;
+  int N_;
+public:
+  HostRowBroadcast(){}
+  template<typename ProblemShapeType>
+  HostRowBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    N_ = cute::get<1>(problem_shape_MNKL);
+    bias_.resize(cutlass::Coord<1>(N_));
+    
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        bias_.host_view(), cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    bias_.sync_device();
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    auto TensorBias = cute::make_tensor(bias_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{}, N_)));
+    
+    return bias_converter_(TensorBias(1, n + n_b));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss
+      << "PerColumnBias = \n" << bias_.host_view() << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    return {bias_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(bias_.device_data(), ElementBias(0), StrideMNL{});
+  }
+
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Column Broadcast
+template <
+  typename ElementBias_,
+  typename StrideMNL = cute::Stride<cute::_1,cute::_0,cute::_0>,
+  typename ElementCompute = float
+>
+class HostColBroadcast: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementBias = ElementBias_;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+  
+  struct Arguments {
+    ElementBias const* ptr_row = nullptr;
+    ElementBias null_default = ElementBias(0);
+    StrideMNL dRow = {};
+  };
+private:
+  cutlass::NumericConverter<ElementCompute, ElementBias> bias_converter_;
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias_;
+  int M_;
+public:
+  HostColBroadcast(){}
+  template<typename ProblemShapeType>
+  HostColBroadcast(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    M_ = cute::get<0>(problem_shape_MNKL);
+    bias_.resize(cutlass::Coord<1>(M_));
+    
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        bias_.host_view(), cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    bias_.sync_device();
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    auto TensorBias = cute::make_tensor(bias_.host_data(),
+      cute::make_layout(cute::make_shape(M_, cute::_1{})));
+    
+    return bias_converter_(TensorBias(m + m_b, 1));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    error_ss
+      << "PerRowBias = \n" << bias_.host_view() << "\n\n";
+    return true;
+  }
+
+  Arguments get_arguments() {
+    return {bias_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(bias_.device_data(), ElementBias(0), StrideMNL{});
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Aux Load
+
+template <
+  typename ElementAuxLoad_,
+  typename LayoutTagAux_,
+  bool isC = false,
+  typename ElementCompute = float
+>
+class HostAuxLoad: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ElementAuxLoad = ElementAuxLoad_;
+  using LayoutTagAux = LayoutTagAux_;
+
+  using StrideAux = cutlass::gemm::TagToStrideC_t<LayoutTagAux>;
+  struct Arguments_Aux {
+    ElementAuxLoad const *ptr_aux = nullptr;
+    ElementAuxLoad null_default = ElementAuxLoad(0);
+    StrideAux dAux = {};
+  };
+
+  struct Arguments_C {};
+
+  using Arguments = cute::conditional_t<isC, Arguments_C, Arguments_Aux>;
+
+private:
+  cutlass::NumericConverter<ElementCompute, ElementAuxLoad> aux_load_converter_;
+  cutlass::HostTensor<ElementAuxLoad, LayoutTagAux> tensor_aux_load_;
+
+  int M_, N_, L_;
+
+  StrideAux stride_aux_;
+public:
+  HostAuxLoad(){}
+  template<typename ProblemShapeType>
+  HostAuxLoad(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    : Base(check_relative_equality) {
+    auto problem_shape_NMKL = cute::append<4>(problem_size, 1);
+    auto [M_, N_, K, L_] = problem_shape_NMKL;
+    auto aux_coord = cutlass::make_Coord(M_ * L_, N_);
+    tensor_aux_load_.resize(
+      aux_coord, 
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+    EXPECT_TRUE(
+      detail::initialize_tensor(
+        tensor_aux_load_.host_view(), 
+        cutlass::Distribution::Uniform, 
+        seed
+      )
+    );
+    tensor_aux_load_.sync_device();
+    stride_aux_ = cutlass::make_cute_packed_stride(StrideAux{}, cute::make_shape(M_, N_, L_));
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+
+    
+    auto TensorAuxLoad = cute::make_tensor(tensor_aux_load_.host_data(),
+      cute::make_layout(cute::make_shape(M_, N_, L_), stride_aux_));
+    return aux_load_converter_(TensorAuxLoad(m + m_b, n + n_b, l));
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    if constexpr (!isC) {
+      error_ss
+        << "AuxLoad = \n" << tensor_aux_load_.host_view()<< "\n\n";
+    }
+    return true;
+  }
+
+  void* get_tensor_C_ptr() {
+    if constexpr (isC) {
+      return static_cast<void*>(tensor_aux_load_.device_data());
+    } 
+    else {
+      return nullptr;
+    }
+  }
+
+  Arguments get_arguments() {
+    if constexpr (isC)
+      return {};
+    else
+      return {tensor_aux_load_.device_data(), ElementAuxLoad(0), stride_aux_};
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (isC)
+      return cute::make_tuple();
+    else
+      return cute::make_tuple(tensor_aux_load_.device_data(), ElementAuxLoad(0), stride_aux_);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Compute
+
+template<typename T>
+T* findNonNullPtr(T* first_ptr) {
+  return first_ptr;
+}
+
+template <typename T, typename... Args>
+T* findNonNullPtr(T* first_ptr, Args... args) {
+  if (first_ptr) {
+    return first_ptr;
+  }
+  return findNonNullPtr(args...);
+}
+
+template <
+  template <class> class ComputeOp_,
+  typename ElementCompute = float
+>
+class HostCompute: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using ComputeOp = ComputeOp_<ElementCompute>;
+
+  struct Arguments {
+    struct OpArgs {} op;
+  };
+private:
+  ComputeOp op_;
+public:
+  HostCompute(){}
+  template <typename ProblemShapeType>
+  HostCompute(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) { }
+
+  template <class ElementAccumulator, typename... Args>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, Args... frg_inputs) {
+    return op_(frg_inputs...);
+  }
+
+  Arguments get_arguments(){
+    return {};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple();
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Aux Store
+
+template <
+  class ElementAuxStore_,
+  typename LayoutTagAux_,
+  bool isD = false,
+  bool isRelu = false,
+  typename ElementCompute = float
+>
+class HostAuxStore: public HostEVTNodeBase<ElementCompute> {
+public:
+  using ElementAuxStore = ElementAuxStore_;
+  using LayoutTagAux = LayoutTagAux_;
+
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  using StrideAux = cutlass::gemm::TagToStrideC_t<LayoutTagAux>;
+  struct Arguments_Aux {
+    struct OpArgs {
+      ElementAuxStore* ptr_aux = nullptr;
+      StrideAux dAux = {};
+    } op;
+  };
+
+  struct Arguments_D {};
+
+  using Arguments = cute::conditional_t<isD, Arguments_D, Arguments_Aux>;
+
+
+private:
+  cutlass::NumericConverter<ElementAuxStore, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementAuxStore, LayoutTagAux> tensor_aux_store_;
+  cutlass::HostTensor<ElementAuxStore, LayoutTagAux> reference_aux_store_;
+  int M_, N_, L_;
+  StrideAux stride_aux_;
+public:
+  HostAuxStore(){}
+  template <typename ProblemShapeType>
+  HostAuxStore(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto [M_, N_, K, L_] = problem_shape_MNKL;
+    auto aux_coord = cutlass::make_Coord(M_ * L_, N_);
+    tensor_aux_store_.resize(
+      aux_coord, 
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+
+    reference_aux_store_.resize(
+      aux_coord,
+      cutlass::layout::Affine2Layout_Factory<LayoutTagAux>::layout_factory(
+        aux_coord, typename LayoutTagAux::Stride()
+      )
+    );
+    tensor_aux_store_.sync_device();
+    stride_aux_ = cutlass::make_cute_packed_stride(StrideAux{}, cute::make_shape(M_, N_, L_));
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+
+    auto TensorAuxStore = cute::make_tensor(detail::make_iterator(static_cast<ElementAuxStore*>(reference_aux_store_.host_data())),
+      cute::make_layout(cute::make_shape(M_, N_, L_), stride_aux_));
+    if constexpr (isRelu)
+      TensorAuxStore(m + m_b, n + n_b, l) = destination_converter_(child_0_result >= 0);
+    else
+      TensorAuxStore(m + m_b, n + n_b, l) = destination_converter_(child_0_result);
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_aux_store_.sync_host();
+
+    bool equal = this->equality_check(reference_aux_store_.host_view(), tensor_aux_store_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nReference =\n" << reference_aux_store_.host_view()
+        << "\n\nComputed =\n" << tensor_aux_store_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  void* get_tensor_D_ptr() {
+    if constexpr (isD) 
+      return static_cast<void*>(tensor_aux_store_.device_data());
+    else
+      return nullptr;
+  }
+
+  Arguments get_arguments() {
+    if constexpr (isD) {
+      return {};
+    } 
+    else {
+      return {tensor_aux_store_.device_data(), stride_aux_};
+    }
+  }
+
+  auto get_flatten_arguments() {
+    if constexpr (isD) {
+      return cute::make_tuple();
+    } 
+    else {
+      return cute::make_tuple(tensor_aux_store_.device_data(), stride_aux_);
+    }
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Row Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  bool FinalReduction = true, // Should match the FinalReduction in Device type
+  typename CtaTileShapeMNK = cute::Shape<cute::_1,cute::_1,cute::_1>,
+  typename ElementCompute = float
+>
+class HostRowReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementDst = cute::conditional_t<FinalReduction, ElementReduce, ElementCompute>;
+
+  static constexpr int TileM = cute::get<0>(CtaTileShapeMNK{});
+  static constexpr int TileN = cute::get<1>(CtaTileShapeMNK{});
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_row = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_0, cute::_1, cute::_0> dRow = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementReduce, ElementDst> destination_converter_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> tensor_row_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> reference_row_reduce_;
+  int N_;
+  ReduceFn<ElementCompute> reduce_fn_;
+
+  int extent_m_;
+  int extent_n_;
+  int extent_l_;
+public:
+  HostRowReduce(){}
+  template <typename ProblemShapeType>
+  HostRowReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    N_ = cute::get<1>(problem_shape_MNKL);
+    if constexpr (FinalReduction) {
+      tensor_row_reduce_.resize(cutlass::Coord<1>(N_));
+      reference_row_reduce_.resize(cutlass::Coord<1>(N_));
+      reduce_buffer_.resize(cutlass::Coord<1>(N_));
+    } 
+    else {
+      auto NumTile = cute::ceil_div(cute::select<0,1,3>(problem_shape_MNKL), cute::take<0,2>(CtaTileShapeMNK{}));
+      extent_m_ = cute::get<0>(NumTile);
+      extent_n_ = cute::get<1>(NumTile) * TileN;
+      extent_l_ = cute::get<2>(NumTile);
+      auto shape = cutlass::make_Coord(extent_m_ * extent_n_ * extent_l_);
+      tensor_row_reduce_.resize(shape);
+      reference_row_reduce_.resize(shape);
+      reduce_buffer_.resize(shape);
+    }
+
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    if constexpr (FinalReduction) {
+      auto TensorRowReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{}, N_)));
+      TensorRowReduce(1, n + n_b) = reduce_fn_(TensorRowReduce(1, n + n_b), child_0_result);
+    } 
+    else {
+      auto TensorRowReduce = cute::make_tensor(
+        reduce_buffer_.host_data(),
+        cute::make_layout(
+          cute::make_shape(extent_m_, extent_n_, extent_l_),
+          cute::make_stride(extent_n_, 1, extent_m_ * extent_l_)
+        )
+      );
+      TensorRowReduce((m+m_b)/TileM, n+n_b, l) = reduce_fn_(TensorRowReduce((m+m_b)/TileM, n+n_b, l), child_0_result);
+    }
+    
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_row_reduce_.sync_host();
+
+    auto TensorRowReduce = cute::make_tensor(reference_row_reduce_.host_data(),
+      cute::make_layout(cute::make_shape(reference_row_reduce_.size())));
+    
+    auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(reduce_buffer_.size())));
+
+    // Filling the reference tensor with the reduce buffer
+    for (uint64_t n = 0; n < size(TensorRowReduce); n ++) {
+      TensorRowReduce(n) = destination_converter_(TensorReduceBuffer(n));
+    }
+
+    bool equal = this->equality_check(reference_row_reduce_.host_view(), tensor_row_reduce_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nRow Reduce Reference =\n" << reference_row_reduce_.host_view()
+        << "\n\nRow Reduce Computed =\n" << tensor_row_reduce_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  Arguments get_arguments() {
+    return {tensor_row_reduce_.device_data()};
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Column Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  bool FinalReduction = true,  // Should match the FinalReduction in Device type
+  typename CtaTileShapeMNK = cute::Shape<cute::_1,cute::_1,cute::_1>,
+  typename ElementCompute = float
+>
+class HostColumnReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementDst = cute::conditional_t<FinalReduction, ElementReduce, ElementCompute>;
+
+  static constexpr int TileM = cute::get<0>(CtaTileShapeMNK{});
+  static constexpr int TileN = cute::get<1>(CtaTileShapeMNK{});
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_col = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_1, cute::_0, cute::_0> dRow = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementDst, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> tensor_column_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementDst, LayoutTagVector> reference_column_reduce_;
+  int M_;
+  ReduceFn<ElementCompute> reduce_fn_;
+
+  int extent_m_;
+  int extent_n_;
+  int extent_l_;
+public:
+  HostColumnReduce(){}
+  template <typename ProblemShapeType>
+  HostColumnReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    M_ = cute::get<0>(problem_shape_MNKL);
+
+    if constexpr (FinalReduction) {
+      tensor_column_reduce_.resize(cutlass::Coord<1>(M_));
+      reference_column_reduce_.resize(cutlass::Coord<1>(M_));
+      reduce_buffer_.resize(cutlass::Coord<1>(M_));
+    } 
+    else {
+      auto NumTile = cute::ceil_div(cute::select<0,1,3>(problem_shape_MNKL), cute::take<0,2>(CtaTileShapeMNK{}));
+      extent_m_ = cute::get<0>(NumTile) * TileM;
+      extent_n_ = cute::get<1>(NumTile);
+      extent_l_ = cute::get<2>(NumTile);
+      auto shape = cutlass::make_Coord(extent_m_ * extent_n_ * extent_l_);
+      tensor_column_reduce_.resize(shape);
+      reference_column_reduce_.resize(shape);
+      reduce_buffer_.resize(shape);
+    }
+
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    auto TensorColReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(M_, cute::_1{})));
+    if constexpr (FinalReduction) {
+      TensorColReduce(m + m_b, 1) = reduce_fn_(TensorColReduce(m + m_b, 1), child_0_result);
+    } 
+    else {
+      auto shape = reduce_buffer_.extent();
+      auto TensorColReduce = cute::make_tensor(
+        reduce_buffer_.host_data(),
+        cute::make_layout(
+          cute::make_shape(extent_m_, extent_n_, extent_l_),
+          cute::make_stride(1, extent_m_, extent_m_ * extent_l_)
+        )
+      );
+      TensorColReduce(m+m_b, (n+n_b)/TileN, l) = reduce_fn_(TensorColReduce(m+m_b, (n+n_b)/TileN, l), child_0_result);
+    }
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    // Verify the store node
+    tensor_column_reduce_.sync_host();
+
+    auto TensorColReduce = cute::make_tensor(reference_column_reduce_.host_data(),
+      cute::make_layout(cute::make_shape(reference_column_reduce_.size())));
+    
+    auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+    cute::make_layout(cute::make_shape(reduce_buffer_.size())));
+
+    // Filling the reference tensor with the reduce buffer
+    for (uint64_t m = 0; m < size(TensorColReduce); m ++) {
+      TensorColReduce(m) = destination_converter_(TensorReduceBuffer(m));
+    }
+
+    bool equal = this->equality_check(reference_column_reduce_.host_view(), tensor_column_reduce_.host_view());
+    if (!equal) {
+      error_ss 
+        << "\n\nColumn Reduce Reference =\n" << reference_column_reduce_.host_view()
+        << "\n\nColumn Reduce Computed =\n" << tensor_column_reduce_.host_view() << "\n\n";
+    }
+    return equal;
+  }
+
+  Arguments get_arguments() {
+    return {tensor_column_reduce_.device_data()};
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// EVT - Scalar Reduce
+
+template <
+  template <class> class ReduceFn,
+  typename ElementReduce,
+  typename ElementCompute = float,
+  bool enabled = true
+>
+class HostScalarReduce: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  struct Arguments {
+    struct OpArgs {
+      ElementReduce* ptr_scalar = nullptr;
+      ElementCompute reduce_identity = 0;
+      cute::Stride<cute::_0, cute::_0, cute::_0> dScalar = {};
+    } op;
+  };
+
+private:
+  cutlass::NumericConverter<ElementReduce, ElementCompute> destination_converter_;
+  cutlass::HostTensor<ElementReduce, LayoutTagVector> tensor_scalar_reduce_;
+  cutlass::HostTensor<ElementCompute, LayoutTagVector> reduce_buffer_;
+  cutlass::HostTensor<ElementReduce, LayoutTagVector> reference_scalar_reduce_;
+  ReduceFn<ElementCompute> reduce_fn_;
+public:
+  HostScalarReduce(){}
+  template <typename ProblemShapeType>
+  HostScalarReduce(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024):
+    Base(check_relative_equality) {
+    tensor_scalar_reduce_.resize(cutlass::Coord<1>(1));
+    reference_scalar_reduce_.resize(cutlass::Coord<1>(1));
+    reduce_buffer_.resize(cutlass::Coord<1>(1));
+
+    tensor_scalar_reduce_.sync_device();
+    cutlass::reference::host::TensorFill(reduce_buffer_.host_view());
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc, ElementCompute child_0_result) {
+    auto TensorRowReduce = cute::make_tensor(reduce_buffer_.host_data(),
+      cute::make_layout(cute::make_shape(cute::_1{})));
+    TensorRowReduce(0) = reduce_fn_(TensorRowReduce(0), child_0_result);
+    return child_0_result;
+  }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    if constexpr (enabled) {
+      // Verify the store node
+      tensor_scalar_reduce_.sync_host();
+
+      auto TensorRowReduce = cute::make_tensor(reference_scalar_reduce_.host_data(),
+        cute::make_layout(cute::make_shape(cute::_1{})));
+      
+      auto TensorReduceBuffer = cute::make_tensor(reduce_buffer_.host_data(),
+        cute::make_layout(cute::make_shape(cute::_1{})));
+
+      // Filling the reference tensor with the reduce buffer
+      TensorRowReduce(0) = destination_converter_(TensorReduceBuffer(0));
+
+      bool equal = this->equality_check(reference_scalar_reduce_.host_view(), tensor_scalar_reduce_.host_view());
+      if (!equal) {
+        error_ss 
+          << "\n\nScalar Reduce Reference =\n" << reference_scalar_reduce_.host_view()
+          << "\n\nScalar Reduce Computed =\n" << tensor_scalar_reduce_.host_view() << "\n\n";
+      }
+      return equal;
+    }
+    else {
+      return true;
+    }
+    
+  }
+
+  Arguments get_arguments() {
+    return {tensor_scalar_reduce_.device_data()};
+  }
+
+  auto get_flatten_arguments() {
+    return cute::make_tuple(tensor_scalar_reduce_.device_data());
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Host EVT wrapper
+
+/// The ArgumentPack is used to model the alignment when num ops <= 4
+template <typename... Ops>
+struct ArgumentPack;
+
+template <typename T>
+struct ArgumentPack<T> {
+  T arg;
+  ArgumentPack(T first):
+    arg(first) {}
+};
+
+template <typename First, typename... Rest>
+struct ArgumentPack<First, Rest...> {
+  First arg;
+  ArgumentPack<Rest...> rest_args;
+
+  ArgumentPack(First first, Rest... rest) :
+    arg(first), rest_args(rest...) {}
+};
+
+
+/// Base class for Host Visitor
+template <class ElementCompute, class... Ops>
+struct HostVisitorBase: public HostEVTNodeBase<ElementCompute> {
+public:
+  using Base = HostEVTNodeBase<ElementCompute>;
+
+  using Arguments_struct = ArgumentPack<typename Ops::Arguments...>;
+  using Arguments_tuple = cute::tuple<typename Ops::Arguments...>;
+
+  constexpr static int Rm1 = sizeof...(Ops);
+  constexpr static bool cond = Rm1 > 4;
+  using Arguments = cute::conditional_t<cond, Arguments_tuple, Arguments_struct>;
+
+  std::tuple<Ops...> ops;
+
+  HostVisitorBase(){}
+  template<typename ProblemShapeType>
+  HostVisitorBase(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(check_relative_equality),
+    ops(test::gemm::device::tapply(std::tuple<Ops...>{}, 
+      [&] (auto&& op) {
+        using Op = cute::remove_cvref_t<decltype(op)>;
+        return Op(problem_size, check_relative_equality, seed);
+      },
+      [] (auto&&... _ops) { 
+        return std::make_tuple(_ops...); 
+      },
+      cute::make_seq<Rm1>{}
+    )){ }
+
+  bool compare_reference(std::stringstream& error_ss) {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.compare_reference(error_ss);
+      },
+      [&] (auto&&... inputs) {
+        return arrayAnd(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  void* get_tensor_C_ptr() {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.get_tensor_C_ptr();
+      },
+      [&] (auto&&... inputs) {
+        return findNonNullPtr(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  void* get_tensor_D_ptr() {
+    return cute::detail::tapply(ops,
+      [&](auto& op) {
+        return op.get_tensor_D_ptr();
+      },
+      [&] (auto&&... inputs) {
+        return findNonNullPtr(inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  Arguments get_arguments() {
+    return test::gemm::device::tapply(ops,
+      [&](auto& op) {
+        return op.get_arguments();
+      },
+      [&] (auto&&... args) {
+        if constexpr (Rm1 > 4) {
+          return cute::make_tuple(args...);
+        } 
+        else {
+          return Arguments(args...);
+        }  
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  auto get_flatten_arguments() {
+    return test::gemm::device::tapply(ops,
+      [&](auto& op) {
+        return op.get_flatten_arguments();
+      },
+      [&] (auto&&... args) {
+        return flatten(cute::make_tuple(args...));
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+
+  bool arrayAnd(bool passed) {
+    return passed;
+  }
+
+  template <typename... Args>
+  bool arrayAnd(bool first_passed, Args... passed) {
+    if (first_passed) {
+      return arrayAnd(passed...);
+    }
+    return first_passed;
+  }
+
+};
+
+
+/// Tree-struct visitor
+template <class NodeOp, class... ChildOps>
+struct HostTreeVisitor: public HostVisitorBase<typename NodeOp::Base::ElementCompute, ChildOps..., NodeOp> {
+public:
+  using ElementCompute = typename NodeOp::Base::ElementCompute;
+  using Base = HostVisitorBase<ElementCompute, ChildOps..., NodeOp>;
+  using Arguments = typename Base::Arguments;
+  
+  constexpr static int Rm1 = sizeof...(ChildOps);
+
+  HostTreeVisitor(){}
+  template<typename ProblemShapeType>
+  HostTreeVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed){ }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    return cute::detail::tapply(this->ops,
+      [&] (auto& op) {
+        return op.visit(m, n, l, m_b, n_b, acc);
+      },
+      [&] (auto&&... frg_inputs) {
+        return std::get<Rm1>(this->ops).visit(m, n, l, m_b, n_b, acc, frg_inputs...);
+      },
+      cute::make_seq<Rm1>{}
+    );
+  }
+};
+
+
+/// General Graph visitor
+template <class ElementCompute, class EdgeTuple, class... Ops>
+struct HostTopoVisitor: public HostVisitorBase<ElementCompute, Ops...> {
+public:
+  using Base = HostVisitorBase<ElementCompute, Ops...>;
+  constexpr static int Rm1 = Base::Rm1;
+  using Arguments = typename Base::Arguments;
+  
+private:
+  ElementCompute frg_outputs_[Rm1];
+public:
+  HostTopoVisitor(){}
+  template<typename ProblemShapeType>
+  HostTopoVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed) { }
+
+  template<class ElementAccumulator, int I>
+  ElementCompute visit_(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+      frg_outputs_[I] = cute::transform_apply(cute::get<I>(EdgeTuple{}),
+        [&] (auto&& _E) {
+          constexpr int e = cute::remove_cvref_t<decltype(_E)>::value;
+          return frg_outputs_[e];
+        },
+        [&] (auto const&... frg_inputs) {
+          ElementCompute res = std::get<I>(this->ops).visit(m, n, l, m_b, n_b, acc, frg_inputs...);
+          return res;
+        }
+      );
+
+      if constexpr (I < Rm1 - 1) {
+        return visit_<ElementAccumulator, I+1>(m, n, l, m_b, n_b, acc);
+      } 
+      else {
+        return frg_outputs_[I];
+      }
+  }
+
+  template <class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+
+    return visit_<ElementAccumulator, 0>(m, n, l, m_b, n_b, acc);
+  }
+
+};
+
+
+/// SplitTree visitor
+template <class ElementCompute, class InputTree, class OutputTree, class... AuxOutTrees>
+struct HostSplitTreeVisitor: public HostVisitorBase<ElementCompute, InputTree, AuxOutTrees..., OutputTree> {
+public:
+  using Base = HostVisitorBase<ElementCompute, InputTree, AuxOutTrees..., OutputTree>;
+  using Arguments = typename Base::Arguments;
+
+  constexpr static int Rm2 = sizeof...(AuxOutTrees);
+
+private:
+  ElementCompute frg_input_;
+public:
+  HostSplitTreeVisitor(){}
+  template<typename ProblemShapeType>
+  HostSplitTreeVisitor(ProblemShapeType problem_size, bool check_relative_equality = false, int64_t seed = 2024)
+    :Base(problem_size, check_relative_equality, seed) { }
+
+  template<class ElementAccumulator, int I>
+  void visitAux(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator frag) {
+    std::get<I+1>(this->ops).visit(m, n, l, m_b, n_b, frag);
+
+    if constexpr (I < Rm2 - 1) {
+      return visitAux<ElementAccumulator, I+1>(m, n, l, m_b, n_b, frag);
+    } 
+    else {
+      return;
+    }
+  }
+
+  template<class ElementAccumulator>
+  ElementCompute visit(
+    int64_t m, int64_t n, int64_t l, int m_b, int n_b,
+    ElementAccumulator acc) {
+    
+    /// Compute the input tree
+    frg_input_ = std::get<0>(this->ops).visit(m, n, l, m_b, n_b, acc);
+
+    /// Compute the aux out tree
+    visitAux<ElementAccumulator, 0>(m, n, l, m_b, n_b, frg_input_);
+    /// Visit the output tree
+    return std::get<Rm2+1>(this->ops).visit(m, n, l, m_b, n_b, frg_input_);
+  }
+};
+
+/// Universal testbed for EVT w/o smem
+template <class Gemm, typename EVT, bool FlatArgs = false>
+class Testbed3xEVTnoSmem {
+public:
+  // The EVT Module to test
+  using EVTModule = EVT; //typename EVT::EVTModule;
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
+  using Kernel = typename Gemm::GemmKernel;
+  using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
+  using ElementAccumulator = typename Kernel::ElementAccumulator;
+  using ElementC = typename Kernel::ElementC;
+  using ElementD = typename Kernel::ElementD;
+
+  using ProblemShapeType = typename Kernel::ProblemShape;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  //
+  // Methods
+  //
+  Testbed3xEVTnoSmem(
+      bool check_relative_equality_,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed ) :
+    impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(check_relative_equality_) { }
+
+  Testbed3xEVTnoSmem(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed ) :
+    impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(false)  { }
+  
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B tensor
+    //
+    impl_.initialize(problem_size);
+  }
+  // Detail Implementation
+  TestBedImpl impl_;
+  
+  // Whether to use relative equality checks
+  bool check_relative_equality;
+  
+  bool verify(ProblemShapeType problem_size, EVTModule& host_reference) {
+    
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    /// Reference Kernel
+    static int constexpr kBlockM = 64;
+    static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+        for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+          ElementAccumulator acc[kBlockM][kBlockN];
+          gett_mainloop(mainloop_params, m, n, l, acc);
+          /// Epilogue EVT
+          for (int n_b = 0; n_b < kBlockN; ++n_b) {
+            for (int m_b = 0; m_b < kBlockM; ++m_b) {
+              if (m + m_b < cute::size<0>(LayoutD) && n + n_b < cute::size<1>(LayoutD)) {
+                host_reference.visit(m, n, l, m_b, n_b, acc[m_b][n_b]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::stringstream error_ss;
+    bool passed = host_reference.compare_reference(error_ss);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+      
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K
+        << ", Batch count = " << L << "\n\n";
+      
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view();
+      
+      file << error_ss.str();
+    }
+
+    return passed;
+  }
+
+  bool run(
+    ProblemShapeType problem_size,
+    RasterOrderOptions raster_order = RasterOrderOptions::Heuristic,
+    detail::MaxSwizzleSize max_swizzle = detail::MaxSwizzleSize{},
+    detail::Splits splits = detail::Splits{},
+    DecompositionMode decomposition_mode = DecompositionMode::Heuristic,
+    int iterations = 20,
+    bool profiling = false) {   
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the Gemm operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { static_cast<int>(splits), static_cast<int>(max_swizzle), raster_order, decomposition_mode };
+    }
+    else {
+      scheduler_args = { static_cast<int>(max_swizzle), raster_order };
+    }
+
+    /// Initializes data structures
+    /// A/B/C/D Tensor
+    initialize(problem_size);
+
+    /// Initialize the epilogue arguments
+    EVTModule host_reference(problem_size, check_relative_equality, 2024);
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+        impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
+      },
+      {},
+      hw_info,
+      scheduler_args
+    };
+
+    // Filling in the thread arguments
+    if constexpr (FlatArgs) {
+      auto epilogue_args = host_reference.get_flatten_arguments();
+      std::memcpy(&arguments.epilogue.thread, &epilogue_args, sizeof(epilogue_args));
+
+      arguments.epilogue.ptr_C = static_cast<ElementC*>(host_reference.get_tensor_C_ptr());
+      arguments.epilogue.dC = impl_.collective_epilogue.stride_c;
+
+      arguments.epilogue.ptr_D = static_cast<ElementD*>(host_reference.get_tensor_D_ptr());
+      arguments.epilogue.dD = impl_.collective_epilogue.stride_d;
+    } 
+    else {
+      auto epilogue_args = host_reference.get_arguments();
+      std::memcpy(&arguments.epilogue, &epilogue_args, sizeof(epilogue_args));
+    }
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    
+    //
+    // Run the GEMM
+    //
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, host_reference);
+    if (!passed) {
+      std::cout << "Error : Failed \n";
+    }
+
+    return passed;
+  }
+};
+
+/// Universal testbed for EVT
+template <class Gemm, typename EVT>
+class Testbed3xEVT {
+public:
+  // The EVT Module to test
+  using EVTModule = typename EVT::EVTModule;
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm, cutlass::epilogue::thread::Identity, true>;
+  using Kernel = typename Gemm::GemmKernel;
+  using Epilogue = typename Gemm::GemmKernel::CollectiveEpilogue;
+  using ElementAccumulator = typename Kernel::ElementAccumulator;
+  using ElementC = typename Kernel::ElementC;
+  using ElementD = typename Kernel::ElementD;
+
+  using ProblemShapeType = typename Kernel::ProblemShape;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+  using LayoutTagC = typename TestBedImpl::LayoutTagC;
+  using LayoutTagD = typename TestBedImpl::LayoutTagD;
+
+  //
+  // Methods
+  //
+  Testbed3xEVT(
+    bool check_relative_equality_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+     impl_((check_relative_equality_ ? CheckEquality::RELATIVE : CheckEquality::EXACT), ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(check_relative_equality_) { }
+
+  Testbed3xEVT(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+     impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+           init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+           check_relative_equality(false)  { }
+
+  Testbed3xEVT(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(stride_factor_A_, stride_factor_B_, stride_factor_C_, stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_),
+          check_relative_equality(false)  { }
+  
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B tensor
+    //
+    impl_.initialize(problem_size);
+  }
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  // Whether to use relative equality checks
+  bool check_relative_equality;
+
+  bool verify(ProblemShapeType problem_size, EVTModule& host_reference) {
+    
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+      cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+      cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto LayoutD = cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d);
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    /// Reference Kernel
+    static int constexpr kBlockM = 64;
+    static int constexpr kBlockN = 64;
+
+#if defined(_OPENMP)
+    #pragma omp parallel for collapse(3)
+#endif
+    for (int64_t l = 0; l < cute::size<2>(mainloop_params.A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(mainloop_params.A.layout()); m += kBlockM) {
+        for (int64_t n = 0; n < cute::size<0>(mainloop_params.B.layout()); n += kBlockN) {
+          ElementAccumulator acc[kBlockM][kBlockN];
+          gett_mainloop(mainloop_params, m, n, l, acc);
+          /// Epilogue EVT
+          for (int n_b = 0; n_b < kBlockN; ++n_b) {
+            for (int m_b = 0; m_b < kBlockM; ++m_b) {
+              if (m + m_b < cute::size<0>(LayoutD) && n + n_b < cute::size<1>(LayoutD)) {
+                host_reference.visit(m, n, l, m_b, n_b, acc[m_b][n_b]);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    std::stringstream error_ss;
+    bool passed = host_reference.compare_reference(error_ss);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+      
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K
+        << ", Batch count = " << L << "\n\n";
+      
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC =\n" << impl_.collective_epilogue.tensor_C.host_view() << "\n\n";
+      
+      file << error_ss.str();
+    }
+
+    return passed;
+  }
+
+  bool run(
+    ProblemShapeType problem_size,
+    bool profiling = false,
+    int iterations = 20,
+    int splits = 1) {   
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the Gemm operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    typename Gemm::GemmKernel::TileScheduler::Arguments scheduler_args;
+    if constexpr (cute::is_same_v<typename Gemm::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      scheduler_args = { splits };
+    }
+
+    /// Initializes data structures
+    /// A/B/C/D Tensor
+    initialize(problem_size);
+
+    /// Initialize the epilogue arguments
+    EVTModule host_reference(problem_size, check_relative_equality, 2024);
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      {
+        impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+        impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b
+      },
+      {   // Epilogue arguments
+        {}, // thread
+        static_cast<ElementC*>(host_reference.get_tensor_C_ptr()),
+        impl_.collective_epilogue.stride_c,
+        static_cast<ElementD*>(host_reference.get_tensor_D_ptr()),
+        impl_.collective_epilogue.stride_d
+      },  // Epilogue arguments end
+      hw_info,
+      scheduler_args
+    };
+
+    // Filling in the thread arguments
+    typename EVTModule::Arguments epilogue_args = host_reference.get_arguments();
+    std::memcpy(&arguments.epilogue.thread, &epilogue_args.arg, sizeof(epilogue_args.arg));
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+    
+    //
+    // Run the GEMM
+    //
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, host_reference);
+    if (!passed) {
+      std::cout << "Error : Failed \n";
+    }
+
+    return passed;
+  }
+};
+
+template <typename Gemm, typename EVT>
+bool TestAllEVT(bool check_relative_equality = false) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+        cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+  problem_size_m.push_back(768);
+  problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  Testbed3xEVT<Gemm, EVT> testbed(check_relative_equality);
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+  for (int n : problem_size_n) {
+    for (int k : problem_size_k) {
+    ProblemShapeType problem_size;
+    if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+      problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+    }
+    else {
+      problem_size = ProblemShapeType{m, n, k};
+    }
+
+    passed = testbed.run(problem_size);
+
+    if (!passed) {
+      return false;
+    }
+    }
+  }
+  }
+
+  // if we do support batched GEMM, just run one test on it to save on test time
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+  auto problem_size = ProblemShapeType{256 + max_alignment, 256 + max_alignment, 160 + max_alignment, /* l */ 3};
+  passed = testbed.run(
+    problem_size
+  );
+
+  if (!passed) {
+    return false;
+  }
+  }
+
+  return passed;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cbc54ec582d88d9039968d8153cf6127a06ec274
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -0,0 +1,2409 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Testbed for Ptr-Array and Grouped GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <random>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/epilogue/fusion/operations.hpp"
+#include "cutlass/complex.h"
+#include "testbed_utils.h"
+
+#include "cutlass/kernel_hardware_info.hpp"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cute/int_tuple.hpp"
+#include "cute/layout.hpp"
+#include "cute/numeric/int.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+enum class ScalarLoc {
+  ON_HOST = 0,
+  ON_DEVICE = 1
+};
+
+enum class VectorScale {
+  DISABLED = 0,
+  ENABLED = 1
+};
+
+enum class CheckEquality {
+  EXACT = 0,
+  RELATIVE = 1
+};
+
+namespace detail{
+
+// Helper classes that take default data type when
+// the Gemm::EpilogueOutputOp does not have ElementCompute
+// and ElementScalar.
+// (e.g. when Sm90TreeVisitor is used as FusionCallbacks)
+template <typename Gemm, typename Default, typename = void>
+struct ElementComputeType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementComputeType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementCompute>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementCompute;
+};
+
+template <typename Gemm, typename Default, typename = void>
+struct ElementScalarType {
+  using Type = Default;
+};
+
+template <typename Gemm, typename Default>
+struct ElementScalarType<Gemm, Default, std::void_t<typename Gemm::EpilogueOutputOp::ElementScalar>> {
+  using Type = typename Gemm::EpilogueOutputOp::ElementScalar;
+};
+
+
+template <typename Gemm, typename = void>
+struct IsF8F6F4Kernel {
+  static constexpr bool value = false;
+};
+
+template <typename Gemm>
+struct IsF8F6F4Kernel<Gemm, std::void_t<decltype(Gemm::GemmKernel::CollectiveMainloop::IsF8F6F4)>> {
+  static constexpr bool value = true;
+};
+
+
+// The maximum swizzle size to use
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+class MaxSwizzleSize {
+public:
+  MaxSwizzleSize() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit MaxSwizzleSize(IntegralNotBool max_swizzle_size) : max_swizzle_size_(max_swizzle_size) {}
+  explicit operator int() const { return max_swizzle_size_; }
+private:
+  int max_swizzle_size_ = 1;
+};
+
+template <typename T>
+auto make_iterator(T* ptr) {
+  return cute::recast_ptr<T>(ptr);
+}
+
+template<class T>
+struct IsDefaultEpilogue {
+  static constexpr bool value = false;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::DefaultEpilogue<args...>> {
+  static constexpr bool value = true;
+};
+
+template<class ...args>
+struct IsDefaultEpilogue<cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<args...>> {
+  static constexpr bool value = true;
+};
+
+// The number of splits to test.
+//
+// This class makes it harder to confuse the order of arguments
+// of the various run(...) functions in this file.  The constructor
+// is explicit, so one can't just type 42 (or false, which the
+// compiler unhelpfully turns into 0); one has to type Splits(42).
+// Splits() picks the default number of splits, 1.
+//
+// The conversion-to-int operator (operator int()) MUST be explicit!
+// Conversion to int MUST require static_cast<int>.
+// Otherwise, that defeats a key purpose of this class,
+// which is to catch common errors of confusing the order
+// of function arguments.
+class Splits {
+public:
+  Splits() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Splits(IntegralNotBool splits) : splits_(splits) {}
+  explicit operator int() const { return splits_; }
+private:
+  int splits_ = 1;
+};
+
+// The number of iterations to test.
+//
+// This class, like Splits above makes it harder to confuse
+// the order of arguments of the various run(...) functions in this file.
+// Iterations() picks the default number of iterations, 20.
+class Iterations {
+public:
+  Iterations() = default;
+
+  template<class IntegralNotBool,
+    __CUTE_REQUIRES((std::is_integral_v<IntegralNotBool> &&
+      !cute::is_same_v<IntegralNotBool, bool>)) >
+  explicit Iterations(IntegralNotBool iterations) : iterations_(iterations) {}
+  explicit operator int() const { return iterations_; }
+private:
+  int iterations_ = 20;
+};
+
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  cutlass::Distribution::Kind dist_kind,
+  uint64_t seed) {
+
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+
+    else if (bits_input <= 6) {
+      scope_max = 2;
+      scope_min = -2;
+    }
+
+    else if (bits_input <= 8) {
+
+      if constexpr (
+                    cute::is_same_v<Element, cutlass::float_ue8m0_t>){
+        scope_max = 4;
+        scope_min = 1;
+      }
+      else {
+
+        scope_max = 1;
+        scope_min = -1;
+
+      }
+
+    }
+    else{
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(
+      view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Looks at Cute Stride to check Row / Column Major
+template<typename Stride>
+static constexpr bool is_row_or_col_major(){
+  int stride_0 = int(cute::size<0>(Stride{}));
+  int stride_1 = int(cute::size<1>(Stride{}));
+  int depth = cute::depth(Stride{});
+  return ((stride_0 == 1) || (stride_1 == 1)) && (depth == 1);
+}
+
+
+//
+// Default MMA input Operands : A , B
+//
+template<
+  class ScheduleType_,
+  class Gemm,
+  class ElementA_ = typename Gemm::GemmKernel::ElementA,
+  class ElementB_ = typename Gemm::GemmKernel::ElementB>
+struct HostCollectiveMainloop {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using InternalStrideA  = typename Gemm::GemmKernel::InternalStrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using InternalStrideB  = typename Gemm::GemmKernel::InternalStrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  cutlass::ComplexTransform TransformA = Gemm::kTransformA;
+  cutlass::ComplexTransform TransformB = Gemm::kTransformB;
+
+  std::vector<InternalStrideA> stride_a_host;
+  std::vector<InternalStrideB> stride_b_host;
+
+  cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
+  cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
+  std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
+  cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
+  cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<InternalStrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_),
+    check_relative_equality(check_relative_equality_) { }
+
+  bool initialize(ProblemShapeType problem_shapes) {
+    //
+    // Allocate the GEMM workspace
+    //
+    // for pointer array problem_shapes.groups() is 1
+
+    tensors_A.clear();
+    tensors_B.clear();
+    stride_a_host.clear();
+    stride_b_host.clear();
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    for(int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
+      stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto a_coord = cutlass::make_Coord(M, K);
+      // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+      // so the HostTensorB should be treated as KxN in "coord"'s view
+      auto b_coord = cutlass::make_Coord(K, N);
+
+      tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
+      tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_A[i].host_view().at({0, 0}) = ElementA(1);
+      tensors_B[i].host_view().at({0, 0}) = ElementB(1);
+
+      tensors_A[i].sync_device();
+      tensors_B[i].sync_device();
+    }
+
+    return true;
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    std::vector<ElementA *> ptr_A_host(L);
+    std::vector<ElementB *> ptr_B_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_A_host.at(i) = tensors_A[i].device_data();
+      ptr_B_host.at(i) = tensors_B[i].device_data();
+    }
+
+    device_tensors_A.reset(L);
+    device_tensors_A.copy_from_host(ptr_A_host.data());
+
+    device_tensors_B.reset(L);
+    device_tensors_B.copy_from_host(ptr_B_host.data());
+
+    stride_a_device.reset(problem_shapes.groups());
+    stride_a_device.copy_from_host(stride_a_host.data());
+    stride_b_device.reset(problem_shapes.groups());
+    stride_b_device.copy_from_host(stride_b_host.data());
+
+    Arguments arguments;
+
+    if constexpr (IsGroupGemm) {
+      arguments
+      =
+      {
+        device_tensors_A.get(), stride_a_device.get(), device_tensors_B.get(), stride_b_device.get()
+      };
+    }
+    else {
+      arguments =
+      {
+        device_tensors_A.get(), stride_a_host[0], device_tensors_B.get(), stride_b_host[0]
+      };
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
+          make_layout(make_shape(M, K, 1), stride_a_host[batch]));
+    auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
+        make_layout(make_shape(N, K, 1), stride_b_host[batch]));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator,
+                                                 decltype(A),
+                                                 decltype(B)
+                                                 > mainloop_params{};
+
+    mainloop_params.A = A;
+    mainloop_params.B = B;
+    mainloop_params.transform_A = TransformA;
+    mainloop_params.transform_B = TransformB;
+
+    return mainloop_params;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file << "A =\n" << tensors_A[batch].host_view()
+         << "\nB =\n" << tensors_B[batch].host_view();
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes, int batch) {
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
+
+    bool passed = true;
+    return passed;
+  }
+};
+
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<
+                                SchedulerPipelineStageCount_,
+                                AccumulatorPipelineStageCount_>,
+                                Gemm, ElementA_, ElementB_> {
+  // Kernel data types
+  using ElementA = ElementA_;
+  using StrideA  = typename Gemm::GemmKernel::StrideA;
+  using InternalStrideA  = typename Gemm::GemmKernel::InternalStrideA;
+  using ElementB = ElementB_;
+  using StrideB  = typename Gemm::GemmKernel::StrideB;
+  using InternalStrideB  = typename Gemm::GemmKernel::InternalStrideB;
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using LayoutTagA = cutlass::detail::StrideToLayoutTagA_t<StrideA>;
+  using LayoutTagB = cutlass::detail::StrideToLayoutTagB_t<StrideB>;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideA, InternalStrideA>;
+
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using EpilogueOutputOp = typename Gemm::EpilogueOutputOp;
+
+  static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+
+  using ElementSF = typename Gemm::GemmKernel::CollectiveMainloop::ElementSF;
+  using Sm1xxBlkScaledConfig =  typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+  using Blk_MN   = typename Sm1xxBlkScaledConfig::Blk_MN;
+  using Blk_SF   = typename Sm1xxBlkScaledConfig::Blk_SF;
+  using SfAtom   = typename Sm1xxBlkScaledConfig::SfAtom;
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using InternalLayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using InternalLayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+
+  using Arguments = typename Gemm::GemmKernel::MainloopArguments;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+
+  std::vector<InternalStrideA> stride_a_host;
+  std::vector<InternalStrideB> stride_b_host;
+  cutlass::DeviceAllocation<InternalStrideA> stride_a_device;
+  cutlass::DeviceAllocation<InternalStrideB> stride_b_device;
+
+  std::vector<InternalLayoutSFA> layout_sfa_host;
+  std::vector<InternalLayoutSFB> layout_sfb_host;
+  cutlass::DeviceAllocation<InternalLayoutSFA> layout_sfa_device;
+  cutlass::DeviceAllocation<InternalLayoutSFB> layout_sfb_device;
+
+  typename LayoutTagA::Stride stride_factor_A;
+  typename LayoutTagB::Stride stride_factor_B;
+
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+
+  std::vector<cutlass::HostTensor<ElementA, LayoutTagA>> tensors_A;
+  std::vector<cutlass::HostTensor<ElementB, LayoutTagB>> tensors_B;
+  std::vector<cutlass::HostTensor<ElementSF, LayoutTagA>> tensors_SFA;
+  std::vector<cutlass::HostTensor<ElementSF, LayoutTagB>> tensors_SFB;
+
+  cutlass::DeviceAllocation<const ElementA *> device_tensors_A;
+  cutlass::DeviceAllocation<const ElementB *> device_tensors_B;
+  cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFA;
+  cutlass::DeviceAllocation<const ElementSF *> device_tensors_SFB;
+
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  // Note: this limitation comes from testbed / not the library
+  static_assert(is_row_or_col_major<InternalStrideA>(),
+    "ERROR : A Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideB>(),
+    "ERROR : B Layout is neither Row / Column Major)");
+
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed,
+    typename LayoutTagA::Stride stride_factor_A_ = typename LayoutTagA::Stride(),
+    typename LayoutTagB::Stride stride_factor_B_ = typename LayoutTagB::Stride()
+  ):
+    check_relative_equality(check_relative_equality_),
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    init_A(init_A_), init_B(init_B_), seed(seed_) { }
+
+  template<class ProblemShapeType>
+  bool initialize(ProblemShapeType problem_shapes) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensors_A.clear();
+    tensors_B.clear();
+    stride_a_host.clear();
+    stride_b_host.clear();
+    tensors_SFA.clear();
+    tensors_SFB.clear();
+    layout_sfa_host.clear();
+    layout_sfb_host.clear();
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_a_host.push_back(cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
+      stride_b_host.push_back(cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto a_coord = cutlass::make_Coord(M, K);
+      // Cutlass has Row/Col major refers to MxK times KxN matrix product,
+      // so the HostTensorB should be treated as KxN in "coord"'s view
+      auto b_coord = cutlass::make_Coord(K, N);
+
+      tensors_A.push_back(cutlass::HostTensor<ElementA, LayoutTagA>(a_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(a_coord, stride_factor_A)));
+      tensors_B.push_back(cutlass::HostTensor<ElementB, LayoutTagB>(b_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(b_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_A[i].host_view(), init_A, seed + 2022 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_B[i].host_view(), init_B, seed + 2021 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_A[i].host_view().at({0, 0}) = ElementA(1);
+      tensors_B[i].host_view().at({0, 0}) = ElementB(1);
+
+      tensors_A[i].sync_device();
+      tensors_B[i].sync_device();
+
+      using namespace cute;
+
+      auto k_blks = cutlass::ceil_div(K, size<1>(shape(SfAtom{})));
+      auto m_blks = cutlass::ceil_div(M, Blk_MN{});
+      auto n_blks = cutlass::ceil_div(N, Blk_MN{});
+      layout_sfa_host.push_back(Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1)));
+      layout_sfb_host.push_back(Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1)));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto sfa_coord   = cutlass::make_Coord(m_blks * Blk_MN{}, k_blks * Blk_SF{});
+      auto sfb_coord   = cutlass::make_Coord(n_blks * Blk_MN{}, k_blks * Blk_SF{});
+
+      tensors_SFA.push_back(cutlass::HostTensor<ElementSF, LayoutTagA>(sfa_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagA>::layout_factory(sfa_coord, stride_factor_A)));
+      tensors_SFB.push_back(cutlass::HostTensor<ElementSF, LayoutTagB>(sfb_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagB>::layout_factory(sfb_coord, stride_factor_B)));
+
+      EXPECT_TRUE(initialize_tensor(tensors_SFA[i].host_view(), init_A, seed + 2024 + i));
+      EXPECT_TRUE(initialize_tensor(tensors_SFB[i].host_view(), init_B, seed + 2025 + i));
+
+      // It is possible to randomly initialize to all zeros, so override this with non-zeros
+      // in the upper left corner of each operand.
+      tensors_SFA[i].host_view().at({0, 0}) = ElementSF(1);
+      tensors_SFB[i].host_view().at({0, 0}) = ElementSF(1);
+
+      tensors_SFA[i].sync_device();
+      tensors_SFB[i].sync_device();
+    }
+
+    return true;
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    std::vector<ElementA *> ptr_A_host(L);
+    std::vector<ElementB *> ptr_B_host(L);
+    std::vector<ElementSF *> ptr_SFA_host(L);
+    std::vector<ElementSF *> ptr_SFB_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_A_host.at(i) = tensors_A[i].device_data();
+      ptr_B_host.at(i) = tensors_B[i].device_data();
+      ptr_SFA_host.at(i) = tensors_SFA[i].device_data();
+      ptr_SFB_host.at(i) = tensors_SFB[i].device_data();
+    }
+
+    device_tensors_A.reset(L);
+    device_tensors_A.copy_from_host(ptr_A_host.data());
+
+    device_tensors_B.reset(L);
+    device_tensors_B.copy_from_host(ptr_B_host.data());
+
+    device_tensors_SFA.reset(L);
+    device_tensors_SFA.copy_from_host(ptr_SFA_host.data());
+
+    device_tensors_SFB.reset(L);
+    device_tensors_SFB.copy_from_host(ptr_SFB_host.data());
+
+    stride_a_device.reset(problem_shapes.groups());
+    stride_a_device.copy_from_host(stride_a_host.data());
+
+    stride_b_device.reset(problem_shapes.groups());
+    stride_b_device.copy_from_host(stride_b_host.data());
+
+    layout_sfa_device.reset(problem_shapes.groups());
+    layout_sfa_device.copy_from_host(layout_sfa_host.data());
+
+    layout_sfb_device.reset(problem_shapes.groups());
+    layout_sfb_device.copy_from_host(layout_sfb_host.data());
+
+    if constexpr (IsGroupGemm) {
+      return Arguments{
+        device_tensors_A.get(), stride_a_device.get(),
+        device_tensors_B.get(), stride_b_device.get(),
+        device_tensors_SFA.get(), layout_sfa_device.get(),
+        device_tensors_SFB.get(), layout_sfb_device.get()
+      };
+    }
+    else {
+      return Arguments{
+        device_tensors_A.get(), stride_a_host[0],
+        device_tensors_B.get(), stride_b_host[0],
+        device_tensors_SFA.get(), layout_sfa_host[0],
+        device_tensors_SFB.get(), layout_sfb_host[0]
+      };
+    }
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto A = make_tensor(make_iterator(tensors_A[batch].host_data()),
+          make_layout(make_shape(M, K, 1), stride_a_host[batch]));
+    auto SfA = make_tensor(tensors_SFA[batch].host_data(), layout_sfa_host[batch]);
+
+    auto B = make_tensor(make_iterator(tensors_B[batch].host_data()),
+        make_layout(make_shape(N, K, 1), stride_b_host[batch]));
+    auto SfB = make_tensor(tensors_SFB[batch].host_data(), layout_sfb_host[batch]);
+
+    return cutlass::reference::host::GettMainloopParams<ElementAccumulator,
+        decltype(A),
+        decltype(B),
+        decltype(SfA),
+        decltype(SfB)
+      >
+      {A, SfA, B, SfB};
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file << "A =\n" << tensors_A[batch].host_view()
+         << "\nB =\n" << tensors_B[batch].host_view()
+         << "\nSFA =\n" << tensors_SFA[batch].host_view()
+         << "\nSFB =\n" << tensors_SFB[batch].host_view();
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes, int batch) {
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_A[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_B[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFA[batch].host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_SFB[batch].host_view()), 0);
+    return true;
+  }
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongBlockScaledSm120<SchedulerPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeBlockScaledSm120<SchedulerPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<0,0>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+//
+// Block Scaled Gemm Input Operands : A , B, scalefactorA, scalefactorB
+//
+template<
+  class Gemm,
+  int SchedulerPipelineStageCount_,
+  int AccumulatorPipelineStageCount_,
+  class ElementA_,
+  class ElementB_
+>
+struct HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm103<SchedulerPipelineStageCount_,
+                                                                                              AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> : public
+       HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,AccumulatorPipelineStageCount_>,
+                              Gemm, ElementA_, ElementB_> {
+  using Base = HostCollectiveMainloop<cutlass::gemm::KernelPtrArrayTmaWarpSpecializedBlockScaledSm100<SchedulerPipelineStageCount_,AccumulatorPipelineStageCount_>,
+                                      Gemm, ElementA_, ElementB_>;
+  HostCollectiveMainloop(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = Base::kDefaultSeed,
+    typename Base::LayoutTagA::Stride stride_factor_A_ = typename Base::LayoutTagA::Stride(),
+    typename Base::LayoutTagB::Stride stride_factor_B_ = typename Base::LayoutTagB::Stride()
+  ) : Base::HostCollectiveMainloop(check_relative_equality_, init_A_, init_B_, seed_, stride_factor_A_, stride_factor_B_) {}
+};
+
+template<class Gemm>
+struct HostCollectiveDefaultEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using InternalStrideD  = typename kernel::InternalStrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+  using InternalStrideC  = typename kernel::InternalStrideC;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideD, InternalStrideD>;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<InternalStrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  cutlass::DeviceAllocation<InternalStrideC> stride_c_device;
+  cutlass::DeviceAllocation<InternalStrideD> stride_d_device;
+
+  std::vector<InternalStrideC> stride_c_host;
+  std::vector<InternalStrideD> stride_d_host;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  ElementScalar alpha;
+  ElementScalar beta;
+
+  std::vector<cutlass::HostTensor<ElementC, LayoutTagC>> tensors_C;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> tensors_D;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> references_D;
+  cutlass::DeviceAllocation<const ElementC *> device_tensors_C;
+  cutlass::DeviceAllocation<ElementD *> device_tensors_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveDefaultEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_C(init_C_), seed(seed_),
+     stride_factor_C(typename LayoutTagC::Stride()),
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    // Initialize Epilogue tensors
+
+    tensors_C.clear();
+    tensors_D.clear();
+    references_D.clear();
+    stride_c_host.clear();
+    stride_d_host.clear();
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_c_host.push_back(cutlass::make_cute_packed_stride(InternalStrideC{}, {M, N, 1}));
+      stride_d_host.push_back(cutlass::make_cute_packed_stride(InternalStrideD{}, {M, N, 1}));
+
+      // 2.x host tensor does not natively contain a batch stride or coord, so we spoof if by folding it into the outer mode
+      auto c_coord = cutlass::make_Coord(M, N);
+
+      tensors_C.push_back(cutlass::HostTensor<ElementC, LayoutTagC>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C)));
+      tensors_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D)));
+      references_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false));
+      EXPECT_TRUE(initialize_tensor(tensors_C[i].host_view(), init_C, seed + 2020));
+      tensors_C[i].host_view().at({0, 0}) = ElementC(1);
+
+      cutlass::reference::host::TensorCopy(references_D[i].host_view(), tensors_C[i].host_view());
+      tensors_C[i].sync_device();
+      tensors_D[i].sync_device();
+    }
+    alpha = alpha_;
+    beta = beta_;
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    tensors_D[batch].sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_C[batch].host_view()), 0);
+
+    if (tensors_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_D[batch].host_view()), 0);
+    }
+
+    if (references_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_D[batch].host_view()), 0);
+    }
+
+    bool passed = equality_check(references_D[batch].host_view(), tensors_D[batch].host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;
+    }
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    file
+    << "\nC =\n" << tensors_C[batch].host_view()
+    << "\n\nReference =\n" << references_D[batch].host_view()
+    << "\n\nComputed =\n" << tensors_D[batch].host_view();
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = cutlass::platform::max(problem_shapes.groups(), L);
+
+    std::vector<ElementC *> ptr_C_host(L);
+    std::vector<ElementD *> ptr_D_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_C_host.at(i) = tensors_C[i].device_data();
+      ptr_D_host.at(i) = tensors_D[i].device_data();
+    }
+
+    device_tensors_C.reset(L);
+    device_tensors_C.copy_from_host(ptr_C_host.data());
+
+    device_tensors_D.reset(L);
+    device_tensors_D.copy_from_host(ptr_D_host.data());
+
+    stride_c_device.reset(problem_shapes.groups());
+    stride_c_device.copy_from_host(stride_c_host.data());
+
+    stride_d_device.reset(problem_shapes.groups());
+    stride_d_device.copy_from_host(stride_d_host.data());
+
+    Arguments arguments;
+    if constexpr (IsGroupGemm) {
+      arguments =
+      {
+        {alpha, beta},
+        device_tensors_C.get(), stride_c_device.get(), device_tensors_D.get(), stride_d_device.get()
+      };
+    }
+    else {
+      arguments =
+      {
+        {alpha, beta},
+        device_tensors_C.get(), stride_c_host[0], device_tensors_D.get(), stride_d_host[0]
+      };
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensors_C[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_c_host[batch]));
+    auto D = cute::make_tensor(detail::make_iterator(references_D[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_d_host[batch]));
+
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D)>
+        epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha;
+    epilogue_params.beta = beta;
+
+    return epilogue_params;
+  }
+};
+
+template<class Gemm>
+struct HostCollectiveEpilogue {
+  // fusion types are potentially void if the fusion is not supported
+  // helper so we don't try to construct HostTensor with void type
+  template <typename T, typename U = uint8_t>
+  using non_void_t = cute::conditional_t<cute::is_void_v<T>, U, T>;
+
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  using kernel   = typename Gemm::GemmKernel;
+  using Epilogue = typename kernel::CollectiveEpilogue;
+  static_assert(IsDefaultEpilogue<Epilogue>::value == false, "Default Epilogue is not supported");
+
+  using ElementD = typename kernel::ElementD;
+  using StrideD  = typename kernel::StrideD;
+  using InternalStrideD  = typename kernel::InternalStrideD;
+  using ElementC = non_void_t<typename kernel::ElementC, ElementD>;
+  using StrideC  = typename kernel::StrideC;
+  using InternalStrideC  = typename kernel::InternalStrideC;
+
+  static constexpr bool IsGroupGemm = !cute::is_same_v<StrideD, InternalStrideD>;
+
+  static_assert(rank(InternalStrideC{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+  static_assert(rank(InternalStrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
+
+  static_assert(is_row_or_col_major<InternalStrideC>(),
+    "ERROR : C Layout is neither Row / Column Major)");
+  static_assert(is_row_or_col_major<InternalStrideD>(),
+    "ERROR : D Layout is neither Row / Column Major)");
+
+  // Deduce Cutlass Layouts (RowMajor & ColumnMajor)
+  using LayoutTagC = cutlass::detail::StrideToLayoutTagC_t<StrideC>;
+  using LayoutTagD = cutlass::detail::StrideToLayoutTagC_t<StrideD>;
+  using LayoutTagScalar = cutlass::layout::PackedVectorLayout; // scalars are size-1 vectors
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  using ElementAccumulator = typename kernel::ElementAccumulator;
+  using ElementScalingFactor = ElementAccumulator;
+  using ProblemShapeType = typename kernel::ProblemShape;
+
+  //
+  // FusionOperation derived types/queries
+  //
+  using EpiloguePolicy = typename Epilogue::DispatchPolicy;
+  static constexpr bool IsLegacy =
+  cute::is_same_v<
+    EpiloguePolicy,
+    cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<
+      EpiloguePolicy::StagesC, EpiloguePolicy::StagesD, EpiloguePolicy::FragmentSize>
+  >;
+
+  using FusionOp = typename Gemm::EpilogueOutputOp;
+  static_assert(cute::is_base_of_v<cutlass::epilogue::fusion::FusionOperation, FusionOp>);
+
+
+  // Scale factor Generation related
+  using SfStrategy = cutlass::reference::host::SfStrategy;
+  static constexpr bool IsBlockScaleSupported            = FusionOp::IsBlockScaleSupported;
+  static constexpr SfStrategy SfGenStrategy              = (!IsBlockScaleSupported) ? SfStrategy::None : SfStrategy::SfDGen;
+  static constexpr int32_t SFD_VectorSize = IsBlockScaleSupported ? FusionOp::SFVecSize : 1;
+  using ElementSFD = non_void_t<cute::remove_pointer_t<typename FusionOp::ElementBlockScaleFactor>, ElementD>;
+  using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
+                                          SFD_VectorSize
+                                        >;
+  using Blk_MN = typename Sm1xxBlockScaledOutputConfig::Blk_MN;
+  using Blk_SF = typename Sm1xxBlockScaledOutputConfig::Blk_SF;
+  using OutputSFAtom = typename Sm1xxBlockScaledOutputConfig::SfAtom;
+  std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> tensors_SFD;
+  std::vector<cutlass::HostTensor<ElementSFD, LayoutTagD>> references_SFD;
+  cutlass::DeviceAllocation<ElementSFD *> device_tensors_SFD;
+
+  using ElementCompute    = typename FusionOp::ElementCompute;
+  using ElementScalar     = typename FusionOp::ElementScalar;
+  using ElementBias       = non_void_t<typename FusionOp::ElementBias>;
+  using ElementAux        = non_void_t<typename FusionOp::ElementAux>;
+  using ElementAmax       = non_void_t<typename FusionOp::ElementAmax>;
+  using LayoutTagAux      = non_void_t<typename FusionOp::GmemLayoutTagAux, LayoutTagD>;
+  using ActivationFunctor = non_void_t<typename FusionOp::ActivationFn,
+                              cutlass::epilogue::thread::Identity<ElementCompute>>;
+
+  static constexpr bool IsBiasEnabled        = FusionOp::IsPerRowBiasSupported;
+  static constexpr bool IsDeBiasEnabled      = FusionOp::IsDePerRowBiasSupported;
+  static constexpr bool IsPerRowScaleEnabled = FusionOp::IsPerRowScaleSupported;
+  static constexpr bool IsScaleFactorEnabled = FusionOp::IsScaleFactorSupported;
+  static constexpr bool IsAuxInEnabled       = FusionOp::IsAuxInSupported;
+  static constexpr bool IsAuxOutEnabled      = FusionOp::IsAuxOutSupported;
+  static constexpr bool IsAbsMaxEnabledD     = FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementD, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementD, cutlass::float_e5m2_t>);
+  static constexpr bool IsAbsMaxEnabledAux   = IsAuxOutEnabled && FusionOp::IsAbsMaxSupported &&
+                                                (cute::is_same_v<ElementAux, cutlass::float_e4m3_t> ||
+                                                 cute::is_same_v<ElementAux, cutlass::float_e5m2_t>);
+
+  using Arguments = typename Gemm::GemmKernel::EpilogueArguments;
+
+  /// Initialization
+  cutlass::DeviceAllocation<InternalStrideC> stride_c_device;
+  cutlass::DeviceAllocation<InternalStrideD> stride_d_device;
+
+  std::vector<InternalStrideC> stride_c_host;
+  std::vector<InternalStrideD> stride_d_host;
+
+  typename LayoutTagC::Stride stride_factor_C;
+  typename LayoutTagD::Stride stride_factor_D;
+
+  // Inputs
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> alpha;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> beta;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_A;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_B;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_C;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_D;
+  cutlass::HostTensor<ElementScalar, LayoutTagScalar> scale_Aux;
+  cutlass::HostTensor<ElementBias  , LayoutTagVector> bias;
+  std::vector<cutlass::HostTensor<ElementC, LayoutTagC>> tensors_C;
+  cutlass::DeviceAllocation<const ElementC *> device_tensors_C;
+  cutlass::HostTensor<ElementCompute, LayoutTagScalar> norm_constant;
+
+  // Outputs
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> abs_max_D;
+  std::vector<cutlass::HostTensor<ElementAux , LayoutTagAux>> tensors_Aux;
+  cutlass::DeviceAllocation<ElementAux *> device_tensors_Aux;
+  cutlass::gemm::TagToStrideC_t<   LayoutTagAux   > stride_Aux;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> tensors_D;
+  std::vector<cutlass::HostTensor<ElementD, LayoutTagD>> references_D;
+  cutlass::DeviceAllocation<ElementD *> device_tensors_D;
+
+  // References
+  cutlass::HostTensor<ElementBias, LayoutTagVector> reference_dbias;
+  std::vector<cutlass::HostTensor<ElementAux , LayoutTagAux>> references_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAmax, LayoutTagScalar> reference_abs_max_D;
+
+  // Whether to use relative equality checks
+  CheckEquality check_relative_equality = CheckEquality::EXACT;
+  // Are scalars copied to device memory before kernel launch
+  ScalarLoc use_device_scalars = ScalarLoc::ON_HOST;
+  // If per-row scale is enabled and this is disabled, alpha/beta are passed as a host or device scalar instead of device vector
+  VectorScale vector_scale_mode = VectorScale::DISABLED;
+
+  // Random distribution with which to initialize the A/B/C/D/Aux scaling factors
+  cutlass::Distribution::Kind init_scale = cutlass::Distribution::Uniform;
+  // Random distribution with which to initialize the bias vector
+  cutlass::Distribution::Kind init_bias = cutlass::Distribution::Uniform;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+  static constexpr uint64_t kDefaultSeed = 4096;
+
+  HostCollectiveEpilogue(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): init_scale(init_scale_), init_bias(init_bias_),
+     init_C(init_C_), seed(seed_),
+     stride_factor_C(typename LayoutTagC::Stride()),
+     stride_factor_D(typename LayoutTagD::Stride()),
+     check_relative_equality(check_relative_equality_),
+     use_device_scalars(use_device_scalars_){ }
+
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    // Initialize Epilogue tensors
+
+    tensors_C.clear();
+    tensors_D.clear();
+    references_D.clear();
+    stride_c_host.clear();
+    stride_d_host.clear();
+
+    tensors_SFD.clear();
+    references_SFD.clear();
+
+
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+
+      stride_c_host.push_back(cutlass::make_cute_packed_stride(InternalStrideC{}, {M, N, 1}));
+      stride_d_host.push_back(cutlass::make_cute_packed_stride(InternalStrideD{}, {M, N, 1}));
+
+      auto c_coord = cutlass::make_Coord(M, N);
+      tensors_C.push_back(cutlass::HostTensor<ElementC, LayoutTagC>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagC>::layout_factory(c_coord, stride_factor_C)));
+      tensors_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D)));
+      references_D.push_back(cutlass::HostTensor<ElementD, LayoutTagD>(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, stride_factor_D), false));
+      EXPECT_TRUE(initialize_tensor(tensors_C[i].host_view(), init_C, seed + 2020));
+      tensors_C[i].host_view().at({0, 0}) = ElementC(1);
+
+      cutlass::reference::host::TensorCopy(references_D[i].host_view(), tensors_C[i].host_view());
+      tensors_C[i].sync_device();
+      tensors_D[i].sync_device();
+    }
+
+    auto scalar_coord = cutlass::make_Coord(1);
+    auto col_vector_coord = cutlass::make_Coord(M);
+    if constexpr (IsPerRowScaleEnabled) {
+      alpha.resize(col_vector_coord);
+      EXPECT_TRUE(initialize_tensor(alpha.host_view(), init_scale, seed + 2023));
+      if (vector_scale_mode == VectorScale::DISABLED) {
+        beta.resize(scalar_coord, false);
+        cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+      }
+      else {
+        beta.resize(col_vector_coord);
+        EXPECT_TRUE(initialize_tensor(beta.host_view(), init_scale, seed + 2024));
+      }
+    }
+    else {
+      alpha.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      beta.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      cutlass::reference::host::TensorFill(alpha.host_view(), alpha_);
+      cutlass::reference::host::TensorFill(beta.host_view(), beta_);
+    }
+    alpha.sync_device();
+    beta.sync_device();
+
+    if constexpr (IsScaleFactorEnabled) {
+      scale_A.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_B.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_C.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      scale_D.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+      EXPECT_TRUE(initialize_tensor(scale_A.host_view(), init_scale, seed + 2023));
+      EXPECT_TRUE(initialize_tensor(scale_B.host_view(), init_scale, seed + 2024));
+      EXPECT_TRUE(initialize_tensor(scale_C.host_view(), init_scale, seed + 2025));
+      EXPECT_TRUE(initialize_tensor(scale_D.host_view(), init_scale, seed + 2026));
+      scale_A.sync_device();
+      scale_B.sync_device();
+      scale_C.sync_device();
+      scale_D.sync_device();
+    }
+
+    if constexpr (IsBiasEnabled) {
+      bias.resize(col_vector_coord);
+      EXPECT_TRUE(initialize_tensor(bias.host_view(), init_bias, seed + 2023));
+      bias.sync_device();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.resize(col_vector_coord);
+      reference_dbias.resize(col_vector_coord);
+      cutlass::reference::host::TensorFill(bias.host_view(), ElementBias(0));
+      cutlass::reference::host::TensorFill(reference_dbias.host_view(), ElementBias(0));
+      bias.sync_device();
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.resize(scalar_coord);
+      // ensure in-place device reductions perform their own initialization
+      cutlass::reference::host::TensorFill(abs_max_D.host_view(),
+                                           CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+      abs_max_D.sync_device();
+      reference_abs_max_D.resize(scalar_coord);
+      cutlass::reference::host::TensorFill(reference_abs_max_D.host_view(), ElementAmax(0));
+    }
+
+    tensors_Aux.clear();
+    references_Aux.clear();
+
+    static_assert(!IsGroupGemm or (IsGroupGemm and !IsAuxInEnabled));
+
+    if constexpr (IsAuxInEnabled) {
+      auto aux_coord = cutlass::make_Coord(M, N);
+      auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+      for (int32_t i = 0; i < L; ++i) {
+        tensors_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout));
+        EXPECT_TRUE(initialize_tensor(tensors_Aux[i].host_view(), init_C, seed + 2023));
+        tensors_Aux[i].sync_device();
+      }
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, 1));
+    }
+
+    static_assert(!IsGroupGemm or (IsGroupGemm and !IsAuxOutEnabled));
+
+    if constexpr (IsAuxOutEnabled) {
+      for (int32_t i = 0; i < L; ++i) {
+        auto [M, N, K, mock_L] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        auto aux_coord = cutlass::make_Coord(M, N);
+        auto aux_layout = cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(aux_coord, typename LayoutTagAux::Stride{});
+        tensors_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout));
+        references_Aux.push_back(cutlass::HostTensor<ElementAux , LayoutTagAux>(aux_coord, aux_layout, false));
+        tensors_Aux[i].sync_device();
+      }
+
+      stride_Aux = cutlass::make_cute_packed_stride(cutlass::gemm::TagToStrideC_t<LayoutTagAux>{}, cute::make_shape(M, N, 1));
+
+      if constexpr (IsScaleFactorEnabled) {
+        scale_Aux.resize(scalar_coord, (use_device_scalars == ScalarLoc::ON_DEVICE));
+        EXPECT_TRUE(initialize_tensor(scale_Aux.host_view(), init_scale, seed + 2027));
+        scale_Aux.sync_device();
+      }
+
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.resize(scalar_coord);
+        // ensure in-place device reductions perform their own initialization
+        cutlass::reference::host::TensorFill(abs_max_Aux.host_view(),
+                                             CUTLASS_STL_NAMESPACE::numeric_limits<ElementAmax>::max());
+        abs_max_Aux.sync_device();
+        reference_abs_max_Aux.resize(scalar_coord);
+        cutlass::reference::host::TensorFill(reference_abs_max_Aux.host_view(), ElementAmax(0));
+      }
+    }
+
+
+    if constexpr (IsBlockScaleSupported) {
+      for (int32_t i = 0; i < L; ++i) {
+        auto [M, N, K, _] = cute::append<4>(problem_shapes.get_host_problem_shape(i), 1);
+        // If block scaled output is supported we always have at least 1 SFD
+        auto m_blks = cutlass::ceil_div(M, cute::size<0>(cute::shape(OutputSFAtom{})));
+        auto n_blks = cutlass::ceil_div(N, cute::size<1>(cute::shape(OutputSFAtom{})));
+        auto sfd_coord = [&] () {
+            return cutlass::make_Coord(m_blks * Blk_MN{}, n_blks * Blk_SF{});
+        }();
+        tensors_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D)));
+        references_SFD.push_back(cutlass::HostTensor<ElementSFD, LayoutTagD>(sfd_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(sfd_coord, stride_factor_D), false));
+        tensors_SFD[i].sync_device();
+      }
+      norm_constant.resize(scalar_coord, true);
+      EXPECT_TRUE(initialize_tensor(norm_constant.host_view(), init_scale, seed + 2023));
+      norm_constant.sync_device();
+    }
+
+
+    return true;
+  }
+
+  template <
+    class Element,
+    class Layout
+  >
+  bool equality_check(
+    cutlass::TensorView<Element, Layout> const& lhs,
+    cutlass::TensorView<Element, Layout> const& rhs) const {
+
+    // Factors used for calculating relative equality. CUTLASS's relative-equality
+    // checks in include/cutlass/relatively_equal.h  are inspired by
+    // https://floating-point-gui.de/errors/comparison/. This reference suggests using
+    // the minimum normal value of a given type as the nonzero_floor.
+    Element epsilon(static_cast<Element>(0.1f));
+    Element nonzero_floor(std::numeric_limits<Element>::min());
+
+    if constexpr (!cutlass::is_complex<Element>::value) {
+      if (check_relative_equality == CheckEquality::RELATIVE) {
+        return cutlass::reference::host::TensorRelativelyEquals(
+          lhs, rhs, epsilon, nonzero_floor);
+      }
+      else {
+        return cutlass::reference::host::TensorEquals(lhs, rhs);
+      }
+    }
+    else {
+      return cutlass::reference::host::TensorEquals(lhs, rhs);
+    }
+  }
+
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch) {
+    tensors_D[batch].sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_C[batch].host_view()), 0);
+
+    if (tensors_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_D[batch].host_view()), 0);
+    }
+
+    if (references_D[batch].size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_D[batch].host_view()), 0);
+    }
+
+    bool passed = equality_check(references_D[batch].host_view(), tensors_D[batch].host_view());
+    if(!passed) {
+      std::cout<<"D is incorrect"<<std::endl;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      abs_max_D.sync_host();
+      passed &= equality_check(reference_abs_max_D.host_view(), abs_max_D.host_view());
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      bias.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(bias.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_dbias.host_view()), 0);
+      passed &= equality_check(reference_dbias.host_view(), bias.host_view());
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      tensors_Aux[batch].sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensors_Aux[batch].host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(references_Aux[batch].host_view()), 0);
+      passed &= equality_check(references_Aux[batch].host_view(), tensors_Aux[batch].host_view());
+      if(!passed) {
+        std::cout<<"Aux is incorrect"<<std::endl;
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        abs_max_Aux.sync_host();
+        bool tmp =  equality_check(reference_abs_max_Aux.host_view(), abs_max_Aux.host_view());
+        if(!tmp) {
+          std::cout<<"AbsMax of Aux is incorrect"<<std::endl;
+        }
+        passed &= tmp;
+      }
+    }
+
+    if constexpr (IsBlockScaleSupported) {
+      tensors_SFD[batch].sync_host();
+      bool passed_sf = equality_check(references_SFD[batch].host_view(), tensors_SFD[batch].host_view());
+      if(!passed_sf) {
+        std::cout<<"SF is incorrect"<<std::endl;
+      }
+      passed &= passed_sf;
+    }
+
+
+    return passed;
+  }
+
+  void print_tensors(std::ofstream& file, int batch) {
+    auto coord_0 = cutlass::make_Coord(0);
+    if constexpr (IsScaleFactorEnabled) {
+      file
+        << ", scale_a: " << scale_A.at(coord_0)
+        << ", scale_b: " << scale_B.at(coord_0)
+        << ", scale_c: " << scale_C.at(coord_0);
+    }
+    if constexpr (IsPerRowScaleEnabled) {
+      file << "\n\nvalpha = \n" << alpha.host_view();
+      file << "\n\nvbeta = \n" << beta.host_view();
+    }
+    else {
+      file
+        << ", alpha: " << alpha.at(coord_0) << ", beta: " << beta.at(coord_0);
+    }
+    file << "\n\n";
+
+    if constexpr (IsAbsMaxEnabledD) {
+      file << "scale_d: " << float(scale_D.at(coord_0));
+      file << "\nReference abs_max_D :";
+      file << " " << float(reference_abs_max_D.at(coord_0));
+
+      file << "\nComputed abs_max_D :";
+      file << " " << float(abs_max_D.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsAbsMaxEnabledAux) {
+      file << "scale_aux: " << float(scale_Aux.at(coord_0));
+      file << "\nReference abs_max_Aux :";
+      file << " " << float(reference_abs_max_Aux.at(coord_0));
+
+      file << "\nComputed abs_max_Aux :";
+      file << " " << float(abs_max_Aux.at(coord_0));
+      file << "\n\n";
+    }
+
+    if constexpr (IsBiasEnabled) {
+      file << "\n\nBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      file << "\n\nAux Input = \n" << tensors_Aux[batch].host_view();
+    }
+
+    if constexpr (IsDeBiasEnabled) {
+      file << "\n\nReference dBias = \n" << reference_dbias.host_view();
+      file << "\n\nComputed dBias = \n" << bias.host_view();
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      file
+        << "\n\nReference Aux =\n" << references_Aux[batch].host_view()
+        << "\n\nComputed Aux =\n" << tensors_Aux[batch].host_view();
+    }
+
+    if constexpr (IsBlockScaleSupported) {
+      file
+        << "\n\nReference SFD =\n" << references_SFD[batch].host_view()
+        << "\n\nComputed SFD =\n" << tensors_SFD[batch].host_view();
+    }
+
+    file
+    << "\nC =\n" << tensors_C[batch].host_view()
+    << "\n\nReference =\n" << references_D[batch].host_view()
+    << "\n\nComputed =\n" << tensors_D[batch].host_view();
+
+  }
+
+  Arguments to_args(ProblemShapeType problem_shapes) {
+    auto coord_0 = cutlass::make_Coord(0);
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    std::vector<ElementC *> ptr_C_host(L);
+    std::vector<ElementD *> ptr_D_host(L);
+
+    for (int32_t i = 0; i < L; ++i) {
+      ptr_C_host.at(i) = tensors_C[i].device_data();
+      ptr_D_host.at(i) = tensors_D[i].device_data();
+    }
+
+    device_tensors_C.reset(L);
+    device_tensors_C.copy_from_host(ptr_C_host.data());
+
+    device_tensors_D.reset(L);
+    device_tensors_D.copy_from_host(ptr_D_host.data());
+
+    stride_c_device.reset(problem_shapes.groups());
+    stride_c_device.copy_from_host(stride_c_host.data());
+
+    stride_d_device.reset(problem_shapes.groups());
+    stride_d_device.copy_from_host(stride_d_host.data());
+
+    std::vector<ElementAux *> ptr_Aux_host(L);
+    if constexpr (IsAuxInEnabled || IsAuxOutEnabled) {
+      for (int32_t i = 0; i < L; ++i) {
+        ptr_Aux_host.at(i) = tensors_Aux[i].device_data();
+      }
+      device_tensors_Aux.reset(L);
+      device_tensors_Aux.copy_from_host(ptr_Aux_host.data());
+    }
+
+    auto device_tensors_C_ptr = cute::is_void_v<typename kernel::ElementC> ? nullptr :
+                                  reinterpret_cast<typename kernel::ElementC const**>(device_tensors_C.get());
+
+    Arguments arguments;
+    if constexpr (IsGroupGemm) {
+      arguments =
+      {
+        {},
+        device_tensors_C_ptr, stride_c_device.get(), device_tensors_D.get(), stride_d_device.get()
+      };
+    }
+    else {
+      arguments =
+      {
+        {},
+        device_tensors_C_ptr, stride_c_host[0], device_tensors_D.get(), stride_d_host[0]
+      };
+    }
+
+    auto &fusion_args = arguments.thread;
+    if constexpr (IsLegacy) {
+      arguments.thread = {
+        alpha.at(coord_0),
+        beta.at(coord_0),
+        alpha.device_data(),
+        beta.device_data()
+      };
+      arguments.ptr_Bias = bias.device_data();
+      arguments.ptr_T = device_tensors_Aux.get();
+    }
+    else {
+      fusion_args.alpha = alpha.at(coord_0);
+      fusion_args.beta = beta.at(coord_0);
+
+      fusion_args.alpha_ptr = alpha.device_data();
+      // can_implement requires beta_ptr to not be set if its voidC
+      fusion_args.beta_ptr = cute::is_void_v<typename kernel::ElementC> ? nullptr :
+                               beta.device_data();
+
+      if constexpr (IsScaleFactorEnabled) {
+        fusion_args.scale_a = scale_A.at(coord_0);
+        fusion_args.scale_b = scale_B.at(coord_0);
+        fusion_args.scale_c = scale_C.at(coord_0);
+        fusion_args.scale_d = scale_D.at(coord_0);
+        fusion_args.scale_a_ptr = scale_A.device_data();
+        fusion_args.scale_b_ptr = scale_B.device_data();
+        fusion_args.scale_c_ptr = scale_C.device_data();
+        fusion_args.scale_d_ptr = scale_D.device_data();
+      }
+
+      if constexpr (IsBiasEnabled) {
+        fusion_args.bias_ptr = bias.device_data();
+      }
+
+      if constexpr (IsDeBiasEnabled) {
+        fusion_args.dbias_ptr = bias.device_data();
+      }
+
+      // example of how to set kernel activation arguments
+      // see ActivationFunctor::Arguments in activation.h for definition
+      // if Arguments doesn't exist then fusion_args.activation is empty
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::ScaledGELU_taylor<ElementCompute>>) {
+        fusion_args.activation.scale = ElementCompute(1);
+      }
+
+      // Treat Clamp as ReLU
+      if constexpr (cute::is_same_v<ActivationFunctor, cutlass::epilogue::thread::Clamp<ElementCompute>>) {
+        fusion_args.activation.lower_bound = 0;
+        fusion_args.activation.upper_bound = std::numeric_limits<ElementCompute>::max();
+      }
+
+      if constexpr (IsAbsMaxEnabledD) {
+        fusion_args.amax_D_ptr = abs_max_D.device_data();
+      }
+
+      if constexpr (IsAuxInEnabled) {
+        fusion_args.aux_ptr = device_tensors_Aux.get();
+        fusion_args.dAux = stride_Aux;
+      }
+
+      if constexpr (IsAuxOutEnabled) {
+        fusion_args.aux_ptr = device_tensors_Aux.get();
+        fusion_args.dAux = stride_Aux;
+        if constexpr (IsScaleFactorEnabled) {
+          fusion_args.scale_aux = scale_Aux.at(coord_0);
+          fusion_args.scale_aux_ptr = scale_Aux.device_data();
+        }
+        if constexpr (IsAbsMaxEnabledAux) {
+          fusion_args.amax_aux_ptr = abs_max_Aux.device_data();
+        }
+      }
+
+      if constexpr (IsBlockScaleSupported) {
+        std::vector<ElementSFD *> ptr_SFD_host(L);
+        for (int32_t i = 0; i < L; ++i) {
+          ptr_SFD_host.at(i) = tensors_SFD[i].device_data();
+        }
+        device_tensors_SFD.reset(L);
+        device_tensors_SFD.copy_from_host(ptr_SFD_host.data());
+
+        arguments.thread.block_scale_factor_ptr = device_tensors_SFD.get();
+        arguments.thread.norm_constant_ptr = norm_constant.device_data();
+      }
+
+    }
+
+    return arguments;
+  }
+
+  auto to_host_args(ProblemShapeType problem_shapes, int batch) {
+    using namespace cute;
+    //
+    // Allocate the GEMM workspace
+    //
+    auto problem_shape_MNKL = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+    auto [M, N, K, L] = problem_shape_MNKL;
+    auto coord_0 = cutlass::make_Coord(0);
+    auto C = cute::make_tensor(detail::make_iterator(tensors_C[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_c_host[batch]));
+    auto D = cute::make_tensor(detail::make_iterator(references_D[batch].host_data()),
+        cute::make_layout(cute::make_shape(M, N, 1), stride_d_host[batch]));
+    auto Bias = cute::make_tensor(detail::make_iterator(IsDeBiasEnabled ? reference_dbias.host_data() : bias.host_data()),
+        cute::make_layout(cute::make_shape(M, cute::_1{})));
+    auto Aux_layout = cute::make_layout(cute::make_shape(M, N, 1), stride_Aux);
+    auto Aux = [&]() {
+      auto ptr = recast_ptr<ElementAux>(nullptr);
+      if (IsAuxInEnabled) {
+        ptr = detail::make_iterator(tensors_Aux[batch].host_data());
+      } else if (IsAuxOutEnabled) {
+        ptr = detail::make_iterator(references_Aux[batch].host_data());
+      }
+      return cute::make_tensor(ptr, Aux_layout);
+    }();
+    auto Valpha = cute::make_tensor(detail::make_iterator(alpha.host_data()),
+        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    auto Vbeta = cute::make_tensor(detail::make_iterator(beta.host_data()),
+        cute::make_layout(cute::make_shape(M, N, cute::_1{}), cute::make_stride(cute::_1{}, cute::_0{}, N)));
+
+    auto SfD = [&](){
+      if constexpr (IsBlockScaleSupported) {
+        auto tensor = make_tensor(detail::make_iterator(references_SFD[batch].host_data()),
+          Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+        return tensor;
+      }
+      else {
+        // Reference kernel has a logic to ignore scalefactor computation if we pass the tensor type same as output D tensor.
+        return D;
+      }
+    }();
+
+
+    cutlass::reference::host::GettEpilogueParams<
+      ElementScalar,
+      ElementScalar,
+      ElementAccumulator,
+      ElementCompute,
+      decltype(C),
+      decltype(D),
+      decltype(Bias),
+      decltype(Aux),
+      decltype(Valpha),
+      decltype(Vbeta),
+      ActivationFunctor
+      , decltype(SfD)
+      , Int<SFD_VectorSize>
+      , cutlass::plus<ElementCompute>
+      , false
+      , SfGenStrategy
+    > epilogue_params{};
+
+    epilogue_params.C = C;
+    epilogue_params.D = D;
+    epilogue_params.alpha = alpha.at(coord_0);
+    epilogue_params.beta = beta.at(coord_0);
+
+    if constexpr (IsScaleFactorEnabled) {
+      epilogue_params.scale_a = scale_A.at(coord_0);
+      epilogue_params.scale_b = scale_B.at(coord_0);
+      epilogue_params.scale_c = scale_C.at(coord_0);
+      epilogue_params.scale_d = scale_D.at(coord_0);
+    }
+
+    if constexpr (IsBiasEnabled or IsDeBiasEnabled) {
+      epilogue_params.Bias = Bias;
+    }
+
+    if constexpr (IsAbsMaxEnabledD) {
+      epilogue_params.abs_max_D = reference_abs_max_D.host_data();
+    }
+
+    if constexpr (IsAuxInEnabled) {
+      epilogue_params.Aux = Aux;
+    }
+
+    if constexpr (IsAuxOutEnabled) {
+      epilogue_params.Aux = Aux;
+      if constexpr (IsScaleFactorEnabled) {
+        epilogue_params.scale_aux = scale_Aux.at(coord_0);
+      }
+      if constexpr (IsAbsMaxEnabledAux) {
+        epilogue_params.abs_max_Aux = reference_abs_max_Aux.host_data();
+      }
+    }
+
+    if constexpr (IsPerRowScaleEnabled) {
+      epilogue_params.Valpha = Valpha;
+      if (vector_scale_mode == VectorScale::ENABLED) {
+        epilogue_params.Vbeta = Vbeta;
+      }
+    }
+
+    if constexpr (IsBlockScaleSupported) {
+      epilogue_params.SfD = SfD;
+      epilogue_params.st = norm_constant.at(coord_0);
+    }
+
+    return epilogue_params;
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor_ = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct TestbedImpl {
+  // Kernel data types
+  using ScheduleType = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy::Schedule;
+  // All Collective MMA operands are defined by HostCollectiveMainloopType based on the schedule type
+  using HostCollectiveMainloopType = HostCollectiveMainloop<ScheduleType, Gemm, ElementA, ElementB>;
+  using CollectiveEpilogue = cute::conditional_t<IsDefaultEpilogue<typename Gemm::GemmKernel::CollectiveEpilogue>::value || force_legacy_epilogue,
+                                                HostCollectiveDefaultEpilogue<Gemm>,
+                                                HostCollectiveEpilogue<Gemm>>;
+
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementAccumulator = typename Gemm::GemmKernel::ElementAccumulator;
+  using ElementCompute = typename ElementComputeType<Gemm, ElementAccumulator>::Type;
+  using ElementScalar = typename ElementScalarType<Gemm, ElementCompute>::Type;
+
+  using LayoutTagA = typename HostCollectiveMainloopType::LayoutTagA;
+  using LayoutTagB = typename HostCollectiveMainloopType::LayoutTagB;
+  using LayoutTagC = typename CollectiveEpilogue::LayoutTagC;
+  using LayoutTagD = typename CollectiveEpilogue::LayoutTagD;
+
+  uint32_t sm_count;
+  // Used to force multi-wave tests for persistent kernel schedules
+  constexpr static int MaxSmCount = 16;
+  static constexpr uint64_t kDefaultSeed = 4096;
+  static constexpr uint32_t mma_promotion_interval = 4;
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  HostCollectiveMainloopType collective_mma_inputs;
+  CollectiveEpilogue collective_epilogue;
+
+  static constexpr bool IsGroupGemm = CollectiveEpilogue::IsGroupGemm;
+
+  //
+  // Methods
+  //
+
+  TestbedImpl(
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  TestbedImpl(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+    ScalarLoc use_device_scalars_ = ScalarLoc::ON_HOST,
+    VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = kDefaultSeed
+  ): collective_mma_inputs(HostCollectiveMainloopType(check_relative_equality_, stride_factor_A_, stride_factor_B_, init_A_, init_B_, seed_)),
+     collective_epilogue(CollectiveEpilogue(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_C_, init_scale_, init_bias_, seed_)) { }
+
+  /// Initializes data structures
+  bool initialize(ProblemShapeType problem_shapes, ElementScalar alpha_=1.f, ElementScalar beta_=0.f) {
+    collective_mma_inputs.initialize(problem_shapes);
+    collective_epilogue.initialize(problem_shapes, alpha_, beta_);
+
+    return true;
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta,
+      int batch)
+  {
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(batch), 1);
+
+    bool passed = collective_mma_inputs.compare_reference(problem_shapes, batch);
+    passed &= collective_epilogue.compare_reference(problem_shapes, alpha, beta, batch);
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_"
+        << M << "x" << N << "x" << K << "x" << batch << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << batch
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      collective_mma_inputs.print_tensors(file, batch);
+      collective_epilogue.print_tensors(file, batch);
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+      ProblemShapeType problem_shapes,
+      ElementScalar alpha,
+      ElementScalar beta)
+  {
+    using namespace cute;
+    auto [M, N, K, L] = cute::append<4>(problem_shapes.get_host_problem_shape(0), 1);
+    L = std::max(problem_shapes.groups(), L);
+
+    bool passed = true;
+    for (int32_t i = 0; i < L; ++i) {
+      auto mainloop_params = collective_mma_inputs.to_host_args(problem_shapes, i);
+      auto epilogue_params = collective_epilogue.to_host_args(problem_shapes, i);
+
+      cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+      passed &= compare_reference(problem_shapes, alpha, beta, i);
+    }
+    return passed;
+  }
+
+  /// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = static_cast<size_t>(Gemm::GemmKernel::SharedStorageSize);
+
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    cudaDeviceProp properties;
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    this->sm_count = properties.multiProcessorCount;
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      printf("failed due to smem_size\n");
+      printf("hardware smem_size: %d, required smem_size: %d\n\n", int(properties.sharedMemPerBlockOptin), int(smem_size));
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    ProblemShapeType problem_shapes,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    detail::Iterations iterations = detail::Iterations{}
+    )
+  {
+
+    // Fail test if insufficient CUDA device
+    if (!sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+
+    if (!this->initialize(problem_shapes, alpha, beta)) {
+      std::cerr << "Initialization failed \n";
+      return false;
+    }
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    this->sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    hw_info.sm_count = this->sm_count;
+
+    typename HostCollectiveMainloopType::Arguments mainloop_args;
+
+    mainloop_args = collective_mma_inputs.to_args(problem_shapes);
+
+    if constexpr (IsGroupGemm) {
+      arguments =
+      {
+        cutlass::gemm::GemmUniversalMode::kGrouped,
+        problem_shapes,
+        mainloop_args,
+        collective_epilogue.to_args(problem_shapes),
+        hw_info
+      };
+    }
+    else {
+      arguments =
+      {
+        cutlass::gemm::GemmUniversalMode::kArray,
+        problem_shapes,
+        mainloop_args,
+        collective_epilogue.to_args(problem_shapes),
+        hw_info
+      };
+    }
+
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return false;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    cudaError_t result;
+    status = gemm_op.initialize(arguments, workspace.get());
+    status = gemm_op.run();
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+      return false;
+    }
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_shapes, alpha, beta);
+    if (!passed) {
+      std::cout << "Error : Failed : with alpha: " << alpha << ", beta: " << beta
+                << "\n";
+    }
+
+    return passed;
+  }
+};
+
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity,
+  bool force_legacy_epilogue = false,
+  typename ElementA = typename Gemm::GemmKernel::ElementA,
+  typename ElementB = typename Gemm::GemmKernel::ElementB
+>
+struct Testbed3x {
+
+  using TestBedImpl = typename detail::TestbedImpl<
+                        Gemm,
+                        ActivationFunctor,
+                        force_legacy_epilogue,
+                        ElementA,
+                        ElementB
+                        >;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementAccumulator   = typename TestBedImpl::ElementAccumulator;
+  using ElementCompute       = typename TestBedImpl::ElementCompute;
+  using ElementScalar        = typename TestBedImpl::ElementScalar;
+
+  using RasterOrderOptions = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90::RasterOrderOptions;
+  using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+
+  static constexpr bool IsGroupGemm = TestBedImpl::IsGroupGemm;
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3x(
+      CheckEquality check_relative_equality_ = CheckEquality::EXACT,
+      ScalarLoc use_device_scalars_ = ScalarLoc::ON_DEVICE,
+      VectorScale vector_scale_mode_ = VectorScale::DISABLED,
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_scale_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_bias_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = TestBedImpl::kDefaultSeed)
+      : impl_(check_relative_equality_, use_device_scalars_, vector_scale_mode_, init_A_, init_B_, init_C_, init_scale_, init_bias_, seed_) {}
+
+  /// Executes one test
+  bool run(
+   typename TestBedImpl::ProblemShapeType problem_shapes,
+    ElementScalar alpha = ElementScalar(1),
+    ElementScalar beta = ElementScalar(0),
+    detail::Iterations iterations = detail::Iterations{}
+    )
+  {
+    return impl_.run(
+        problem_shapes, alpha, beta, iterations);
+  }
+};
+
+template <
+  typename Gemm,
+  template <class T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAll(double alpha = 1.0, double beta = 0.0, CheckEquality check_relative_equality = CheckEquality::RELATIVE) {
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  Testbed3x<Gemm, ActivationFunctor> testbed(check_relative_equality, ScalarLoc::ON_DEVICE, VectorScale::DISABLED);
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  int batches[] = {5, 10};
+
+  bool passed = true;
+
+  for (int batch : batches) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+
+          if constexpr (Testbed3x<Gemm, ActivationFunctor>::IsGroupGemm) {
+            std::vector<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_host;
+            cutlass::DeviceAllocation<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_device;
+
+            for (int i = 0; i < batch; ++i) {
+              problem_sizes_host.push_back({m * ((i % 3) + 1), n * ((i % 4) + 1), k * ((i % 5) + 1)});
+            }
+
+            problem_sizes_device.reset(problem_sizes_host.size());
+            problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+            passed = testbed.run(
+              ProblemShapeType{static_cast<int>(problem_sizes_host.size()), problem_sizes_device.get(), problem_sizes_host.data()},
+              cutlass::from_real<ElementScalar>(alpha),
+              cutlass::from_real<ElementScalar>(beta)
+            );
+          }
+          else {
+            ProblemShapeType problem_size{{m, n, k, batch}};
+
+            passed = testbed.run(
+              problem_size,
+              cutlass::from_real<ElementScalar>(alpha),
+              cutlass::from_real<ElementScalar>(beta)
+            );
+          }
+
+          if (!passed) {
+            std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNKL " << m << " " << n << " " << k << " " << batch << " FAILED.\n";
+            return false;
+          }
+        } // k
+      } // n
+    } // m
+  } // batch
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = false>
+bool TestSmall(double alpha = 1.0, double beta = 1.0,
+  CheckEquality check_relative_equality = CheckEquality::RELATIVE,
+  ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE,
+  VectorScale vector_scale_mode = VectorScale::ENABLED,
+  std::vector<int> override_problem_size_k = {}) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ElementA = typename Gemm::GemmKernel::ElementA;
+  using ElementB = typename Gemm::GemmKernel::ElementB;
+  using TiledMma = typename Gemm::GemmKernel::TiledMma;
+
+  static constexpr bool IsF8F6F4 = cutlass::gemm::collective::detail::is_sm100_mma_f8f6f4<TiledMma, ElementA, ElementB>();
+  // For fp4 and fp6 kernels, the min alignment_input is 128 elements, so we don't need to add alignment_input in test problem sizes.  
+  int alignment_bits_a = cutlass::detail::get_input_alignment_bits<ElementA, IsF8F6F4>();
+  int alignment_input_a = (alignment_bits_a / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits_a / cute::sizeof_bits<ElementA>::value);
+  
+  int alignment_bits_b = cutlass::detail::get_input_alignment_bits<ElementB, IsF8F6F4>();
+  int alignment_input_b = (alignment_bits_b / cute::sizeof_bits<ElementB>::value == 128) ? 0 : (alignment_bits_b / cute::sizeof_bits<ElementB>::value);
+  
+  int alignment_input = (alignment_input_a == 0 || alignment_input_b == 0) ? 0 : std::max(alignment_input_a, alignment_input_b);
+
+  if constexpr (apply_alignment_offset) {
+    // If BlockScaled, then min alignment is SFVecSize
+    static constexpr bool IsBlockScaleSupported = Gemm::EpilogueOutputOp::IsBlockScaleSupported;
+    static constexpr int SFVecSize = Gemm::GemmKernel::CollectiveMainloop::SFVecSize;
+    if constexpr (IsBlockScaleSupported) {
+      alignment_input = cutlass::round_up(alignment_input, SFVecSize);
+    }
+  }
+
+
+  using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
+  using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
+  CtaShape_MNK cta_shape;
+  Testbed3x<Gemm, cutlass::epilogue::thread::Identity, force_legacy_epilogue> testbed(check_relative_equality, use_device_scalars, vector_scale_mode);
+  // For Ptr-Array and Grouped GEMM ideally we need to know SM count at runtime
+  static constexpr int SmCount = 16;
+
+  float waves[] = {0.5, 2.5};
+  int batches[] = {3};
+  int cluster_m = 1;
+  int cluster_n = 1;
+
+  std::vector<int> problem_size_k;
+  if (override_problem_size_k.empty()) {
+    // this is to test with min alignment
+    problem_size_k = {256 - alignment_input, 512 + alignment_input};
+  }
+  else {
+    problem_size_k = override_problem_size_k;
+  }
+
+  if constexpr(DispatchPolicy::ArchTag::kMinComputeCapability >= 90) {
+    typename DispatchPolicy::ClusterShape cluster_shape;
+    cluster_m = cute::size<0>(cluster_shape);
+    cluster_n = cute::size<1>(cluster_shape);
+  }
+
+  bool passed = true;
+
+  for (int batch : batches) {
+    for (float wave : waves) {
+      for (int k : problem_size_k) {
+        int grid_m, grid_n = 0;
+        float num_grid = wave * SmCount;
+
+        if (cluster_m >= cluster_n) {
+          grid_m = cluster_m;
+          grid_n = static_cast<int>(num_grid) / grid_m;
+          // Align grid_n to cluster_n
+          grid_n = std::max((grid_n + cluster_n - 1 ) / cluster_n * cluster_n, 1);
+        }
+        else {
+          grid_n = cluster_n;
+          grid_m = static_cast<int>(num_grid) / grid_n;
+          // Align grid_m to cluster_m
+          grid_m = std::max((grid_m + cluster_m - 1 ) / cluster_m * cluster_m, 1);
+        }
+
+        int m = grid_m * cute::size<0>(cta_shape) - alignment_input; // this is just to test with unusual problem shapes
+        int n = grid_n * cute::size<1>(cta_shape) + alignment_input;
+
+        if constexpr (Testbed3x<Gemm, cutlass::epilogue::thread::Identity, force_legacy_epilogue>::IsGroupGemm) {
+          std::vector<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_host;
+          cutlass::DeviceAllocation<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_device;
+          for (int i = 0; i < batch; ++i) {
+            problem_sizes_host.push_back({m * ((i % 2) + 1), n * ((i % 3) + 1), k * ((i % 2) + 1)});
+          }
+          problem_sizes_device.reset(problem_sizes_host.size());
+          problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+          ProblemShapeType problem_shapes{batch, problem_sizes_device.get(), problem_sizes_host.data()};
+
+          if (CUTLASS_DEBUG_TRACE_LEVEL > 0) {
+            for (int i = 0; i < batch; ++i) {
+              std::cout << "problem_shapes : "  << problem_shapes.get_host_problem_shape(i) << " \n";
+            }
+          }
+          passed = testbed.run(
+            problem_shapes,
+            cutlass::from_real<ElementScalar>(alpha),
+            cutlass::from_real<ElementScalar>(beta)
+          );
+        }
+        else {
+          ProblemShapeType problem_shapes{{m, n, k, batch}};
+          if (CUTLASS_DEBUG_TRACE_LEVEL > 0) {
+            std::cout << "problem_shapes : "  << problem_shapes.get_host_problem_shape() << " \n";
+          }
+          passed = testbed.run(
+            problem_shapes,
+            cutlass::from_real<ElementScalar>(alpha),
+            cutlass::from_real<ElementScalar>(beta)
+          );
+        }
+
+        if (!passed) {
+          std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNK " << m << " " << n << " " << k << " FAILED.\n";
+          return false;
+        }
+      } // k
+    } // waves
+  } // batches
+
+  return passed;
+}
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = true>
+bool TestSmallFusion(double alpha = 1.0, double beta = 0.0,
+    CheckEquality check_relative_equality = CheckEquality::RELATIVE,
+    ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE,
+    VectorScale vector_scale_mode = VectorScale::ENABLED) {
+  return TestSmall<Gemm, force_legacy_epilogue, apply_alignment_offset>(
+    alpha, beta, check_relative_equality, use_device_scalars, vector_scale_mode);
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b00f98a97846de175f1c6f95919c483ab4b81da
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@@ -0,0 +1,515 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface with elementwise tensor-tensor broadcast epilogue
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "testbed_utils.h"
+#include "gemm_testbed_3x.hpp"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct Testbed3xTensorBroadcast {
+
+  using TestBedImpl = typename detail::TestbedImpl<Gemm>;
+  using Kernel      = typename Gemm::GemmKernel;
+  using Epilogue    = typename Gemm::GemmKernel::CollectiveEpilogue;
+
+  using ElementA = typename Kernel::ElementA;
+  using StrideA  = typename Kernel::StrideA;
+  using ElementB = typename Kernel::ElementB;
+  using StrideB  = typename Kernel::StrideB;
+  using ElementC = typename Kernel::ElementC;
+  using StrideC  = typename Kernel::StrideC;
+  using ElementD = typename Kernel::ElementD;
+  using StrideD  = typename Kernel::StrideD;
+
+  using ElementAccumulator   = typename Kernel::ElementAccumulator;
+  using ElementCompute       = typename Epilogue::ElementCompute;
+  using ElementScalar        = typename Epilogue::ElementScalar;
+  using ProblemShapeType     = typename Kernel::ProblemShape;
+  using ElementBias          = typename Epilogue::ElementBias;
+  using ActivationFunctor    = typename Epilogue::ActivationFunctor;
+
+  static constexpr bool IsBinaryOp0Enabled = Epilogue::IsBinaryOp0Enabled;
+  static constexpr bool IsBinaryOp1Enabled = Epilogue::IsBinaryOp1Enabled;
+  static constexpr bool IsUnaryOpEnabled   = Epilogue::IsUnaryOpEnabled;
+
+  static constexpr bool PerColBias = Epilogue::PerColumnBias;
+
+  using LayoutTagA = typename TestBedImpl::LayoutTagA;
+  using LayoutTagB = typename TestBedImpl::LayoutTagB;
+  using LayoutTagC = typename TestBedImpl::LayoutTagC;
+  using LayoutTagD = typename TestBedImpl::LayoutTagD;
+  using LayoutTagVector = cutlass::layout::PackedVectorLayout;
+
+  cutlass::HostTensor<ElementBias, LayoutTagVector> bias;
+  cutlass::HostTensor<ElementC, LayoutTagC> tensor_C1;
+  // tensor_C0 is taken from TestbedImpl's tensor_C
+
+
+  // Detail Implementation
+  TestBedImpl impl_;
+
+  //
+  // Methods
+  //
+  Testbed3xTensorBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(CheckEquality::EXACT, ScalarLoc::ON_DEVICE, VectorScale::ENABLED,
+          init_A_, init_B_, init_C_, cutlass::Distribution::Uniform, cutlass::Distribution::Uniform, seed_) { }
+
+  Testbed3xTensorBroadcast(
+    typename LayoutTagA::Stride stride_factor_A_,
+    typename LayoutTagB::Stride stride_factor_B_,
+    typename LayoutTagC::Stride stride_factor_C_,
+    typename LayoutTagD::Stride stride_factor_D_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = TestBedImpl::kDefaultSeed
+  ) :
+    impl_(stride_factor_A_,
+          stride_factor_B_,
+          stride_factor_C_,
+          stride_factor_D_,
+          CheckEquality::EXACT, ScalarLoc::ON_HOST, VectorScale::ENABLED,
+          init_A_,
+          init_B_,
+          init_C_,
+          cutlass::Distribution::Uniform,
+          cutlass::Distribution::Uniform,
+          seed_) { }
+
+  /// Initializes data structures
+  void initialize(ProblemShapeType problem_size) {
+    //
+    // Allocate the GEMM workspace for A/B/C/D tensor
+    //
+    impl_.initialize(problem_size);
+  }
+
+  void initialize_bias(ProblemShapeType problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto bias_size = PerColBias ? cute::get<1>(problem_shape_MNKL) : cute::get<0>(problem_shape_MNKL);
+    bias.resize(cutlass::Coord<1>(bias_size));
+
+    EXPECT_TRUE(detail::initialize_tensor(bias.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2023));
+    bias.sync_device();
+  }
+
+  void initialize_c1(ProblemShapeType problem_size) {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto c_coord = cutlass::make_Coord(M * L, N);
+
+    tensor_C1.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C));
+    EXPECT_TRUE(detail::initialize_tensor(tensor_C1.host_view(), cutlass::Distribution::Uniform, impl_.collective_mma_inputs.seed + 2024));
+    tensor_C1.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+      cute::Shape<int,int,int,int> problem_shape_MNKL,
+      ElementScalar alpha,
+      ElementScalar beta,
+      bool use_bias)
+  {
+    auto [M, N, K, L] = problem_shape_MNKL;
+
+    impl_.collective_epilogue.tensor_D.sync_host();
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_mma_inputs.tensor_B.host_view()), 0);
+
+    if (impl_.collective_epilogue.tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.tensor_D.host_view()), 0);
+    }
+
+    if (impl_.collective_epilogue.reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(impl_.collective_epilogue.reference_D.host_view()), 0);
+    }
+
+    bool passed = cutlass::reference::host::TensorEquals(impl_.collective_epilogue.reference_D.host_view(), impl_.collective_epilogue.tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::stringstream fname;
+      fname << "error_Gemm_device_broadcast"
+        << M << "x" << N << "x" << K << "x" << L << "_"
+        << cute::get<0>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<1>(typename Gemm::GemmKernel::TileShape{}) << "_"
+        << cute::get<2>(typename Gemm::GemmKernel::TileShape{}) << ".txt";
+
+      std::ofstream file(fname.str());
+      file
+        << "problem: " << ' ' << M << "x" << N << "x" << K << ", Batch count = " << L
+        << ", alpha: " << float(alpha) << ", beta: " << float(beta) << ", use_bias: " << use_bias 
+        << ", per-col bias: " << PerColBias << "\n\n";
+
+      if (use_bias){
+        file << "Bias = \n" << bias.host_view()<< "\n\n";
+      }
+
+      file
+        << "A =\n" << impl_.collective_mma_inputs.tensor_A.host_view()
+        << "\nB =\n" << impl_.collective_mma_inputs.tensor_B.host_view()
+        << "\nC0 =\n" << impl_.collective_epilogue.tensor_C.host_view()
+        << "\nC1 =\n" << tensor_C1.host_view()
+        << "\n\nReference =\n" << impl_.collective_epilogue.reference_D.host_view()
+        << "\n\nComputed =\n" <<impl_.collective_epilogue.tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result matches the GEMM with elementwise tensor-tensor
+  /// broadcast operation
+  bool verify(
+    ProblemShapeType problem_size,
+    ElementScalar alpha,
+    ElementScalar beta,
+    bool use_bias)
+  {
+    auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+    auto M = cute::get<0>(problem_shape_MNKL);
+    auto N = cute::get<1>(problem_shape_MNKL);
+    auto K = cute::get<2>(problem_shape_MNKL);
+    auto L = cute::get<3>(problem_shape_MNKL);
+
+    auto A = cute::make_tensor(impl_.collective_mma_inputs.tensor_A.host_data(),
+        cute::make_layout(cute::make_shape(M, K, L), impl_.collective_mma_inputs.stride_a));
+    auto B = cute::make_tensor(impl_.collective_mma_inputs.tensor_B.host_data(),
+        cute::make_layout(cute::make_shape(N, K, L), impl_.collective_mma_inputs.stride_b));
+    auto D = cute::make_tensor(impl_.collective_epilogue.reference_D.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
+    auto Bias = cute::make_tensor(static_cast<ElementBias*>(use_bias ? bias.host_data() : nullptr),
+        cute::make_layout(PerColBias ? cute::make_shape(1, N) : cute::make_shape(M, 1)));
+    auto C0 = cute::make_tensor(impl_.collective_epilogue.tensor_C.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    auto C1 = cute::make_tensor(tensor_C1.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+
+    // Create host workspace for output of testbed. This computes a portion of the epilogue:
+    //    ref_compute_out = Activation(alpha * (A @ B) + bias)
+    cutlass::HostTensor<ElementCompute, LayoutTagC> ref_compute_out;
+    auto c_coord = cutlass::make_Coord(M * L, N);
+    ref_compute_out.resize(c_coord, cutlass::layout::Affine2Layout_Factory<LayoutTagD>::layout_factory(c_coord, impl_.collective_epilogue.stride_factor_C), false);
+    auto RefComputeOut = cute::make_tensor(ref_compute_out.host_data(),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+
+    cutlass::reference::host::GettMainloopParams<ElementAccumulator, decltype(A), decltype(B)> mainloop_params{A, B};
+
+    // Use a dummy null tensor for operand C because the epilogue overrides C.
+    auto dummy_C = cute::make_tensor(static_cast<ElementC*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    ElementCompute dummy_beta(0);
+    auto dummy_Aux = cute::make_tensor(static_cast<ElementD*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_d));
+    auto dummy_Valpha = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    auto dummy_Vbeta = cute::make_tensor(static_cast<ElementCompute*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, 1), cute::make_stride(cute::_1{}, cute::_0{}, M)));
+    
+    auto dummy_SFD = cute::make_tensor(static_cast<ElementD*>(nullptr),
+        cute::make_layout(cute::make_shape(M, N, L), impl_.collective_epilogue.stride_c));
+    using DummySFDVectorSize = cute::Int<0>;
+    
+
+    cutlass::reference::host::GettEpilogueParams<
+        ElementScalar,
+        ElementScalar,
+        ElementAccumulator,
+        ElementCompute,
+        decltype(dummy_C),
+        decltype(RefComputeOut),
+        decltype(Bias),
+        decltype(dummy_Aux),      
+        decltype(dummy_Valpha),
+        decltype(dummy_Vbeta),
+        ActivationFunctor,
+        decltype(dummy_SFD),            
+        DummySFDVectorSize,             
+        cutlass::plus<ElementCompute>,
+        PerColBias> epilogue_params{
+          alpha,
+          dummy_beta,
+          dummy_C,
+          RefComputeOut,
+          Bias,
+          dummy_Aux,
+          dummy_Valpha,
+          dummy_Vbeta
+        };
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    cutlass::NumericConverter<ElementCompute, ElementC, Epilogue::ThreadEpilogueOp::kRound> source_converter;
+    cutlass::NumericConverter<ElementD, ElementCompute, Epilogue::ThreadEpilogueOp::kRound> destination_converter;
+    cutlass::multiplies<ElementCompute> mul;
+
+    // Compute broadcast operations atop the reference
+    #pragma omp parallel for collapse(3)
+    for (int64_t l = 0; l < cute::size<2>(A.layout()); ++l) {
+      for (int64_t m = 0; m < cute::size<0>(A.layout()); ++m) {
+        for (int64_t n = 0; n < cute::size<0>(B.layout()); ++n) {
+          ElementCompute intermediate = RefComputeOut(m, n, l);
+          // Apply BinaryOp0, if needed
+          if constexpr (IsBinaryOp0Enabled) {
+            typename Epilogue::ThreadEpilogueOp::BinaryOp0 bin0;
+            ElementCompute converted_source = source_converter(C0(m, n, l));
+            intermediate = bin0(intermediate, mul(beta, converted_source));
+          }
+
+          // Apply BinaryOp1, if needed
+          if constexpr (IsBinaryOp1Enabled) {
+            typename Epilogue::ThreadEpilogueOp::BinaryOp1 bin1;
+            ElementCompute converted_source = source_converter(C1(m, n, l));
+            intermediate = bin1(intermediate, mul(beta, converted_source));
+          }
+
+          // Apply UnaryOp, if needed
+          if constexpr (IsUnaryOpEnabled) {
+            typename Epilogue::ThreadEpilogueOp::UnaryOp unary;
+            intermediate = unary(intermediate);
+          }
+
+          D(m, n, l) = destination_converter(intermediate);
+        }
+      }
+    }
+
+    return compare_reference(problem_shape_MNKL, alpha, beta, use_bias);
+  }
+
+  /// Executes one test
+  bool run(
+      ProblemShapeType problem_size,
+      ElementScalar alpha = ElementScalar(1),
+      ElementScalar beta = ElementScalar(0),
+      bool profiling = false,
+      int iterations = 20,
+      bool use_bias = true)
+  {
+    // Fail test if insufficient CUDA device
+    if (!impl_.sufficient()) {
+      std::cout << "Test failed due to insufficient CUDA device." << std::endl;
+      return false;
+    }
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments;
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    if (not profiling) {
+      impl_.sm_count = std::min(impl_.MaxSmCount, cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id));
+      hw_info.sm_count = impl_.sm_count;
+    }
+    else {
+      impl_.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+      hw_info.sm_count = impl_.sm_count;
+    }
+
+    /// Initializes data structures
+    /// A/B/C0/D Tensor
+    initialize(problem_size);
+    initialize_bias(problem_size);
+
+    if constexpr (IsBinaryOp1Enabled) {
+      initialize_c1(problem_size);
+    }
+
+    arguments = typename Gemm::Arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+        problem_size,
+        { impl_.collective_mma_inputs.tensor_A.device_data(), impl_.collective_mma_inputs.stride_a,
+          impl_.collective_mma_inputs.tensor_B.device_data(), impl_.collective_mma_inputs.stride_b,
+          impl_.mma_promotion_interval
+        },
+        { // Epilogue arguments
+          { alpha, beta }, // ThreadOp arguments
+          impl_.collective_epilogue.stride_c,
+          impl_.collective_epilogue.tensor_D.device_data(),
+          impl_.collective_epilogue.stride_d,
+          use_bias ? bias.device_data() : nullptr,
+          impl_.collective_epilogue.tensor_C.device_data(),
+          tensor_C1.device_data()
+        }, // Epilogue arguments end
+        hw_info
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    if (profiling) {
+      return impl_.profile(problem_size, iterations, gemm_op, arguments, workspace);
+    }
+    else {
+      cudaError_t result;
+      status = gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op.run();
+      result = cudaDeviceSynchronize();
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << "Error at Kernel Sync.";
+        return false;
+      }
+
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+      //
+      // Verify
+      //
+      bool passed = this->verify(problem_size, alpha, beta, use_bias);
+      if (!passed) {
+        std::cout << "Error : Failed : with alpha: " << float(alpha)
+                  << ", beta: " << float(beta)
+                  << ", use_bias: " << use_bias
+                  << "\n";
+      }
+
+      return passed;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestAllTensorBroadcast(bool use_bias=true) {
+  using ElementScalar = typename Gemm::GemmKernel::CollectiveEpilogue::ElementScalar;
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+  int max_alignment = std::max(Gemm::kAlignmentA, Gemm::kAlignmentB);
+  std::vector<int> problem_size_m = {max_alignment, 512 - 3 * max_alignment};
+  std::vector<int> problem_size_n = {max_alignment, 512 - 2 * max_alignment};
+
+  if constexpr (cute::is_same_v<typename Gemm::GemmKernel::DispatchPolicy::Schedule,
+                cutlass::gemm::KernelTmaWarpSpecializedPingpong>) {
+    problem_size_m.push_back(768);
+    problem_size_n.push_back(768);
+  }
+
+  constexpr int Stages = Gemm::GemmKernel::DispatchPolicy::Stages;
+  constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{});
+
+  std::vector<int> problem_size_k = {max_alignment, TileShapeK * (Stages + 1) - max_alignment};
+
+  Testbed3xTensorBroadcast<Gemm> testbed;
+  bool passed = true;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        ProblemShapeType problem_size;
+        if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+          problem_size = ProblemShapeType{m, n, k, /* l */ 1};
+        }
+        else {
+          problem_size = ProblemShapeType{m, n, k};
+        }
+
+        for (bool use_bias : {true, false}) {
+          passed = testbed.run(
+            problem_size,
+            cutlass::from_real<ElementScalar>(1),
+            cutlass::from_real<ElementScalar>(1),
+            false,  // profiling
+            20,     // iterations
+            use_bias
+          );
+
+          if (!passed) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  if constexpr (cute::rank(ProblemShapeType{}) == 4) {
+    auto problem_size = ProblemShapeType{256 + max_alignment, 256 + max_alignment, 160 + max_alignment, /* l */ 3};
+    passed = testbed.run(
+      problem_size,
+      cutlass::from_real<ElementScalar>(1),
+      cutlass::from_real<ElementScalar>(1),
+      false,  // profiling
+      20      // iterations
+    );
+    if (!passed) {
+      return false;
+    }
+  }
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ae7b864cb272782da4920ffc038830d3b5984b2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed.h
@@ -0,0 +1,300 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct MultistageTestbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute =
+      typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  MultistageTestbed(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 2080)
+      : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(cutlass::TensorView<Element, Layout> view,
+                         cutlass::Distribution::Kind dist_kind, uint64_t seed) {
+    if (dist_kind == cutlass::Distribution::Uniform) {
+      int scope = (cutlass::sizeof_bits<Element>::value == 8) ? 2 : 8;
+      cutlass::reference::host::TensorFillRandomUniform(view, seed, scope,
+                                                        -scope, 0);
+    } else if (dist_kind == cutlass::Distribution::Gaussian) {
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, -1);
+    } else if (dist_kind == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(view);
+    } else if (dist_kind == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(view.data(),
+                                                    view.capacity());
+    } else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Waives test if CUDA device is insufficient
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(cutlass::gemm::GemmCoord problem_size,
+           ElementCompute alpha = ElementCompute(1),
+           ElementCompute beta = ElementCompute(0)) {
+
+    // Waives test if CUDA device is insufficient
+    if (!sufficient()) {
+    	return true;
+    }
+
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA>
+        tensor_A(problem_size.mk());
+
+    cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB>
+        tensor_B(problem_size.kn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        tensor_C(problem_size.mn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        tensor_D(problem_size.mn());
+
+    cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC>
+        reference_D(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(),
+                                         tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+        problem_size,          tensor_A.device_ref(), tensor_B.device_ref(),
+        tensor_C.device_ref(), tensor_D.device_ref(), {alpha, beta}};
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(arguments);
+
+    if (status != cutlass::Status::kSuccess) {
+      cudaError_t error = cudaGetLastError();
+      std::cerr << "This test is not supported: " << cudaGetErrorString(error) << "\n";
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, alpha, tensor_A.host_ref(), tensor_B.host_ref(), beta,
+        reference_D.host_ref(), ElementAccumulator(0));
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" << problem_size.m() << "x"
+            << problem_size.n() << "x" << problem_size.k() << "_"
+            << Gemm::ThreadblockShape::kM << "x" << Gemm::ThreadblockShape::kN
+            << "x" << Gemm::ThreadblockShape::kK << "_" << Gemm::WarpShape::kM
+            << "x" << Gemm::WarpShape::kN << "x" << Gemm::WarpShape::kK
+            << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file << "problem: " << problem_size << ", alpha: " << alpha
+           << ", beta: " << beta << "\n\n";
+
+      file << "A =\n"
+           << tensor_A.host_view() << "\nB =\n"
+           << tensor_B.host_view() << "\nC =\n"
+           << tensor_C.host_view() << "\n\nReference =\n"
+           << reference_D.host_view() << "\nComputed =\n"
+           << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Runs a set of problem sizes
+  bool run_all() {
+    bool passed = true;
+
+    int problem_size_m[] = {16, 528};
+
+    int problem_size_n[] = {16, 528};
+
+    int problem_size_k[] = {Gemm::InstructionShape::kK,
+                            Gemm::ThreadblockShape::kK * Gemm::kStages +
+                                Gemm::InstructionShape::kK};
+
+    double problem_alpha[] = {1.0};
+
+    // TODO Try non zero beta value after multistaged epilogue is implemented
+    double problem_beta[] = {0.0};
+
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (double alpha : problem_alpha) {
+            for (double beta : problem_beta) {
+              passed =
+                  run({m, n, k}, ElementCompute(alpha), ElementCompute(beta));
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device
+}  // namespace gemm
+}  // namespace test
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed_interleaved.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed_interleaved.h
new file mode 100644
index 0000000000000000000000000000000000000000..e309208bb4311253be5b7366841164eb62748bab
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -0,0 +1,348 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, int InterleavedK>
+struct MultistageInterleavedTestbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  MultistageInterleavedTestbed(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerMultiprocessor < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+    
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm::ElementA, 
+      typename Gemm::LayoutA> tensor_A(problem_size.mk());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B_reordered(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_C(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_D(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> reference_D(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    cutlass::reorder_column<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), problem_size);
+
+    cutlass::reference::host::TensorCopy(
+      reference_D.host_view(), 
+      tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      {alpha, beta}
+    };
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(), 
+      tensor_B.host_ref(), 
+      beta, 
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+    
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D.host_view(), 
+      tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" 
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nB_reordered =\n" << tensor_B_reordered.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Runs a set of problem sizes
+  bool run_all() {
+    bool passed = true;
+
+    int problem_size_m[] = {
+      InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_n[] = {
+      InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_k[] = {
+      InterleavedK, Gemm::ThreadblockShape::kK * Gemm::kStages + InterleavedK
+    };
+
+    double problem_alpha[] = {
+      1.0
+    };
+
+    double problem_beta[] = {
+      0.0
+    };
+
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (double alpha : problem_alpha) {
+            for (double beta : problem_beta) {
+ 
+              passed = run(
+                {m, n, k}, 
+                ElementCompute(alpha), 
+                ElementCompute(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/simt_sm50.py b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/simt_sm50.py
new file mode 100644
index 0000000000000000000000000000000000000000..a180028205abb689436c73403eea82758ade7da9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/simt_sm50.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# this file creates the test/unit/gemm/device simt tests
+
+
+outputDir = ""
+
+################################################################################
+# parameters
+# Edge - for tiles, the edges represent the length of one side
+# Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles
+# MaxEdge - maximum length of each edge
+# Min/Max - minimum/maximum of the product of edge lengths
+################################################################################
+
+warpsPerThreadblockEdge = [1, 2, 4, 8, 16]
+warpsPerThreadblockRatio = 2
+warpsPerThreadblockMax = 16
+# NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases
+
+warpShapeEdges = [8, 16, 32, 64, 128, 256]
+warpShapeRatio = 4
+warpShapeMax = 64*64
+warpShapeMin = 8*8
+
+threadblockEdgeMax = 256
+
+#      char,      type               bits/elem, max tile,   L0 threadblock tiles
+precisions = [
+       ["c", "cutlass::complex<float>",     64,  64*128, [ [ 64, 128], [ 64,  32]             ] ],
+       ["q", "cutlass::Quaternion<float>",  64,  64*128, [ [ 64, 128], [ 64,  32]             ] ],
+       ["d", "double",                      64,   64*64, [ [ 64,  64], [ 32,  32]             ] ],
+       ["h", "cutlass::half_t",             16, 128*256, [ [256, 128], [ 64, 128], [ 64,  32] ] ],
+       ["i", "int",                         32, 128*128, [ [128,  64], [ 16, 32]              ] ],
+       ["s", "float",                       32, 128*128, [ [128, 256], [128, 128], [ 64,  64] ] ],
+       ["z", "cutlass::complex<double>",   128,   64*64, [ [ 32,  64], [ 16,  32]             ] ],
+       ]
+# L1 will have a single kernel for every unique shape
+# L2 will have everything else
+
+transposes = [
+       [False, False],
+       [False, True],
+       [True, False],
+       [True, True]
+       ]
+
+################################################################################
+# warps per threadblock
+################################################################################
+warpsPerThreadblocks = []
+for warpsPerThreadblock0 in warpsPerThreadblockEdge:
+    for warpsPerThreadblock1 in warpsPerThreadblockEdge:
+        if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax:
+            warpsPerThreadblocks.append([warpsPerThreadblock0,
+                warpsPerThreadblock1])
+print("WarpsPerThreadblocks",warpsPerThreadblocks)
+
+################################################################################
+# warp shapes
+################################################################################
+warpNumThreads = 32
+warpShapes = []
+for warp0 in warpShapeEdges:
+    for warp1 in warpShapeEdges:
+        if warp0 / warp1 <= warpShapeRatio and warp1 / warp0 <= warpShapeRatio and warp0*warp1 <= warpShapeMax and warp0*warp1 > warpShapeMin:
+            warpShapes.append([warp0, warp1])
+print("WarpShapes", warpShapes)
+
+numL0 = 0
+numL1 = 0
+numL2 = 0
+
+################################################################################
+# create kernels
+# create a file for each precision/transpose
+# each file contains many tile sizes
+################################################################################
+
+# precisions
+for precision in precisions:
+
+    # get precision char
+    precisionChar = precision[0]
+    precisionType = precision[1]
+    precisionBits = precision[2]
+    threadblockMaxElements = precision[3]
+    threadblockTilesL0 = precision[4]
+
+    # transposes
+    for transpose in transposes:
+
+        # get transpose char
+        columnMajorA = transpose[0]
+        columnMajorB = transpose[1]
+        transCharA = "n" if columnMajorA else "t"
+        transCharB = "n" if columnMajorB else "t"
+
+        # open file
+        fileName="simt_%sgemm_%s%s_sm50.cu" % (precisionChar, transCharA, transCharB)
+        print("\n", fileName)
+        filePath = "%s%s" % (outputDir, fileName)
+        out = open(filePath, "w+")
+
+        # write file header
+        out.write("/***************************************************************************************************\n"
+" * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
+" * SPDX-License-Identifier: BSD-3-Clause                                                           \n"
+" *                                                                                                 \n"
+" * Redistribution and use in source and binary forms, with or without                              \n"
+" * modification, are permitted provided that the following conditions are met:                     \n"
+" *                                                                                                 \n"
+" * 1. Redistributions of source code must retain the above copyright notice, this                  \n"
+" * list of conditions and the following disclaimer.                                                \n"
+" *                                                                                                 \n"
+" * 2. Redistributions in binary form must reproduce the above copyright notice,                    \n"
+" * this list of conditions and the following disclaimer in the documentation                       \n"
+" * and/or other materials provided with the distribution.                                          \n"
+" *                                                                                                 \n"
+" * 3. Neither the name of the copyright holder nor the names of its                                \n"
+" * contributors may be used to endorse or promote products derived from                            \n"
+" * this software without specific prior written permission.                                        \n"
+" *                                                                                                 \n"
+" * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"                   \n"
+" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE                       \n"
+" * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE                  \n"
+" * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE                    \n"
+" * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL                      \n"
+" * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR                      \n"
+" * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER                      \n"
+" * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,                   \n"
+" * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE                   \n"
+" * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.                            \n"
+" *\n"
+" **************************************************************************************************/\n"
+"/*! \\file\n"
+"    \\brief Tests for device-wide GEMM interface\n"
+"*/\n"
+"\n"
+"#include <iostream>\n"
+"\n"
+"#include \"cutlass/cutlass.h\"\n"
+"#include \"cutlass/gemm/device/gemm.h\"\n"
+"#include \"cutlass/numeric_types.h\"\n"
+"\n"
+"#include \"../../common/cutlass_unit_test.h\"\n"
+"\n"
+"#include \"cutlass/util/host_tensor.h\"\n"
+"#include \"cutlass/util/tensor_view_io.h\"\n"
+"#include \"cutlass/util/reference/host/tensor_fill.h\"\n"
+"#include \"cutlass/util/reference/host/tensor_copy.h\"\n"
+"#include \"cutlass/util/reference/host/tensor_compare.h\"\n"
+"#include \"cutlass/util/reference/host/gemm.h\"\n"
+"\n"
+"#include \"testbed.h\"\n"
+"\n")
+        foundThreadblockTilesL0 = {}
+        foundThreadblockTilesL1 = {}
+
+        ########################################################################
+        # for each combination of tile sizes
+        ########################################################################
+        for warpsPerThreadblock in warpsPerThreadblocks:
+            for warpShape in warpShapes:
+                warpThreadsM = 0
+                if warpShape[0] > warpShape[1]:
+                    warpThreadsM = 8
+                else:
+                    warpThreadsM = 4
+                warpThreadsN = warpNumThreads / warpThreadsM
+
+                # skip shapes with conflicting rectangularity
+                # they are unlikely to be fastest
+                blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1]
+                blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1]
+                warpG = warpShape[0] > warpShape[1]
+                warpL = warpShape[0] < warpShape[1]
+
+                blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2
+                blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1]
+                warpG2 = warpShape[0] > warpShape[1]*2
+                warpL2 = warpShape[0]*2 < warpShape[1]
+
+                if blockG2 and warpL: continue
+                if blockL2 and warpG: continue
+                if warpG2 and blockL: continue
+                if warpL2 and blockG: continue
+
+                # check threadblock ratios and max
+                threadblockTile = [warpShape[0]*warpsPerThreadblock[0],
+                        warpShape[1]*warpsPerThreadblock[1]]
+                if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue
+                if threadblockTile[0] > threadblockEdgeMax: continue
+                if threadblockTile[1] > threadblockEdgeMax: continue
+                totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1]
+
+                # calculate unroll
+                # ensure that every iteration at least a full load of A,B are done
+                unrollMin = 8
+                unrollMin0 = totalThreads / threadblockTile[0]
+                unrollMin1 = totalThreads / threadblockTile[1]
+                unroll = max(unrollMin, unrollMin0, unrollMin1)
+
+                threadTileM = warpShape[0] / warpThreadsM
+                threadTileN = warpShape[1] / warpThreadsN
+                if threadTileM < 2 or threadTileN < 2: continue
+                if threadTileM*threadTileN*precisionBits > 8*8*32: continue
+
+                # epilogue currently only supports N < WarpNumThreads
+                if threadblockTile[1] < warpNumThreads: continue
+
+                # limit smem
+                smemBitsA = threadblockTile[0]*unroll*2*precisionBits
+                smemBitsB = threadblockTile[1]*unroll*2*precisionBits
+                smemKBytes = (smemBitsA+smemBitsB)/8/1024
+                if (smemKBytes > 48): continue
+
+                # test level 0
+                testLevel = -1
+                for tileId in range(0, len(threadblockTilesL0)):
+                    tbTile = threadblockTilesL0[tileId]
+                    if tbTile[0] == threadblockTile[0] and tbTile[1] == threadblockTile[1]:
+                        if tuple(tbTile) not in foundThreadblockTilesL0:
+                            testLevel = 0
+                            numL0 += 1
+                            foundThreadblockTilesL0[tuple(tbTile)] = True
+
+                # test level 1
+                if testLevel < 0:
+                    threadblockTileAlreadyUsed = False
+                    if tuple(threadblockTile) not in foundThreadblockTilesL1:
+                        testLevel = 1
+                        numL1 += 1
+                        foundThreadblockTilesL1[tuple(threadblockTile)] = True
+
+                # test level 2
+                if testLevel < 0:
+                    testLevel = 2
+                    numL2 += 1
+
+                ################################################################
+                # write this tile to file
+                ################################################################
+
+                print("%ix%ix%i__%ix%i_%ix%i_%ix%i L%i" % (
+                        threadblockTile[0], threadblockTile[1], unroll,
+                        threadTileM, threadTileN,
+                        warpThreadsM, warpThreadsN,
+                        warpsPerThreadblock[0], warpsPerThreadblock[1], testLevel))
+
+                out.write("////////////////////////////////////////////////////////////////////////////////\n"
+                        "// Elements / Thread: %3i x %3i\n"
+                        "//    Threads / Warp: %3i x %3i\n"
+                        "//     Warps / Block: %3i x %3i\n"
+                        "//       Threadblock: %3i x %3i x %2i\n"
+                        % ( threadTileM, threadTileN,
+                            warpThreadsM, warpThreadsN,
+                            warpsPerThreadblock[0], warpsPerThreadblock[1],
+                            threadblockTile[0], threadblockTile[1], unroll
+                            )
+                        )
+
+                out.write("CUTLASS_TEST_L%i(SM50_device_%sgemm_%s%s, %ix%ix%i_%ix%ix1_%ix%i_%ix%i_%ix%i, {\n" % (
+                    testLevel,
+                    precisionChar,
+                    transCharA,
+                    transCharB,
+                    threadblockTile[0],
+                    threadblockTile[1],
+                    unroll,
+                    warpShape[0],
+                    warpShape[1],
+                    threadTileM,
+                    threadTileN,
+                    warpThreadsM,
+                    warpThreadsN,
+                    warpsPerThreadblock[0],
+                    warpsPerThreadblock[1]
+                    ))
+                out.write("    using precision = %s;\n" % precisionType)
+                out.write("    using ThreadblockShape = cutlass::gemm::GemmShape<%i, %i, %i>;\n" % (
+                    threadblockTile[0],
+                    threadblockTile[1],
+                    unroll))
+                out.write("    using WarpShape = cutlass::gemm::GemmShape<%i, %i, %i>;\n\n" % (
+                    warpShape[0],
+                    warpShape[1],
+                    unroll))
+                out.write("    static int const kEpilogueElementsPerAccess = 1;\n"
+                    "    using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;\n"
+                    "    using EpilogueOutputOp = cutlass::epilogue::thread::LinearCombination<\n"
+                    "        precision, kEpilogueElementsPerAccess, precision, precision>;\n\n")
+
+                out.write("    using Gemm = cutlass::gemm::device::Gemm<\n"
+                    "        precision, cutlass::layout::%sMajor,\n"
+                    "        precision, cutlass::layout::%sMajor,\n"
+                    "        precision, cutlass::layout::RowMajor,\n"
+                    "        precision,\n"
+                    "        cutlass::arch::OpClassSimt,\n"
+                    "        cutlass::arch::Sm50,\n"
+                    "        ThreadblockShape, WarpShape, InstructionShape,\n"
+                    "        EpilogueOutputOp,\n"
+                    "        cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>,\n"
+                    "        2 // Stages\n"
+                    "    >;\n" % (
+                        "Column" if columnMajorA else "Row",
+                        "Column" if columnMajorB else "Row",
+                        ))
+                out.write("    EXPECT_TRUE(test::gemm::device::TestAllGemm<Gemm>());\n"
+                    "} )\n\n")
+
+
+        out.close()
+print("NumKernels:", numL0, numL1, numL2)
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/sm90_evt_operations.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/sm90_evt_operations.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..63ffc3281dd2b9e9f74e0024c73da00628331dd4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/sm90_evt_operations.hpp
@@ -0,0 +1,545 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Host reference and operations for Sm90 EVT unit test
+*/
+#pragma once
+#include "gemm_testbed_3x_evt.hpp"
+
+//////////////////////////////////////////////////////////////////////////////
+/// Host references used for testing
+namespace test::gemm::device {
+template<class NodeOp, class ...ChildOp>
+using HEVT = HostTreeVisitor<NodeOp, ChildOp...>;
+
+template<class EdgeTuple, class ...Ops>
+using HDAG = HostTopoVisitor<EdgeTuple, Ops...>;
+
+template<class InputTree, class OutputTree, class... AuxOutTrees>
+using HST = HostSplitTreeVisitor<InputTree, OutputTree, AuxOutTrees...>;
+
+/// D = alpha * acc + beta * C + AuxLoad
+template<class Gemm, class ElementAux, class LayoutAux>
+class HostEVTAuxLoad {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using AuxLoadNode = HostAuxLoad<ElementAux, LayoutAux, false>;
+  using TernaryCompute0 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarAlpha, AccFetchNode, AuxLoadNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, TernaryCompute0>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// D = alpha * acc + beta * C + per-column bias
+template<class Gemm, class ElementBias>
+class HostPerColBias {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using RowBroadcastNode = HostRowBroadcast<ElementBias>;
+  using TernaryCompute0 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarAlpha, AccFetchNode, RowBroadcastNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, TernaryCompute0>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// D = beta * C + Graph(relu(alpha * acc + aux) + aux)
+/// Testing EVT - DAG structure
+template<class Gemm>
+class HostEVTDAG {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using AuxLoadNode = HostAuxLoad<cutlass::half_t, cutlass::layout::RowMajor, false>;
+  using DAGNode = HDAG<
+    float,
+    cute::tuple<
+      cute::tuple<>, // 0. alpha
+      cute::tuple<>, // 1. acc
+      cute::tuple<>, // 2. aux load
+      cute::tuple<cute::_0, cute::_1, cute::_2>, // 3. alpha * acc + aux load
+      cute::tuple<cute::_3>, // relu(alpha * acc + aux load)
+      cute::tuple<cute::_2, cute::_4> // relu(alpha * acc + aux load) + aux load
+    >,
+    ScalarAlpha,
+    AccFetchNode,
+    AuxLoadNode,
+    HostCompute<cutlass::homogeneous_multiply_add>,
+    HostCompute<cutlass::epilogue::thread::ReLu>,
+    HostCompute<cutlass::plus>
+  >;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, DAGNode>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, TernaryCompute1>;
+};
+
+/// EVT = alpha * acc + C
+/// D = Graph(maximum(EVT + per-row bias, EVT))
+/// Testing DAG - EVT
+template<class Gemm>
+class HostDAGEVT {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using EVTNode = HEVT<
+    HostAuxStore<cutlass::half_t, cutlass::layout::RowMajor, false>,
+    HEVT<
+      HostCompute<cutlass::homogeneous_multiply_add>,
+      HostScalarBroadcast<2>,
+      HostAccumulator<>,
+      HostAuxLoad<ElementC, LayoutC, true>
+    >
+  >;
+  using EVTModule = HEVT<
+    HostAuxStore<ElementD, LayoutD, true>,
+    HDAG<
+      float,
+      cute::tuple<
+      cute::tuple<>, // 0. EVT
+      cute::tuple<>, // 1. per-row bias
+      cute::tuple<cute::_0, cute::_1>, // 2. EVT + per-row bias
+      cute::tuple<cute::_0, cute::_2> // 3. maximum(EVT + per-row bias, EVT)
+      >,
+      EVTNode,
+      HostColBroadcast<cutlass::half_t, cute::Stride<cute::_1,cute::_0,int>>,
+      HostCompute<cutlass::plus>,
+      HostCompute<cutlass::maximum_with_default_nan_propagation>
+    >
+  >;
+};
+
+/// Xreduce(alpha * acc + beta * C)
+template<class Gemm, class ReduceOp>
+class HostReduce {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using ElementD = typename Gemm::GemmKernel::ElementC;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using ScalarAlpha = HostScalarBroadcast<1>;
+  using AccFetchNode = HostAccumulator<>;
+  using BinaryCompute0 = HEVT<HostCompute<cutlass::multiplies>, ScalarAlpha, AccFetchNode>;
+  using ScalarBeta = HostScalarBroadcast<1>;
+  using CLoadNode = HostAuxLoad<ElementC, LayoutC, true>;
+  using TernaryCompute1 = HEVT<HostCompute<cutlass::homogeneous_multiply_add>, ScalarBeta, CLoadNode, BinaryCompute0>;
+  using ReduceNode = HEVT<ReduceOp, TernaryCompute1>;
+  using EVTModule = HEVT<HostAuxStore<ElementD, LayoutD, true>, ReduceNode>;
+};
+
+// Z = scale_a * scale_b * alpha * acc + beta * scale_c * C + per-row bias
+// if D is fp8
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+template <class Gemm, template <class> class ActivationFn, class ElementD>
+class HostScaledLinCombPerRowBiasEltAct {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  using EVTModule = HEVT<
+  HostAuxStore<ElementD, LayoutD, true>,
+  HEVT<
+    HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,  // activation(Z) * scaled_d
+    HEVT<
+      HostCompute<ActivationFn>, // activation(Z)
+      HEVT<
+        HostCompute<cutlass::homogeneous_multiply_add>,
+        HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+        HostAuxLoad<ElementC, LayoutC, true>, // C
+        HEVT<
+          HostCompute<cutlass::homogeneous_multiply_add>,
+          HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+          HostAccumulator<>,
+          HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+        >
+      >
+    >,
+    HostScalarBroadcast<1> // scale_d
+  >
+  >;
+};
+
+// Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+// if D is fp8
+//   amax_d = max(abs(elements in activation(Z)))
+//   D = scale_d * activation(Z)
+// else
+//   D = activation(Z)
+// if Aux is fp8
+//   amax_aux = max(abs(elements in Z))
+//   Aux = scale_aux * Z
+// else
+//   Aux = Z
+template <class Gemm, template <class> class ActivationFn, class ElementD, class ElementAux = ElementD>
+class HostScaledLinCombPerRowBiasEltActAmaxAux {
+public:
+  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using LayoutC = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideC>;
+  using LayoutD = cutlass::detail::StrideToLayoutTagC_t<typename Gemm::GemmKernel::StrideD>;
+
+  template <typename T>
+  using amax = cutlass::maximum_absolute_value_reduction<T, true>;
+  using EVTModuleAuxFp8 = HEVT<
+    HostAuxStore<ElementD, LayoutD, true>,
+    HST<float,
+      // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+      HEVT<
+        HostCompute<cutlass::homogeneous_multiply_add>,
+        HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+        HostAuxLoad<ElementC, LayoutC, true>, // C
+        HEVT<
+          HostCompute<cutlass::homogeneous_multiply_add>,
+          HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+          HostAccumulator<>,
+          HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+        >
+      >,
+      // D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
+      HEVT<
+        HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
+        HEVT<
+          HostScalarReduce<amax, float>,
+          HEVT<
+            HostCompute<ActivationFn>, //activation(Z) * scaled_d
+            HostAccumulator<> // Z
+          >
+        >,
+        HostScalarBroadcast<1> // scale_d
+      >,
+      // Aux = Z * scale_aux, amax_aux = max(abs(elements in Aux))
+      HEVT<
+        HostAuxStore<ElementAux, cutlass::layout::RowMajor, false>,
+        HEVT<
+          HostCompute<cutlass::multiplies>,
+          HEVT<
+            HostScalarReduce<amax, float>,
+            HostAccumulator<>
+            >,
+          HostScalarBroadcast<1>
+        >
+      >
+    >
+  >;
+
+  using EVTModuleAuxNotFp8 = HEVT<
+    // D = activation(Z) * scaled_d, amax_d = max(abs(elements in D))
+    HostAuxStore<ElementD, LayoutD, true>,
+      HEVT<
+        HostCompute<cutlass::epilogue::fusion::detail::ScaleOutOp<ElementD>::template Op>,
+        HEVT<
+          HostScalarReduce<amax, float>,
+          HEVT<
+            HostCompute<ActivationFn>, //activation(Z) * scaled_d
+            HEVT<
+              // Aux = Z
+              HostAuxStore<ElementAux, cutlass::layout::RowMajor, false>,
+              // Z = scale_a * scale_b * alpha * acc + scale_c * beta * C + per-row bias
+              HEVT<
+                HostCompute<cutlass::homogeneous_multiply_add>,
+                HostScalarBroadcast<1, 2, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_c * beta
+                HostAuxLoad<ElementC, LayoutC, true>, // C
+                HEVT<
+                  HostCompute<cutlass::homogeneous_multiply_add>,
+                  HostScalarBroadcast<1, 3, cute::Stride<cute::_0,cute::_0,int64_t>>, // scale_a * scale_b * alpha
+                  HostAccumulator<>,
+                  HostColBroadcast<ElementD, cute::Stride<cute::_1,cute::_0,int64_t>>
+                >
+              >
+            >
+          >
+        >,
+        HostScalarBroadcast<1> // scale_d
+      >
+    >;
+      
+  using EVTModule = cute::conditional_t<cutlass::epilogue::fusion::detail::is_fp8_v<ElementAux>, EVTModuleAuxFp8, EVTModuleAuxNotFp8>;
+
+};
+} // namespace test::gemm::device
+
+//////////////////////////////////////////////////////////////////////////////
+namespace cutlass::epilogue {
+namespace fusion {
+
+namespace detail {
+
+template <typename T>
+struct maximum_with_default_nan_propagation : maximum<T> {};
+
+} // namespace detail
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + AuxLoad
+template<
+  class EpilogueDescriptor,
+  class AuxLoadDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombAuxLoad =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<
+        AuxLoadDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxLoadDescriptor::Element,
+        typename AuxLoadDescriptor::Stride, typename AuxLoadDescriptor::SmemLayoutAtom,
+        typename AuxLoadDescriptor::CopyOpS2R // aux load
+      >
+    >
+  >;
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + AuxLoadNoSmem
+template<
+  class EpilogueDescriptor,
+  class ElementAux,
+  class StrideAux,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombAuxLoadNoSmem =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<0, void, ElementAux, StrideAux, void, void> // aux load
+    >
+  >;
+
+//////////////////////////////////////////////////////////////////////////////
+/// Example DAG
+/// beta * C + Graph(alpha * acc + gamma + acc)
+template<
+  typename EpilogueDescriptor,
+  typename AuxLoadDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombEVTDAG =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + aux)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90TopologicalVisitor<
+      ElementCompute,
+      cute::tuple<
+        cute::seq<>, // 0. alpha
+        cute::seq<>, // 1. acc
+        cute::seq<>, // 2. aux load
+        cute::seq<1, 0, 2>, // 3. alpha * acc + aux load
+        cute::seq<3>, // relu(alpha & acc + aux load)
+        cute::seq<2, 4> // relu(alpha * acc + aux load) + aux load
+      >,
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90AuxLoad<
+        AuxLoadDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxLoadDescriptor::Element, typename AuxLoadDescriptor::Stride,
+        typename AuxLoadDescriptor::SmemLayoutAtom, typename AuxLoadDescriptor::CopyOpS2R>,
+      Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>,
+      Sm90Compute<cutlass::epilogue::thread::ReLu, ElementCompute, ElementCompute, RoundStyle>,
+      Sm90Compute<plus, ElementCompute, ElementCompute, RoundStyle>
+    >
+    >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// Example DAG
+/// EVT = alpha * acc + C
+/// D = Graph(maximum(EVT + per-row bias, EVT))
+template<
+  class EpilogueDescriptor,
+  class AuxStoreDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombDAGEVT =
+  Sm90TopologicalVisitor<
+    ElementCompute,
+    cute::tuple<
+      cute::seq<>,
+      cute::seq<>,
+      cute::seq<1, 0>,
+      cute::seq<0, 2>
+    >,
+    Sm90EVT<
+      Sm90AuxStore<
+        AuxStoreDescriptor::Stages, typename EpilogueDescriptor::EpilogueTile,
+        typename AuxStoreDescriptor::Element, RoundStyle, typename AuxStoreDescriptor::Stride,
+        typename AuxStoreDescriptor::SmemLayoutAtom, typename AuxStoreDescriptor::CopyOpR2S>,
+      Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>,
+        Sm90ScalarBroadcast<ElementScalar>,
+        Sm90AccFetch,
+        Sm90SrcFetch<ElementOutput>
+      >
+    >,
+    Sm90ColBroadcast<0, typename EpilogueDescriptor::TileShape, ElementBias, ElementCompute>,
+    Sm90Compute<plus, ElementCompute, ElementCompute, RoundStyle>,
+    Sm90Compute<detail::maximum_with_default_nan_propagation, ElementOutput, ElementCompute, RoundStyle>
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = alpha * acc + beta * C + per-column bias
+template<
+  class EpilogueDescriptor,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementBias = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColumnBias =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc + bias)
+    Sm90ScalarBroadcast<ElementScalar>, // beta
+    Sm90SrcFetch<ElementOutput>, // C
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc + bias
+      Sm90ScalarBroadcast<ElementScalar>, // alpha
+      Sm90AccFetch, // acc
+      Sm90RowBroadcast<0, typename EpilogueDescriptor::TileShape, ElementBias, ElementCompute>
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = per-column reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerColumnReduce =
+  Sm90EVT<Sm90RowReduction<RegReduceFn, RegReduceFn, GmemReduceFn, 0, CtaTileShapeMNK, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = per-row reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class CtaTileShapeMNK,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombPerRowReduce =
+  Sm90EVT<Sm90ColReduction<RegReduceFn, RegReduceFn, GmemReduceFn, 0, CtaTileShapeMNK, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+
+
+//////////////////////////////////////////////////////////////////////////////
+/// D = scalar reduce(alpha * acc + beta * C)
+template<
+  template <class> class RegReduceFn,
+  template <class> class GmemReduceFn,
+  class ElementReduce,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinCombScalarReduce =
+  Sm90EVT<Sm90ScalarReduction<RegReduceFn, GmemReduceFn, ElementReduce, ElementCompute, RoundStyle>, // per column reduce
+    Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + alpha * acc
+      Sm90ScalarBroadcast<ElementScalar>, // beta
+      Sm90SrcFetch<ElementOutput>, // C
+      Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+        Sm90ScalarBroadcast<ElementScalar>, // alpha
+        Sm90AccFetch // acc
+      >
+    >
+  >;
+} // namespace fusion
+
+} // namespace cutlass::epilogue
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..0007666cdd084f35015200e36fd47f75971f6c1c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed.h
@@ -0,0 +1,639 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed_utils.h"
+#include "testbed_universal.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool Relu = false>
+struct Testbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  typename Gemm::LayoutA::Stride stride_factor_A;
+  typename Gemm::LayoutB::Stride stride_factor_B;
+  typename Gemm::LayoutC::Stride stride_factor_C;
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  Testbed(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    stride_factor_A(typename Gemm::LayoutA::Stride()),
+    stride_factor_B(typename Gemm::LayoutB::Stride()),
+    stride_factor_C(typename Gemm::LayoutC::Stride()),
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  Testbed(
+    typename Gemm::LayoutA::Stride stride_factor_A_,
+    typename Gemm::LayoutB::Stride stride_factor_B_,
+    typename Gemm::LayoutC::Stride stride_factor_C_,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    stride_factor_A(stride_factor_A_),
+    stride_factor_B(stride_factor_B_),
+    stride_factor_C(stride_factor_C_),
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(problem_size.mk(), cutlass::layout::Affine2Layout_Factory<typename Gemm::LayoutA>::layout_factory(problem_size.mk(), stride_factor_A));
+    tensor_B.resize(problem_size.kn(), cutlass::layout::Affine2Layout_Factory<typename Gemm::LayoutB>::layout_factory(problem_size.kn(), stride_factor_B));
+    tensor_C.resize(problem_size.mn(), cutlass::layout::Affine2Layout_Factory<typename Gemm::LayoutC>::layout_factory(problem_size.mn(), stride_factor_C));
+    tensor_D.resize(problem_size.mn(), cutlass::layout::Affine2Layout_Factory<typename Gemm::LayoutC>::layout_factory(problem_size.mn(), stride_factor_C));
+    reference_D.resize(problem_size.mn(), cutlass::layout::Affine2Layout_Factory<typename Gemm::LayoutC>::layout_factory(problem_size.mn(), stride_factor_C), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at(cutlass::make_Coord(0, 0)) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0)
+        << "tensor_D (size " << tensor_D.size() << ") has nonpositive norm";
+    }
+    if (reference_D.size() > 1) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0)
+        << "reference_D (size " << reference_D.size() << ") has nonpositive norm";
+    }
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed) << "reference_D does not equal tensor_D";
+
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" 
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+    
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(), 
+      tensor_B.host_ref(), 
+      beta, 
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    if (Relu) {
+      for (int i = 0; i < problem_size.m(); ++i) {
+        for (int j = 0; j < problem_size.n(); ++j) {
+           reference_D.at(cutlass::MatrixCoord(i, j)) = 
+                  ((ElementCompute)reference_D.at(cutlass::MatrixCoord(i, j)) < (ElementCompute)0)
+                  ? (typename Gemm::ElementC)0
+                  : reference_D.at(cutlass::MatrixCoord(i, j));
+        }
+      }
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+	/// Determine if the CUDA device is sufficient to run the kernel
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size,
+    int split_k_slices = 1,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+/*
+    std::cout << "\n-----------------------\n";
+    std::cout << "problem size: " << problem_size << "\n";
+    std::cout << "split_k_slices: " << split_k_slices << "\n";
+    std::cout << "alpha: " << alpha << "\n";
+    std::cout << "beta: " << beta << "\n";
+    std::cout << "-----------------------\n\n";
+*/
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      {alpha, beta},
+      split_k_slices
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess)
+      << "gemm_op.initialize returned with error " << to_string(status)
+      << ", indicating that this test is not supported.  Last CUDA error: "
+      << cudaGetErrorString(cudaGetLastError());
+    if (status != cutlass::Status::kSuccess) {
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    try {
+      status = gemm_op();
+    }
+    catch (std::exception const& e) {
+      EXPECT_TRUE(false) << "gemm_op() threw a std::exception: " << e.what();
+      throw;
+    }
+    catch (...) {
+      EXPECT_TRUE(false) << "gemm_op() threw an exception of unknown type";
+      throw;
+    }
+    EXPECT_TRUE(status == cutlass::Status::kSuccess)
+      << "gemm_op failed with error " << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+    EXPECT_TRUE(passed) << "Error: split_k_slices = " << split_k_slices
+      << ", alpha: " << alpha;
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool Relu=false>
+bool TestAllGemmBasic(
+    const typename Gemm::LayoutA::Stride& stride_factor_A = typename Gemm::LayoutA::Stride(),
+    const typename Gemm::LayoutB::Stride& stride_factor_B = typename Gemm::LayoutB::Stride(),
+    const typename Gemm::LayoutC::Stride& stride_factor_C = typename Gemm::LayoutC::Stride()) {
+  bool passed = true;
+
+  int const kMinimumOperandElementSize = 
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value), 
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Gemm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementB, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::LayoutB, cutlass::layout::RowMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementB, int8_t>::value &&
+                          (cutlass::platform::is_same<typename Gemm::LayoutA, cutlass::layout::RowMajor>::value ||
+                          cutlass::platform::is_same<typename Gemm::LayoutB, cutlass::layout::ColumnMajor>::value) ? 4 : kAlignment;
+
+  int problem_size_m[] = {kAlignmentM, 512 - 3 * kAlignmentM};
+
+  int problem_size_n[] = {kAlignmentN, 512 - 2 * kAlignmentN};
+
+  int problem_size_k[] = {
+      kAlignmentK, Gemm::ThreadblockShape::kK * (Gemm::kStages + 1) - kAlignmentK};
+
+  int split_k_slices[] = {
+    1, 2, 3
+  };
+
+  double problem_alpha[] = {
+    1
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  Testbed<Gemm, Relu> testbed(stride_factor_A, stride_factor_B, stride_factor_C);
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int split_k : split_k_slices) {
+
+          if (!Gemm::kSplitKSerial && split_k > 1) {
+            continue;
+          }
+
+          if (split_k > 1 && k / Gemm::ThreadblockShape::kK < split_k) {
+            continue;
+          }
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+              try {
+                passed = testbed.run(
+                  problem_size, 
+                  split_k,
+                  cutlass::from_real<ElementCompute>(alpha), 
+                  cutlass::from_real<ElementCompute>(beta)
+                );
+              }
+              catch (std::exception const& e) {
+                EXPECT_TRUE(false) << "TestAllGemmBasic: testbed.run threw an "
+                  "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                  << m << ", n: " << n << ", k: " << k << "}: " << e.what();
+                throw;
+              }
+              catch (...) {
+                EXPECT_TRUE(false) << "TestAllGemmBasic: testbed.run threw an "
+                  "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                  << m << ", n: " << n << ", k: " << k << "}: (unknown)";
+                throw;
+              }
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool Relu=false>
+bool TestAllGemm(
+    const typename Gemm::LayoutA::Stride& stride_factor_A,
+    const typename Gemm::LayoutB::Stride& stride_factor_B = typename Gemm::LayoutB::Stride(),
+    const typename Gemm::LayoutC::Stride& stride_factor_C = typename Gemm::LayoutC::Stride())
+{
+  // Test basic GEMM with non-default stride factors
+  return TestAllGemmBasic<Gemm, Relu>(stride_factor_A, stride_factor_B, stride_factor_C);
+}
+
+template <typename Gemm, bool Relu=false>
+bool TestAllGemm()
+{
+#ifdef NDEBUG
+  // Non-debug builds also test basic GEMM with default stride factors
+  if (!TestAllGemmBasic<Gemm, Relu>()) {
+    return false;
+  }
+#endif // NDEBUG
+
+  // Test universal GEMM
+#if 0
+  // Define the universal kernel
+  using UniversalKernel = cutlass::gemm::kernel::GemmUniversal<
+    typename Gemm::GemmKernel::Mma,                                 // Mma
+    typename Gemm::GemmKernel::Epilogue,                            // Epilogue
+    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>    // ThreadblockSwizzle
+  >;
+#else
+  // Define the streamk universal kernel
+  using UniversalKernel = cutlass::gemm::kernel::GemmUniversalStreamk<
+    typename Gemm::GemmKernel::Mma,                                 // Mma
+    typename Gemm::GemmKernel::Epilogue,                            // Epilogue
+    cutlass::gemm::threadblock::ThreadblockSwizzleStreamK           // ThreadblockSwizzle
+  >;
+#endif
+
+  // Define the universal adaptor
+  using UniversalGemm = cutlass::gemm::device::GemmUniversalAdapter<UniversalKernel>;
+
+  // Test universal GEMM
+  return TestAllGemmUniversal<UniversalGemm, Relu>();
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gemm>
+bool TestGemmPerf(int iterations = 1) {
+  bool passed = true;
+
+  int problem_size_m[] = { 2048 };
+
+  int problem_size_n[] = { 4352 };
+
+  int problem_size_k[] = { 4096  };
+
+  int split_k_slices[] = { 1 };
+  double problem_alpha[] = { 1 };
+  double problem_beta[] = { 0.0 };
+
+  Testbed<Gemm> testbed;
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int split_k : split_k_slices) {
+
+          if (!Gemm::kSplitKSerial && split_k > 1) {
+            continue;
+          }
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+              for (int i = 0; i < iterations; i++){
+                try {
+                  passed = testbed.run(
+                    problem_size, 
+                    split_k,
+                    cutlass::from_real<ElementCompute>(alpha), 
+                    cutlass::from_real<ElementCompute>(beta)
+                  );
+                }
+                catch (std::exception const& e) {
+                  EXPECT_TRUE(false) << "TestGemmPerf: testbed.run threw an "
+                    "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                    << m << ", n: " << n << ", k: " << k << "}: " << e.what();
+                  throw;
+                }
+                catch (...) {
+                  EXPECT_TRUE(false) << "TestGemmPerf: testbed.run threw an "
+                    "exception {alpha: " << alpha << ", beta: " << beta << ", m: "
+                    << m << ", n: " << n << ", k: " << k << "}: (unknown)";
+                  throw;
+                }
+              }
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..add984ca3b9a0c05325b93cf52cbadd710527ba6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_complex.h
@@ -0,0 +1,294 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct TestbedComplex : public Testbed<Gemm> {
+
+  using Base = Testbed<Gemm>;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+
+  //
+  // Methods
+  //
+
+  TestbedComplex(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    Base(init_A_, init_B_, init_C_, seed_) { }
+
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex(
+      problem_size,
+      alpha, 
+      this->tensor_A.host_ref(),
+      Gemm::kTransformA,
+      this->tensor_B.host_ref(), 
+      Gemm::kTransformB,
+      beta, 
+      this->tensor_C.host_ref(), 
+      this->reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    return this->compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+    
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+    
+    if (result != cudaSuccess) {
+    	throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+    
+    result = cudaGetDeviceProperties(&properties, device_idx);
+    
+    if (result != cudaSuccess) {
+    	throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+    
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+    	return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    int split_k_slices = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    //
+    // Initialize workspace
+    //
+
+    this->initialize(problem_size);
+		
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      this->tensor_A.device_ref(),
+      this->tensor_B.device_ref(),
+      this->tensor_C.device_ref(),
+      this->tensor_D.device_ref(),
+      {alpha, beta},
+      split_k_slices
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Error with split_k_slices = " << split_k_slices << ", alpha: " << alpha << std::endl;
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestAllGemmComplex() {
+  bool passed = true;
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  int const kMinimumOperandElementSize = 
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value), 
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  int const kAlignment = 
+    cutlass::platform::is_same<
+      typename Gemm::OperatorClass, 
+      cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  int problem_size_m[] = {
+    kAlignment, 512 - 3*kAlignment
+  };
+
+  int problem_size_n[] = {
+    kAlignment, 512 - 2*kAlignment
+  };
+
+  int problem_size_k[] = {
+    kAlignment, 128 - kAlignment
+  };
+
+  int split_k_slices[] = {
+    1, 2, 3
+  };
+
+  double problem_alpha[] = {
+    1
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  TestbedComplex<Gemm> testbed;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int split_k : split_k_slices) {
+
+          if (!Gemm::kSplitKSerial && split_k > 1) {
+            continue;
+          }
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+              passed = testbed.run(
+                problem_size, 
+                split_k,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_broadcast.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..eca0b0ae0decf3293f6f73cb6ebbc5b5735a8e49
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_broadcast.h
@@ -0,0 +1,670 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct GemmWithBroadcastReferenceOp {
+
+  using OutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+
+  typename OutputOp::BinaryOp binary_op;
+  typename OutputOp::ElementwiseOp elementwise_op;
+
+  GemmWithBroadcastReferenceOp() { }
+
+  void operator()(ElementZ &Z, ElementT &T, ElementCompute gemm, ElementCompute bias) {
+
+    ElementCompute t_full = binary_op(gemm, bias);
+
+    if (OutputOp::kStoreT) {
+      T = ElementT(t_full);
+    }
+
+    if (OutputOp::kStoreZ) {
+      ElementCompute z_full = elementwise_op(t_full);
+      Z = ElementZ(z_full);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Fused testbed
+//
+//  Y = GEMM(AB, C)
+//
+//  T[i, j] = BinaryOp(Y[i, j], Broadcast[i])
+//
+//  Z[i, j] = Elementwise(T[i, j])
+//
+
+template <
+  typename Gemm, 
+  typename ReferenceOp = GemmWithBroadcastReferenceOp<Gemm>
+>
+struct TestbedGemmWithBroadcast {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using OutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename OutputOp::ElementCompute;
+  using ElementVector = typename OutputOp::ElementVector;
+  using ElementZ = typename OutputOp::ElementZ;
+  using ElementT = typename OutputOp::ElementT;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;          // Input A
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;          // Input B
+  cutlass::HostTensor<ElementC, typename Gemm::LayoutC> tensor_C;                         // Input C
+  cutlass::HostTensor<ElementVector, typename Gemm::LayoutC> tensor_Broadcast;            // Input Broadcast
+
+  cutlass::HostTensor<ElementZ, typename Gemm::LayoutC> tensor_Z;
+  cutlass::HostTensor<ElementT, typename Gemm::LayoutC> tensor_T;
+
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tensor_C_ref;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tensor_Y_ref;
+  cutlass::HostTensor<ElementZ, typename Gemm::LayoutC> tensor_Z_ref;
+  cutlass::HostTensor<ElementT, typename Gemm::LayoutC> tensor_T_ref;
+
+
+  //
+  // Methods
+  //
+
+  TestbedGemmWithBroadcast(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 1;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.kn());
+    tensor_C.resize(problem_size.mn());
+    tensor_Z.resize(problem_size.mn());
+    tensor_T.resize(problem_size.mn());
+    tensor_Broadcast.resize({
+      problem_size.m(), 
+      1
+    });
+
+    tensor_C_ref.resize(problem_size.mn());
+    tensor_Y_ref.resize(problem_size.mn());
+    tensor_Z_ref.resize(problem_size.mn());
+    tensor_T_ref.resize(problem_size.mn());
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_Broadcast.host_view(), init_C, seed + 2020));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Gemm::ElementC(1);
+
+    for (int m = 0; m < tensor_C_ref.extent().row(); ++m) {
+      for (int n = 0; n < tensor_C_ref.extent().column(); ++n) {
+        tensor_C_ref.at({m, n}) = ElementAccumulator(tensor_C.at({m, n}));
+      }
+    }
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_Broadcast.sync_device();
+
+    tensor_Z.sync_device();
+    tensor_T.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementAccumulator alpha, 
+    ElementAccumulator beta) {
+
+    tensor_Z.sync_host();
+    tensor_T.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (OutputOp::kStoreZ) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Z_ref.host_view()), 0);
+    }
+
+    if (OutputOp::kStoreT) {
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_T_ref.host_view()), 0);
+    }
+
+    bool passed = true;
+    float norm_diff = 0;
+
+    if (OutputOp::kStoreZ) {
+      norm_diff = cutlass::reference::host::TensorNormDiff(tensor_Z_ref.host_view(), tensor_Z.host_view(), float());
+      passed = (norm_diff <= 0.1f);
+      EXPECT_LT(norm_diff, 0.1f) << " tensor_Z is incorrect";
+    }
+
+    if (OutputOp::kStoreT) {
+
+      norm_diff = cutlass::reference::host::TensorNormDiff(tensor_T_ref.host_view(), tensor_T.host_view(), float());
+      passed = (passed && (norm_diff <= 0.1f));
+
+      EXPECT_LT(norm_diff, 0.1f) << " tensor_T is incorrect"; 
+    }
+
+
+    if (!passed) {
+
+      /*
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_"
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+      */
+
+      std::ofstream file("errors_testbed_gemm_with_broadcast.txt");
+
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nZ =\n" << tensor_Z.host_view()
+        << "\nT =\n" << tensor_T.host_view()
+        << "\n\n"
+        << "\nY_ref =\n" << tensor_Y_ref.host_view()
+        << "\nZ_ref =\n" << tensor_Z_ref.host_view()
+        << "\nT_ref =\n" << tensor_T_ref.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementAccumulator alpha, 
+    ElementAccumulator beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        ElementAccumulator, typename Gemm::LayoutC, 
+        ElementAccumulator, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      Gemm::kTransformA,
+      tensor_B.host_ref(),
+      Gemm::kTransformB,
+      beta, 
+      tensor_C_ref.host_ref(), 
+      tensor_Y_ref.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    using ElementC = typename Gemm::ElementC;
+
+    ReferenceOp reference_op;
+
+    // compute tensor Z and tensor T
+    for (int m = 0; m < problem_size.m(); ++m) {
+      for (int n = 0; n < problem_size.n(); ++n) {
+
+        ElementZ z;
+        ElementT t;
+
+        reference_op(z, t, tensor_Y_ref.at({m, n}), tensor_Broadcast.at({m, 0}));
+
+        if (OutputOp::kStoreZ) {
+          tensor_Z_ref.at({m, n}) = z;
+        }
+
+        if (OutputOp::kStoreT) {
+          tensor_T_ref.at({m, n}) = t;
+        }
+      }
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size, 
+    int batch_count = 1,
+    ElementAccumulator alpha = ElementAccumulator(1), 
+    ElementAccumulator beta = ElementAccumulator(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_Z.device_data(),
+      tensor_Broadcast.device_data(),
+      tensor_T.device_data(),
+      problem_size.m() * problem_size.k(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_Z.layout().stride(0),
+      0,                                    // This must be zero
+      tensor_T.layout().stride(0),
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = true;
+
+    passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed with batch_count/split_k_slices = " << batch_count << std::endl;
+    }
+
+    //
+    // Profile
+    //
+
+    #if 0 // profiling disabled for now.
+
+    int const kWorkspaces = 100;
+
+    cutlass::DeviceAllocation<typename Gemm::ElementA> profiling_tensor_A(tensor_A.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<typename Gemm::ElementB> profiling_tensor_B(tensor_B.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<ElementC> profiling_tensor_C(tensor_C.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<ElementC> profiling_tensor_Broadcast(tensor_Broadcast.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<ElementZ> profiling_tensor_Z(tensor_Z.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<ElementT> profiling_tensor_T(tensor_T.capacity() * kWorkspaces);
+
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      cudaError_t result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << " cudaEventCreate() failed with error " << cudaGetErrorString(result);
+        return false;
+        break;
+      }
+    }
+
+    int const kWarmupIterations = 5;
+    int const kProfilingIterations = 100;
+
+    for (int i = 0; i < kWarmupIterations; ++i) {
+      status = gemm_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    }
+    
+
+    cudaError_t result = cudaEventRecord(events[0]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    for (int i = 0; i < kProfilingIterations; ++i) {
+
+      typename Gemm::Arguments arguments{
+        mode,
+        problem_size,
+        batch_count,
+        {alpha, beta},
+        profiling_tensor_A.get() + tensor_A.capacity() * (i % kWorkspaces),
+        profiling_tensor_B.get() + tensor_B.capacity() * (i % kWorkspaces),
+        profiling_tensor_C.get() + tensor_C.capacity() * (i % kWorkspaces),
+        profiling_tensor_Z.get() + tensor_Z.capacity() * (i % kWorkspaces),
+        profiling_tensor_Broadcast.get() + tensor_Broadcast.capacity() * (i % kWorkspaces),
+        profiling_tensor_T.get() + tensor_T.capacity() * (i % kWorkspaces),
+        problem_size.m() * problem_size.k(),
+        problem_size.n() * problem_size.k(),
+        problem_size.m() * problem_size.n(),
+        problem_size.m() * problem_size.n(),
+        problem_size.m(),
+        problem_size.m() * problem_size.n(),
+        tensor_A.layout().stride(0),
+        tensor_B.layout().stride(0),
+        tensor_C.layout().stride(0),
+        tensor_Z.layout().stride(0),
+        0,                                    // This must be zero
+        tensor_T.layout().stride(0),
+      };
+
+      gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    }
+
+    result = cudaEventRecord(events[1]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess);
+
+    float elapsed_time = 0;
+    result = cudaEventElapsedTime(&elapsed_time, events[0], events[1]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    double average_time = double(elapsed_time) / double(kProfilingIterations);
+
+    std::cout << problem_size << ": " << average_time << " ms" << std::endl;
+
+    for (auto & event : events) {
+      cudaEventDestroy(event);
+    }
+    #endif
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm, 
+  typename ReferenceOp = GemmWithBroadcastReferenceOp<Gemm>
+>
+bool TestGemmWithBroadcast(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedGemmWithBroadcast<Gemm, ReferenceOp> testbed;
+  
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  passed = testbed.run(
+    mode,
+    problem_size, 
+    batch_count,
+    cutlass::from_real<ElementAccumulator>(alpha), 
+    cutlass::from_real<ElementAccumulator>(beta)
+  );
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm, 
+  typename ReferenceOp = GemmWithBroadcastReferenceOp<Gemm>
+>
+bool TestAllGemmWithBroadcast() {
+
+  int M_problems[] = {8, 136, 264, 520};
+  int N_problems[] = {8, 136, 264, 520};
+  int K_problems[] = {8, 136, 264, 520};
+  double alpha_problems[] = {1.25, 2.25};
+  double beta_problems[] = {0, 1, 2.0};
+
+  bool passed = true;
+
+  for (int M : M_problems) {
+    for (int N : N_problems) {
+      for (int K : K_problems) {
+        for (double alpha : alpha_problems) {
+          for (double beta : beta_problems) {
+
+            TestbedGemmWithBroadcast<Gemm, ReferenceOp> testbed;
+            
+            using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+            passed = testbed.run(
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              {M, N, K}, 
+              1,
+              cutlass::from_real<ElementAccumulator>(alpha), 
+              cutlass::from_real<ElementAccumulator>(beta)
+            );
+
+            EXPECT_TRUE(passed) 
+              << "M: " << M << ", N: " << N << ", K: " << K << ", alpha: " << alpha << ", beta: " << beta;
+
+            if (!passed) {
+
+              return passed;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_reduction.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..af3629ccfb87e09e80b85af508379780d6428dc5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_gemm_with_reduction.h
@@ -0,0 +1,588 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, typename BinaryOp>
+struct GemmWithReductionReference {
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::ElementCompute;
+  using ElementC = typename Gemm::ElementC;
+  using ElementT = typename Gemm::GemmKernel::Epilogue::ElementTensor;
+  //
+  // Data members
+  //
+
+  BinaryOp binary_op;
+
+  //
+  // Methods
+  //
+
+  GemmWithReductionReference() { }
+
+  ElementCompute operator()(
+    ElementAccumulator d_y, 
+    ElementT t) {
+    
+    return binary_op(ElementCompute(d_y), ElementCompute(t));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  typename ReferenceOp
+>
+struct TestbedGemmWithReduction {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementT = typename Gemm::GemmKernel::Epilogue::ElementTensor;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Gemm::ElementAccumulator, typename Gemm::LayoutC> tensor_Reduction;
+  cutlass::HostTensor<ElementT, typename Gemm::LayoutC> tensor_Tensor;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tensor_C_ref;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> reference_d_Y;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> reference_D;
+  cutlass::HostTensor<typename Gemm::ElementAccumulator, typename Gemm::LayoutC> reference_Reduction;
+
+  //
+  // Methods
+  //
+
+  TestbedGemmWithReduction(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 1;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      for (int m = 0; m < view.extent().row(); ++m) {
+        for (int n = 0; n < view.extent().column(); ++n) {
+          //view.at({m, n}) = Element(float(((idx ++) % 17) - 8));
+          view.at({m, n}) = (n == 0 ? Element(m) : Element());
+
+        }
+      }
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.kn());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+
+    tensor_Reduction.resize({
+      problem_size.m(), 
+      (problem_size.n() - 1 + Gemm::ThreadblockShape::kN) / Gemm::ThreadblockShape::kN
+    });
+
+    tensor_Tensor.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+    reference_d_Y.resize(problem_size.mn(), false);
+    tensor_C_ref.resize(problem_size.mn(), false);
+    reference_Reduction.resize({problem_size.m(), 1}, false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+    EXPECT_TRUE(initialize_tensor(tensor_Tensor.host_view(), init_C, seed + 2020));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Gemm::ElementC(1);
+
+    for (int m = 0; m < tensor_C_ref.extent().row(); ++m) {
+      for (int n = 0; n < tensor_C_ref.extent().column(); ++n) {
+        tensor_C_ref.at({m, n}) = ElementAccumulator(tensor_C.at({m, n}));
+      }
+    }
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_Reduction.sync_device();
+    tensor_Tensor.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementAccumulator alpha, 
+    ElementAccumulator beta) {
+
+    tensor_Reduction.sync_host();
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+    
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Reduction.host_view()), 0);
+
+    bool passed = true;
+    for (int m = 0; m < tensor_Reduction.extent().row(); ++m) {
+
+      ElementAccumulator reduced_value = ElementAccumulator();
+      for (int j = 0; j < tensor_Reduction.extent().column(); ++j) {
+        reduced_value += tensor_Reduction.at({m, j});
+      }
+
+      if (reduced_value != reference_Reduction.at({m, 0})) {
+        std::cout << "Error in bias[" << m << "] - Expected: " << reference_Reduction.at({m, 0}) << ", got: " << reduced_value << std::endl;
+        passed = false;
+        break;
+      }
+    }
+    EXPECT_TRUE(passed) << "Reduction is incorect.";
+
+    if (!cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view())) {
+      EXPECT_TRUE(false) << " mismatched reference";
+      passed = false;
+    }
+    
+    if (!passed) {
+
+      /*
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_"
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+      */
+
+      std::ofstream file("testbed_universal_errors_sm70.txt");
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nT = \n" << tensor_Tensor.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view()
+        << "\n\nReduction =\n" << tensor_Reduction.host_view() << "\n"
+        << "\nReference reduction =\n" << reference_Reduction.host_view() << "\n";
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementAccumulator alpha, 
+    ElementAccumulator beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        ElementAccumulator, typename Gemm::LayoutC, 
+        ElementAccumulator, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      Gemm::kTransformA,
+      tensor_B.host_ref(),
+      Gemm::kTransformB,
+      beta, 
+      tensor_C_ref.host_ref(), 
+      reference_d_Y.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    using ElementC = typename Gemm::ElementC;
+
+    ReferenceOp reference_op;
+
+    // compute backwards 
+    for (int m = 0; m < problem_size.m(); ++m) {
+      ElementAccumulator reduced_value = ElementAccumulator();
+      for (int n = 0; n < problem_size.n(); ++n) {
+        ElementAccumulator d_full = reference_op(reference_d_Y.at({m, n}), tensor_Tensor.at({m, n}));
+        reduced_value += d_full;
+        reference_D.at({m, n}) = ElementC(d_full);
+      }
+      reference_Reduction.at({m, 0}) = reduced_value;
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size, 
+    int batch_count = 1,
+    ElementAccumulator alpha = ElementAccumulator(1), 
+    ElementAccumulator beta = ElementAccumulator(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      tensor_Reduction.device_data(),
+      tensor_Tensor.device_data(),
+      problem_size.m() * problem_size.k(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0),
+      tensor_Reduction.layout().stride(0),
+      tensor_Tensor.layout().stride(0),
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed with batch_count/split_k_slices = " << batch_count << std::endl;
+    }
+
+    //
+    // Profile
+    //
+
+    #if 0 // profiling disabled for now.
+
+    int const kWorkspaces = 100;
+
+    cutlass::DeviceAllocation<typename Gemm::ElementA> profiling_tensor_A(tensor_A.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<typename Gemm::ElementB> profiling_tensor_B(tensor_B.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<typename Gemm::ElementC> profiling_tensor_C(tensor_C.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<typename Gemm::ElementC> profiling_tensor_D(tensor_D.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<typename Gemm::ElementC> profiling_tensor_Reduction(tensor_Reduction.capacity() * kWorkspaces);
+    cutlass::DeviceAllocation<ElementT> profiling_tensor_Tensor(tensor_Tensor.capacity() * kWorkspaces);
+
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      cudaError_t result = cudaEventCreate(&event);
+      if (result != cudaSuccess) {
+        EXPECT_EQ(result, cudaSuccess) << " cudaEventCreate() failed with error " << cudaGetErrorString(result);
+        return false;
+        break;
+      }
+    }
+
+    int const kWarmupIterations = 5;
+    int const kProfilingIterations = 100;
+
+    for (int i = 0; i < kWarmupIterations; ++i) {
+      status = gemm_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    }
+    
+
+    cudaError_t result = cudaEventRecord(events[0]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    for (int i = 0; i < kProfilingIterations; ++i) {
+
+      typename Gemm::Arguments arguments{
+        mode,
+        problem_size,
+        batch_count,
+        {alpha, beta},
+        profiling_tensor_A.get() + tensor_A.capacity() * (i % kWorkspaces),
+        profiling_tensor_B.get() + tensor_B.capacity() * (i % kWorkspaces),
+        profiling_tensor_C.get() + tensor_C.capacity() * (i % kWorkspaces),
+        profiling_tensor_D.get() + tensor_D.capacity() * (i % kWorkspaces),
+        profiling_tensor_Reduction.get() + tensor_Reduction.capacity() * (i % kWorkspaces),
+        profiling_tensor_Tensor.get() + tensor_Tensor.capacity() * (i % kWorkspaces),
+        problem_size.m() * problem_size.k(),
+        problem_size.n() * problem_size.k(),
+        problem_size.m() * problem_size.n(),
+        problem_size.m() * problem_size.n(),
+        problem_size.m(),
+        problem_size.m() * problem_size.n(),
+        tensor_A.layout().stride(0),
+        tensor_B.layout().stride(0),
+        tensor_C.layout().stride(0),
+        tensor_D.layout().stride(0),
+        tensor_Reduction.layout().stride(0),
+        tensor_Tensor.layout().stride(0),
+      };
+
+      gemm_op.initialize(arguments, workspace.get());
+      status = gemm_op();
+      EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+    }
+
+    result = cudaEventRecord(events[1]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess);
+
+    float elapsed_time = 0;
+    result = cudaEventElapsedTime(&elapsed_time, events[0], events[1]);
+    EXPECT_EQ(result, cudaSuccess);
+
+    double average_time = double(elapsed_time) / double(kProfilingIterations);
+
+    std::cout << problem_size << ": " << average_time << " ms" << std::endl;
+
+    for (auto & event : events) {
+      cudaEventDestroy(event);
+    }
+    #endif
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gemm, typename ReferenceOp>
+bool TestGemmWithReduction(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count = 1,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedGemmWithReduction<Gemm, ReferenceOp> testbed;
+  
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  passed = testbed.run(
+    mode,
+    problem_size, 
+    batch_count,
+    cutlass::from_real<ElementAccumulator>(alpha), 
+    cutlass::from_real<ElementAccumulator>(beta)
+  );
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7317eb855477e63fe19858ca51cd5722f236eb5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped.h
@@ -0,0 +1,501 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+    
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct TestbedGrouped {
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Gemm::GemmKernel::Epilogue::OutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  int problem_count;
+
+  std::vector<cutlass::gemm::GemmCoord>               problem_sizes_host;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+  std::vector<int64_t> ldd_host;
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+  cutlass::DeviceAllocation<int64_t> ldd;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+
+  cutlass::DeviceAllocation<ElementA *> ptr_A;
+  cutlass::DeviceAllocation<ElementB *> ptr_B;
+  cutlass::DeviceAllocation<ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementC *> ptr_D;
+
+  //
+  // Methods
+  //
+
+  TestbedGrouped(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // no fill - remain zero
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize() {
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+
+    lda_host.resize(problem_count);
+    ldb_host.resize(problem_count);
+    ldc_host.resize(problem_count);
+    ldd_host.resize(problem_count);
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+
+      cutlass::gemm::GemmCoord problem(
+        8 * (rand() % 64) + 24,
+        8 * (rand() % 64) + 24,
+        8 * (rand() % 64) + 24);
+
+      if (!i) {
+        problem = cutlass::gemm::GemmCoord(48, 16, 8);
+      }
+
+      problem_sizes_host.at(i) = problem;
+
+      // std::cout << "Problem[" << i << "]: " << problem << std::endl;
+
+      lda_host.at(i) = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+      ldb_host.at(i) = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+      ldc_host.at(i) = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+      ldd_host.at(i) = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = problem.m() * problem.k();
+      int64_t elements_B = problem.k() * problem.n();
+      int64_t elements_C = problem.m() * problem.n();
+      int64_t elements_D = problem.m() * problem.n();
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+
+      // Random strides between problems?
+    }
+
+    problem_sizes_device.reset(problem_count);
+    problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+    lda.reset(problem_count);
+    ldb.reset(problem_count);
+    ldc.reset(problem_count);
+    ldd.reset(problem_count);
+
+    lda.copy_from_host(lda_host.data());
+    ldb.copy_from_host(ldb_host.data());
+    ldc.copy_from_host(ldc_host.data());
+    ldd.copy_from_host(ldd_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+
+    std::vector<ElementA *> ptr_A_host(problem_count);
+    std::vector<ElementB *> ptr_B_host(problem_count);
+    std::vector<ElementC *> ptr_C_host(problem_count);
+    std::vector<ElementC *> ptr_D_host(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
+
+    ptr_A.reset(problem_count);
+    ptr_A.copy_from_host(ptr_A_host.data());
+    
+    ptr_B.reset(problem_count);
+    ptr_B.copy_from_host(ptr_B_host.data());
+    
+    ptr_C.reset(problem_count);
+    ptr_C.copy_from_host(ptr_C_host.data());
+    
+    ptr_D.reset(problem_count);
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.m(), problem.k()};
+      MatrixCoord extent_B{problem.k(), problem.n()};
+      MatrixCoord extent_C{problem.m(), problem.n()};
+      
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+
+      initialize_tensor(cutlass::TensorView<ElementA, LayoutA>(matrix_A.data(), layout_A, extent_A), init_A, seed * 2021);
+      initialize_tensor(cutlass::TensorView<ElementB, LayoutB>(matrix_B.data(), layout_B, extent_B), init_B, seed * 2022);
+      initialize_tensor(cutlass::TensorView<ElementC, LayoutC>(matrix_C.data(), layout_C, extent_C), init_C, seed * 2023);
+
+      cutlass::device_memory::copy_to_device(ptr_A_host.at(i), matrix_A.data(), matrix_A.size());
+      cutlass::device_memory::copy_to_device(ptr_B_host.at(i), matrix_B.data(), matrix_B.size());
+      cutlass::device_memory::copy_to_device(ptr_C_host.at(i), matrix_C.data(), matrix_C.size());
+      cutlass::device_memory::copy_to_device(ptr_D_host.at(i), matrix_D.data(), matrix_D.size());
+    }
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.m(), problem.k()};
+      MatrixCoord extent_B{problem.k(), problem.n()};
+      MatrixCoord extent_C{problem.m(), problem.n()};
+      
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+      std::vector<ElementC> matrix_Ref(layout_D.capacity(extent_C));
+
+      cutlass::device_memory::copy_to_host(matrix_A.data(), block_A.get() + offset_A.at(i), matrix_A.size());
+      cutlass::device_memory::copy_to_host(matrix_B.data(), block_B.get() + offset_B.at(i), matrix_B.size());
+      cutlass::device_memory::copy_to_host(matrix_C.data(), block_C.get() + offset_C.at(i), matrix_C.size());
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + offset_D.at(i), matrix_D.size());
+
+      cutlass::TensorView<ElementA, LayoutA> view_A(matrix_A.data(), layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(matrix_B.data(), layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(matrix_C.data(), layout_C, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_D(matrix_D.data(), layout_D, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref(matrix_Ref.data(), layout_D, extent_C);
+
+      // Reference GEMM
+      cutlass::reference::host::GemmComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem,
+        alpha, 
+        view_A,
+        Gemm::kTransformA,
+        view_B,
+        Gemm::kTransformB,
+        beta, 
+        view_C, 
+        view_Ref, 
+        ElementAccumulator(0)
+      );
+
+      // Ensure that no input or output is entirely zero
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_A), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_B), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_C), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_D), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_Ref), 0);
+
+      // Compare against reference
+      passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
+
+      if (!passed) {
+        std::ofstream file("testbed_grouped_errors.txt");
+
+        file
+          << "problem: " << problem << "  [group: " << i << "]\n" 
+          << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+        file 
+          << "A =\n" << view_A
+          << "\nB =\n" << view_B
+          << "\nC =\n" << view_C
+          << "\n\nReference =\n" << view_Ref
+          << "\nComputed =\n" << view_D;
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+  /// Executes one test
+  bool run(
+    int problem_count,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    this->problem_count = problem_count;
+
+    // Initialize the problem
+    initialize();
+
+    int threadblock_count = Gemm::sufficient(problem_sizes_host.data(), problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device resources." << std::endl;
+      }
+      return true;
+    }
+
+    // Configure the GEMM arguments
+    typename EpilogueOutputOp::Params epilogue_op(alpha, beta);
+
+    // Configure GEMM arguments
+    typename Gemm::Arguments args(
+      problem_sizes_device.get(),
+      problem_count,
+      threadblock_count,
+      epilogue_op,
+      ptr_A.get(),
+      ptr_B.get(),
+      ptr_C.get(),
+      ptr_D.get(),
+      lda.get(),
+      ldb.get(),
+      ldc.get(),
+      ldd.get(),
+      problem_sizes_host.data()
+    );
+
+    // Initialize the GEMM object
+    Gemm gemm;
+
+    size_t workspace_size = gemm.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm.initialize(args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Run the GEMM object
+    status = gemm.run();
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Wait for completion
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) 
+      << "Kernel execution error: " << cudaGetErrorString(result);
+
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    // Verify correctness
+    return verify(alpha, beta);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8f08f23c4477745648f1cf8f9e439ae6b5061e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k.h
@@ -0,0 +1,502 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K interface
+
+*/
+
+#pragma once
+
+#include <fstream>
+#include <iostream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped.h"
+#include "cutlass/gemm/kernel/default_rank_2k_grouped.h"
+#include "cutlass/gemm/device/rank_2k_grouped.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/rank_2k_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/tensor_view_io.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Rank2K>
+struct TestbedGrouped {
+
+  //
+  // Type definitions
+  //
+
+  using ElementA = typename Rank2K::ElementA;
+  using ElementB = typename Rank2K::ElementB;
+  using ElementC = typename Rank2K::ElementC;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+
+  using EpilogueOutputOp = typename Rank2K::EpilogueOutputOp;
+  using ElementCompute = typename EpilogueOutputOp::ElementCompute;
+
+  using LayoutA = typename Rank2K::LayoutA;
+  using LayoutB = typename Rank2K::LayoutB;
+  using LayoutC = typename Rank2K::LayoutC;
+
+  using MatrixCoord = typename LayoutC::TensorCoord;
+
+  //
+  // Data members
+  //
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint32_t seed;
+
+  int problem_count;
+
+  std::vector<cutlass::gemm::GemmCoord>               problem_sizes_host;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device;
+
+  std::vector<int64_t> offset_A;
+  std::vector<int64_t> offset_B;
+  std::vector<int64_t> offset_C;
+  std::vector<int64_t> offset_D;
+
+  std::vector<int64_t> lda_host;
+  std::vector<int64_t> ldb_host;
+  std::vector<int64_t> ldc_host;
+  std::vector<int64_t> ldd_host;
+
+  cutlass::DeviceAllocation<int64_t> lda;
+  cutlass::DeviceAllocation<int64_t> ldb;
+  cutlass::DeviceAllocation<int64_t> ldc;
+  cutlass::DeviceAllocation<int64_t> ldd;
+
+  cutlass::DeviceAllocation<ElementA> block_A;
+  cutlass::DeviceAllocation<ElementB> block_B;
+  cutlass::DeviceAllocation<ElementC> block_C;
+  cutlass::DeviceAllocation<ElementC> block_D;
+
+  cutlass::DeviceAllocation<ElementA *> ptr_A;
+  cutlass::DeviceAllocation<ElementB *> ptr_B;
+  cutlass::DeviceAllocation<ElementC *> ptr_C;
+  cutlass::DeviceAllocation<ElementC *> ptr_D;
+
+  //
+  // Methods
+  //
+
+  TestbedGrouped(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        if (cutlass::sizeof_bits<ElementAccumulator>::value <= 16) {
+          scope_max = 5;
+          scope_min = -5;
+        }
+        else {
+          scope_max = 8;
+          scope_min = -8;
+        }
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      // no fill - remain zero
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize() {
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_A = 0;
+    int64_t total_elements_B = 0;
+    int64_t total_elements_C = 0;
+    int64_t total_elements_D = 0;
+
+
+    lda_host.resize(problem_count);
+    ldb_host.resize(problem_count);
+    ldc_host.resize(problem_count);
+    ldd_host.resize(problem_count);
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+
+      auto N = 8 * (rand() % 64) + 24;
+      auto K = 8 * (rand() % 64) + 24;
+      cutlass::gemm::GemmCoord problem(N, N, K);
+
+      if (!i) {
+        problem = cutlass::gemm::GemmCoord(16, 16, 8);
+      }
+
+      problem_sizes_host.at(i) = problem;
+
+      lda_host.at(i) = LayoutA::packed({problem.n(), problem.k()}).stride(0);
+      ldb_host.at(i) = LayoutB::packed({problem.n(), problem.k()}).stride(0);
+      ldc_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+      ldd_host.at(i) = LayoutC::packed({problem.n(), problem.n()}).stride(0);
+
+      offset_A.push_back(total_elements_A);
+      offset_B.push_back(total_elements_B);
+      offset_C.push_back(total_elements_C);
+      offset_D.push_back(total_elements_D);
+
+      int64_t elements_A = problem.n() * problem.k();
+      int64_t elements_B = problem.n() * problem.k();
+      int64_t elements_C = problem.n() * problem.n();
+      int64_t elements_D = problem.n() * problem.n();
+
+      total_elements_A += elements_A;
+      total_elements_B += elements_B;
+      total_elements_C += elements_C;
+      total_elements_D += elements_D;
+
+      // Random strides between problems?
+    }
+
+    problem_sizes_device.reset(problem_count);
+    problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+    lda.reset(problem_count);
+    ldb.reset(problem_count);
+    ldc.reset(problem_count);
+    ldd.reset(problem_count);
+
+    lda.copy_from_host(lda_host.data());
+    ldb.copy_from_host(ldb_host.data());
+    ldc.copy_from_host(ldc_host.data());
+    ldd.copy_from_host(ldd_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_A.reset(total_elements_A);
+    block_B.reset(total_elements_B);
+    block_C.reset(total_elements_C);
+    block_D.reset(total_elements_D);
+
+    std::vector<ElementA *> ptr_A_host(problem_count);
+    std::vector<ElementB *> ptr_B_host(problem_count);
+    std::vector<ElementC *> ptr_C_host(problem_count);
+    std::vector<ElementC *> ptr_D_host(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      ptr_A_host.at(i) = block_A.get() + offset_A.at(i);
+      ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
+      ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
+      ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
+    }
+
+    ptr_A.reset(problem_count);
+    ptr_A.copy_from_host(ptr_A_host.data());
+
+    ptr_B.reset(problem_count);
+    ptr_B.copy_from_host(ptr_B_host.data());
+
+    ptr_C.reset(problem_count);
+    ptr_C.copy_from_host(ptr_C_host.data());
+
+    ptr_D.reset(problem_count);
+    ptr_D.copy_from_host(ptr_D_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.n(), problem.k()};
+      MatrixCoord extent_B{problem.n(), problem.k()};
+      MatrixCoord extent_C{problem.n(), problem.n()};
+
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+
+      initialize_tensor(cutlass::TensorView<ElementA, LayoutA>(matrix_A.data(), layout_A, extent_A), init_A, seed * 2021);
+      initialize_tensor(cutlass::TensorView<ElementB, LayoutB>(matrix_B.data(), layout_B, extent_B), init_B, seed * 2022);
+      initialize_tensor(cutlass::TensorView<ElementC, LayoutC>(matrix_C.data(), layout_C, extent_C), init_C, seed * 2023);
+
+      cutlass::device_memory::copy_to_device(ptr_A_host.at(i), matrix_A.data(), matrix_A.size());
+      cutlass::device_memory::copy_to_device(ptr_B_host.at(i), matrix_B.data(), matrix_B.size());
+      cutlass::device_memory::copy_to_device(ptr_C_host.at(i), matrix_C.data(), matrix_C.size());
+      cutlass::device_memory::copy_to_device(ptr_D_host.at(i), matrix_D.data(), matrix_D.size());
+    }
+  }
+
+  /// Verifies the result is a Rank2K
+  bool verify(
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      cutlass::gemm::GemmCoord problem = problem_sizes_host.at(i);
+
+      LayoutA layout_A(lda_host.at(i));
+      LayoutB layout_B(ldb_host.at(i));
+      LayoutC layout_C(ldc_host.at(i));
+      LayoutC layout_D(ldd_host.at(i));
+
+      MatrixCoord extent_A{problem.n(), problem.k()};
+      MatrixCoord extent_B{problem.n(), problem.k()};
+      MatrixCoord extent_C{problem.n(), problem.n()};
+
+      std::vector<ElementA> matrix_A(layout_A.capacity(extent_A));
+      std::vector<ElementB> matrix_B(layout_B.capacity(extent_B));
+      std::vector<ElementC> matrix_C(layout_C.capacity(extent_C));
+      std::vector<ElementC> matrix_D(layout_D.capacity(extent_C));
+      std::vector<ElementC> matrix_Ref(layout_D.capacity(extent_C));
+
+      cutlass::device_memory::copy_to_host(matrix_A.data(), block_A.get() + offset_A.at(i), matrix_A.size());
+      cutlass::device_memory::copy_to_host(matrix_B.data(), block_B.get() + offset_B.at(i), matrix_B.size());
+      cutlass::device_memory::copy_to_host(matrix_C.data(), block_C.get() + offset_C.at(i), matrix_C.size());
+      cutlass::device_memory::copy_to_host(matrix_D.data(), block_D.get() + offset_D.at(i), matrix_D.size());
+
+      cutlass::TensorView<ElementA, LayoutA> view_A(matrix_A.data(), layout_A, extent_A);
+      cutlass::TensorView<ElementB, LayoutB> view_B(matrix_B.data(), layout_B, extent_B);
+      cutlass::TensorView<ElementC, LayoutC> view_C(matrix_C.data(), layout_C, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_D(matrix_D.data(), layout_D, extent_C);
+      cutlass::TensorView<ElementC, LayoutC> view_Ref(matrix_Ref.data(), layout_D, extent_C);
+
+      // Reference Rank2K
+      cutlass::reference::host::Rank2KComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementCompute, ElementAccumulator
+      >(
+        problem,
+        alpha,
+        view_A,
+        Rank2K::kTransformA,
+        view_B,
+        Rank2K::kTransformB,
+        beta,
+        view_C,
+        view_Ref,
+        ElementAccumulator(0),
+        Rank2K::kFillModeC,
+        Rank2K::kBlasMode
+      );
+
+      // Ensure that no input or output is entirely zero
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_A), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_B), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_C), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_D), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(view_Ref), 0);
+
+      // Compare against reference
+      passed = cutlass::reference::host::TensorEquals(view_D, view_Ref);
+
+      if (!passed) {
+        std::ofstream file("testbed_grouped_errors.txt");
+
+        file
+          << "problem: " << problem << "  [group: " << i << "]\n"
+          << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+        file
+          << "A =\n" << view_A
+          << "\nB =\n" << view_B
+          << "\nC =\n" << view_C
+          << "\n\nReference =\n" << view_Ref
+          << "\nComputed =\n" << view_D;
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+  /// Executes one test
+  bool run(
+    int problem_count,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0)) {
+
+    this->problem_count = problem_count;
+
+    // Initialize the problem
+    initialize();
+
+    int threadblock_count = Rank2K::sufficient(problem_sizes_host.data(), problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device resources." << std::endl;
+      }
+      return true;
+    }
+
+    // Configure the Rank2K arguments
+    typename EpilogueOutputOp::Params epilogue_op(alpha, beta);
+
+    // Configure Rank2K arguments
+    typename Rank2K::Arguments args(
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_sizes_device.get(),
+      problem_count,
+      threadblock_count,
+      epilogue_op,
+      ptr_A.get(),
+      ptr_B.get(),
+      ptr_C.get(),
+      ptr_D.get(),
+      lda.get(),
+      ldb.get(),
+      ldc.get(),
+      ldd.get(),
+      problem_sizes_host.data()
+    );
+
+    // Initialize the Rank2K object
+    Rank2K rank2k;
+
+    size_t workspace_size = rank2k.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k.initialize(args, workspace.get());
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Run the Rank2K object
+    status = rank2k.run();
+
+    if (status != cutlass::Status::kSuccess) {
+      return false;
+    }
+
+    // Wait for completion
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess)
+      << "Kernel execution error: " << cudaGetErrorString(result);
+
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    // Verify correctness
+    return verify(alpha, beta);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9315e12e8711f50256e4cfe05666201acd614d3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
@@ -0,0 +1,461 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped Rank2K problem visitors
+*/
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/device_kernel.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Use simple problem visitor as a baseline
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC>
+struct BaselineProblemVisitor : public cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+  static cutlass::FillMode const kFillModeC = FillModeC;
+
+  struct SharedStorage {};
+
+  int32_t tile_count_sum;
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaselineProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  shared_storage(shared_storage_)
+  {
+    cutlass::gemm::GemmCoord problem = this->problem_size();
+    cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+    tile_count_sum = this->tile_count(grid);
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx < tile_count_sum) {
+      return true;
+    }
+
+    do {
+      ++this->problem_idx;
+
+      if (this->problem_idx >= this->params.problem_count) {
+        return false;
+      }
+
+      cutlass::gemm::GemmCoord problem = this->problem_size();
+      cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+
+      this->problem_tile_start = tile_count_sum;
+      tile_count_sum += this->tile_count(grid);
+
+    } while (tile_count_sum <= this->tile_idx);
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+
+  CUTLASS_DEVICE
+  cutlass::gemm::GemmCoord threadblock_offset(int32_t threadblock_id) const {
+    int32_t macro_id = threadblock_id / ProblemSizeHelper::OffsetHelper::kThreadblockSkewRatio;
+    int32_t macro_row = ceil(cutlass::fast_sqrt((2*macro_id) + 2.25) - 0.5) - 1;
+    int32_t macro_col = macro_id - (((macro_row+1) * macro_row)/2);
+
+    if (FillModeC == cutlass::FillMode::kUpper) {
+      cutlass::swap(macro_row, macro_col);
+    }
+
+    int32_t row = ProblemSizeHelper::OffsetHelper::macro_row_to_row(macro_row, threadblock_id);
+    int32_t col = ProblemSizeHelper::OffsetHelper::macro_col_to_col(macro_col, threadblock_id);
+
+    return cutlass::gemm::GemmCoord(row, col, 0);
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorKernel {
+  struct SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor_params;
+    int32_t* visited_problems_ptr;
+    int32_t* visited_tiles_ptr;
+    int32_t visits_per_block;
+
+    Params():
+      visited_problems_ptr(nullptr),
+      visited_tiles_ptr(nullptr),
+      visits_per_block(0) {}
+
+    Params(typename ProblemVisitor::Params problem_visitor_params_,
+           int32_t* visited_problems_ptr_,
+           int32_t* visited_tiles_ptr_,
+           int32_t visits_per_block_):
+      problem_visitor_params(problem_visitor_params_),
+      visited_problems_ptr(visited_problems_ptr_),
+      visited_tiles_ptr(visited_tiles_ptr_),
+      visits_per_block(visits_per_block_) {}
+  };
+
+  CUTLASS_DEVICE
+  void operator()(const Params& params, SharedStorage &shared_storage) {
+    int32_t store_offset = params.visits_per_block * blockIdx.x;
+    ProblemVisitor problem_visitor(params.problem_visitor_params,
+                                   shared_storage.problem_visitor,
+                                   blockIdx.x);
+
+    while (problem_visitor.next_tile()) {
+      cutlass::gemm::GemmCoord problem_size = problem_visitor.problem_size();
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      cutlass::gemm::GemmCoord grid_shape = problem_visitor.grid_shape(problem_size);
+      cutlass::gemm::GemmCoord tile_offset = problem_visitor.threadblock_offset(threadblock_idx);
+
+      problem_visitor.advance(gridDim.x);
+
+      //
+      // Early exit conditions
+      //   1) Out of range
+      //   2) Upper-triangular block in lower-triangular problem
+      //   3) Lower-triangular block in upper-triangular problem
+      //
+
+      if (grid_shape.m() <= tile_offset.m() ||
+          grid_shape.n() <= tile_offset.n()) {
+        continue;
+      }
+
+      if (ProblemVisitor::kFillModeC == cutlass::FillMode::kLower &&
+          (tile_offset.m() + 1) * ProblemVisitor::ThreadblockShape::kM <= tile_offset.n() * ProblemVisitor::ThreadblockShape::kN) {
+        continue;
+      }
+
+      if (ProblemVisitor::kFillModeC == cutlass::FillMode::kUpper &&
+          tile_offset.m() * ProblemVisitor::ThreadblockShape::kM >= (tile_offset.n() + 1) * ProblemVisitor::ThreadblockShape::kN) {
+        continue;
+      }
+
+      if (threadIdx.x == 0) {
+        params.visited_problems_ptr[store_offset] = problem_idx;
+        params.visited_tiles_ptr[store_offset] = threadblock_idx;
+        ++store_offset;
+      }
+    }
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorRunner {
+  using BaseKernel = ProblemVisitorKernel<ProblemVisitor>;
+  using Params = typename BaseKernel::Params;
+
+  Params params;
+  std::vector<cutlass::gemm::GemmCoord> host_problem_sizes;
+  int32_t problem_count;
+  int32_t threadblock_count;
+  int32_t visits_per_block;
+  cutlass::DeviceAllocation<int32_t> visited_problems;
+  cutlass::DeviceAllocation<int32_t> visited_tiles;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes;
+  cutlass::DeviceAllocation<uint8_t> workspace;
+  std::vector<int32_t> host_visited_problems;
+  std::vector<int32_t> host_visited_tiles;
+
+  ProblemVisitorRunner(const std::vector<cutlass::gemm::GemmCoord>& host_problem_sizes_,
+                       int32_t threadblock_count_):
+      host_problem_sizes(host_problem_sizes_),
+      problem_count(int32_t(host_problem_sizes_.size())),
+      threadblock_count(threadblock_count_) {}
+
+  /// Initializes GEMM state from arguments.
+  cutlass::Status initialize() {
+    size_t workspace_bytes = ProblemVisitor::get_workspace_size(
+                                host_problem_sizes.data(),
+                                problem_count,
+                                threadblock_count);
+
+    workspace.reset(workspace_bytes);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+
+    int32_t tile_count = ProblemVisitor::group_tile_count(host_problem_sizes.data(), problem_count);
+
+    ProblemVisitor::host_precompute(host_problem_sizes.data(), problem_count,
+                                    threadblock_count, host_workspace.data());
+
+    workspace.copy_from_host(host_workspace.data(), workspace_bytes);
+
+    device_problem_sizes.reset(problem_count);
+    device_problem_sizes.copy_from_host(host_problem_sizes.data(), problem_count);
+
+    visits_per_block = (tile_count - 1 + threadblock_count) / threadblock_count;
+    int32_t total_visits = visits_per_block * threadblock_count;
+
+    visited_problems.reset(total_visits);
+    visited_tiles.reset(total_visits);
+    host_visited_problems.resize(total_visits);
+    host_visited_tiles.resize(total_visits);
+
+    cudaError_t result = cudaMemset(visited_problems.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    result = cudaMemset(visited_tiles.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    typename ProblemVisitor::Params pv_params(device_problem_sizes.get(), problem_count, workspace.get(), tile_count);
+    params = Params(pv_params, visited_problems.get(), visited_tiles.get(), visits_per_block);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  bool verify() {
+    // Sort by problem size and then by threadblock_idx
+    std::vector<int32_t> indices(host_visited_problems.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::stable_sort(indices.begin(), indices.end(),
+      [&](int32_t i1, int32_t i2) {
+        if (host_visited_problems[i1] == host_visited_problems[i2]) {
+          return host_visited_tiles[i1] < host_visited_tiles[i2];
+        }
+        return host_visited_problems[i1] < host_visited_problems[i2];
+      });
+
+    int32_t idx = 0;
+
+    // Skip any entries that were not visited
+    while (host_visited_problems[indices[idx]] == -1) {
+      ++idx;
+    }
+
+    // Check that each problem visited has the tiles we expect
+    for (int32_t problem_idx = 0; problem_idx < problem_count; ++problem_idx) {
+      auto problem = host_problem_sizes[problem_idx];
+      ProblemVisitor::possibly_transpose_problem(problem);
+      int32_t problem_tiles = ProblemVisitor::tile_count(ProblemVisitor::grid_shape(problem));
+      for (int i = 0; i < problem_tiles; ++i) {
+        EXPECT_EQ(problem_idx, host_visited_problems[indices[idx]]);
+        EXPECT_EQ(i, host_visited_tiles[indices[idx]]);
+        ++idx;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(bool skip_tile_check=false, cudaStream_t stream = nullptr) {
+    cutlass::Status status = initialize();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Initialization failed" << std::endl;
+      return false;
+    }
+
+    dim3 grid(threadblock_count, 1, 1);
+    dim3 block(ProblemVisitor::kThreadCount, 1, 1);
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params);
+
+    cudaError_t result = cudaGetLastError();
+    if (result != cudaSuccess) {
+      std::cerr << "grid launch failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    visited_problems.copy_to_host(host_visited_problems.data());
+    visited_tiles.copy_to_host(host_visited_tiles.data());
+
+    if (skip_tile_check) {
+      return true;
+    }
+
+    return verify();
+  }
+};
+
+template <typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          cutlass::FillMode FillModeC,
+          cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode0,
+          cutlass::gemm::kernel::GroupScheduleMode... Args>
+struct TestbedGroupedRank2KScheduler {
+
+  using BaselinePV = BaselineProblemVisitor<cutlass::gemm::kernel::detail::Rank2KGroupedProblemSizeHelper<ThreadblockShape>,
+                                            ThreadblockShape,
+                                            PrefetchTileCount,
+                                            ThreadCount,
+                                            FillModeC>;
+
+  //
+  // Data members
+  //
+
+  // Whether to skip checking that the tiles are visited as expected. This is useful
+  // in cases where ThreadblockShape::kM != ThreadblockShape::kN, for which the grouped
+  // Rank2K scheduler may assign out-of-bounds tiles that will cause a threadblock to
+  // exit early, but which are difficult to detect in tests without reimplementing
+  // this functionality.
+  bool skip_tile_check;
+  uint32_t seed;
+  int problem_count;
+  int threadblock_count;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes_host;
+
+  //
+  // Methods
+  //
+
+  TestbedGroupedRank2KScheduler(bool skip_tile_check_=false, uint32_t seed_ = 3080):
+    skip_tile_check(skip_tile_check_), seed(seed_) { srand(seed); }
+
+  /// Initializes data structures
+  void initialize(int32_t scale_factor) {
+
+    //
+    // Choose random problem sizes
+    //
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+      int n = scale_factor * (rand() % 64) + 24;
+
+      cutlass::gemm::GemmCoord problem(
+        n,
+        n,
+        scale_factor * (rand() % 64) + 24);
+
+      problem_sizes_host.at(i) = problem;
+    }
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    using PV = cutlass::gemm::kernel::Rank2KGroupedProblemVisitor<
+                                         ThreadblockShape,
+                                         GroupScheduleMode_,
+                                         PrefetchTileCount,
+                                         ThreadCount,
+                                         FillModeC>;
+    ProblemVisitorRunner<PV> runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(runner.run(skip_tile_check));
+
+    // Check that this problem visitor visits the same problems and tiles as the baseline
+    EXPECT_EQ(baseline_runner.host_visited_problems, runner.host_visited_problems);
+    EXPECT_EQ(baseline_runner.host_visited_tiles, runner.host_visited_tiles);
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode1_,
+            cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode2_,
+            cutlass::gemm::kernel::GroupScheduleMode... Rest>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    // Compare the next visitor with the baseline visitor
+    compare_visitors<GroupScheduleMode1_>(baseline_runner);
+
+    // Recurse to compare the next visitors
+    compare_visitors<GroupScheduleMode2_, Rest...>(baseline_runner);
+  }
+
+  /// Executes the test on all scheduler modes
+  void run(int problem_count, int threadblock_count, int scale_factor=8) {
+
+    this->problem_count = problem_count;
+    this->threadblock_count = threadblock_count;
+
+    // Initialize the problem
+    initialize(scale_factor);
+
+    // Run the baseline visitor to which we will compare all other visitors
+    ProblemVisitorRunner<BaselinePV> baseline_runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(baseline_runner.run(skip_tile_check));
+
+    compare_visitors<Args...>(baseline_runner);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_scheduler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda2704b517ea95052e2c2060b50712b686344f6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_grouped_scheduler.h
@@ -0,0 +1,407 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for grouped GEMM problem visitors
+*/
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped_problem_visitor.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/util/device_memory.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Use simple problem visitor as a baseline
+template <typename ProblemSizeHelper,
+          typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount>
+struct BaselineProblemVisitor : public cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape> {
+  using Base = cutlass::gemm::kernel::BaseGroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape>;
+  using Params = typename Base::Params;
+  static int const kThreadCount = ThreadCount;
+
+  struct SharedStorage {};
+
+  int32_t tile_count_sum;
+  SharedStorage &shared_storage;
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  BaselineProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_,
+    int32_t block_idx
+  ): Base(params_, block_idx),
+  shared_storage(shared_storage_)
+  {
+    cutlass::gemm::GemmCoord problem = this->problem_size();
+    cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+    tile_count_sum = this->tile_count(grid);
+  }
+
+  CUTLASS_DEVICE
+  bool next_tile() {
+    if (this->tile_idx < tile_count_sum) {
+      return true;
+    }
+
+    do {
+      ++this->problem_idx;
+
+      if (this->problem_idx >= this->params.problem_count) {
+        return false;
+      }
+
+      cutlass::gemm::GemmCoord problem = this->problem_size();
+      cutlass::gemm::GemmCoord  grid = this->grid_shape(problem);
+
+      this->problem_tile_start = tile_count_sum;
+      tile_count_sum += this->tile_count(grid);
+
+    } while (tile_count_sum <= this->tile_idx);
+
+    return true;
+  }
+
+  static size_t get_workspace_size(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                                   int32_t problem_count,
+                                   int32_t block_count) {
+    return 0;
+  }
+
+  static void host_precompute(const cutlass::gemm::GemmCoord* host_problem_sizes_ptr,
+                              int32_t problem_count,
+                              int32_t block_count,
+                              void* host_workspace_ptr) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ProblemVisitor>
+struct ProblemVisitorKernel {
+  struct SharedStorage {
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor_params;
+    int32_t* visited_problems_ptr;
+    int32_t* visited_tiles_ptr;
+    int32_t visits_per_block;
+
+    Params():
+      visited_problems_ptr(nullptr),
+      visited_tiles_ptr(nullptr),
+      visits_per_block(0) {}
+
+    Params(typename ProblemVisitor::Params problem_visitor_params_,
+           int32_t* visited_problems_ptr_,
+           int32_t* visited_tiles_ptr_,
+           int32_t visits_per_block_):
+      problem_visitor_params(problem_visitor_params_),
+      visited_problems_ptr(visited_problems_ptr_),
+      visited_tiles_ptr(visited_tiles_ptr_),
+      visits_per_block(visits_per_block_) {}
+  };
+
+  CUTLASS_DEVICE
+  void operator()(const Params& params, SharedStorage &shared_storage) {
+    int32_t store_offset = params.visits_per_block * blockIdx.x;
+    ProblemVisitor problem_visitor(params.problem_visitor_params,
+                                   shared_storage.problem_visitor,
+                                   blockIdx.x);
+
+    while (problem_visitor.next_tile()) {
+      int32_t problem_idx = problem_visitor.problem_index();
+      int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      if (threadIdx.x == 0) {
+        params.visited_problems_ptr[store_offset] = problem_idx;
+        params.visited_tiles_ptr[store_offset] = threadblock_idx;
+        ++store_offset;
+      }
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+template <typename ProblemVisitor>
+struct ProblemVisitorRunner {
+  using BaseKernel = ProblemVisitorKernel<ProblemVisitor>;
+  using Params = typename BaseKernel::Params;
+
+  Params params;
+  std::vector<cutlass::gemm::GemmCoord> host_problem_sizes;
+  int32_t problem_count;
+  int32_t threadblock_count;
+  int32_t visits_per_block;
+  cutlass::DeviceAllocation<int32_t> visited_problems;
+  cutlass::DeviceAllocation<int32_t> visited_tiles;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> device_problem_sizes;
+  cutlass::DeviceAllocation<uint8_t> workspace;
+  std::vector<int32_t> host_visited_problems;
+  std::vector<int32_t> host_visited_tiles;
+
+  ProblemVisitorRunner(const std::vector<cutlass::gemm::GemmCoord>& host_problem_sizes_,
+                       int32_t threadblock_count_):
+      host_problem_sizes(host_problem_sizes_),
+      problem_count(int32_t(host_problem_sizes_.size())),
+      threadblock_count(threadblock_count_) {}
+
+  /// Initializes GEMM state from arguments.
+  cutlass::Status initialize() {
+    size_t workspace_bytes = ProblemVisitor::get_workspace_size(
+                                host_problem_sizes.data(),
+                                problem_count,
+                                threadblock_count);
+
+    workspace.reset(workspace_bytes);
+    std::vector<uint8_t> host_workspace(workspace_bytes);
+
+    int32_t tile_count = ProblemVisitor::group_tile_count(host_problem_sizes.data(), problem_count);
+
+    ProblemVisitor::host_precompute(host_problem_sizes.data(), problem_count,
+                                    threadblock_count, host_workspace.data());
+
+    workspace.copy_from_host(host_workspace.data(), workspace_bytes);
+
+    device_problem_sizes.reset(problem_count);
+    device_problem_sizes.copy_from_host(host_problem_sizes.data(), problem_count);
+
+    visits_per_block = (tile_count - 1 + threadblock_count) / threadblock_count;
+    int32_t total_visits = visits_per_block * threadblock_count;
+
+    visited_problems.reset(total_visits);
+    visited_tiles.reset(total_visits);
+    host_visited_problems.resize(total_visits);
+    host_visited_tiles.resize(total_visits);
+
+    cudaError_t result = cudaMemset(visited_problems.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    result = cudaMemset(visited_tiles.get(), -1, sizeof(int32_t) * total_visits);
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    typename ProblemVisitor::Params pv_params(device_problem_sizes.get(), problem_count, workspace.get(), tile_count);
+    params = Params(pv_params, visited_problems.get(), visited_tiles.get(), visits_per_block);
+
+    return cutlass::Status::kSuccess;
+  }
+
+  bool verify() {
+    // Sort by problem size and then by threadblock_idx
+    std::vector<int32_t> indices(host_visited_problems.size());
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::stable_sort(indices.begin(), indices.end(),
+      [&](int32_t i1, int32_t i2) {
+        if (host_visited_problems[i1] == host_visited_problems[i2]) {
+          return host_visited_tiles[i1] < host_visited_tiles[i2];
+        }
+        return host_visited_problems[i1] < host_visited_problems[i2];
+      });
+
+    int32_t idx = 0;
+
+    // Skip any entries that were not visited
+    while (host_visited_problems[indices[idx]] == -1) {
+      ++idx;
+    }
+
+    // Check that each problem visited has the tiles we expect
+    for (int32_t problem_idx = 0; problem_idx < problem_count; ++problem_idx) {
+      auto problem = host_problem_sizes[problem_idx];
+      ProblemVisitor::possibly_transpose_problem(problem);
+      int32_t problem_tiles = ProblemVisitor::tile_count(ProblemVisitor::grid_shape(problem));
+      for (int i = 0; i < problem_tiles; ++i) {
+        EXPECT_EQ(problem_idx, host_visited_problems[indices[idx]]);
+        EXPECT_EQ(i, host_visited_tiles[indices[idx]]);
+        ++idx;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(cudaStream_t stream = nullptr) {
+    cutlass::Status status = initialize();
+    if (status != cutlass::Status::kSuccess) {
+      std::cerr << "Initialization failed" << std::endl;
+      return false;
+    }
+
+    dim3 grid(threadblock_count, 1, 1);
+    dim3 block(ProblemVisitor::kThreadCount, 1, 1);
+    int smem_size = int(sizeof(typename BaseKernel::SharedStorage));
+
+    cutlass::Kernel<BaseKernel><<<grid, block, smem_size, stream>>>(params);
+
+    cudaError_t result = cudaGetLastError();
+    if (result != cudaSuccess) {
+      std::cerr << "grid launch failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    result = cudaDeviceSynchronize();
+    if (result != cudaSuccess) {
+      std::cerr << "cudaDeviceSynchronize failed with error " << cudaGetErrorString(result) << std::endl;
+      return false;
+    }
+
+    visited_problems.copy_to_host(host_visited_problems.data());
+    visited_tiles.copy_to_host(host_visited_tiles.data());
+
+    return verify();
+  }
+};
+
+template <typename ThreadblockShape,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transpose,
+          cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode0,
+          cutlass::gemm::kernel::GroupScheduleMode... Args>
+struct TestbedGroupedGemmScheduler {
+
+  using PSHelper = cutlass::gemm::kernel::detail::GemmGroupedProblemSizeHelper<ThreadblockShape, Transpose>;
+  using BaselinePV = BaselineProblemVisitor<PSHelper,
+                                            ThreadblockShape,
+                                            PrefetchTileCount,
+                                            ThreadCount>;
+
+  //
+  // Data members
+  //
+  uint32_t seed;
+  int problem_count;
+  int threadblock_count;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes_host;
+
+  //
+  // Methods
+  //
+
+  TestbedGroupedGemmScheduler(uint32_t seed_ = 3080):
+    seed(seed_) { srand(seed); }
+
+  /// Initializes data structures
+  void initialize(int32_t scale_factor) {
+
+    //
+    // Choose random problem sizes
+    //
+
+    problem_sizes_host.clear();
+    problem_sizes_host.resize(problem_count);
+
+    for (int32_t i = 0; i < problem_count; ++i) {
+
+      cutlass::gemm::GemmCoord problem(
+        scale_factor * (rand() % 64) + 24,
+        scale_factor * (rand() % 64) + 24,
+        scale_factor * (rand() % 64) + 24);
+
+      problem_sizes_host.at(i) = problem;
+    }
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    using PV = cutlass::gemm::kernel::GemmGroupedProblemVisitor<
+                                         ThreadblockShape,
+                                         GroupScheduleMode_,
+                                         PrefetchTileCount,
+                                         ThreadCount,
+                                         Transpose>;
+    ProblemVisitorRunner<PV> runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(runner.run());
+
+    // Check that this problem visitor visits the same problems and tiles as the baseline
+    EXPECT_EQ(baseline_runner.host_visited_problems, runner.host_visited_problems);
+    EXPECT_EQ(baseline_runner.host_visited_tiles, runner.host_visited_tiles);
+  }
+
+  template <cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode1_,
+            cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode2_,
+            cutlass::gemm::kernel::GroupScheduleMode... Rest>
+  void compare_visitors(const ProblemVisitorRunner<BaselinePV>& baseline_runner) {
+    // Compare the next visitor with the baseline visitor
+    compare_visitors<GroupScheduleMode1_>(baseline_runner);
+
+    // Recurse to compare the next visitors
+    compare_visitors<GroupScheduleMode2_, Rest...>(baseline_runner);
+  }
+
+  /// Executes the test on all scheduler modes
+  void run(int problem_count, int threadblock_count, int scale_factor=8) {
+
+    this->problem_count = problem_count;
+    this->threadblock_count = threadblock_count;
+
+    // Initialize the problem
+    initialize(scale_factor);
+
+    // Run the baseline visitor to which we will compare all other visitors
+    ProblemVisitorRunner<BaselinePV> baseline_runner(problem_sizes_host, threadblock_count);
+    EXPECT_TRUE(baseline_runner.run());
+
+    compare_visitors<Args...>(baseline_runner);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // device
+} // gemm
+} // test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_interleaved.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_interleaved.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a5956000db8e8c05ea22538e58149998b03e3fc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_interleaved.h
@@ -0,0 +1,346 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, int InterleavedK>
+struct InterleavedTestbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  //
+  // Methods
+  //
+
+  InterleavedTestbed(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, 2, -2, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+	/// Waives test if CUDA device is insufficient
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+    
+    //
+    // Allocate the GEMM workspace
+    //
+
+    cutlass::HostTensor<
+      typename Gemm::ElementA, 
+      typename Gemm::LayoutA> tensor_A(problem_size.mk());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementB, 
+      typename Gemm::LayoutB> tensor_B_reordered(problem_size.kn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_C(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> tensor_D(problem_size.mn());
+
+    cutlass::HostTensor<
+      typename Gemm::ElementC, 
+      typename Gemm::LayoutC> reference_D(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    cutlass::reorder_column<InterleavedK>(
+        tensor_B_reordered.host_ref(), tensor_B.host_ref(), problem_size);
+
+    cutlass::reference::host::TensorCopy(
+      reference_D.host_view(), 
+      tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B_reordered.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      tensor_A.device_ref(),
+      tensor_B_reordered.device_ref(),
+      tensor_C.device_ref(),
+      tensor_D.device_ref(),
+      {alpha, beta}
+    };
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.initialize(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(), 
+      tensor_B.host_ref(), 
+      beta, 
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+    
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(
+      reference_D.host_view(), 
+      tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" 
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nB_reordered =\n" << tensor_B_reordered.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Runs a set of problem sizes
+  bool run_all() {
+    bool passed = true;
+
+    int problem_size_m[] = {
+      InterleavedK, 256 + InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_n[] = {
+      InterleavedK, 256 + InterleavedK, 512 + InterleavedK
+    };
+
+    int problem_size_k[] = {
+      InterleavedK, 256 + InterleavedK, 512 + InterleavedK
+    };
+
+    double problem_alpha[] = {
+      1.0
+    };
+
+    double problem_beta[] = {
+      2.0
+    };
+
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (double alpha : problem_alpha) {
+            for (double beta : problem_beta) {
+ 
+              passed = run(
+                {m, n, k}, 
+                ElementCompute(alpha), 
+                ElementCompute(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_planar_complex.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_planar_complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..32452c30e05f64763a268195ae78138f26c09735
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_planar_complex.h
@@ -0,0 +1,326 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm_planar_complex.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace device {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+class TestbedPlanarComplex {
+public:
+
+  using ElementA = typename Gemm::ElementA;
+  using LayoutA = typename Gemm::LayoutA;
+  using ElementB = typename Gemm::ElementB;
+  using LayoutB = typename Gemm::LayoutB;
+  using ElementC = typename Gemm::ElementC;
+  using LayoutC = typename Gemm::LayoutC;
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+  //
+  // Data members
+  //
+
+  cutlass::gemm::GemmCoord problem_size;
+  cutlass::HostTensorPlanarComplex<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensorPlanarComplex<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensorPlanarComplex<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensorPlanarComplex<ElementC, LayoutC> tensor_D;
+  cutlass::HostTensorPlanarComplex<ElementC, LayoutC> tensor_D_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(cutlass::gemm::GemmCoord const & problem_size): problem_size(problem_size) {
+
+    tensor_A.reset({problem_size.m(), problem_size.k()});
+    tensor_B.reset({problem_size.k(), problem_size.n()});
+    tensor_C.reset({problem_size.m(), problem_size.n()});
+    tensor_D.reset({problem_size.m(), problem_size.n()});
+    tensor_D_ref.reset({problem_size.m(), problem_size.n()}, false);
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    int scope_max = 8;
+    int scope_min = -8;
+
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_A.host_view(), seed, scope_max, scope_min, 0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_B.host_view(), seed * 2019, scope_max, scope_min, 0);
+
+    cutlass::reference::host::TensorFillRandomUniform(
+        tensor_C.host_view(), seed * 2020, scope_max, scope_min, 0);
+
+    cutlass::reference::host::TensorFill(tensor_D.host_view(), cutlass::complex<ElementC>());
+    cutlass::reference::host::TensorFill(tensor_D_ref.host_view(), cutlass::complex<ElementC>());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+  
+  bool run(
+      cutlass::complex<ElementCompute> alpha = {1, 0},
+      cutlass::complex<ElementCompute> beta = {0, 0}) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    initialize();
+
+    int batch_count = 1;
+
+    ElementA *ptr_A = tensor_A.device_data();
+    ElementB *ptr_B = tensor_B.device_data();
+    ElementC *ptr_C = tensor_C.device_data();
+    ElementC *ptr_D = tensor_D.device_data();
+
+    typename LayoutA::Stride::Index lda = tensor_A.layout().stride(0);
+    typename LayoutB::Stride::Index ldb = tensor_B.layout().stride(0);
+    typename LayoutC::Stride::Index ldc = tensor_C.layout().stride(0);
+    typename LayoutC::Stride::Index ldd = tensor_D.layout().stride(0);
+
+    int64_t imag_stride_A = tensor_A.imaginary_stride();
+    int64_t imag_stride_B = tensor_B.imaginary_stride();
+    int64_t imag_stride_C = tensor_C.imaginary_stride();
+    int64_t imag_stride_D = tensor_D.imaginary_stride();
+
+    //
+    // Launch device kernel
+    //
+
+    Gemm gemm_op;
+
+    typename Gemm::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      ptr_A,
+      ptr_A + imag_stride_A,
+      ptr_B,
+      ptr_B + imag_stride_B,
+      ptr_C,
+      ptr_C + imag_stride_C,
+      ptr_D,
+      ptr_D + imag_stride_D,
+      lda,
+      lda,
+      ldb,
+      ldb,
+      ldc,
+      ldc,
+      ldd,
+      ldd
+    };
+
+    cutlass::Status status = gemm_op(args);
+
+    EXPECT_EQ(status, cutlass::Status::kSuccess);
+
+    cudaError_t error = cudaDeviceSynchronize();
+
+    tensor_D.sync_host();
+
+    //
+    // Compute reference
+    //
+
+    cutlass::reference::host::GemmPlanarComplex<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementAccumulator
+    >(
+      problem_size,
+      alpha,
+      tensor_A.host_ref(),
+      Gemm::kTransformA,
+      tensor_B.host_ref(),
+      Gemm::kTransformB,
+      beta,
+      tensor_C.host_ref(),
+      tensor_D_ref.host_ref()
+    );
+    
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D.host_view(), 
+      tensor_D_ref.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("gemm_planar_complex.txt");
+
+      output
+        << "A:\n" << tensor_A.host_view() << "\n"
+        << "B:\n" << tensor_B.host_view() << "\n"
+        << "C:\n" << tensor_C.host_view() << "\n"
+        << "Reference:\n"
+        << tensor_D_ref.host_view() << "\n"
+        << "Computed:\n"
+        << tensor_D.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+template <typename Gemm>
+bool TestOneGemmPlanarComplex(cutlass::gemm::GemmCoord problem_size) {
+
+  TestbedPlanarComplex<Gemm> testbed(problem_size);
+
+  return testbed.run();
+}
+
+template <typename Gemm>
+bool TestAllGemmPlanarComplex() {
+
+  int M[] = {
+    16, 64, 72, 144, 264, 520,
+  };
+
+  int N[] = {
+    16, 64, 72, 144, 248, 264, 520
+  };
+
+  int K[] = {
+    8, 64, 72, 96,  264, 520
+  };
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  cutlass::complex<ElementCompute> alpha_values[] = {
+    {ElementCompute(1.25), ElementCompute(-0.5)}
+  };
+
+  cutlass::complex<ElementCompute> beta_values[] = {
+    {ElementCompute(-2.25), ElementCompute(1.5)}
+  };
+
+  for (int m : M) {
+    for (int n : N) {
+      for (int k : K) {
+        
+        test::gemm::device::TestbedPlanarComplex<Gemm> testbed({m, n, k});
+
+        for (auto const &alpha : alpha_values) {
+          for (auto const &beta : beta_values) {
+
+            bool passed = testbed.run(alpha, beta);
+            if (!passed) {
+              return false;
+            }            
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank2k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank2k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f6743a45e5dc3a7b4ddd3e2a7b2abceffbb18
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank2k_universal.h
@@ -0,0 +1,641 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Rank 2k update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/rank_2k.h"
+#include "cutlass/util/reference/host/rank_2k_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Rank2K>
+struct TestbedRank2KUniversal {
+
+  using ElementA = typename Rank2K::ElementA;
+  using ElementB = typename Rank2K::ElementB;
+  using ElementC = typename Rank2K::ElementC;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+  using ElementCompute = typename Rank2K::Rank2Kkernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Rank2K::ElementA, typename Rank2K::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Rank2K::ElementB, typename Rank2K::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Rank2K::ElementC, typename Rank2K::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedRank2KUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Rank2K::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Rank2K::kFillModeC, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Rank2K::kFillModeC, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the Rank2K workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.mk());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename Rank2K::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018, cutlass::MantissaInBits<typename Rank2K::ElementB>::bits));
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename Rank2K::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Rank2K::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Rank2K::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Rank2K::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Rank2K::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a Rank2K
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+    cutlass::reference::host::Rank2KComplex<
+        typename Rank2K::ElementA, typename Rank2K::LayoutA,
+        typename Rank2K::ElementB, typename Rank2K::LayoutB,
+        typename Rank2K::ElementC, typename Rank2K::LayoutC, 
+        ElementCompute, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      Rank2K::kTransformA,
+      tensor_B.host_ref(),
+      Rank2K::kTransformB,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0),
+      Rank2K::kFillModeC,
+      Rank2K::kBlasMode
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Rank2K::Rank2Kkernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedRank2KUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the Rank2K operator
+    //
+
+    typename Rank2K::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      problem_size.n() * problem_size.k(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Rank2K rank2k_op;
+
+    size_t workspace_size = Rank2K::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the Rank2K
+    //
+
+    status = rank2k_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Rank2k_device_"
+            << "fill_mode_c_"
+            << (Rank2K::kFillModeC == cutlass::FillMode::kLower ? "lower_" :
+                (Rank2K::kFillModeC == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Rank2K::ThreadblockShape::kM << "x"  
+            << Rank2K::ThreadblockShape::kN << "x"  
+            << Rank2K::ThreadblockShape::kK << "_"
+            << Rank2K::WarpShape::kM << "x"  
+            << Rank2K::WarpShape::kN << "x"  
+            << Rank2K::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Rank2K>
+bool TestRank2kUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedRank2KUniversal<Rank2K> testbed;
+  
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename Rank2K>
+bool TestAllRank2KUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Rank2K::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Rank2K::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages - kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 3.25
+  };
+
+  double problem_beta[] = {
+    0.0, 2.15
+  };
+
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+                // skip very small K problems
+                //if (k / batch_count < 2 * Rank2K::ThreadblockShape::kK) {
+                //  continue;
+                //}
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<Rank2K> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+template <typename Rank2K>
+bool TestAllRank2KHermitianUniversal() {
+  bool passed = true;
+
+  using ElementCompute = typename Rank2K::EpilogueOutputOp::ElementCompute;
+  using ElementAccumulator = typename Rank2K::ElementAccumulator;
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Rank2K::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Rank2K::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Rank2K::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Rank2K::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Rank2K::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages - kAlignmentK, 
+    Rank2K::ThreadblockShape::kK * Rank2K::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  /* Complex alpha for HER2K */
+  ElementAccumulator problem_alpha[] = {
+    {1.0},
+    {1.25, 3.25},
+    {-0.25, -2.25}
+  };
+
+  ElementAccumulator problem_beta[] = {
+    0.0, -2.25
+  };
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+                // skip very small K problems
+                //if (k / batch_count < 2 * Rank2K::ThreadblockShape::kK) {
+                //  continue;
+                //}
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<Rank2K> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                alpha,
+                beta
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank_k_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank_k_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb46528a049ae1254d0492b6235821210e47b957
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_rank_k_universal.h
@@ -0,0 +1,511 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Rank 2k update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/rank_k_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename RankK>
+struct TestbedRank2KUniversal {
+
+  using ElementA = typename RankK::ElementA;
+  using ElementC = typename RankK::ElementC;
+  using ElementAccumulator = typename RankK::ElementAccumulator;
+  using ElementCompute = typename RankK::RankKkernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename RankK::ElementA, typename RankK::LayoutA> tensor_A;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> tensor_C;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> tensor_D;
+  cutlass::HostTensor<typename RankK::ElementC, typename RankK::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedRank2KUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename RankK::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename RankK::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, RankK::kFillModeC, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, RankK::kFillModeC, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the RankK workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename RankK::ElementA>::bits));
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename RankK::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename RankK::ElementA(1);
+    tensor_C.host_view().at({0, 0}) = typename RankK::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename RankK::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a RankK
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+    cutlass::reference::host::Rank2KComplex<
+        typename RankK::ElementA, typename RankK::LayoutA,
+        typename RankK::ElementC, typename RankK::LayoutC, 
+        ElementCompute, ElementAccumulator
+    >(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      RankK::kTransformA,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0),
+      RankK::kFillModeC,
+      RankK::kBlasMode
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename RankK::RankKkernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedRankKUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the RankK operator
+    //
+
+    typename RankK::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    RankK rank2k_op;
+
+    size_t workspace_size = RankK::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = rank2k_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the RankK
+    //
+
+    status = rank2k_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_RankK_device_"
+            << "fill_mode_c_"
+            << (RankK::kFillModeC == cutlass::FillMode::kLower ? "lower_" :
+                (RankK::kFillModeC == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << RankK::ThreadblockShape::kM << "x"  
+            << RankK::ThreadblockShape::kN << "x"  
+            << RankK::ThreadblockShape::kK << "_"
+            << RankK::WarpShape::kM << "x"  
+            << RankK::WarpShape::kN << "x"  
+            << RankK::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename RankK>
+bool TestRank2kUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedRank2KUniversal<RankK> testbed;
+  
+  using ElementCompute = typename RankK::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename RankK>
+bool TestAllRankKUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename RankK::ElementA>::value);
+  int const kAlignmentN = 128 / kMinimumOperandElementSize;
+  int const kAlignmentK = 128 / kMinimumOperandElementSize;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK, 
+    RankK::ThreadblockShape::kK * RankK::kStages - kAlignmentK, 
+    RankK::ThreadblockShape::kK * RankK::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+
+  using ElementCompute = typename RankK::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+              }
+
+              cutlass::gemm::GemmCoord problem_size(n, n, k);
+
+              TestbedRank2KUniversal<RankK> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sanity.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sanity.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a01a6a32ee2db84f2e890059423cd6b8477f766
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sanity.h
@@ -0,0 +1,238 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/core_io.h"
+
+#include "testbed.h"
+
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// List of Gemm internal paramters this testbed supports user verification
+//
+enum class ParameterID {
+
+  // Threadblock-level parameters 
+  kSmemASize,
+  kSmemBSize,
+
+  // Warp-level parameters
+  kWarpFragmentASize,
+  kWarpFragmentBSize,
+  kWarpFragmentCSize,
+  kInvalid
+};
+
+struct Reference {
+  ParameterID parameter_id;
+
+  union {
+    int value;
+    
+    struct {
+      int m, n, k;
+    } gemm_shape;
+
+    struct {
+      int row, column;
+    } matrix_shape;
+  };
+
+  std::string error_msg;
+
+  Reference(
+    ParameterID parameter_id_, 
+    int value_=-1, 
+    std::string const &error_msg_="") : parameter_id(parameter_id_), value(value_), error_msg(error_msg_) {} 
+};
+
+
+template <typename Gemm>
+struct TestbedSanity {
+
+  //
+  // Type definitions (All Gemm types top down) 
+  //
+
+  // Unpacking Gemm types in the following order
+  // Kernel-level > Threadblock-level > Warp-level > Instruction-level
+
+  // kernel-level cutlass Gemm
+  using GemmKernel = typename Gemm::GemmKernel;
+
+  //
+  // Threadblock-level gemm types
+  // 
+  using MmaThreadBlock = typename GemmKernel::Mma;
+
+  // Threadblock-level gemm shape covering one stage
+  using ThreadblockShape = typename MmaThreadBlock::Shape;
+
+  // Shared memory size covering all stages
+  using SmemShapeA = typename MmaThreadBlock::Base::SharedStorage::ShapeA;
+  using SmemPaddingA = typename MmaThreadBlock::Policy::SmemPaddingA;
+  using SmemShapeB = typename MmaThreadBlock::Base::SharedStorage::ShapeB;
+  using SmemPaddingB = typename MmaThreadBlock::Policy::SmemPaddingB;
+  
+
+  /// Number of stages 
+  static int const kStages = MmaThreadBlock::Base::kStages;
+
+  /// Number of warp-level GEMM oeprations
+  static int const  kWarpGemmIterations = MmaThreadBlock::kWarpGemmIterations;
+
+
+  //
+  // Warp-level gemm types
+  //
+
+  // Warp-level gemm operator
+  using MmaWarp = typename MmaThreadBlock::Operator;
+
+  // Warp-level gemm shape covering all kgroups
+  using WarpShape = typename MmaWarp::Shape;
+
+  // Warp-level framents holding operands A & B operand and destination C
+  using WarpFragmentA = typename MmaWarp::FragmentA;
+  using WarpFragmentB = typename MmaWarp::FragmentB;
+  using WarpFragmentC = typename MmaWarp::FragmentC;
+
+  //
+  // Instruction-level gemm types
+  //
+
+  // Instruction-level gemm operator
+  using MmaInstruction = typename MmaWarp::Policy::Operator;
+
+  // Instruction shape
+  using InstructionShape = typename MmaInstruction::Shape;
+
+  // Instruction-level framents holding operands A & B operand and destination C
+  using InstructionFragmentA = typename MmaInstruction::FragmentA;
+  using InstructionFragmentB = typename MmaInstruction::FragmentB;
+  using InstructionFragmentC = typename MmaInstruction::FragmentC;
+
+  //
+  // Testbed types
+  //
+
+  // Vector of values holding user provided reference 
+  using ReferenceVector = std::vector<Reference>;
+
+  //
+  // Data members
+  //
+  ReferenceVector references;
+
+  //
+  // Methods
+  //
+
+  TestbedSanity(ReferenceVector const &references_ = ReferenceVector()) : references(references_){ }
+
+  // verify all parameter in ReferenceVector 
+  bool verify() {
+    for(auto ref : references)
+      verify_parameter(ref);
+    return true;
+  }
+
+  // verify parameter of type Reference
+  void verify_parameter(Reference const& ref) {
+    switch(ref.parameter_id) {
+      case ParameterID::kWarpFragmentASize : EXPECT_TRUE(WarpFragmentA::kElements == ref.value) << *this; break;
+      case ParameterID::kWarpFragmentBSize : EXPECT_TRUE(WarpFragmentB::kElements == ref.value) << *this; break;
+      case ParameterID::kWarpFragmentCSize : EXPECT_TRUE(WarpFragmentC::kElements == ref.value) << *this; break;
+    }
+  } 
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//                             Overload output operators for TesbedSanity<Gemm>
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gemm>
+std::ostream & operator<<(std::ostream &out, TestbedSanity<Gemm> const &test) {
+
+
+  out << "Gemm internal parameters" << std::endl 
+      << "  Threadblock-level parameters:" << std::endl  
+      << "     ThreadblockShape = " << typename TestbedSanity<Gemm>::ThreadblockShape() << std::endl
+      << "     kStages = " << TestbedSanity<Gemm>::kStages << std::endl
+      << "     kWarpGemmIterations = "<< TestbedSanity<Gemm>::kWarpGemmIterations << std::endl    
+      <<"  Shared memory sizes:" << std::endl
+      <<"    SmemPaddingA = " << typename TestbedSanity<Gemm>::SmemPaddingA() << std::endl
+      <<"    SmemPaddingB = " << typename TestbedSanity<Gemm>::SmemPaddingB() << std::endl
+      <<"      SmemShapeA = " << typename TestbedSanity<Gemm>::SmemShapeA() << std::endl
+      <<"      SmemShapeB = " << typename TestbedSanity<Gemm>::SmemShapeB() << std::endl
+      <<"  Warp-level parameters" << std::endl
+      <<"    WarpShape = " << typename TestbedSanity<Gemm>::WarpShape() << std::endl
+      <<"    Fragment sizes:" << std::endl
+      <<"      WarpFragmentA::kElements = " << TestbedSanity<Gemm>::WarpFragmentA::kElements << std::endl
+      <<"      WarpFragmentB::kElements = " << TestbedSanity<Gemm>::WarpFragmentB::kElements << std::endl
+      <<"      WarpFragmentC::kElements = " << TestbedSanity<Gemm>::WarpFragmentC::kElements << std::endl
+      <<"  Instruction-level parameters" << std::endl
+      <<"    InstructionShape = " << typename TestbedSanity<Gemm>::InstructionShape() << std::endl
+      <<"    Fragment sizes:" << std::endl
+      <<"      InstructionFragmentA::kElements = " << TestbedSanity<Gemm>::InstructionFragmentA::kElements << std::endl
+      <<"      InstructionFragmentB::kElements = " << TestbedSanity<Gemm>::InstructionFragmentB::kElements << std::endl
+      <<"      InstructionFragmentC::kElements = " << TestbedSanity<Gemm>::InstructionFragmentC::kElements << std::endl;
+
+  return out;
+}
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sparse.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..a95bf996bac337b44da616dc9fbf9c9bdb2a625c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_sparse.h
@@ -0,0 +1,487 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+
+  Testbed for sparse operations not to be released for CUDA 11.0 GA. Expected release is 11.1.
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/host_uncompress.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct SparseTestbed {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  static int const kSparse = Gemm::GemmKernel::kSparse;
+  static int const kMetaSizeInBits = Gemm::GemmKernel::kMetaSizeInBits;
+  static int const kMaxID2 = Gemm::GemmKernel::kMaxID2;
+  static int const kElementsPerElementE = Gemm::GemmKernel::kElementsPerElementE;
+
+  using ElementE = typename Gemm::GemmKernel::ElementE;
+  using LayoutE = cutlass::layout::RowMajor;
+  using ReorderedLayoutE = typename Gemm::GemmKernel::LayoutE;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  cutlass::Distribution::Kind init_E;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A_uncompressed;
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> reference_D;
+  cutlass::HostTensor<ElementE, LayoutE> tensor_E;
+  cutlass::HostTensor<ElementE, ReorderedLayoutE> tensor_E_reordered;
+
+  //
+  // Methods
+  //
+
+  SparseTestbed(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_E_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 2080)
+      : init_A(init_A_),
+        init_B(init_B_),
+        init_C(init_C_),
+        init_E(init_E_),
+        seed(seed_) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+    tensor_A.resize(cutlass::make_Coord(problem_size.m(), problem_size.k() / kSparse));
+    tensor_A_uncompressed.resize(problem_size.mk());
+    tensor_B.resize(problem_size.kn());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+    tensor_E.resize(cutlass::make_Coord(
+        problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+    tensor_E_reordered.resize(cutlass::make_Coord(
+        problem_size.m(), problem_size.k() / kSparse / kElementsPerElementE));
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    if (init_E == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomSparseMeta(
+          tensor_E.host_view(), seed, kMetaSizeInBits);
+    } else if (init_E == cutlass::Distribution::Identity) {
+      uint32_t content = (kMaxID2 == 1) ? 0x44444444 : 0x4444;
+      cutlass::reference::host::TensorFill(tensor_E.host_view(),
+                                           (ElementE)(content));
+    } else {
+      EXPECT_TRUE(false);
+    }
+
+    cutlass::reorder_meta(tensor_E_reordered.host_ref(), tensor_E.host_ref(),
+                          {problem_size.m(), problem_size.n(),
+                           problem_size.k() / kSparse / kElementsPerElementE});
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+    tensor_E_reordered.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_" 
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"  
+        << Gemm::ThreadblockShape::kN << "x"  
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"  
+        << Gemm::WarpShape::kN << "x"  
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\nE =\n" << tensor_E.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::uncompress(tensor_A_uncompressed.host_ref(), tensor_A.host_ref(),
+                        tensor_E.host_ref(), problem_size.m(), problem_size.k());
+
+    cutlass::reference::host::Gemm<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC, 
+        ElementCompute,
+        ElementAccumulator, typename Gemm::Operator>
+        reference_gemm;
+
+    reference_gemm(
+      problem_size,
+      alpha, 
+      tensor_A_uncompressed.host_ref(), 
+      tensor_B.host_ref(), 
+      beta, 
+      reference_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    int split_k_slices = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      split_k_slices,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      tensor_E_reordered.device_data(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      int64_t(),
+      tensor_A.layout().stride(0),                                     
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0),                                     
+      tensor_E_reordered.layout().stride(0)
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+		// This failure is likely due to insufficient device capabilities. Waive the test.
+    if (status != cutlass::Status::kSuccess) {
+      return true;
+    }
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Error with split_k_slices = " << split_k_slices << ", alpha: " << alpha << ", beta: " << beta << ", m: " << problem_size.m() << ", n: " << problem_size.n() << ", k:" <<problem_size.k() << std::endl;
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestAllSparseGemm() {
+  bool passed = true;
+
+  int const kMinimumOperandElementSize = 
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value), 
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int)
+  // because of the reordering of operand E
+  int const kAlignmentM = std::max(((sizeof(typename Gemm::ElementE) == 2) ? 32 : 16),
+                                   kMinimumOperandElementSize);
+
+  int const kAlignmentN = 128 / kMinimumOperandElementSize;
+
+  int problem_size_m[] = {kAlignmentM, 512 - 3 * kAlignmentM};
+
+  int problem_size_n[] = {kAlignmentN, 512 - 2 * kAlignmentN};
+
+  int problem_size_k[] = {Gemm::ThreadblockShape::kK * 8};
+
+  int split_k_slices[] = {
+    1, 2
+  };
+
+  double problem_alpha[] = {
+    1
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  SparseTestbed<Gemm> testbed;
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  for (int m : problem_size_m) {
+    for (int n : problem_size_n) {
+      for (int k : problem_size_k) {
+        for (int split_k : split_k_slices) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+              passed = testbed.run(
+                problem_size, 
+                split_k,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_splitk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_splitk.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fa4a85505316d08f1d050702b78448f8fae8565
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_splitk.h
@@ -0,0 +1,218 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "testbed.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+struct TestbedSplitK : public Testbed<Gemm> {
+
+  using Base = Testbed<Gemm>;
+
+  using ElementCompute = typename Base::ElementCompute;
+
+  //
+  // Methods
+  //
+
+  TestbedSplitK(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    Base(init_A_, init_B_, init_C_, seed_) { }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+  
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmCoord problem_size, 
+    int split_k_slices,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      problem_size,
+      this->tensor_A.device_ref(),
+      this->tensor_B.device_ref(),
+      this->tensor_C.device_ref(),
+      this->tensor_D.device_ref(),
+      {alpha, beta},
+      split_k_slices
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess);
+
+    //
+    // Verify
+    //
+
+    return this->verify(problem_size, alpha, beta);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm>
+bool TestAllGemmSplitK() {
+  bool passed = true;
+
+  cutlass::gemm::GemmCoord problem_sizes[] = {
+    {8, 8, 2048},
+    {8, 8, 2056},
+    {264, 72, 520},
+    {264, 520,  120},
+    {264, 520,  264}
+  };
+
+  int split_k_slices[] = {
+    1, 2, 4, 5, 7
+  };
+
+  double problem_alpha[] = {
+    0.5
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+  using Testbed = TestbedSplitK<Gemm>;
+  using ElementCompute = typename Testbed::ElementCompute;
+
+  Testbed testbed;
+
+  for (auto problem_size : problem_sizes) {
+    for (int split_k_count : split_k_slices) {
+      for (double alpha : problem_alpha) {
+        for (double beta : problem_beta) {
+
+          passed = testbed.run(
+            problem_size, 
+            split_k_count,
+            ElementCompute(alpha), 
+            ElementCompute(beta)
+          );
+
+          if (!passed) {
+            std::cout << "Failed on size " << problem_size << " with split_k_count " << split_k_count << std::endl;
+            return false;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_TRUE(passed);
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_symm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_symm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7a57f7eb0ca73c23460e5a9ce1301061c2cc286
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_symm_universal.h
@@ -0,0 +1,592 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide Symm update interface
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/symm.h"
+#include "cutlass/util/reference/host/symm_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Symm>
+struct TestbedSymmUniversal {
+
+  using ElementA = typename Symm::ElementA;
+  using ElementB = typename Symm::ElementB;
+  using ElementC = typename Symm::ElementC;
+  using ElementAccumulator = typename Symm::ElementAccumulator;
+  using ElementCompute = typename Symm::SymmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Symm::ElementA, typename Symm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Symm::ElementB, typename Symm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Symm::ElementC, typename Symm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedSymmUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Symm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Symm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Symm::kFillModeA, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Symm::kFillModeA, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+
+      EXPECT_TRUE(false) << "Input distribution (symmetric tensor) not implemented";
+      return false;
+    }
+
+    return true;
+  }
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the Symm workspace
+    //
+
+    if (Symm::kSideModeA == cutlass::SideMode::kLeft) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.m(),problem_size.m()));
+    }
+    else if (Symm::kSideModeA == cutlass::SideMode::kRight) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.n(),problem_size.n()));
+    }
+
+    tensor_B.resize(problem_size.mn());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_symmetric_tensor(tensor_A.host_view(), init_A, seed + 2019, cutlass::MantissaInBits<typename Symm::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018, cutlass::MantissaInBits<typename Symm::ElementB>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017, cutlass::MantissaInBits<typename Symm::ElementC>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Symm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Symm::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename Symm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Symm::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a Symm
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    using HostReference = typename cutlass::platform::conditional<
+                              (cutlass::platform::is_same<typename Symm::ElementC,
+                                                          cutlass::complex<double>
+                                                         >::value ||
+                              cutlass::platform::is_same<typename Symm::ElementC,
+                                                          cutlass::complex<float>
+                                                         >::value
+                              ), 
+                              cutlass::reference::host::SymmComplex<
+                                  typename Symm::ElementA, typename Symm::LayoutA,
+                                  Symm::kSideModeA, Symm::kFillModeA,
+                                  typename Symm::ElementB, typename Symm::LayoutB,
+                                  typename Symm::ElementC, typename Symm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator,
+                                  Symm::kBlasMode>,
+                              cutlass::reference::host::Symm<
+                                  typename Symm::ElementA, typename Symm::LayoutA,
+                                  Symm::kSideModeA, Symm::kFillModeA, 
+                                  typename Symm::ElementB, typename Symm::LayoutB,
+                                  typename Symm::ElementC, typename Symm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>
+                           >::type;
+
+
+    HostReference reference_symm;
+
+    reference_symm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Symm::SymmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1), 
+    ElementCompute beta = ElementCompute(0)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedSymmUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha)
+              << " beta: " << ElementCompute(beta) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the Symm operator
+    //
+
+    int batch_stride_A;
+    if (Symm::kSideModeA == cutlass::SideMode::kLeft)
+      batch_stride_A = problem_size.m()*problem_size.m();
+    if (Symm::kSideModeA == cutlass::SideMode::kRight)
+      batch_stride_A = problem_size.n()*problem_size.n();
+
+    typename Symm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Symm symm_op;
+
+    size_t workspace_size = Symm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = symm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the Symm
+    //
+
+    status = symm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    //if (true) {
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_"
+            << (Symm::kBlasMode == cutlass::BlasMode::kSymmetric ? "symm_" : "hemm_" )
+            << "device_"
+            << "fill_mode_a_"
+            << (Symm::kSideModeA == cutlass::SideMode::kLeft ? "leftside_" :
+                (Symm::kSideModeA == cutlass::SideMode::kRight ? "rightside_" : "invalid_"))            
+            << (Symm::kFillModeA == cutlass::FillMode::kLower ? "lower_" :
+                (Symm::kFillModeA == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Symm::ThreadblockShape::kM << "x"  
+            << Symm::ThreadblockShape::kN << "x"  
+            << Symm::ThreadblockShape::kK << "_"
+            << Symm::WarpShape::kM << "x"  
+            << Symm::WarpShape::kN << "x"  
+            << Symm::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "alpha: " << ElementCompute(alpha) << "\n"
+        << "beta: "  << ElementCompute(beta) << "\n"
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nC:\n" << tensor_C.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Symm>
+bool TestsymmUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0, 
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedSymmUniversal<Symm> testbed;
+  
+  using ElementCompute = typename Symm::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha), 
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename Symm>
+bool TestAllSymmUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Symm::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Symm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Symm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Symm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Symm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Symm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Symm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Symm::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_m[] = {
+    kAlignmentK, 
+    Symm::ThreadblockShape::kK * Symm::kStages - kAlignmentK, 
+    Symm::ThreadblockShape::kK * Symm::kStages * 3 - kAlignmentK
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 3.0
+  };
+
+  double problem_beta[] = {
+    0, 2.0
+  };
+
+
+  using ElementCompute = typename Symm::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int batch_count : batch_counts) {
+
+          for (auto alpha : problem_alpha) {
+            for (auto beta : problem_beta) {
+              
+              int k = 0;
+              if (Symm::kSideModeA == cutlass::SideMode::kLeft)
+                k = m;
+              else if (Symm::kSideModeA == cutlass::SideMode::kRight)
+                k = n;
+
+              if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+  #if 0
+                // skip very small K problems
+                if (k / batch_count < 2 * Symm::ThreadblockShape::kK) {
+                  continue;
+                }
+  #endif
+              }
+
+              cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+              TestbedSymmUniversal<Symm> testbed;
+
+              passed = testbed.run(
+                mode,
+                problem_size,
+                batch_count,
+                cutlass::from_real<ElementCompute>(alpha), 
+                cutlass::from_real<ElementCompute>(beta)
+              );
+
+              if (!passed) {
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_trmm_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_trmm_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..b30acfed6bba547986efd3afa8eb829be2a255e4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_trmm_universal.h
@@ -0,0 +1,606 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide TRMM interface
+
+  
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/host/trmm.h"
+#include "cutlass/util/reference/host/trmm_complex.h"
+#include "cutlass/core_io.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Trmm>
+struct TestbedTrmmUniversal {
+
+  using ElementA = typename Trmm::ElementA;
+  using ElementB = typename Trmm::ElementB;
+  using ElementC = typename Trmm::ElementC;
+  using ElementAccumulator = typename Trmm::ElementAccumulator;
+  using ElementCompute = typename Trmm::TrmmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_D;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Trmm::ElementA, typename Trmm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Trmm::ElementB, typename Trmm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Trmm::ElementC, typename Trmm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Trmm::ElementC, typename Trmm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedTrmmUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_D_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_D(init_D_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, mantissa_in_bits);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_symmetric_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int mantissa_in_bits) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillSymmetricRandomUniform(
+        view, seed, Trmm::kFillMode, scope_max, scope_min, mantissa_in_bits);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillSymmetricRandomGaussian(
+        view, seed, Trmm::kFillMode, 0, 0.5, mantissa_in_bits);
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Helper to initialize a tensor view (pad diagonal fill with zeros for up to alignment on wrong side of diagonal)
+  template <typename Element, typename Layout>
+  bool initialize_pad_diagonal_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed,
+    int alignment) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Trmm::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillPadDiagonalRandomUniform(
+        view, seed, Trmm::kFillMode, scope_max, scope_min, 0, alignment);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      EXPECT_TRUE(false) << "Gaussian distribution for pad diagonal not implemented";
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the TRMM workspace
+    //
+
+    if (Trmm::kSideMode == cutlass::SideMode::kLeft) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.m(),problem_size.m()));
+    }
+    else if (Trmm::kSideMode == cutlass::SideMode::kRight) {
+      tensor_A.resize(cutlass::make_Coord(problem_size.n(),problem_size.n()));
+    }
+
+    tensor_B.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    //EXPECT_TRUE(initialize_symmetric_tensor(tensor_A.host_view(), init_A, seed + 2017));
+    //EXPECT_TRUE(initialize_pad_diagonal_tensor(tensor_A.host_view(), init_A, seed + 2017, Trmm::kAlignmentA));
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2017, cutlass::MantissaInBits<typename Trmm::ElementA>::bits));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2019, cutlass::MantissaInBits<typename Trmm::ElementB>::bits));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename Trmm::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename Trmm::ElementB(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_D.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+
+    if (tensor_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+
+    if (reference_D.size() > 1)
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    double l2_norm = cutlass::reference::host::TensorRelativeErrorMetric(reference_D.host_view(), tensor_D.host_view());
+
+    bool passed = l2_norm < cutlass::MantissaInBits<typename Trmm::ElementA>::error;
+
+    return passed;
+  }
+
+  /// Verifies the result is a TRMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size, 
+    ElementCompute alpha) {
+
+    //
+    // Verify
+    //
+
+    using HostReference = typename cutlass::platform::conditional<
+                              (cutlass::platform::is_same<typename Trmm::ElementC,
+                                                          cutlass::complex<double>
+                                                         >::value ||
+                              cutlass::platform::is_same<typename Trmm::ElementC,
+                                                          cutlass::complex<float>
+                                                         >::value
+                              ), 
+                              cutlass::reference::host::TrmmComplex<
+                                  typename Trmm::ElementA, typename Trmm::LayoutA,
+                                  Trmm::kTransformA,
+                                  Trmm::kSideMode, Trmm::kFillMode, Trmm::kDiagType,
+                                  typename Trmm::ElementB, typename Trmm::LayoutB,
+                                  Trmm::kTransformB,
+                                  typename Trmm::ElementC, typename Trmm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>,
+                              cutlass::reference::host::Trmm<
+                                  typename Trmm::ElementA, typename Trmm::LayoutA,
+                                  Trmm::kSideMode, Trmm::kFillMode, Trmm::kDiagType,
+                                  typename Trmm::ElementB, typename Trmm::LayoutB,
+                                  typename Trmm::ElementC, typename Trmm::LayoutC, 
+                                  ElementCompute,
+                                  ElementAccumulator>
+                           >::type;
+
+
+    HostReference reference_trmm;
+
+    reference_trmm(
+      problem_size,
+      alpha, 
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      reference_D.host_ref(), 
+      ElementAccumulator(0)
+    );
+
+    return compare_reference(problem_size, alpha);
+  }
+  
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Trmm::TrmmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1)) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+#if 0
+    std::cout << "[TestbedTrmmUniversal::run()] problem(m, n, k): " << problem_size
+              << " alpha: " << ElementCompute(alpha) << std::endl;
+#endif
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the TRMM operator
+    //
+
+    int batch_stride_A;
+    if (Trmm::kSideMode == cutlass::SideMode::kLeft)
+      batch_stride_A = problem_size.m()*problem_size.m();
+    if (Trmm::kSideMode == cutlass::SideMode::kRight)
+      batch_stride_A = problem_size.n()*problem_size.n();
+
+    typename Trmm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Trmm trmm_op;
+
+    size_t workspace_size = Trmm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = trmm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the TRMM
+    //
+
+    status = trmm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+    bool passed = this->verify(problem_size, alpha);
+
+    if (!passed) {
+      std::stringstream fname;
+
+      fname << "error_Trmm_device_"
+            << "fill_mode_"
+            << (Trmm::kFillMode == cutlass::FillMode::kLower ? "lower_" :
+                (Trmm::kFillMode == cutlass::FillMode::kUpper ? "upper_" : "invalid_"))
+            << "side_mode_"
+            << (Trmm::kSideMode == cutlass::SideMode::kLeft ? "left_" :
+                (Trmm::kSideMode == cutlass::SideMode::kRight ? "right_" : "invalid_")) 
+            << "mnk_"
+            << problem_size.m() << "x"
+            << problem_size.n() << "x"
+            << problem_size.k() << "_"
+            << Trmm::ThreadblockShape::kM << "x"  
+            << Trmm::ThreadblockShape::kN << "x"  
+            << Trmm::ThreadblockShape::kK << "_"
+            << Trmm::WarpShape::kM << "x"  
+            << Trmm::WarpShape::kN << "x"  
+            << Trmm::WarpShape::kK << ".txt";
+
+      std::cout << fname.str() << std::endl;
+
+      std::ofstream results(fname.str());
+
+      results << problem_size << std::endl;
+
+      results
+        << "\nA:\n" << tensor_A.host_view() << "\n"
+        << "\nB:\n" << tensor_B.host_view() << "\n"
+        << "\nD reference:\n" << reference_D.host_view() << "\n"
+        << "\nD computed:\n" << tensor_D.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Trmm>
+bool TestTrmmUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0) {
+
+  bool passed = true;
+
+  TestbedTrmmUniversal<Trmm> testbed;
+  
+  using ElementCompute = typename Trmm::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha) 
+  );
+
+  return passed;
+}
+
+template <typename Trmm>
+bool TestAllTrmmUniversal() {
+  bool passed = true;
+
+  int const kMinimumOperandElementSize = int(cutlass::sizeof_bits<typename Trmm::ElementA>::value);
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Trmm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Trmm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Trmm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Trmm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = kAlignmentM;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Trmm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Trmm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Trmm::LayoutA, cutlass::layout::RowMajor>::value
+                           ? 4 : kAlignment;
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_m[] = {
+    kAlignmentK, 
+    Trmm::ThreadblockShape::kK * Trmm::kStages - kAlignmentK, 
+    Trmm::ThreadblockShape::kK * Trmm::kStages * 3 - kAlignmentK
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1                         // Just running one batch for now (removing 2, 3, 5, 7)
+  };
+
+  double problem_alpha[] = {
+    1.0, 2.0
+  };
+
+  using ElementCompute = typename Trmm::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int batch_count : batch_counts) {
+          for (auto alpha : problem_alpha) {
+            
+            int k = 0;
+            if (Trmm::kSideMode == cutlass::SideMode::kLeft)
+              k = m;
+            else if (Trmm::kSideMode == cutlass::SideMode::kRight)
+              k = n;
+
+            if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+              mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+#if 0
+              // skip very small K problems
+              if (k / batch_count < 2 * Trmm::ThreadblockShape::kK) {
+                continue;
+              }
+#endif
+            }
+            
+            cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+            TestbedTrmmUniversal<Trmm> testbed;
+
+            passed = testbed.run(
+              mode,
+              problem_size,
+              batch_count,
+              cutlass::from_real<ElementCompute>(alpha) 
+            );
+
+            if (!passed) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_universal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_universal.h
new file mode 100644
index 0000000000000000000000000000000000000000..00368a5e8eebc128719f64069583010c83dc0c1f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_universal.h
@@ -0,0 +1,553 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed_utils.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool Relu = false>
+struct TestbedUniversal {
+
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<typename Gemm::ElementA, typename Gemm::LayoutA> tensor_A;
+  cutlass::HostTensor<typename Gemm::ElementB, typename Gemm::LayoutB> tensor_B;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_C;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_D;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> reference_D;
+
+  //
+  // Methods
+  //
+
+  TestbedUniversal(
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+    uint64_t seed_ = 2080
+  ):
+    init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { }
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view,
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename Gemm::ElementC>::value;
+      bool is_unsigned_int = std::numeric_limits<Element>::is_integer && !std::numeric_limits<Element>::is_signed;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = is_unsigned_int ? 2 : 1;
+        scope_min = is_unsigned_int ? 0 : -1;
+      } else if (bits_output == 16) {
+        constexpr auto u8_bf16 =
+          (cutlass::platform::is_same<ElementA, uint8_t>::value &&
+           cutlass::platform::is_same<ElementB, cutlass::bfloat16_t>::value) ||
+          (cutlass::platform::is_same<ElementA, cutlass::bfloat16_t>::value &&
+           cutlass::platform::is_same<ElementB, uint8_t>::value);
+        scope_max = is_unsigned_int ? 10 : (u8_bf16 ? 3 : 5);
+        scope_min = is_unsigned_int ? 0 : (u8_bf16 ? -3 : -5);
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    }
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    }
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    }
+    else {
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+
+    tensor_A.resize(problem_size.mk());
+    tensor_B.resize(problem_size.kn());
+    tensor_C.resize(problem_size.mn());
+    tensor_D.resize(problem_size.mn());
+    reference_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<2> origin(0);
+    tensor_A.host_view().at(origin) = typename Gemm::ElementA(1);
+    tensor_B.host_view().at(origin) = typename Gemm::ElementB(1);
+    tensor_C.host_view().at(origin) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      /*
+
+      std::stringstream fname;
+
+      fname << "error_Gemm_device_"
+        << problem_size.m() << "x"
+        << problem_size.n() << "x"
+        << problem_size.k() << "_"
+        << Gemm::ThreadblockShape::kM << "x"
+        << Gemm::ThreadblockShape::kN << "x"
+        << Gemm::ThreadblockShape::kK << "_"
+        << Gemm::WarpShape::kM << "x"
+        << Gemm::WarpShape::kN << "x"
+        << Gemm::WarpShape::kK << ".txt";
+
+      std::ofstream file(fname.str());
+      */
+
+      std::ofstream file("testbed_universal_errors.txt");
+
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC,
+        ElementCompute, ElementAccumulator
+    >(
+      problem_size,
+      alpha,
+      tensor_A.host_ref(),
+      Gemm::kTransformA,
+      tensor_B.host_ref(),
+      Gemm::kTransformB,
+      beta,
+      tensor_C.host_ref(),
+      reference_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    if (Relu) {
+      for (int i = 0; i < problem_size.m(); ++i) {
+        for (int j = 0; j < problem_size.n(); ++j) {
+           reference_D.at(cutlass::MatrixCoord(i, j)) =
+                  ((ElementCompute)reference_D.at(cutlass::MatrixCoord(i, j)) < (ElementCompute)0)
+                  ? (typename Gemm::ElementC)0
+                  : reference_D.at(cutlass::MatrixCoord(i, j));
+        }
+      }
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    size_t smem_size = sizeof(typename Gemm::GemmKernel::SharedStorage);
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.sharedMemPerBlockOptin < smem_size) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+/*
+    std::cout << "\n-----------------------\n";
+    std::cout << "mode: " << (int) mode << "\n";
+    std::cout << "problem size: " << problem_size << "\n";
+    std::cout << "batch_count: " << batch_count << "\n";
+    std::cout << "alpha: " << alpha << "\n";
+    std::cout << "beta: " << beta << "\n";
+    std::cout << "-----------------------\n\n";
+*/
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::Arguments arguments{
+      mode,
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      problem_size.m() * problem_size.k(),
+      problem_size.n() * problem_size.k(),
+      problem_size.m() * problem_size.n(),
+      problem_size.m() * problem_size.n(),
+      tensor_A.layout().stride(0),
+      tensor_B.layout().stride(0),
+      tensor_C.layout().stride(0),
+      tensor_D.layout().stride(0)
+    };
+
+    Gemm gemm_op;
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed with batch_count/split_k_slices = " << batch_count << std::endl;
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Gemm, bool Relu = false>
+bool TestGemmUniversal(
+  cutlass::gemm::GemmCoord const & problem_size,
+  cutlass::gemm::GemmUniversalMode mode,
+  int batch_count,
+  double alpha = 1.0,
+  double beta = 2.0) {
+
+  bool passed = true;
+
+  TestbedUniversal<Gemm, Relu> testbed;
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  passed = testbed.run(
+    mode,
+    problem_size,
+    batch_count,
+    cutlass::from_real<ElementCompute>(alpha),
+    cutlass::from_real<ElementCompute>(beta)
+  );
+
+  return passed;
+}
+
+template <typename Gemm, bool Relu = false>
+bool TestAllGemmUniversal() {
+  bool passed = true;
+
+
+  int const kMinimumOperandElementSize = 
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value), 
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  int const kAlignment = cutlass::platform::is_same<
+                              typename Gemm::OperatorClass, 
+                              cutlass::arch::OpClassSimt>::value ? 1 : 128 / kMinimumOperandElementSize;
+
+  // int8_t gemm alignment constraints
+  int const kAlignmentM = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::LayoutA, cutlass::layout::ColumnMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentN = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementB, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::LayoutB, cutlass::layout::RowMajor>::value ? 4 : kAlignment;
+
+  int const kAlignmentK = cutlass::platform::is_same<typename Gemm::OperatorClass, cutlass::arch::OpClassSimt>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementA, int8_t>::value &&
+                          cutlass::platform::is_same<typename Gemm::ElementB, int8_t>::value &&
+                          (cutlass::platform::is_same<typename Gemm::LayoutA, cutlass::layout::RowMajor>::value ||
+                          cutlass::platform::is_same<typename Gemm::LayoutB, cutlass::layout::ColumnMajor>::value) ? 4 : kAlignment;
+
+
+
+  cutlass::gemm::GemmUniversalMode modes[] = {
+    cutlass::gemm::GemmUniversalMode::kGemm,
+  };
+
+  int problem_size_m[] = {
+    kAlignmentM, 512 - 3*kAlignmentM
+  };
+
+  int problem_size_n[] = {
+    kAlignmentN, 512 - 2*kAlignmentN
+  };
+
+  int problem_size_k[] = {
+    kAlignmentK,
+    Gemm::ThreadblockShape::kK * Gemm::kStages - kAlignmentK,
+    Gemm::ThreadblockShape::kK * Gemm::kStages * 3 - kAlignmentK
+  };
+
+  int batch_counts[] = {      // may be interpretted as batch count or split-K slices
+    1, 2, 3, 5, 7
+  };
+
+  double problem_alpha[] = {
+    1
+  };
+
+  double problem_beta[] = {
+    2.0
+  };
+
+
+  using ElementCompute = typename Gemm::EpilogueOutputOp::ElementCompute;
+
+  for (cutlass::gemm::GemmUniversalMode mode : modes) {
+    for (int m : problem_size_m) {
+      for (int n : problem_size_n) {
+        for (int k : problem_size_k) {
+          for (int batch_count : batch_counts) {
+
+            for (auto alpha : problem_alpha) {
+              for (auto beta : problem_beta) {
+
+                if (mode == cutlass::gemm::GemmUniversalMode::kGemm ||
+                  mode == cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {
+
+                  // skip very small K problems
+                  if (k / batch_count < 2 * Gemm::ThreadblockShape::kK) {
+                    continue;
+                  }
+                }
+
+                cutlass::gemm::GemmCoord problem_size(m, n, k);
+
+                TestbedUniversal<Gemm, Relu> testbed;
+
+                passed = testbed.run(
+                  mode,
+                  problem_size,
+                  batch_count,
+                  cutlass::from_real<ElementCompute>(alpha),
+                  cutlass::from_real<ElementCompute>(beta)
+                );
+
+                if (!passed) {
+                  return false;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /*
+  // large problem with high coverage
+  for (int split_k_slices = 1; split_k_slices <= 3; ++split_k_slices) {
+    TestbedUniversal<Gemm> testbed;
+
+    cutlass::gemm::GemmCoord problem_size(72, 56, 8192);
+
+    passed = testbed.run(
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      problem_size,
+      split_k_slices,
+      cutlass::from_real<ElementCompute>(1.0),
+      cutlass::from_real<ElementCompute>(2.0)
+    );
+
+    if (!passed) {
+      break;
+    }
+  }
+  */
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_utils.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..89ac33a1028061515d08d50fdb6cce7833ae88ce
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_utils.h
@@ -0,0 +1,53 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tests for device-wide GEMM interface
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+inline char const *to_string(cutlass::Status status) {
+
+  switch (status) {
+    case cutlass::Status::kSuccess: return "kSuccess";
+    case cutlass::Status::kErrorMisalignedOperand: return "kErrorMisalignedOperand";
+    case cutlass::Status::kErrorInvalidLayout: return "kErrorInvalidLayout";
+    case cutlass::Status::kErrorInvalidProblem: return "kErrorInvalidProblem";
+    case cutlass::Status::kErrorNotSupported: return "kErrorNotSupported";
+    case cutlass::Status::kErrorWorkspaceNull: return "kErrorWorkspaceNull";
+    case cutlass::Status::kErrorInternal: return "kErrorInternal";
+    case cutlass::Status::kInvalid: return "kInvalid";
+    default: break;
+  }
+  return "invalid";
+}
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_with_absmax.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_with_absmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b5588f57c40c4e8f8d06adfa9f1e673350fb5e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/device/testbed_with_absmax.h
@@ -0,0 +1,609 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed for running device-level GEMMs with absolute maximum calculation and scaling
+*/
+
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "testbed.h"
+#include "testbed_sparse.h"
+#include "testbed_utils.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_coord.h"
+
+namespace test {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  typename GemmTestbed,
+  template<typename T> class ActivationFunctor
+>
+struct TestbedWithAmax {
+
+  static_assert(std::is_same_v<GemmTestbed, Testbed<Gemm>> || std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>);
+  static constexpr bool IsSparseTestbed = std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>;
+
+  using ElementAccumulator = typename Gemm::ElementAccumulator;
+  using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute;
+  using ElementScalingFactor = typename Gemm::EpilogueOutputOp::ElementScalingFactor;
+  using ElementAbsmax = typename Gemm::EpilogueOutputOp::ElementAbsmax;
+
+  static bool const kScaleAux = Gemm::EpilogueOutputOp::kIsScalingAndAmaxAuxOutputNeeded;
+  static bool const kScaleOutput = Gemm::EpilogueOutputOp::kIsScalingAndAmaxOutputNeeded;
+  bool doScaleA;
+  bool doScaleB;
+  bool doScaleC;
+
+  GemmTestbed underlying_testbed;
+
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> tensor_Aux;
+  cutlass::HostTensor<typename Gemm::ElementC, typename Gemm::LayoutC> tensor_Vector;
+  cutlass::HostTensor<ElementAccumulator, typename Gemm::LayoutC> tmp_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementOutput, typename Gemm::LayoutC> reference_D;
+  cutlass::HostTensor<typename Gemm::EpilogueOutputOp::ElementAuxOutput, typename Gemm::LayoutC> reference_Aux;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_A;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_B;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_C;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_D;
+  cutlass::HostTensor<ElementScalingFactor, typename Gemm::LayoutC> scale_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> abs_max_D;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_Aux;
+  cutlass::HostTensor<ElementAbsmax, typename Gemm::LayoutC> reference_abs_max_D;
+
+  //
+  // Methods
+  //
+
+  TestbedWithAmax(
+    bool scaleA = true,
+    bool scaleB = true,
+    bool scaleC = true,
+    cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform
+  ):
+    doScaleA(scaleA), doScaleB(scaleB), doScaleC(scaleC),
+    underlying_testbed(init_A_, init_B_, init_C_) { }
+
+  /// Helper to initialize scaling factors
+  template <typename Element, typename Layout>
+  bool initialize_scale_factor(cutlass::TensorView<Element, Layout> view, uint64_t seed, int bits=0) {
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, double(1.), double(0.), bits);
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(cutlass::gemm::GemmCoord problem_size) {
+    //
+    // Allocate the GEMM workspace
+    //
+    underlying_testbed.initialize(problem_size);
+
+    tensor_Vector.resize({1, problem_size.n()});
+    reference_D.resize(problem_size.mn(), false);
+    tmp_D.resize(problem_size.mn(), false);
+
+    EXPECT_TRUE(
+      underlying_testbed.initialize_tensor(tensor_Vector.host_view(), underlying_testbed.init_C, underlying_testbed.seed + 2020)
+    );
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    cutlass::Coord<2> origin(0);
+    tensor_Vector.host_view().at(origin) = typename Gemm::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), underlying_testbed.tensor_C.host_view());
+
+    tensor_Vector.sync_device();
+
+    int scale_bits = 2;
+    if (doScaleA) {
+      scale_A.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_A.host_view(), underlying_testbed.seed + 2021, scale_bits));
+      scale_A.sync_device();
+    }
+
+    if (doScaleB) {
+      scale_B.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_B.host_view(), underlying_testbed.seed + 2022, scale_bits));
+      scale_B.sync_device();
+    }
+
+    if (doScaleC) {
+      scale_C.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_C.host_view(), underlying_testbed.seed + 2023, scale_bits));
+      scale_C.sync_device();
+    }
+
+    if (kScaleOutput) {
+      scale_D.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_D.host_view(), underlying_testbed.seed + 2024, scale_bits));
+      scale_D.sync_device();
+
+      abs_max_D.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_D.host_view());
+      abs_max_D.sync_device();
+
+      reference_abs_max_D.resize({1, 1});
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.resize(problem_size.mn());
+      cutlass::reference::host::TensorFill(tensor_Aux.host_view());
+      tensor_Aux.sync_device();
+
+      scale_Aux.resize({1, 1});
+      EXPECT_TRUE(initialize_scale_factor(scale_Aux.host_view(), underlying_testbed.seed + 2025, scale_bits));
+      scale_Aux.sync_device();
+
+      abs_max_Aux.resize({1, 1});
+      cutlass::reference::host::TensorFill(abs_max_Aux.host_view());
+      abs_max_Aux.sync_device();
+
+      reference_Aux.resize(problem_size.mn(), false);
+      reference_abs_max_Aux.resize({1, 1});
+    }
+  }
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    underlying_testbed.tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_C.host_view()), 0);
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(underlying_testbed.tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), underlying_testbed.tensor_D.host_view());
+    if (!passed) {
+      std::cout << "Comparison of D failed" << std::endl;
+    }
+
+    if (kScaleAux) {
+      tensor_Aux.sync_host();
+      abs_max_Aux.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_Aux.host_view()), 0);
+      EXPECT_GT(cutlass::reference::host::TensorNorm(reference_Aux.host_view()), 0);
+      if (!cutlass::reference::host::TensorEquals(reference_Aux.host_view(), tensor_Aux.host_view())) {
+        passed = false;
+        std::cout << "Comparison of Aux failed" << std::endl;
+      }
+      if (!cutlass::reference::host::TensorEquals(abs_max_Aux.host_view(), reference_abs_max_Aux.host_view())) {
+        passed = false;
+        std::cout << "Comparison of Aux absmax failed" << std::endl;
+      }
+    }
+
+    if (kScaleOutput) {
+      abs_max_D.sync_host();
+      EXPECT_GT(cutlass::reference::host::TensorNorm(abs_max_D.host_view()), 0);
+      if (!cutlass::reference::host::TensorEquals(abs_max_D.host_view(), reference_abs_max_D.host_view())) {
+        passed = false;
+        std::cout << "Comparison of D absmax failed" << std::endl;
+      }
+    }
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file("testbed_with_amax_errors.txt");
+
+      file
+        << "problem: " << problem_size
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file
+        << "A =\n" << underlying_testbed.tensor_A.host_view()
+        << "\nB =\n" << underlying_testbed.tensor_B.host_view()
+        << "\nC =\n" << underlying_testbed.tensor_C.host_view()
+        << "\nVector =\n" << tensor_Vector.host_view()
+        << "\nScaleA = " << scale_A.host_view()
+        << "\nScaleB = " << scale_B.host_view()
+        << "\nScaleC = " << scale_C.host_view()
+        << "\nScaleD = " << scale_D.host_view()
+        << "\nScaleAux = " << scale_Aux.host_view()
+        << "\n\nReference D =\n" << reference_D.host_view()
+        << "\nComputed D =\n" << underlying_testbed.tensor_D.host_view();
+      if (kScaleAux) {
+        file
+          << "\n\nReference Aux =\n" << reference_Aux.host_view()
+          << "\nComputed Aux =\n" << tensor_Aux.host_view()
+          << "\n\nReference Absmax Aux = " << reference_abs_max_Aux.host_view()
+          << "\nComputed Absmax Aux = " << abs_max_Aux.host_view();
+      }
+      if (kScaleOutput) {
+        file
+          << "\n\nReference Absmax D = " << reference_abs_max_D.host_view()
+          << "\nComputed Absmax D = " << abs_max_D.host_view();
+      }
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify(
+    cutlass::gemm::GemmCoord problem_size,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    cutlass::Coord<2> origin(0);
+    ElementCompute scaled_alpha = alpha;
+    if (doScaleA) {
+      scaled_alpha *= scale_A.host_view().at(origin);
+    }
+    if (doScaleB) {
+      scaled_alpha *= scale_B.host_view().at(origin);
+    }
+
+    ElementCompute scaled_beta = beta;
+    if (doScaleC) {
+      scaled_beta *= scale_C.host_view().at(origin);
+    }
+
+    //
+    // Verify
+    //
+
+    auto ref_tA = [&](){
+      if constexpr (IsSparseTestbed) {
+        cutlass::uncompress(
+          underlying_testbed.tensor_A_uncompressed.host_ref(),
+          underlying_testbed.tensor_A.host_ref(),
+          underlying_testbed.tensor_E.host_ref(),
+          problem_size.m(),
+          problem_size.k()
+        );
+        return underlying_testbed.tensor_A_uncompressed.host_ref();
+      }
+      else {
+        return underlying_testbed.tensor_A.host_ref();
+      }
+    }();
+
+    // Run reference kernel with ElementOutput of type ElementAccumulator
+    // so that we can compute the absmax epilogue on data that is of type
+    // ElementAccumulator (which is what the GEMM we are testing will do).
+    cutlass::reference::host::GemmComplex<
+        typename Gemm::ElementA, typename Gemm::LayoutA,
+        typename Gemm::ElementB, typename Gemm::LayoutB,
+        typename Gemm::ElementC, typename Gemm::LayoutC,
+        ElementCompute, ElementAccumulator, ElementAccumulator
+    >(
+      problem_size,
+      scaled_alpha,
+      ref_tA,
+      Gemm::kTransformA,
+      underlying_testbed.tensor_B.host_ref(),
+      Gemm::kTransformB,
+      scaled_beta,
+      underlying_testbed.tensor_C.host_ref(),
+      tmp_D.host_ref(),
+      ElementAccumulator(0)
+    );
+
+    ElementCompute tmp_abs_max_Aux(0.);
+    ElementCompute tmp_abs_max_D(0.);
+
+    cutlass::NumericConverter<ElementCompute, typename Gemm::ElementC> cvt_c_to_compute;
+    cutlass::NumericConverter<ElementCompute, ElementAccumulator> cvt_accum_to_compute;
+    cutlass::NumericConverter<ElementAbsmax, ElementCompute> cvt_compute_to_absmax;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementOutput, ElementCompute> cvt_compute_to_d;
+    cutlass::NumericConverter<typename Gemm::EpilogueOutputOp::ElementAuxOutput, ElementCompute> cvt_compute_to_aux;
+
+    cutlass::absolute_value_op<ElementCompute> abs;
+    cutlass::maximum_with_nan_propogation<ElementCompute> max;
+    ActivationFunctor<ElementCompute> act;
+
+    ElementScalingFactor d_scale = kScaleOutput ? scale_D.host_view().at(origin) : ElementScalingFactor(1.);
+
+    for (int m = 0; m < problem_size.m(); ++m) {
+      for (int n = 0; n < problem_size.n(); ++n) {
+        ElementCompute intermediate = cvt_accum_to_compute(tmp_D.host_view().at({m, n}));
+        ElementCompute bias = cvt_c_to_compute(tensor_Vector.host_view().at({0, n}));
+        ElementCompute aux = intermediate + bias;
+        ElementCompute d = act(aux);
+        tmp_abs_max_Aux = max(abs(aux), tmp_abs_max_Aux);
+        tmp_abs_max_D = max(abs(d), tmp_abs_max_D);
+        reference_D.host_view().at({m, n}) = cvt_compute_to_d(d * d_scale);
+
+        if (kScaleAux) {
+          reference_Aux.host_view().at({m, n}) = cvt_compute_to_aux(aux * scale_Aux.host_view().at(origin));
+        }
+      }
+    }
+
+    if (kScaleAux) {
+      reference_abs_max_Aux.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_Aux);
+    }
+
+    if (kScaleOutput) {
+      reference_abs_max_D.host_view().at(origin) = cvt_compute_to_absmax(tmp_abs_max_D);
+    }
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+    return underlying_testbed.sufficient();
+  }
+
+  /// Executes one test
+  bool run(
+    cutlass::gemm::GemmUniversalMode mode,
+    cutlass::gemm::GemmCoord problem_size,
+    int batch_count = 1,
+    ElementCompute alpha = ElementCompute(1),
+    ElementCompute beta = ElementCompute(0))
+  {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+    this->initialize(problem_size);
+
+    //
+    // Initialize the GEMM operator
+    //
+
+    typename Gemm::EpilogueOutputOp::Params::ActivationParams activation_params{alpha, beta};
+    typename Gemm::EpilogueOutputOp::Params epilogue_params{
+      activation_params,
+      scale_A.device_data(),
+      scale_B.device_data(),
+      scale_C.device_data(),
+      scale_D.device_data(),
+      scale_Aux.device_data(),
+      abs_max_Aux.device_data(),
+      abs_max_D.device_data()
+    };
+
+    auto arguments = [&]() {
+      if constexpr (IsSparseTestbed) {
+        return typename Gemm::Arguments{
+          cutlass::gemm::GemmUniversalMode::kGemm,
+          problem_size,
+          batch_count,
+          epilogue_params,
+          underlying_testbed.tensor_A.device_data(),
+          underlying_testbed.tensor_B.device_data(),
+          underlying_testbed.tensor_C.device_data(),
+          underlying_testbed.tensor_D.device_data(),
+          underlying_testbed.tensor_E_reordered.device_data(),
+          tensor_Aux.device_data(),
+          tensor_Vector.device_data(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          int64_t(),
+          underlying_testbed.tensor_A.layout().stride(0),
+          underlying_testbed.tensor_B.layout().stride(0),
+          underlying_testbed.tensor_C.layout().stride(0),
+          underlying_testbed.tensor_D.layout().stride(0),
+          underlying_testbed.tensor_E_reordered.layout().stride(0),
+          tensor_Aux.layout().stride(0),
+          0 // stride vector
+        };
+      }
+      else {
+        return typename Gemm::Arguments{
+          mode,
+          problem_size,
+          batch_count,
+          epilogue_params,
+          underlying_testbed.tensor_A.device_data(),
+          underlying_testbed.tensor_B.device_data(),
+          underlying_testbed.tensor_C.device_data(),
+          underlying_testbed.tensor_D.device_data(),
+          tensor_Aux.device_data(),
+          tensor_Vector.device_data(),
+          problem_size.m() * problem_size.k(),
+          problem_size.n() * problem_size.k(),
+          problem_size.m() * problem_size.n(),
+          problem_size.m() * problem_size.n(),
+          0, // stride vector
+          underlying_testbed.tensor_A.layout().stride(0),
+          underlying_testbed.tensor_B.layout().stride(0),
+          underlying_testbed.tensor_C.layout().stride(0),
+          underlying_testbed.tensor_D.layout().stride(0),
+          (int64_t)0 // Leading dimension of vector. This must be 0
+        };
+      }
+    }();
+
+    Gemm gemm_op;
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = Gemm::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemm_op.initialize(arguments, workspace.get());
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMM
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    cudaError_t cuda_error = cudaDeviceSynchronize();
+    EXPECT_TRUE(cuda_error == cudaSuccess) << cudaGetErrorString(cuda_error);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(problem_size, alpha, beta);
+
+    if (!passed) {
+      std::cout << "Failed with batch_count/split_k_slices = " << batch_count << std::endl;
+    }
+
+    return passed;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename Gemm,
+  typename GemmTestbed,
+  template<typename T> class ActivationFunctor = cutlass::epilogue::thread::Identity
+>
+bool TestAllGemmWithAbsmax(bool scaleA=true, bool scaleB=true, bool scaleC=true) {
+
+  int const kMinimumOperandElementSize =
+    std::min(
+      int(cutlass::sizeof_bits<typename Gemm::ElementA>::value),
+      int(cutlass::sizeof_bits<typename Gemm::ElementB>::value));
+
+  int constexpr kAlignmentM = [&]() {
+    if constexpr (std::is_same_v<GemmTestbed, SparseTestbed<Gemm>>) {
+      // M dimension has to be multiple of 32 (sparse float) or 16 (sparse int)
+      // because of the reordering of operand E
+      return std::max(((sizeof(typename Gemm::ElementE) == 2) ? 32 : 16),
+                                   kMinimumOperandElementSize);
+    }
+    else {
+      return 128 / kMinimumOperandElementSize;
+    }
+  }();
+
+  int const kAlignmentN = 128 / kMinimumOperandElementSize;
+
+  int M_problems[] = {kAlignmentM, 128 + 32};
+  int N_problems[] = {kAlignmentN, 512 - 2 * kAlignmentN};
+  int K_problems[] = {Gemm::ThreadblockShape::kK * 2};
+  double alpha_problems[] = {1.};
+  double beta_problems[] = {0.};
+  int split_k_slices[] = {
+    1, 2
+  };
+
+  bool passed = true;
+
+  for (int M : M_problems) {
+    for (int N : N_problems) {
+      for (int K : K_problems) {
+        for (int split_k : split_k_slices) {
+          if (cutlass::sizeof_bits_v<typename Gemm::EpilogueOutputOp::ElementOutput> <= 8 && split_k > 1) {
+            // Don't test split-K with FP8 output. The kernel being tested will writie partial accumulations
+            // for different splits to global memory in FP8, while the reference kernel will not. This leads
+            // to mismatches that are difficult to capture without a permissive relative equality check threshold.
+            continue;
+          }
+
+          for (double alpha : alpha_problems) {
+            for (double beta : beta_problems) {
+              TestbedWithAmax<Gemm, GemmTestbed, ActivationFunctor> testbed(scaleA, scaleB, scaleC);
+
+              using ElementAccumulator = typename Gemm::ElementAccumulator;
+
+              passed = testbed.run(
+                cutlass::gemm::GemmUniversalMode::kGemm,
+                {M, N, K},
+                split_k,
+                cutlass::from_real<ElementAccumulator>(alpha),
+                cutlass::from_real<ElementAccumulator>(beta)
+              );
+
+              EXPECT_TRUE(passed)
+                << "M: " << M << ", N: " << N << ", K: " << K << ", alpha: " << alpha << ", beta: " << beta << ", split_k:" << split_k;
+
+              if (!passed) {
+
+                return passed;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return passed;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/kernel/testbed_gemv.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/kernel/testbed_gemv.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e939f9710403a5f5c3fd8c61e34c4e8021ff423
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/kernel/testbed_gemv.h
@@ -0,0 +1,358 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include "cutlass/gemm/kernel/default_gemv.h"
+#include "cutlass/gemm/kernel/gemv_batched_strided.h"
+
+namespace test {
+namespace gemm {
+namespace kernel {
+
+template<typename ThreadBlockShape_,
+        typename ThreadShape_,
+        typename ElementAB_,
+        typename ElementAccumulator_,
+        typename ElementCD_,
+        typename LayoutA_,
+        typename LayoutB_,
+        typename LayoutCD_,
+        int THREAD_B = 1, // batch tile size
+        bool DEBUG=false>
+void batched_gemv_kernel_test(cutlass::gemm::BatchedGemmCoord problem_size,
+                              ElementCD_ alpha = ElementCD_(1),
+                              ElementCD_ beta = ElementCD_(0),
+                              bool perf_test = false,
+                              int perf_test_iter = 1)
+{
+    using ThreadBlockShape = ThreadBlockShape_;
+    using ThreadShape = ThreadShape_;
+    using ElementA = ElementAB_;
+    using LayoutA = LayoutA_;
+    using ElementB = ElementAB_;
+    using LayoutB = LayoutB_;
+    using ElementAccumulator = ElementCD_;
+    using ElementCD = ElementCD_;
+    using LayoutCD = LayoutCD_;
+
+    using GemvKernel = cutlass::gemm::kernel::DefaultGemv<ThreadBlockShape,
+                                                          ThreadShape,
+                                                          ElementA,
+                                                          LayoutA,
+                                                          ElementB,
+                                                          LayoutB,
+                                                          ElementCD,
+                                                          LayoutCD,
+                                                          ElementAccumulator>;
+
+    using ThreadBlockGemv = typename GemvKernel::ThreadBlockGemv;
+    using ThreadBlockSwizzle = typename GemvKernel::ThreadBlockSwizzle;
+
+    if (DEBUG)
+    { 
+        problem_size = cutlass::gemm::BatchedGemmCoord(
+                        problem_size.m(), problem_size.n(), problem_size.k(), 1);
+    }
+
+    // Create host tensors that will be the backing store for the batches
+    // Note that no device memory is initially allocated
+    cutlass::HostTensor<ElementA, LayoutA> matrix_A({problem_size.m(), problem_size.k()}, false); 
+    cutlass::HostTensor<ElementB, LayoutB> matrix_B({problem_size.k(), problem_size.n()}, false); 
+    cutlass::HostTensor<ElementCD, LayoutCD> matrix_C_computed({problem_size.m(), problem_size.n()}, false); 
+    cutlass::HostTensor<ElementCD, LayoutCD> matrix_C_reference({problem_size.m(), problem_size.n()}, false);
+
+    // Reserve memory for the batch of tensors
+    matrix_A.reserve(problem_size.m()*problem_size.k()*problem_size.batch());
+    matrix_B.reserve(problem_size.n()*problem_size.k()*problem_size.batch());
+    matrix_C_computed.reserve(problem_size.m()*problem_size.n()*problem_size.batch());
+    matrix_C_reference.reserve(problem_size.m()*problem_size.n()*problem_size.batch(), false);
+
+    // Fill eatch tensor batch
+    const int seed = 9876;
+    for (int b = 0; b < problem_size.batch(); b++)
+    {
+        if(DEBUG)
+        {
+            cutlass::reference::host::BlockFillSequential(
+                matrix_A.host_data_ptr_offset(b*matrix_A.capacity()), matrix_A.capacity());
+            cutlass::reference::host::BlockFillSequential(
+                matrix_B.host_data_ptr_offset(b*matrix_B.capacity()), matrix_B.capacity());
+        }
+        else
+        {
+            cutlass::reference::host::TensorFillRandomUniform(
+                matrix_A.host_view(b*matrix_A.capacity()),
+                seed + 1660,
+                8,
+                -8,
+                0
+            );
+
+            cutlass::reference::host::TensorFillRandomUniform(
+                matrix_B.host_view(b*matrix_B.capacity()),
+                seed + 1880,
+                8,
+                -8,
+                0
+            );
+        }
+
+        cutlass::reference::host::TensorFill(matrix_C_computed.host_view(b*matrix_C_computed.capacity()));
+        cutlass::reference::host::TensorFill(matrix_C_reference.host_view(b*matrix_C_reference.capacity()));
+    }
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    ThreadBlockSwizzle swizzle;
+
+    cutlass::gemm::BatchedGemmCoord tiled_size{ThreadBlockShape::kM,
+                                                ThreadBlockShape::kN,
+                                                problem_size.k(), // no split-k
+                                                DEBUG ? 1 : THREAD_B };
+
+    cutlass::gemm::BatchedGemmCoord tiled_shape = swizzle.get_tiled_shape(problem_size, tiled_size);
+
+    #if 0 
+    printf("tiled_size = %d %d %d %d\n", tiled_size.m(), tiled_size.n(), tiled_size.k(), tiled_size.batch());
+    printf("tiled_shape = %d %d %d %d\n", tiled_shape.m(), tiled_shape.n(), tiled_shape.k(), tiled_shape.batch());
+    #endif
+
+    // No split-k
+    EXPECT_EQ(tiled_size.k(), problem_size.k());
+
+    dim3 grid = swizzle.get_grid_shape(tiled_shape);
+    dim3 block(tiled_size.n() / ThreadShape::kN, tiled_size.batch(), tiled_size.k() / problem_size.k());
+
+    // Some sanity checks
+    EXPECT_TRUE( block.x*block.y*block.z <= 1024 );
+    EXPECT_TRUE( block.x <= 1024 );
+    EXPECT_TRUE( block.y <= 1024 );
+    EXPECT_TRUE( block.z <= 64 );
+
+    #if 0 
+    printf("grid dim = %d, %d, %d\n", grid.x, grid.y, grid.z);
+    printf("block dim = %d, %d, %d\n", block.x, block.y, block.z);
+    #endif
+
+    cudaError_t result;
+    cudaEvent_t start_event, end_event;
+ 
+    for (int iter = 0; iter < (perf_test ? (perf_test_iter+1) : 1); ++iter)
+    {
+        if (perf_test && iter == 1)
+        {
+            result = cudaEventCreate(&start_event);
+            EXPECT_EQ(result, cudaSuccess);
+            
+            result = cudaEventCreate(&end_event);
+            EXPECT_EQ(result, cudaSuccess);
+    
+            result = cudaEventRecord(start_event);
+            EXPECT_EQ(result, cudaSuccess);
+        }
+
+        if (beta == ElementCD(0))
+        {
+            if (alpha == ElementCD(1))
+            {
+                cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel><<< grid, block >>>(
+                    problem_size,
+                    matrix_A.device_ref(),
+                    matrix_A.capacity(),
+                    matrix_B.device_ref(),
+                    matrix_B.capacity(),
+                    matrix_C_computed.device_ref(),
+                    matrix_C_computed.capacity()
+                );
+            }
+            else
+            {
+                cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel><<< grid, block >>>(
+                    problem_size,
+                    alpha,
+                    matrix_A.device_ref(),
+                    matrix_A.capacity(),
+                    matrix_B.device_ref(),
+                    matrix_B.capacity(),
+                    matrix_C_computed.device_ref(),
+                    matrix_C_computed.capacity()
+                );
+            }
+        }
+        else
+        {
+            cutlass::gemm::kernel::GemvBatchedStrided<GemvKernel, ElementCD, false><<< grid, block >>>(
+                problem_size,
+                alpha,
+                beta,
+                matrix_A.device_ref(),
+                matrix_A.capacity(),
+                matrix_B.device_ref(),
+                matrix_B.capacity(),
+                matrix_C_computed.device_ref(),
+                matrix_C_computed.capacity(),
+                matrix_C_computed.device_ref(),
+                matrix_C_computed.capacity()
+            );
+        }
+
+        if (iter == 0)
+        {
+            result = cudaGetLastError();
+            EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);        
+        }
+    }
+
+    if (perf_test)
+    {
+        result = cudaEventRecord(end_event);
+        EXPECT_EQ(result, cudaSuccess);
+    }
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess) << " kernel error: " << cudaGetErrorString(result);
+
+    if (perf_test)
+    {
+        float ms;
+        result = cudaEventElapsedTime(&ms, start_event, end_event);
+        EXPECT_EQ(result, cudaSuccess);
+        
+        double flops = (double(problem_size.m()) *
+                        double(problem_size.n()) *
+                        double(problem_size.k()) *
+                        double(problem_size.batch()) * 2); // 2 for MAC
+    
+        double read_bytes = double(problem_size.batch()) * (sizeof(ElementA)*double(problem_size.m())*double(problem_size.k()) + 
+                                                            sizeof(ElementB)*double(problem_size.k())*double(problem_size.n()));
+
+        double write_bytes = double(problem_size.batch()) * (sizeof(ElementCD)*double(problem_size.m())*double(problem_size.n()));
+
+        double avg_runtime = double(ms) / perf_test_iter;
+        double gflops_per_sec = flops / 1.0e6 / avg_runtime;
+        double read_bandwidth = read_bytes / 1.0e6 / avg_runtime;
+        double write_bandwidth = write_bytes / 1.0e6 / avg_runtime;
+
+        std::cout << "\n\nProblem size: "
+                  << problem_size.m() 
+                  << " x " << problem_size.n()
+                  << " x " << problem_size.k()
+                  << " x " << problem_size.batch() 
+                  << std::endl;
+
+        std::cout << "  GFLOPs:     " << gflops_per_sec << std::endl;
+        std::cout << "BW (R/W):     " << read_bandwidth << " / " << write_bandwidth << " GB/sec" << std::endl;
+        std::cout << " Runtime:     " << avg_runtime << " ms" << std::endl;
+    }
+    else
+    {
+        matrix_C_computed.sync_host();
+
+        // Compute the batched gemms
+        for (int b = 0; b < problem_size.batch(); b++)
+        {
+          cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                         ElementCD, LayoutCD, ElementCD,
+                                         ElementCD>
+              reference_gemm;
+
+          reference_gemm(
+              problem_size.mnk(), alpha,
+              matrix_A.host_ref(b * matrix_A.capacity()),
+              matrix_B.host_ref(b * matrix_B.capacity()), beta,
+              matrix_C_reference.host_ref(b * matrix_C_computed.capacity()));
+
+          bool passed = cutlass::reference::host::TensorEquals(
+              matrix_C_computed.host_view(b * matrix_C_computed.capacity()),
+              matrix_C_reference.host_view(b * matrix_C_reference.capacity()));
+
+          EXPECT_TRUE(passed)
+              //<< "A:\n" << matrix_A.host_view() << "\n"
+              //<< "B:\n" << matrix_B.host_view() << "\n"
+              << "Batch: " << b << "\n"
+              << "Reference:\n"
+              << matrix_C_reference.host_view(b * matrix_C_reference.capacity())
+              << "\n"
+              << "Computed:\n"
+              << matrix_C_computed.host_view(b * matrix_C_computed.capacity())
+              << "\n";
+        }
+    }
+}
+
+template<typename ThreadBlockShape_,
+        typename ThreadShape_,
+        typename ElementAB_,
+        typename ElementAccumulator_,
+        typename ElementCD_,
+        typename LayoutA_,
+        typename LayoutB_,
+        typename LayoutCD_,
+        int THREAD_B = 1, // batch tile size
+        bool DEBUG=false>
+void batched_gemv_kernel_perf_test(cutlass::gemm::BatchedGemmCoord problem_size,
+                                   ElementCD_ alpha = ElementCD_(1),
+                                   ElementCD_ beta = ElementCD_(0),
+                                   int iter = 50)
+{
+    batched_gemv_kernel_test<ThreadBlockShape_,
+                             ThreadShape_,
+                             ElementAB_,
+                             ElementAccumulator_,
+                             ElementCD_,
+                             LayoutA_,
+                             LayoutB_,
+                             LayoutCD_,
+                             THREAD_B,
+                             DEBUG>(problem_size, alpha, beta, true, iter);
+}
+    
+} // namespace threadblock
+} // namespace kernel
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/host/testbed_host.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/host/testbed_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e3d6ab079d44345f2f55f4126ba3efc1eba47cb
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/host/testbed_host.h
@@ -0,0 +1,232 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include "cutlass/gemm/thread/mma.h"
+#include "cutlass/layout/vector.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace test {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread-level matrix multiply-accumulate
+template <typename Mma>
+void kernel(
+  typename Mma::ElementC *D,
+  typename Mma::ElementA const *A,
+  typename Mma::ElementB const *B,
+  typename Mma::ElementC const *C) {
+
+  auto ptr_D = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> *>(D);
+  auto ptr_A = reinterpret_cast<cutlass::Array<typename Mma::ElementA, Mma::Shape::kMK> const *>(A);
+  auto ptr_B = reinterpret_cast<cutlass::Array<typename Mma::ElementB, Mma::Shape::kKN> const *>(B);
+  auto ptr_C = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> const *>(C);
+
+  Mma mma;
+
+  auto a = *ptr_A;
+  auto b = *ptr_B;
+  auto c = *ptr_C;
+
+  using Btype = typename Mma::ElementB;
+  cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> d;
+
+  mma(d, a, b, c);
+
+  *ptr_D = d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC
+>
+struct Testbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = cutlass::gemm::thread::Mma<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC
+  >;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed() {
+
+    tensor_A.reset(cutlass::make_Coord(Shape::kM, Shape::kK), false);
+    tensor_B.reset(cutlass::make_Coord(Shape::kK, Shape::kN), false);
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Runs the test
+  bool run() {
+
+    //
+    // initialize device memory
+    //
+
+    cutlass::reference::host::detail::RandomUniformFunc< ElementA > tfill_rand_func( 
+      0,  // seed
+      10, // max
+      0,  // min
+      0); // bits after decimal
+                                                                              
+    cutlass::reference::host::detail::TensorFillRandomUniformFunc< ElementA, LayoutA > tfill_rand(
+      tensor_A.host_view(),
+      tfill_rand_func); 
+
+    for (auto i=0; i< Shape::kM; i++)
+      for (auto j=0; j< Shape::kK; j++)
+        tfill_rand(cutlass::make_Coord(i,j));
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_B.host_data(),
+      tensor_B.capacity(),
+      ElementB(1),
+      ElementB(2)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+
+    // Host side call
+    kernel<Mma>(
+      tensor_D_computed.host_data(),
+      tensor_A.host_data(),
+      tensor_B.host_data(),
+      tensor_C.host_data());
+
+    //
+    // Reference implementation
+    //
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC>
+        reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, Shape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed)
+      << "A:\n" << tensor_A.host_view() << "\n\n"
+      << "B:\n" << tensor_B.host_view() << "\n\n"
+      << "C:\n" << tensor_C.host_view() << "\n\n"
+      << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+      << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    
+    
+    return passed;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d34d7992b57cefa0eaf7300a5e1fb49f41a93e2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/thread/testbed.h
@@ -0,0 +1,236 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include "cutlass/gemm/thread/mma.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+namespace test {
+namespace gemm {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread-level matrix multiply-accumulate
+template <typename Mma>
+__global__ void kernel(
+  typename Mma::ElementC *D,
+  typename Mma::ElementA const *A,
+  typename Mma::ElementB const *B,
+  typename Mma::ElementC const *C) {
+
+  auto ptr_D = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> *>(D);
+  auto ptr_A = reinterpret_cast<cutlass::Array<typename Mma::ElementA, Mma::Shape::kMK> const *>(A);
+  auto ptr_B = reinterpret_cast<cutlass::Array<typename Mma::ElementB, Mma::Shape::kKN> const *>(B);
+  auto ptr_C = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> const *>(C);
+
+  Mma mma;
+
+  auto a = *ptr_A;
+  auto b = *ptr_B;
+  auto c = *ptr_C;
+
+  cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> d;
+
+  mma(d, a, b, c);
+
+  *ptr_D = d;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC
+>
+struct Testbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = cutlass::gemm::thread::Mma<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC
+  >;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed() {
+
+    tensor_A.reset(cutlass::make_Coord(Shape::kM, Shape::kK));
+    tensor_B.reset(cutlass::make_Coord(Shape::kK, Shape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Runs the test
+  bool run() {
+
+    //
+    // initialize device memory
+    //
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_A.host_data(),
+      tensor_A.capacity()
+    );
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_B.host_data(),
+      tensor_B.capacity(),
+      ElementB(1),
+      ElementB(2)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+    // launch kernel
+    kernel<Mma><<< dim3(1, 1), dim3(1, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    //tensor_D_reference.fill(tensor_C.host_view());
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC>
+        reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, Shape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed)
+      << "A:\n" << tensor_A.host_view() << "\n\n"
+      << "B:\n" << tensor_B.host_view() << "\n\n"
+      << "C:\n" << tensor_C.host_view() << "\n\n"
+      << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+      << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace gemm
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f3bc8cf114d7eb2ac00bd19ae92c984558b7228
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -0,0 +1,435 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/core_io.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/host_uncompress.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_multistage_mma_sparse(cutlass::gemm::GemmCoord problem_size,
+                                      typename Mma::IteratorA::Params params_A,
+                                      typename Mma::IteratorA::TensorRef ref_A,
+                                      typename Mma::IteratorB::Params params_B,
+                                      typename Mma::IteratorB::TensorRef ref_B,
+                                      typename Mma::ElementC *ptr_C,
+                                      typename Mma::LayoutC::Stride::Index ldc,
+                                      typename Mma::IteratorE::Params params_E,
+                                      typename Mma::IteratorE::TensorRef ref_E) {
+  // Shared storage needed by threadblock-scoped matrix multiply-
+  // Dynamic shared memory base pointer
+  extern __shared__ int GemmSharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  typename Mma::SharedStorage *shared_storage =
+      reinterpret_cast<typename Mma::SharedStorage *>(GemmSharedStorageBase);
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k() / Mma::kSparse};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  cutlass::MatrixCoord tb_offset_E{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k() / Mma::kSparse};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k() / Mma::kSparse},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  typename Mma::IteratorE iterator_E(
+      params_E, ref_E.data(),
+      {problem_size.m(),
+       problem_size.k() / Mma::kSparse / Mma::kElementsPerElementE},
+      tb_thread_id, tb_offset_E);
+
+  int warp_id = __shfl_sync(0xffffffff, threadIdx.y, 0);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(*shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, iterator_E, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, threadIdx.x);
+
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_id % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_id / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct SparseTestbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  using ElementE = typename MmaCore::ElementE;
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using ThreadMapE = typename MmaCore::IteratorThreadMapE;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  using AccessTypeE = cutlass::Array<ElementE, ThreadMapE::kElementsPerAccess>;
+  static int const Stages = MmaCore::kStages;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      MmaCore::kCacheOpA;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      MmaCore::kCacheOpB;
+  static cutlass::arch::CacheOperation::Kind const CacheOpE =
+      MmaCore::kCacheOpE;
+
+  static int const Sparse = MmaCore::kSparse;
+  static int const MetaSizeInBits = MmaCore::kMetaSizeInBits;
+  static int const MaxID2 = MmaCore::kMaxID2;
+
+  using LayoutE = cutlass::layout::RowMajor;
+  using ReorderedLayoutE = typename MmaCore::GmemLayoutE;
+
+  static int const ElementsPerElementE = MmaCore::kElementsPerElementE;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK / Sparse>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define iterators over tiles from the E operand
+  using IteratorE =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK /
+                                                         Sparse /
+                                                         ElementsPerElementE>,
+          ElementE, ReorderedLayoutE, 1, ThreadMapE, AccessTypeE>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::SparseMmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      CacheOpA, IteratorB, typename MmaCore::SmemIteratorB, CacheOpB, ElementC,
+      LayoutC, IteratorE, typename MmaCore::SmemIteratorE, CacheOpE,
+      typename MmaCore::MmaPolicy, Stages>;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A_uncompressed;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+  cutlass::HostTensor<ElementE, LayoutE> matrix_E;
+  cutlass::HostTensor<ElementE, ReorderedLayoutE> matrix_E_reordered;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  SparseTestbed(int m, int n, int k, float alpha_ = float(1), float beta_ = float(0))
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k / Sparse));
+    matrix_A_uncompressed.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+    matrix_C_computed.reset(cutlass::make_Coord(m, n));
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+    matrix_E.reset(cutlass::make_Coord(m, k / Sparse / ElementsPerElementE));
+    matrix_E_reordered.reset(
+        cutlass::make_Coord(m, k / Sparse / ElementsPerElementE));
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_E = cutlass::Distribution::Uniform) {
+
+    // Waive the test
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    if (init_E == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomSparseMeta(
+          matrix_E.host_view(), seed, MetaSizeInBits);
+    } else if (init_E == cutlass::Distribution::Identity) {
+      uint32_t content = (MaxID2 == 1) ? 0x44444444 : 0x4444;
+      cutlass::reference::host::TensorFill(matrix_E.host_view(),
+                                           (ElementE)(content));
+    } else {
+      return false;
+    }
+
+    cutlass::reorder_meta(matrix_E_reordered.host_ref(), matrix_E.host_ref(),
+                          {problem_size.m(), problem_size.n(),
+                           problem_size.k() / Sparse / ElementsPerElementE});
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+    matrix_E_reordered.sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+    typename IteratorE::Params params_E(matrix_E_reordered.layout());
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename Mma::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+          return true;
+      }
+
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+          return true;
+      }
+    }
+
+    test::gemm::threadblock::kernel_multistage_mma_sparse<Mma>
+        <<<grid, block, smem_size, 0>>>(
+            problem_size, params_A, matrix_A.device_ref(), params_B,
+            matrix_B.device_ref(), matrix_C_computed.device_data(),
+            matrix_C_computed.layout().stride(0), params_E,
+            matrix_E_reordered.device_ref());
+
+    //
+    // Check error code
+    //
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    matrix_C_computed.sync_host();
+
+    cutlass::uncompress(matrix_A_uncompressed.host_ref(), matrix_A.host_ref(),
+                        matrix_E.host_ref(), problem_size.m(),
+                        problem_size.k());
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC>
+        reference_gemm;
+
+    reference_gemm(problem_size, ElementC(alpha),
+                   matrix_A_uncompressed.host_view(), matrix_B.host_view(),
+                   ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed.host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "E:\n" << matrix_E.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed.host_view() << "\n";
+    }
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
+
+    return passed;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..5caaf38ace92758bbc86970d8d4ff339d87348ab
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -0,0 +1,372 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include "../../common/cutlass_unit_test.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/core_io.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/tensor_view_io.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_multistage_mma(cutlass::gemm::GemmCoord problem_size,
+                                      typename Mma::IteratorA::Params params_A,
+                                      typename Mma::IteratorA::TensorRef ref_A,
+                                      typename Mma::IteratorB::Params params_B,
+                                      typename Mma::IteratorB::TensorRef ref_B,
+                                      typename Mma::ElementC *ptr_C, 
+                                      typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int GemmSharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  typename Mma::SharedStorage *shared_storage =
+      reinterpret_cast<typename Mma::SharedStorage *>(GemmSharedStorageBase);
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = __shfl_sync(0xffffffff, threadIdx.y, 0);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(*shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, threadIdx.x);
+
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_id % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_id / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  static int const Stages = MmaCore::kStages;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      MmaCore::kCacheOpA;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      MmaCore::kCacheOpB;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      CacheOpA, IteratorB, typename MmaCore::SmemIteratorB, CacheOpB, ElementC,
+      LayoutC, typename MmaCore::MmaPolicy, Stages>;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_ = float(1), float beta_ = float(0))
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+    matrix_C_computed.reset(cutlass::make_Coord(m, n));
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    //
+    // Determine SMEM requirements and waive if not satisfied
+    //
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename Mma::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      if (result != cudaSuccess) {
+        if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+          std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+        }
+        return true;
+      }
+
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      if (result != cudaSuccess) {
+        if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+          std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+        }
+        return true;
+      }
+    }
+
+    test::gemm::threadblock::kernel_multistage_mma<Mma>
+        <<<grid, block, smem_size, 0>>>(
+            problem_size, params_A, matrix_A.device_ref(), params_B,
+            matrix_B.device_ref(), matrix_C_computed.device_data(),
+            matrix_C_computed.layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    matrix_C_computed.sync_host();
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC> reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed.host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed && CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+      std::cout
+        << __FILE__ << ":" << __LINE__ << "  "
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed.host_view() << "\n";
+    }
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_reference.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(matrix_C_computed.host_view()), 0);
+
+    return passed;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e617d6327594570b1a88a5b28f2ec4d0467b534
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
@@ -0,0 +1,387 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_multistage_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC **ptr_C,
+                           typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+
+  // Dynamic shared memory base pointer
+  extern __shared__ int GemmSharedStorageBase[];
+
+  // Declare pointer to dynamic shared memory.
+  typename Mma::SharedStorage *shared_storage =
+      reinterpret_cast<typename Mma::SharedStorage *>(GemmSharedStorageBase);
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = __shfl_sync(0xffffffff, threadIdx.y, 0);
+  int lane_id = threadIdx.x;
+
+  int partitionsK_idx = warp_id / (Mma::WarpCount::kM * Mma::WarpCount::kN);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(*shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C[partitionsK_idx], ldc}, lane_id);
+
+  int warp_idx_mn = warp_id % (Mma::WarpCount::kM * Mma::WarpCount::kN);
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_idx_mn % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_idx_mn / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  using ThreadMapA = typename MmaCore::IteratorThreadMapA;
+  using ThreadMapB = typename MmaCore::IteratorThreadMapB;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  static int const Stages = MmaCore::kStages;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      MmaCore::kCacheOpA;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      MmaCore::kCacheOpB;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaMultistage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, CacheOpA,
+      IteratorB, typename MmaCore::SmemIteratorB, CacheOpB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy, Stages>;
+
+  static int const kPartitionsK = MmaCore::MmaPolicy::kPartitionsK; 
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed[kPartitionsK];
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+  cutlass::HostTensor<ElementC*, cutlass::layout::PackedVectorLayout> matrix_C_pointers;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_ = float(1), float beta_ = float(0))
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].reset(cutlass::make_Coord(m, n));
+
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+    matrix_C_pointers.reset(cutlass::Coord<1>(kPartitionsK));
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      return false;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      cutlass::reference::host::TensorFill(matrix_C_computed[k].host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_pointers.at(cutlass::Coord<1>(k)) = matrix_C_computed[k].device_data();
+
+    matrix_C_pointers.sync_device();
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename Mma::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+
+      EXPECT_EQ(result, cudaSuccess)
+          << " cudaFuncSetAttribute "
+             "cudaFuncAttributeMaxDynamicSharedMemorySize error: "
+          << cudaGetErrorString(result);
+
+      result = cudaFuncSetAttribute(
+          test::gemm::threadblock::kernel_multistage_mma<Mma>,
+          cudaFuncAttributePreferredSharedMemoryCarveout, 100);
+
+      EXPECT_EQ(result, cudaSuccess)
+          << " cudaFuncSetAttribute "
+             "cudaFuncAttributePreferredSharedMemoryCarveout error: "
+          << cudaGetErrorString(result);
+    }
+
+    test::gemm::threadblock::kernel_multistage_mma<Mma><<<grid, block, smem_size, 0>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_pointers.device_data(),
+        matrix_C_computed[0].layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_host();
+
+    // TODO: this is temporary. it will be removed after slicing can de
+    // reduction
+    //
+    // Reduce matrix_C_computed
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 1; k < kPartitionsK; k++) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int m = 0; m < matrix_C_computed[0].extent().row(); m++){
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < matrix_C_computed[0].extent().column(); n++){
+          matrix_C_computed[0].at({m, n}) += matrix_C_computed[k].at({m, n});
+        }
+      }
+    }
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed[0].host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_multistage_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed[0].host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eb62f9a39fe4472f77446efc591267001758c58
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -0,0 +1,353 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC *ptr_C,
+                           typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ typename Mma::SharedStorage shared_storage;
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id);
+
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_id % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_id / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_,
+    /// Number of stages
+    int Stages = 2>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+  static const int kStages = Stages;
+
+  // Define iterators over tiles from the A operand
+  static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value && 
+                                cutlass::platform::is_same<ElementB, int8_t>::value && 
+                                cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
+
+  using IteratorA = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
+        
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
+      >::type;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
+
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
+      >::type;
+
+  // Define MmaPipeline Single Stage
+  using MmaPipelineSingleStage =  cutlass::gemm::threadblock::MmaSingleStage<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+
+  // Define MmaPipeline Two Stages
+  using MmaPipelineTwoStages =  cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+  
+  // Define the threadblock-scoped pipelined matrix multiply (Select between Single vs. Two stages)
+  using Mma = typename cutlass::platform::conditional<(kStages==1), MmaPipelineSingleStage, MmaPipelineTwoStages>::type;
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_, float beta_)
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+    matrix_C_computed.reset(cutlass::make_Coord(m, n));
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+  }
+
+  bool sufficient() {
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    // Waive test if insufficient CUDA device
+    if (!sufficient()) {
+      if (CUTLASS_TEST_UNIT_ENABLE_WARNINGS) {
+        std::cerr << "Test waived due to insufficient CUDA device." << std::endl;
+      }
+      return true;
+    }
+
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(matrix_C_computed.host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_computed.device_data(),
+        matrix_C_computed.layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result) << " on device " << GetCudaDevice();
+
+    matrix_C_computed.sync_host();
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed.host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed) << "Failed on device " << GetCudaDevice();
+
+    if (!passed) {
+      std::ofstream output("mma_pipelined_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
new file mode 100644
index 0000000000000000000000000000000000000000..36e55b2542b2258542336a052cdd14bf4b85f78d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
@@ -0,0 +1,370 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_mma(cutlass::gemm::GemmCoord problem_size,
+                           typename Mma::IteratorA::Params params_A,
+                           typename Mma::IteratorA::TensorRef ref_A,
+                           typename Mma::IteratorB::Params params_B,
+                           typename Mma::IteratorB::TensorRef ref_B,
+                           typename Mma::ElementC **ptr_C,
+                           typename Mma::LayoutC::Stride::Index ldc) {
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ typename Mma::SharedStorage shared_storage;
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A and B operands
+  typename Mma::IteratorA iterator_A(params_A, ref_A.data(),
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+
+  typename Mma::IteratorB iterator_B(params_B, ref_B.data(),
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  int partitionsK_idx = warp_id / (Mma::WarpCount::kM * Mma::WarpCount::kN);
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C[partitionsK_idx], ldc}, lane_id);
+
+
+  int warp_idx_mn = warp_id % (Mma::WarpCount::kM * Mma::WarpCount::kN);
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_idx_mn % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_idx_mn / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename MmaCore_>
+struct Testbed {
+  /// Threadblock-level GEMM implementation
+  using MmaCore = MmaCore_;
+  using ThreadblockShape = typename MmaCore::Shape;
+  using WarpShape = typename MmaCore::WarpShape;
+  using InstructionShape = typename MmaCore::InstructionShape;
+  using ElementA = typename MmaCore::ElementA;
+  using LayoutA = typename MmaCore::LayoutA;
+  using ElementB = typename MmaCore::ElementB;
+  using LayoutB = typename MmaCore::LayoutB;
+  using ElementC = typename MmaCore::ElementC;
+  using LayoutC = typename MmaCore::LayoutC;
+
+  // Define iterators over tiles from the A operand
+  static const bool use_idp4a = cutlass::platform::is_same<ElementA, int8_t>::value && 
+                                cutlass::platform::is_same<ElementB, int8_t>::value && 
+                                cutlass::platform::is_same<typename MmaCore::OperatorClass, cutlass::arch::OpClassSimt>::value;
+
+  static const bool transposeA =  cutlass::platform::is_same< LayoutA, cutlass::layout::ColumnMajor >::value;
+  static const bool transposeB =  cutlass::platform::is_same< LayoutB, cutlass::layout::RowMajor >::value;
+
+  using IteratorA = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA, transposeA> ,
+        
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore::IteratorThreadMapA>
+      >::type;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB = typename cutlass::platform::conditional< use_idp4a,
+      cutlass::transform::threadblock::PredicatedTileIterator2dThreadTile<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB, transposeB> ,
+
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<ThreadblockShape::kK, ThreadblockShape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore::IteratorThreadMapB>
+      >::type;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using Mma = cutlass::gemm::threadblock::MmaPipelined<
+      typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA,
+      IteratorB, typename MmaCore::SmemIteratorB, ElementC, LayoutC,
+      typename MmaCore::MmaPolicy>;
+
+  static int const kPartitionsK = MmaCore::MmaPolicy::kPartitionsK; 
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensor<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_computed[kPartitionsK];
+  cutlass::HostTensor<ElementC, LayoutC> matrix_C_reference;
+  cutlass::HostTensor<ElementC*, cutlass::layout::PackedVectorLayout> matrix_C_pointers;
+
+  cutlass::gemm::GemmCoord problem_size;
+  float alpha, beta;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed(int m, int n, int k, float alpha_, float beta_)
+      : problem_size(m, n, k), alpha(alpha_), beta(beta_) {
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].reset(cutlass::make_Coord(m, n));
+
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+    matrix_C_pointers.reset(cutlass::Coord<1>(kPartitionsK));
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+    } else {
+      return false;
+    }
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      cutlass::reference::host::TensorFill(matrix_C_computed[k].host_view());
+
+    cutlass::reference::host::TensorFill(matrix_C_reference.host_view());
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_pointers.at(cutlass::Coord<1>(k)) = matrix_C_computed[k].device_data();
+
+    matrix_C_pointers.sync_device();
+
+    test::gemm::threadblock::kernel_mma<Mma><<<grid, block>>>(
+        problem_size, params_A, matrix_A.device_ref(), params_B,
+        matrix_B.device_ref(), matrix_C_pointers.device_data(),
+        matrix_C_computed[0].layout().stride(0));
+
+    //
+    // Check error code
+    //
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 0; k < kPartitionsK; k++)
+      matrix_C_computed[k].sync_host();
+
+    // TODO: this is temporary. it will be removed after slicing can de
+    // reduction
+    //
+    // Reduce matrix_C_computed
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for(int k = 1; k < kPartitionsK; k++) {
+      CUTLASS_PRAGMA_UNROLL
+      for(int m = 0; m < matrix_C_computed[0].extent().row(); m++){
+        CUTLASS_PRAGMA_UNROLL
+        for(int n = 0; n < matrix_C_computed[0].extent().column(); n++){
+          matrix_C_computed[0].at({m, n}) += matrix_C_computed[k].at({m, n});
+        }
+      }
+    }
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   typename MmaCore::Operator>
+        reference_gemm;
+
+    reference_gemm(
+        problem_size, ElementC(alpha), matrix_A.host_view(),
+        matrix_B.host_view(), ElementC(beta), matrix_C_reference.host_view());
+
+    bool passed = cutlass::reference::host::TensorEquals(
+        matrix_C_computed[0].host_view(), matrix_C_reference.host_view());
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_pipelined_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed[0].host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5fdc07769726353b33c1a5da65dedfadb4ce1e7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -0,0 +1,350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit testbed for kernel-level GEMM
+*/
+
+#pragma once
+
+#include <fstream>
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/core_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm_planar_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Mma>
+__global__ void kernel_mma_planar_complex(
+  cutlass::gemm::GemmCoord problem_size,
+  typename Mma::IteratorA::Params params_A,
+  typename Mma::IteratorA::Element *ptr_A,
+  int64_t imaginary_stride_A,
+  typename Mma::IteratorB::Params params_B,
+  typename Mma::IteratorB::Element *ptr_B,
+  int64_t imaginary_stride_B,
+  typename Mma::ElementC *ptr_C, 
+  typename Mma::LayoutC::Stride::Index ldc, int64_t imaginary_stride_C) {
+
+  // Shared storage needed by threadblock-scoped matrix multiply-accumulate
+  __shared__ typename Mma::SharedStorage shared_storage;
+
+  // Compute threadblock location
+  cutlass::gemm::GemmCoord tb_tile_offset = {int(blockIdx.x), int(blockIdx.y),
+                                             0};
+
+  cutlass::MatrixCoord tb_offset_A{tb_tile_offset.m() * Mma::Shape::kM,
+                                   tb_tile_offset.k()};
+
+  cutlass::MatrixCoord tb_offset_B{tb_tile_offset.k(),
+                                   tb_tile_offset.n() * Mma::Shape::kN};
+
+  // Compute position within threadblock
+  int tb_thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  // Construct iterators to A operand
+  typename Mma::IteratorA iterator_A_real(params_A, ptr_A,
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+  
+  typename Mma::IteratorA iterator_A_imag(params_A, ptr_A + imaginary_stride_A,
+                                     {problem_size.m(), problem_size.k()},
+                                     tb_thread_id, tb_offset_A);
+  
+  // Construct iterators to B operand
+  typename Mma::IteratorB iterator_B_real(params_B, ptr_B,
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  typename Mma::IteratorB iterator_B_imag(params_B, ptr_B + imaginary_stride_B,
+                                     {problem_size.k(), problem_size.n()},
+                                     tb_thread_id, tb_offset_B);
+
+  int warp_id = threadIdx.y;
+  int lane_id = threadIdx.x;
+
+  // Construct thread-scoped matrix multiply
+  Mma mma(shared_storage, tb_thread_id, warp_id, threadIdx.x);
+
+  typename Mma::FragmentC accum;
+
+  accum.clear();
+
+  int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+  // Compute threadblock-scoped matrix multiply-add
+  mma(gemm_k_iterations, accum, iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag, accum);
+
+  // Output results
+  typename Mma::Operator::IteratorC iterator_C({ptr_C, ldc}, lane_id);
+
+  iterator_C.add_tile_offset(
+      {(tb_tile_offset.m() * Mma::WarpCount::kM) +
+           (warp_id % Mma::WarpCount::kM),
+       (tb_tile_offset.n() * Mma::WarpCount::kN) +
+           (warp_id / Mma::WarpCount::kM)});
+
+  iterator_C.store(accum.real);
+
+  iterator_C.store_with_pointer_offset(accum.imag, imaginary_stride_C);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+    /// Threadblock-level matrix multiply-accumulate
+    typename Mma_>
+struct TestbedPlanarComplex {
+
+  using Mma = Mma_;
+  using ThreadblockShape = typename Mma::Shape;
+  using IteratorA = typename Mma::IteratorA;
+  using ElementA = typename Mma::IteratorA::Element;
+  using LayoutA = typename Mma::IteratorA::Layout;
+  using IteratorB = typename Mma::IteratorB;
+  using ElementB = typename Mma::IteratorB::Element;
+  using LayoutB = typename Mma::IteratorB::Layout;
+  using ElementC = typename Mma::ElementC;
+  using ElementAccumulator = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+  using ThreadMapA = typename Mma::IteratorA::ThreadMap;
+  using ThreadMapB = typename Mma::IteratorB::ThreadMap;
+  using AccessTypeA = cutlass::Array<ElementA, ThreadMapA::kElementsPerAccess>;
+  using AccessTypeB = cutlass::Array<ElementB, ThreadMapB::kElementsPerAccess>;
+  static int const Stages = Mma::kStages;
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      Mma::kCacheOpA;
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      Mma::kCacheOpB;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensorPlanarComplex<ElementA, LayoutA> matrix_A;
+  cutlass::HostTensorPlanarComplex<ElementB, LayoutB> matrix_B;
+  cutlass::HostTensorPlanarComplex<ElementC, LayoutC> matrix_C_computed;
+  cutlass::HostTensorPlanarComplex<ElementC, LayoutC> matrix_C_reference;
+
+  cutlass::gemm::GemmCoord problem_size;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  TestbedPlanarComplex(int m, int n, int k)
+      : problem_size(m, n, k) {
+
+    matrix_A.reset(cutlass::make_Coord(m, k));
+    matrix_B.reset(cutlass::make_Coord(k, n));
+    matrix_C_computed.reset(cutlass::make_Coord(m, n));
+    matrix_C_reference.reset(cutlass::make_Coord(m, n), false);
+  }
+
+  /// Runs the test
+  bool run(
+      dim3 grid, dim3 block,
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_A.host_view(), seed, scope_max, scope_min, 0);
+      
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      
+      for (int i = 0; i < matrix_A.capacity() * 2; ++i) {
+        matrix_A.host_data()[i] = cutlass::half_t(float(i % 5) - 2);
+      }
+      /*
+      cutlass::reference::host::BlockFillSequential(matrix_A.host_data(),
+                                                    matrix_A.capacity() * 2);
+      */
+    } else if (init_A == cutlass::Distribution::Identity) {
+      //cutlass::reference::host::TensorFillIdentity(matrix_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+
+      
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          matrix_B.host_view(), seed + 16, scope_max, scope_min, 0);
+      
+
+    } else if (init_B == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(matrix_B.host_data(),
+                                                    matrix_B.capacity() * 2);
+
+      for (int i = 0; i < matrix_B.capacity() * 2; ++i) {
+        matrix_B.host_data()[i] = cutlass::half_t(float((i + 3) % 5) - 2);
+      }
+
+
+    } else if (init_B == cutlass::Distribution::Identity) {
+
+      //cutlass::reference::host::TensorFillIdentity(matrix_B.host_view());
+
+    } else {
+      return false;
+    }
+
+    matrix_A.sync_device();
+    matrix_B.sync_device();
+    matrix_C_computed.sync_device();
+
+    typename IteratorA::Params params_A(matrix_A.layout());
+    typename IteratorB::Params params_B(matrix_B.layout());
+
+    test::gemm::threadblock::kernel_mma_planar_complex<Mma><<<grid, block>>>(
+        problem_size, 
+        params_A, 
+        matrix_A.device_data(),
+        matrix_A.imaginary_stride(),
+        params_B,
+        matrix_B.device_data(), 
+        matrix_B.imaginary_stride(),
+        matrix_C_computed.device_data(),
+        matrix_C_computed.layout().stride(0), 
+        matrix_C_computed.imaginary_stride()
+      );
+
+
+    //
+    // Check error code
+    //
+
+    cudaError_t result = cudaDeviceSynchronize();
+    EXPECT_EQ(result, cudaSuccess)
+        << " kernel error: " << cudaGetErrorString(result);
+
+    matrix_C_computed.sync_host();
+
+    cutlass::reference::host::GemmPlanarComplex<
+      ElementA, LayoutA,
+      ElementB, LayoutB,
+      ElementC, LayoutC,
+      ElementAccumulator
+    >(
+      problem_size,
+      cutlass::complex<ElementAccumulator>(ElementAccumulator(1)),
+      matrix_A.host_ref(),
+      Mma::kTransformA,
+      matrix_B.host_ref(),
+      Mma::kTransformB,
+      cutlass::complex<ElementAccumulator>(ElementAccumulator(0)),
+      matrix_C_reference.host_ref(),
+      matrix_C_reference.host_ref()
+    );
+    
+    bool passed = cutlass::reference::host::TensorEquals(
+      matrix_C_computed.host_view(), 
+      matrix_C_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::ofstream output("mma_pipelined_testbed_errors.txt");
+
+      output
+        << "A:\n" << matrix_A.host_view() << "\n"
+        << "B:\n" << matrix_B.host_view() << "\n"
+        << "Reference:\n"
+        << matrix_C_reference.host_view() << "\n"
+        << "Computed:\n"
+        << matrix_C_computed.host_view() << "\n";
+    }
+
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/warp/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/warp/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..921d1abdc40c2040104815cfffb8b2ea32384136
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/gemm/warp/testbed.h
@@ -0,0 +1,1543 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/subbyte_reference.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/host_reorder.h"
+#include "cutlass/util/host_uncompress.h"
+
+namespace test {
+namespace gemm {
+namespace warp {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test kernel
+template <typename Mma, typename ThreadblockShape>
+__global__ void kernel(
+  typename Mma::ElementC *output_C, 
+  typename Mma::ElementA const *input_A,
+  typename Mma::ElementB const *input_B,
+  typename Mma::ElementC const *input_C,
+  int iterations = 1) {
+
+  // Use AlignedBuffer to store trivially copyable objects in unions and __shared__ buffers.
+  __shared__ cutlass::AlignedBuffer<
+    typename Mma::ElementA, ThreadblockShape::kM * ThreadblockShape::kK> smem_buffer_A;
+
+  __shared__ cutlass::AlignedBuffer<
+    typename Mma::ElementB, ThreadblockShape::kN * ThreadblockShape::kK> smem_buffer_B;
+
+  if (threadIdx.x == 0) {
+    typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementA>::type>::get(input_A, i);
+    }
+
+    typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementB>::type>::get(input_B, i);
+    }
+  }
+
+  __syncthreads();
+
+  //
+  // Construct warp-level matrix product
+  //
+
+  using FragmentA = typename Mma::FragmentA;
+  using FragmentB = typename Mma::FragmentB;
+  using FragmentC = typename Mma::FragmentC;
+
+  typename Mma::LayoutA layout_A = Mma::LayoutA::packed({ThreadblockShape::kM, ThreadblockShape::kK});
+  typename Mma::LayoutB layout_B = Mma::LayoutB::packed({ThreadblockShape::kK, ThreadblockShape::kN});
+  typename Mma::LayoutC layout_C = Mma::LayoutC::packed({Mma::Shape::kM, Mma::Shape::kN});
+
+  typename Mma::IteratorA iter_A({smem_buffer_A.data(), layout_A}, cutlass::arch::LaneId());
+
+  typename Mma::IteratorB iter_B({smem_buffer_B.data(), layout_B}, cutlass::arch::LaneId());
+
+  FragmentA frag_A;
+  FragmentB frag_B;
+
+  FragmentC accum;
+
+  Mma mma;
+
+  accum.clear();
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int iter = 0; iter < iterations; ++iter) {     // place in loop that is not unrolled 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < ThreadblockShape::kK;
+         k += Mma::Policy::MmaShape::kK) {
+      iter_A.load(frag_A);
+      iter_B.load(frag_B);
+
+      ++iter_A;
+      ++iter_B;
+
+      mma(accum, frag_A, frag_B, accum);
+    }
+  }
+  
+  typename Mma::IteratorC iter_C({output_C, layout_C}, cutlass::arch::LaneId());
+
+  iter_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Warp-level matrix multiply-accumulate
+  typename Mma_,
+  /// Size of threadblock-scoped shape used to store SMEM
+  typename ThreadblockShape_,
+  /// The inner product operation performed by GEMM 
+  typename Operator_ = cutlass::arch::OpMultiplyAdd
+>
+struct Testbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = Mma_;
+  using ThreadblockShape = ThreadblockShape_;
+  using Operator = Operator_;
+
+  using Shape = typename Mma::Shape;
+  using ElementA = typename Mma::ElementA;
+  using LayoutA = typename Mma::LayoutA;
+  using ElementB = typename Mma::ElementB;
+  using LayoutB = typename Mma::LayoutB;
+  using ElementC = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed() {
+
+    tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK));
+    tensor_B.reset(cutlass::make_Coord(ThreadblockShape::kK, ThreadblockShape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+
+  /// Runs the test
+  bool run(
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+
+      cutlass::reference::host::BlockFillRandomUniform(tensor_A.host_data(),
+        tensor_A.capacity(), seed, scope_max, scope_min, 0);
+
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_A.host_data(),
+                                                    tensor_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+
+      cutlass::reference::host::BlockFillRandomUniform(tensor_B.host_data(),
+        tensor_B.capacity(), seed, scope_max, scope_min, 0);
+
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_B.host_data(),
+                                                    tensor_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+    // launch kernel
+    kernel<Mma, ThreadblockShape><<< dim3(1, 1), dim3(32, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   Operator>
+        reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, ThreadblockShape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+
+      cutlass::TensorView<ElementA, cutlass::layout::ColumnMajor> tensor_A_physical(
+        tensor_A.host_data(), 
+        tensor_A.stride()[0], 
+        tensor_A.extent());
+
+      cutlass::TensorView<ElementB, cutlass::layout::RowMajor> tensor_B_physical(
+        tensor_B.host_data(), 
+        tensor_B.stride()[0], 
+        tensor_B.extent());
+
+      std::cout <<"cutlass::sizeof_bits<ElementA>::value = "<<cutlass::sizeof_bits<ElementA>::value<<"\n";
+      std::cout
+        << "A:\n" << tensor_A.host_view() << "\n\n"
+        << "A(physical - stride: " << tensor_A.stride()[0] 
+        << ", extent: " << tensor_A.extent() << "):\n" << tensor_A_physical << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementB>::value<<"\n";
+      std::cout
+        << "B:\n" << tensor_B.host_view() << "\n\n"
+        << "B(physical - stride: " << tensor_B.stride()[0] 
+        << ", extent: " << tensor_B.extent() << "):\n" << tensor_B_physical << "\n\n";
+
+      std::cout
+        << "C:\n" << tensor_C.host_view() << "\n\n"
+        << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+        << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    }
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Warp-level matrix multiply-accumulate
+  typename Mma_,
+  /// Size of threadblock-scoped shape used to store SMEM
+  typename ThreadblockShape_
+>
+struct TestbedComplex {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = Mma_;
+  using ThreadblockShape = ThreadblockShape_;
+
+  using Shape = typename Mma::Shape;
+  using ElementA = typename Mma::ElementA;
+  using LayoutA = typename Mma::LayoutA;
+  using ElementB = typename Mma::ElementB;
+  using LayoutB = typename Mma::LayoutB;
+  using ElementC = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  TestbedComplex() {
+
+    tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK));
+    tensor_B.reset(cutlass::make_Coord(ThreadblockShape::kK, ThreadblockShape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(tensor_A.host_view(),
+                                                        seed, 8, -8, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_A.host_data(),
+                                                    tensor_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(tensor_B.host_view(),
+                                                        seed + 16, 8, -8, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_B.host_data(),
+                                                    tensor_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+    // launch kernel
+    kernel<Mma, ThreadblockShape><<< dim3(1, 1), dim3(32, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    cutlass::reference::host::GemmComplex(
+      {Shape::kM, Shape::kN, ThreadblockShape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      Mma::kTransformA,
+      tensor_B.host_ref(),
+      Mma::kTransformB,
+      ElementC(0),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+
+      cutlass::TensorView<ElementA, cutlass::layout::ColumnMajor> tensor_A_physical(
+        tensor_A.host_data(), 
+        tensor_A.stride()[0], 
+        tensor_A.extent());
+
+      cutlass::TensorView<ElementB, cutlass::layout::RowMajor> tensor_B_physical(
+        tensor_B.host_data(), 
+        tensor_B.stride()[0], 
+        tensor_B.extent());
+
+      std::cout <<"cutlass::sizeof_bits<ElementA>::value = "<<cutlass::sizeof_bits<ElementA>::value<<"\n";
+      std::cout 
+        << "A:\n" << tensor_A.host_view() << "\n\n"
+        << "A(physical - stride: " << tensor_A.stride()[0] << ", extent: " << tensor_A.extent() << "):\n" << tensor_A_physical << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementB>::value<<"\n";
+      std::cout
+        << "B:\n" << tensor_B.host_view() << "\n\n"
+        << "B(physical - stride: " << tensor_B.stride()[0] << ", extent: " << tensor_B.extent() <<"):\n" << tensor_B_physical << "\n\n";
+
+      std::cout
+        << "C:\n" << tensor_C.host_view() << "\n\n"
+        << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+        << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    }
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test kernel
+template <typename Mma, typename ThreadblockShape>
+__global__ void kernel_transform(
+  typename Mma::ElementC *output_C, 
+  typename Mma::ElementA const *input_A,
+  typename Mma::ElementB const *input_B,
+  typename Mma::ElementC const *input_C,
+  int iterations = 1) {
+
+  // Use AlignedBuffer to store trivially copyable objects in unions and __shared__ buffers.
+  __shared__ cutlass::AlignedBuffer<
+    typename Mma::ElementA, ThreadblockShape::kM * ThreadblockShape::kK> smem_buffer_A;
+
+  __shared__ cutlass::AlignedBuffer<
+    typename Mma::ElementB, ThreadblockShape::kN * ThreadblockShape::kK> smem_buffer_B;
+
+  if (threadIdx.x == 0) {
+    typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementA>::type>::get(input_A, i);
+    }
+
+    typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementB>::type>::get(input_B, i);
+    }
+  }
+
+  __syncthreads();
+
+  //
+  // Construct warp-level matrix product
+  //
+
+  using FragmentA = typename Mma::FragmentA;
+  using FragmentB = typename Mma::FragmentB;
+  using FragmentC = typename Mma::FragmentC;
+
+  using TransformedFragmentA = typename Mma::TransformedFragmentA;
+  using TransformedFragmentB = typename Mma::TransformedFragmentB;
+
+  typename Mma::LayoutA layout_A = Mma::LayoutA::packed({ThreadblockShape::kM, ThreadblockShape::kK});
+  typename Mma::LayoutB layout_B = Mma::LayoutB::packed({ThreadblockShape::kK, ThreadblockShape::kN});
+  typename Mma::LayoutC layout_C = Mma::LayoutC::packed({Mma::Shape::kM, Mma::Shape::kN});
+
+  typename Mma::IteratorA iter_A({smem_buffer_A.data(), layout_A}, cutlass::arch::LaneId());
+
+  typename Mma::IteratorB iter_B({smem_buffer_B.data(), layout_B}, cutlass::arch::LaneId());
+
+  FragmentA loaded_frag_A;
+  FragmentB loaded_frag_B;
+  TransformedFragmentA transformed_frag_A;
+  TransformedFragmentB transformed_frag_B;
+
+  FragmentC accum;
+
+  Mma mma;
+
+  accum.clear();
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int iter = 0; iter < iterations; ++iter) {     // place in loop that is not unrolled 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < ThreadblockShape::kK;
+         k += Mma::Policy::MmaShape::kK) {
+      iter_A.load(loaded_frag_A);
+      iter_B.load(loaded_frag_B);
+
+      ++iter_A;
+      ++iter_B;
+
+      mma.transform(transformed_frag_A, transformed_frag_B, loaded_frag_A,
+                    loaded_frag_B);
+
+      mma(accum, transformed_frag_A, transformed_frag_B, accum);
+    }
+  }
+  
+  typename Mma::IteratorC iter_C({output_C, layout_C}, cutlass::arch::LaneId());
+
+  iter_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Warp-level matrix multiply-accumulate
+  typename Mma_,
+  /// Size of threadblock-scoped shape used to store SMEM
+  typename ThreadblockShape_,
+  /// The innter product operation performed by GEMM 
+  typename Operator_ = cutlass::arch::OpMultiplyAdd
+>
+struct TransformTestbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = Mma_;
+  using ThreadblockShape = ThreadblockShape_;
+  using Operator = Operator_;
+
+  using Shape = typename Mma::Shape;
+  using ElementA = typename Mma::ElementA;
+  using LayoutA = typename Mma::LayoutA;
+  using ElementB = typename Mma::ElementB;
+  using LayoutB = typename Mma::LayoutB;
+  using ElementC = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  TransformTestbed() {
+
+    tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK));
+    tensor_B.reset(cutlass::make_Coord(ThreadblockShape::kK, ThreadblockShape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          tensor_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_A.host_data(),
+                                                    tensor_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          tensor_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_B.host_data(),
+                                                    tensor_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+    // launch kernel
+    kernel_transform<Mma, ThreadblockShape><<<dim3(1, 1), dim3(32, 1, 1)>>>(
+        tensor_D_computed.device_data(), tensor_A.device_data(),
+        tensor_B.device_data(), tensor_C.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   Operator>
+        reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, ThreadblockShape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+
+      cutlass::TensorView<ElementA, cutlass::layout::ColumnMajor> tensor_A_physical(
+        tensor_A.host_data(), 
+        tensor_A.stride()[0], 
+        tensor_A.extent());
+
+      cutlass::TensorView<ElementB, cutlass::layout::RowMajor> tensor_B_physical(
+        tensor_B.host_data(), 
+        tensor_B.stride()[0], 
+        tensor_B.extent());
+
+      std::cout <<"cutlass::sizeof_bits<ElementA>::value = "<<cutlass::sizeof_bits<ElementA>::value<<"\n";
+      std::cout
+        << "A:\n" << tensor_A.host_view() << "\n\n"
+        << "A(physical - stride: " << tensor_A.stride()[0] << ", extent: " << tensor_A.extent() << "):\n" << tensor_A_physical << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementB>::value<<"\n";
+      std::cout
+        << "B:\n" << tensor_B.host_view() << "\n\n"
+        << "B(physical - stride: " << tensor_B.stride()[0] << ", extent: " << tensor_B.extent() << "):\n" << tensor_B_physical << "\n\n";
+
+      std::cout
+        << "C:\n" << tensor_C.host_view() << "\n\n"
+        << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+        << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    }
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Warp-level matrix multiply-accumulate
+  typename Mma_,
+  /// Size of threadblock-scoped shape used to store SMEM
+  typename ThreadblockShape_
+>
+struct TransformedTestbedComplex {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = Mma_;
+  using ThreadblockShape = ThreadblockShape_;
+
+  using Shape = typename Mma::Shape;
+  using ElementA = typename Mma::ElementA;
+  using LayoutA = typename Mma::LayoutA;
+  using ElementB = typename Mma::ElementB;
+  using LayoutB = typename Mma::LayoutB;
+  using ElementC = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  TransformedTestbedComplex() {
+
+    tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK));
+    tensor_B.reset(cutlass::make_Coord(ThreadblockShape::kK, ThreadblockShape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(tensor_A.host_view(),
+                                                        seed, 8, -8, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_A.host_data(),
+                                                    tensor_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(tensor_B.host_view(),
+                                                        seed + 16, 8, -8, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_B.host_data(),
+                                                    tensor_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+    // launch kernel
+    kernel_transform<Mma, ThreadblockShape><<< dim3(1, 1), dim3(32, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    cutlass::reference::host::GemmComplex(
+      {Shape::kM, Shape::kN, ThreadblockShape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      Mma::kTransformA,
+      tensor_B.host_ref(),
+      Mma::kTransformB,
+      ElementC(0),
+      tensor_C.host_ref(),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+
+      cutlass::TensorView<ElementA, cutlass::layout::ColumnMajor> tensor_A_physical(
+        tensor_A.host_data(), 
+        tensor_A.stride()[0], 
+        tensor_A.extent());
+
+      cutlass::TensorView<ElementB, cutlass::layout::RowMajor> tensor_B_physical(
+        tensor_B.host_data(), 
+        tensor_B.stride()[0], 
+        tensor_B.extent());
+
+      std::cout <<"cutlass::sizeof_bits<ElementA>::value = "<<cutlass::sizeof_bits<ElementA>::value<<"\n";
+      std::cout 
+        << "A:\n" << tensor_A.host_view() << "\n\n"
+        << "A(physical - stride: " << tensor_A.stride()[0] << ", extent: " << tensor_A.extent() << "):\n" << tensor_A_physical << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementB>::value<<"\n";
+      std::cout
+        << "B:\n" << tensor_B.host_view() << "\n\n"
+        << "B(physical - stride: " << tensor_B.stride()[0] << ", extent: " << tensor_B.extent() <<"):\n" << tensor_B_physical << "\n\n";
+
+      std::cout
+        << "C:\n" << tensor_C.host_view() << "\n\n"
+        << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+        << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    }
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Test kernel
+template <typename Mma, typename ThreadblockShape>
+__global__ void sparse_kernel(
+  typename Mma::ElementC *output_C, 
+  typename Mma::ElementA const *input_A,
+  typename Mma::ElementB const *input_B,
+  typename Mma::ElementC const *input_C,
+  typename Mma::ElementE const *input_E,
+  int iterations = 1) {
+
+  // Use AlignedBuffer to store trivially copyable objects in unions and __shared__ buffers.
+  __shared__ cutlass::AlignedBuffer<typename Mma::ElementA,
+                                    ThreadblockShape::kM *
+                                        ThreadblockShape::kK / Mma::kSparse>
+      smem_buffer_A;
+
+  __shared__ cutlass::AlignedBuffer<
+    typename Mma::ElementB, ThreadblockShape::kN * ThreadblockShape::kK> smem_buffer_B;
+
+  __shared__ cutlass::AlignedBuffer<
+      typename Mma::ElementE, Mma::Shape::kM * Mma::Shape::kK /
+                                  Mma::kSparse / Mma::kElementsPerElementE>
+      smem_buffer_E;
+  
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    typename Mma::ElementA *smem_ptr_A = smem_buffer_A.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_A.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementA>::get(smem_ptr_A, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementA>::type>::get(input_A, i);
+    }
+
+    typename Mma::ElementB *smem_ptr_B = smem_buffer_B.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_B.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementB>::get(smem_ptr_B, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementB>::type>::get(input_B, i);
+    }
+
+    typename Mma::ElementE *smem_ptr_E = smem_buffer_E.data();
+    #pragma unroll 1
+    for (size_t i = 0; i < smem_buffer_E.size(); ++i) {
+      cutlass::ReferenceFactory<typename Mma::ElementE>::get(smem_ptr_E, i) =
+          cutlass::ReferenceFactory<typename cutlass::platform::remove_const<
+              typename Mma::ElementE>::type>::get(input_E, i);
+    }
+  }
+
+  __syncthreads();
+
+  //
+  // Construct warp-level matrix product
+  //
+
+  using FragmentA = typename Mma::FragmentA;
+  using FragmentB = typename Mma::FragmentB;
+  using FragmentC = typename Mma::FragmentC;
+  using FragmentE = typename Mma::FragmentE;
+
+  typename Mma::LayoutA layout_A = Mma::LayoutA::packed(
+      {ThreadblockShape::kM, ThreadblockShape::kK / Mma::kSparse});
+  typename Mma::LayoutB layout_B =
+      Mma::LayoutB::packed({ThreadblockShape::kK, ThreadblockShape::kN});
+  typename Mma::LayoutC layout_C = Mma::LayoutC::packed({Mma::Shape::kM, Mma::Shape::kN});
+  typename Mma::LayoutE layout_E =
+      Mma::LayoutE::packed({Mma::Shape::kM * Mma::kInterleaved,
+                            Mma::Shape::kK / Mma::kSparse /
+                                Mma::kElementsPerElementE / Mma::kInterleaved});
+
+  typename Mma::IteratorA iter_A({smem_buffer_A.data(), layout_A}, cutlass::arch::LaneId());
+
+  typename Mma::IteratorB iter_B({smem_buffer_B.data(), layout_B}, cutlass::arch::LaneId());
+
+  typename Mma::IteratorE iter_E({smem_buffer_E.data(), layout_E}, cutlass::arch::LaneId());
+
+  FragmentA frag_A;
+  FragmentB frag_B;
+
+  FragmentC accum;
+
+  FragmentE frag_E;
+
+  Mma mma;
+
+  accum.clear();
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int iter = 0; iter < iterations; ++iter) {     // place in loop that is not unrolled 
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0; k < ThreadblockShape::kK;
+         k += Mma::Policy::MmaShape::kK) {
+      iter_A.load(frag_A);
+      iter_B.load(frag_B);
+      iter_E.load(frag_E);
+
+      ++iter_A;
+      ++iter_B;
+      ++iter_E;
+
+      mma(accum, frag_A, frag_B, accum, frag_E);
+    }
+  }
+  
+  typename Mma::IteratorC iter_C({output_C, layout_C}, cutlass::arch::LaneId());
+
+  iter_C.store(accum);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product
+template <
+  /// Warp-level matrix multiply-accumulate
+  typename Mma_,
+  /// Size of threadblock-scoped shape used to store SMEM
+  typename ThreadblockShape_,
+  /// The innter product operation performed by GEMM 
+  typename Operator_ = cutlass::arch::OpMultiplyAdd
+>
+struct SparseTestbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = Mma_;
+  using ThreadblockShape = ThreadblockShape_;
+  using Operator = Operator_;
+
+  using Shape = typename Mma::Shape;
+  using ElementA = typename Mma::ElementA;
+  using LayoutA = typename Mma::LayoutA;
+  using ElementB = typename Mma::ElementB;
+  using LayoutB = typename Mma::LayoutB;
+  using ElementC = typename Mma::ElementC;
+  using LayoutC = typename Mma::LayoutC;
+
+  static int const Sparse = Mma::kSparse;
+  static int const MetaSizeInBits = Mma::kMetaSizeInBits;
+  static int const MaxID2 = Mma::kMaxID2;
+  static int const Interleaved = Mma::kInterleaved;
+
+  using ElementE = typename Mma::ElementE;
+
+  static int const ElementsPerElementE = Mma::kElementsPerElementE;
+
+  using LayoutE = cutlass::layout::RowMajor;
+  using ReorderedLayoutE =
+      cutlass::layout::ColumnMajorInterleaved<Interleaved>;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A_uncompressed;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+  cutlass::HostTensor<ElementE, LayoutE> tensor_E;
+  cutlass::HostTensor<ElementE, ReorderedLayoutE> tensor_E_reordered;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  SparseTestbed() {
+
+    tensor_A.reset(cutlass::make_Coord(ThreadblockShape::kM,
+                                       ThreadblockShape::kK / Sparse));
+    tensor_A_uncompressed.reset(
+        cutlass::make_Coord(ThreadblockShape::kM, ThreadblockShape::kK));
+    tensor_B.reset(cutlass::make_Coord(ThreadblockShape::kK, ThreadblockShape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+    tensor_E.reset(cutlass::make_Coord(
+        Shape::kM, Shape::kK / Sparse / ElementsPerElementE));
+    tensor_E_reordered.reset(cutlass::make_Coord(
+        Shape::kM, Shape::kK / Sparse / ElementsPerElementE));
+  }
+
+  /// Returns true if the CUDA device is sufficient to execute the kernel.
+  bool sufficient() const {
+
+    cudaDeviceProp properties;
+    int device_idx;
+    cudaError_t result = cudaGetDevice(&device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDevice() API call failed.");
+    }
+
+    result = cudaGetDeviceProperties(&properties, device_idx);
+
+    if (result != cudaSuccess) {
+      throw std::runtime_error("cudaGetDeviceProperties() failed");
+    }
+
+    if (properties.major == 9) {
+      // NVIDIA Hopper drops support for several data types
+      if (
+        cutlass::sizeof_bits<ElementA>::value < 8 ||
+        cutlass::sizeof_bits<ElementB>::value < 8 ||
+        cutlass::sizeof_bits<ElementC>::value < 8) {
+
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  /// Runs the test
+  bool run(
+      cutlass::Distribution::Kind init_A = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_E = cutlass::Distribution::Uniform) {
+
+    if (!sufficient()) {
+      return true;
+    }
+
+    //
+    // initialize device memory
+    //
+
+    if (init_A == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementA>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementA>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          tensor_A.host_view(), seed, scope_max, scope_min, 0);
+    } else if (init_A == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_A.host_data(),
+                                                    tensor_A.capacity());
+    } else if (init_A == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_A.host_view());
+    } else {
+      return false;
+    }
+
+    if (init_B == cutlass::Distribution::Uniform) {
+      int scope_max = 8;
+      int scope_min = -8;
+
+      if (cutlass::sizeof_bits<ElementB>::value == 4) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (cutlass::sizeof_bits<ElementB>::value == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      }
+
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomUniform(
+          tensor_B.host_view(), seed + 16, scope_max, scope_min, 0);
+    } else if (init_B == cutlass::Distribution::Sequential) {
+      cutlass::reference::host::BlockFillSequential(tensor_B.host_data(),
+                                                    tensor_B.capacity());
+    } else if (init_B == cutlass::Distribution::Identity) {
+      cutlass::reference::host::TensorFillIdentity(tensor_B.host_view());
+    } else {
+      return false;
+    }
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    if (init_E == cutlass::Distribution::Uniform) {
+      uint64_t seed = 7;
+      cutlass::reference::host::TensorFillRandomSparseMeta(
+          tensor_E.host_view(), seed, MetaSizeInBits);
+    } else if (init_E == cutlass::Distribution::Identity) {
+      uint32_t content = (MaxID2 == 1) ? 0x44444444 : 0x4444;
+      cutlass::reference::host::TensorFill(tensor_E.host_view(),
+                                           (ElementE)(content));
+    } else {
+      return false;
+    }
+
+    cutlass::reorder_meta(
+        tensor_E_reordered.host_ref(), tensor_E.host_ref(),
+        {Shape::kM, Shape::kN, Shape::kK / Sparse / ElementsPerElementE});
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+    tensor_E_reordered.sync_device();
+
+    // launch kernel
+    sparse_kernel<Mma, ThreadblockShape><<< dim3(1, 1), dim3(32, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_E_reordered.device_data());
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+    cutlass::uncompress(tensor_A_uncompressed.host_ref(), tensor_A.host_ref(),
+                        tensor_E.host_ref(), Shape::kM, Shape::kK);
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC,
+                                   Operator>
+        reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, ThreadblockShape::kK},
+      ElementC(1),
+      tensor_A_uncompressed.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed);
+
+    if (!passed) {
+      std::cout <<"cutlass::sizeof_bits<ElementA>::value = "<<cutlass::sizeof_bits<ElementA>::value<<"\n";
+      std::cout << "A:\n" << tensor_A.host_view() << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementB>::value<<"\n";
+      std::cout << "B:\n" << tensor_B.host_view() << "\n\n";
+
+      std::cout <<"cutlass::sizeof_bits<ElementB>::value = "<<cutlass::sizeof_bits<ElementE>::value<<"\n";
+      std::cout << "E:\n" << tensor_E.host_view() << "\n\n";
+
+      std::cout
+        << "C:\n" << tensor_C.host_view() << "\n\n"
+        << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+        << "Computed:\n" << tensor_D_computed.host_view() << "\n";
+    }
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/cutlass/nvrtc/environment.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/cutlass/nvrtc/environment.h
new file mode 100644
index 0000000000000000000000000000000000000000..3311e915db892466a9a4c52c82d100c2e1319966
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -0,0 +1,43 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include <stddef.h>
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace nvrtc {
+
+extern char const *kCutlassHeaders[];
+extern char const *kCutlassHeaderNames[];
+extern size_t const kCutlassHeaderCount;
+} // namespace nvrtc
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/contraction.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/contraction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55df44379c847034ed38cfab23477331ee4a537c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/contraction.hpp
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+
+namespace nvrtc {
+namespace thread {
+
+template<
+  typename ElementA, typename ElementB, typename ElementC,
+  typename TileShape, typename ClusterShape,
+  bool kTransA, bool kTransB,
+  int RANK_M, int RANK_N, int RANK_K, int RANK_L
+>
+struct ContractionKernel {
+
+using ElementScalar = float;
+using ElementAccum = float;
+using EpilogueThread = cutlass::epilogue::thread::LinearCombination<ElementC,
+                                                                    1,
+                                                                    ElementAccum,
+                                                                    ElementScalar>;
+
+static constexpr cute::GMMA::Major majorA = ! kTransA ? cute::GMMA::Major::MN : cute::GMMA::Major::K;
+static constexpr cute::GMMA::Major majorB = ! kTransB ? cute::GMMA::Major::K : cute::GMMA::Major::MN;
+
+/// Kernel config
+typedef int64_t stride_type;
+typedef int32_t extent_type;
+
+static constexpr const stride_type* stride_null = nullptr;
+static constexpr const extent_type* extent_null = nullptr;
+
+template <int Rank, bool IsMajor, class Indexable>
+static constexpr
+auto
+make_stride_tuple(Indexable const& t, int n, int64_t init_default = 0) {
+  static_assert(Rank > 1);
+  if constexpr (IsMajor) {
+    return cute::transform(cute::make_seq<Rank>{}, [&](auto i) {
+      if constexpr (i == 0) {
+        return cute::Int<1>{};
+      }
+      else {
+        return i < n ? t[i] : init_default;
+      }
+    });
+  }
+  else {
+    return cute::make_int_tuple<Rank>(t, n, init_default);
+  }
+}
+
+using StrideA = decltype(cute::make_stride(
+  make_stride_tuple<RANK_M, majorA == cute::GMMA::Major::MN>(stride_null, 0, 0),
+  make_stride_tuple<RANK_K, majorA == cute::GMMA::Major::K>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using StrideB = decltype(cute::make_stride(
+  make_stride_tuple<RANK_N, majorB == cute::GMMA::Major::MN>(stride_null, 0, 0),
+  make_stride_tuple<RANK_K, majorB == cute::GMMA::Major::K>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using StrideC = decltype(cute::make_stride(
+  cute::make_int_tuple<RANK_M>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_N>(stride_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(stride_null, 0, 0)));
+
+using ProblemShape = decltype(cute::make_shape(
+  cute::make_int_tuple<RANK_M>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_N>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_K>(extent_null, 0, 0),
+  cute::make_int_tuple<RANK_L>(extent_null, 0, 0)));
+
+using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
+  cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+  ElementA, StrideA, 16 / sizeof(ElementA),
+  ElementB, StrideB, 16 / sizeof(ElementB),
+  ElementAccum,
+  TileShape, ClusterShape, cutlass::gemm::collective::StageCountAuto,
+  cutlass::gemm::KernelTmaWarpSpecialized
+>::CollectiveOp;
+
+using EpilogueOutputOp = cutlass::epilogue::collective::DefaultEpilogue<ElementC, StrideC, StrideC, EpilogueThread, cutlass::gemm::EpilogueDefault>;
+using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<EpilogueOutputOp>;
+using Kernel = cutlass::gemm::kernel::GemmUniversal<
+  ProblemShape,
+  CollectiveOp,
+  CollectiveEpilogue>;
+
+};
+
+} // namespace nvrtc
+} // namespace thread
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/testbed_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..576f55cd868cd64c8c09c055d8b9a956e40c87ae
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -0,0 +1,76 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+
+namespace test {
+namespace nvrtc {
+namespace kernel {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread-level matrix multiply-accumulate
+template <typename Mma>
+__global__ void testbed_kernel(
+  typename Mma::ElementC *D,
+  typename Mma::ElementA const *A,
+  typename Mma::ElementB const *B,
+  typename Mma::ElementC const *C) {
+
+  auto ptr_D = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> *>(D);
+  auto ptr_A = reinterpret_cast<cutlass::Array<typename Mma::ElementA, Mma::Shape::kMK> const *>(A);
+  auto ptr_B = reinterpret_cast<cutlass::Array<typename Mma::ElementB, Mma::Shape::kKN> const *>(B);
+  auto ptr_C = reinterpret_cast<cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> const *>(C);
+
+  Mma mma;
+
+  auto a = *ptr_A;
+  auto b = *ptr_B;
+  auto c = *ptr_C;
+
+  cutlass::Array<typename Mma::ElementC, Mma::Shape::kMN> d;
+
+  mma(d, a, b, c);
+
+  *ptr_D = d;
+}
+
+}
+}
+}
+}
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/assert.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7e6e94691c82b2f343959421c884c8b0b06f9b4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/assert.h
@@ -0,0 +1,30 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/stdint.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/stdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ba5432fd568af71e15b20b8cdab1571f303bcdf
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/stdlib/stdint.h
@@ -0,0 +1,129 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+typedef char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned int uint32_t;
+typedef long long int int64_t;
+typedef unsigned long long int uint64_t;
+
+#if defined __x86_64__ && !defined __ILP32__
+# define __WORDSIZE     64
+#else
+# define __WORDSIZE     32
+#endif
+
+
+/* Small types.  */
+
+/* Signed.  */
+typedef signed char             int_least8_t;
+typedef short int               int_least16_t;
+typedef int                     int_least32_t;
+#if __WORDSIZE == 64
+typedef long int                int_least64_t;
+#else
+__extension__
+typedef long long int           int_least64_t;
+#endif
+
+/* Unsigned.  */
+typedef unsigned char           uint_least8_t;
+typedef unsigned short int      uint_least16_t;
+typedef unsigned int            uint_least32_t;
+#if __WORDSIZE == 64
+typedef unsigned long int       uint_least64_t;
+#else
+__extension__
+typedef unsigned long long int  uint_least64_t;
+#endif
+
+
+/* Fast types.  */
+
+/* Signed.  */
+typedef signed char             int_fast8_t;
+#if __WORDSIZE == 64
+typedef long int                int_fast16_t;
+typedef long int                int_fast32_t;
+typedef long int                int_fast64_t;
+#else
+typedef int                     int_fast16_t;
+typedef int                     int_fast32_t;
+__extension__
+typedef long long int           int_fast64_t;
+#endif
+
+/* Unsigned.  */
+typedef unsigned char           uint_fast8_t;
+#if __WORDSIZE == 64
+typedef unsigned long int       uint_fast16_t;
+typedef unsigned long int       uint_fast32_t;
+typedef unsigned long int       uint_fast64_t;
+#else
+typedef unsigned int            uint_fast16_t;
+typedef unsigned int            uint_fast32_t;
+__extension__
+typedef unsigned long long int  uint_fast64_t;
+#endif
+
+/* Types for `void *' pointers.  */
+#if __WORDSIZE == 64
+# ifndef __intptr_t_defined
+typedef long int                intptr_t;
+#  define __intptr_t_defined
+# endif
+typedef unsigned long int       uintptr_t;
+#else
+# ifndef __intptr_t_defined
+typedef int                     intptr_t;
+#  define __intptr_t_defined
+# endif
+typedef unsigned int            uintptr_t;
+#endif
+
+
+/* Largest integral types.  */
+#if __WORDSIZE == 64
+typedef long int                intmax_t;
+typedef unsigned long int       uintmax_t;
+#else
+__extension__
+typedef long long int           intmax_t;
+__extension__
+typedef unsigned long long int  uintmax_t;
+#endif
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/thread/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/thread/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fd6863e8fa003d3fbc4e0b498e3b9b454ade190
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/nvrtc/thread/testbed.h
@@ -0,0 +1,398 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level GEMM
+*/
+
+#pragma once
+
+#include <iostream>
+#include <cstdio>
+#include <vector>
+
+#include "cutlass/gemm/thread/mma.h"
+#include "../kernel/thread/testbed_kernel.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/trace.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/gemm.h"
+
+#include <cuda.h>
+#include <nvrtc.h>
+#include "../cutlass/nvrtc/environment.h"
+#include <assert.h>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace nvrtc {
+namespace thread {
+
+#define NVRTC_RETURN_IF_ERROR(api)                    \
+  do {                                                \
+    nvrtcResult _result = api;                        \
+    if (_result != NVRTC_SUCCESS) {                   \
+      CUTLASS_TRACE_HOST("Nvrtc error: " << _result); \
+      return false;                                   \
+    }                                                 \
+  } while(0)
+
+inline const char * cuda_source_fmt = R"""(
+
+#include "kernel/thread/contraction.hpp"
+
+using Operator = %s;
+
+extern "C" __global__ void global_entry(__grid_constant__ Operator::Params const params) {
+  extern __shared__ char smem[];
+
+  Operator op;
+  op(params, smem);
+}
+
+)""";
+
+struct TestbedKernel {
+  static bool compile(std::string const &kernel, std::vector<const char *> const &opts) {
+    int sz = std::snprintf(nullptr, 0, cuda_source_fmt, kernel.c_str());
+    std::vector<char> cuda_source(sz + 1);
+    std::snprintf(&cuda_source[0], cuda_source.size(), cuda_source_fmt, kernel.c_str());
+
+    nvrtcProgram program;
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcCreateProgram(
+            &program,
+            cuda_source.data(),
+            nullptr,
+            static_cast<int32_t>(cutlass::nvrtc::kCutlassHeaderCount),
+            cutlass::nvrtc::kCutlassHeaders,
+            cutlass::nvrtc::kCutlassHeaderNames)
+    );
+
+    nvrtcResult compile_result = 
+        nvrtcCompileProgram(
+            program, 
+            static_cast<int32_t>(opts.size()), 
+            opts.data());
+
+    size_t log_size;
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcGetProgramLogSize(program, &log_size)
+    );
+
+    if (log_size > 1) {
+      auto log = std::make_unique<char[]>(log_size);
+
+      NVRTC_RETURN_IF_ERROR(
+          nvrtcGetProgramLog(program, log.get())
+      );
+                
+      std::cout << log.get() << std::endl;
+    }
+
+    NVRTC_RETURN_IF_ERROR(compile_result);
+
+    NVRTC_RETURN_IF_ERROR(
+        nvrtcDestroyProgram(&program)
+    );
+
+    return true;
+  }
+};
+
+/// Structure to compute the matrix product
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape,
+  /// Data type of A elements
+  typename ElementA,
+  /// Layout of A matrix (concept: MatrixLayout)
+  typename LayoutA,
+  /// Data type of B elements
+  typename ElementB,
+  /// Layout of B matrix (concept: MatrixLayout)
+  typename LayoutB,
+  /// Element type of C matrix
+  typename ElementC,
+  /// Layout of C matrix (concept: MatrixLayout)
+  typename LayoutC
+>
+struct Testbed {
+
+  /// Thread-level matrix multiply-accumulate operator
+  using Mma = cutlass::gemm::thread::Mma<
+    Shape,
+    ElementA,
+    LayoutA,
+    ElementB,
+    LayoutB,
+    ElementC,
+    LayoutC
+  >;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutB> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_computed;
+  cutlass::HostTensor<ElementC, LayoutC> tensor_D_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed() {
+
+    tensor_A.reset(cutlass::make_Coord(Shape::kM, Shape::kK));
+    tensor_B.reset(cutlass::make_Coord(Shape::kK, Shape::kN));
+    tensor_C.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_computed.reset(cutlass::make_Coord(Shape::kM, Shape::kN));
+    tensor_D_reference.reset(cutlass::make_Coord(Shape::kM, Shape::kN), false);
+  }
+
+  static inline bool check_nvrtc_error(nvrtcResult error) {
+    if (error != NVRTC_SUCCESS) {
+      std::cerr << "failed to compile ";
+      return false;
+    }
+    return true;
+  }
+
+  /// Runs the test
+  bool run(std::string const &gemm_traits) {
+
+    //
+    // initialize device memory
+    //
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_A.host_data(),
+      tensor_A.capacity()
+    );
+
+    cutlass::reference::host::BlockFillSequential(
+      tensor_B.host_data(),
+      tensor_B.capacity(),
+      ElementB(1),
+      ElementB(2)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_C.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_computed.host_view(),
+      ElementC(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      tensor_D_reference.host_view(),
+      ElementC(0)
+    );
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D_computed.sync_device();
+
+#if 0
+    // launch kernel
+    cutlass::gemm::kernel::testbed_kernel<Mma><<< dim3(1, 1), dim3(1, 1, 1) >>>(
+      tensor_D_computed.device_data(),
+      tensor_A.device_data(),
+      tensor_B.device_data(),
+      tensor_C.device_data());
+
+#else
+    // Instantiate gemm_kernel
+    nvrtcResult result_nvrtc;
+    nvrtcProgram program;
+    static char const *src =
+        "#include \"cutlass/gemm/thread/mma.h\"\n"
+        "#include \"cutlass/gemm/gemm.h\"\n"
+        "#include \"cutlass/layout/matrix.h\"\n"
+        "#include \"unit/nvrtc/kernel/thread/testbed_kernel.h\"\n"
+    ;
+
+    std::string type_name;
+#if 0
+    // TODO Ideally we'd use nvrtcGetTypeName to determine the type, but it cannot resolve enum symbol names
+    // As altername solution we might want to implement to_string<GemmTraits>() to get the traits string.
+    nvrtcGetTypeName<typename GemmTraits_>(&type_name);
+#else
+    type_name = gemm_traits;
+#endif
+
+    result_nvrtc = nvrtcCreateProgram(&program,
+                                    src,
+                                    NULL,
+                                    (int)cutlass::nvrtc::kCutlassHeaderCount,
+                                    cutlass::nvrtc::kCutlassHeaders,
+                                    cutlass::nvrtc::kCutlassHeaderNames);
+    check_nvrtc_error(result_nvrtc);
+
+    std::string gemm_kernel_instantiation =
+      "test::nvrtc::kernel::thread::testbed_kernel< " + type_name + " >";
+    nvrtcAddNameExpression(program, gemm_kernel_instantiation.c_str());
+
+    const char *opts[] = {"--gpu-architecture=compute_75",
+                          "--std=c++17",
+                          "--include-path=/usr/local/cuda-10.1/include"};
+
+    result_nvrtc = nvrtcCompileProgram(program, 3, opts);
+    if (result_nvrtc != NVRTC_SUCCESS) {
+      size_t logSize;
+      nvrtcGetProgramLogSize(program, &logSize);
+      std::vector<char> log(logSize);
+      nvrtcGetProgramLog(program, log.data());
+      std::cout << "Compile log:" << std::endl << log.data() << std::endl;
+    }
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // The lowered name is the name of the template instantiation in the generated PTX code.
+    char const *gemm_kernel_lowered_name;
+    nvrtcGetLoweredName(program, gemm_kernel_instantiation.c_str(), &gemm_kernel_lowered_name);
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // Query the size of the genereated PTX so that we can allocate storage and retrieve it afterwards
+    size_t ptx_size;
+    result_nvrtc = nvrtcGetPTXSize(program, &ptx_size);
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    std::vector<char> ptx(ptx_size);
+    result_nvrtc = nvrtcGetPTX(program, ptx.data());
+    if (!check_nvrtc_error(result_nvrtc)) {
+      assert(0);
+    }
+
+    // we do not need the nvrtc program anymore
+    //nvrtcDestroyProgram(&program);
+
+    CUmodule module;
+    CUresult result_cuda;
+    result_cuda = cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    }
+
+    CUfunction kernel;
+    result_cuda = cuModuleGetFunction(&kernel, module, gemm_kernel_lowered_name);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    }
+
+    void* d_a = (void*)tensor_A.device_data();
+    void* d_b = (void*)tensor_B.device_data();
+    void* d_c = (void*)tensor_C.device_data();
+    void* d_d = (void*)tensor_D_computed.device_data();
+    void* args[] = { &d_d, &d_a, &d_b, &d_c };
+
+    // CUfunction f, unsigned int  gridDimX, unsigned int  gridDimY, unsigned int  gridDimZ, unsigned int  blockDimX, unsigned int  blockDimY, unsigned int  blockDimZ, unsigned int  sharedMemBytes, CUstream hStream, void** kernelParams, void** extra
+    result_cuda = cuLaunchKernel(kernel, 1, 1, 1, 1, 1, 1, 0, 0 /*cudaStreamDefault*/, args, 0);
+    if (result_cuda != CUDA_SUCCESS) {
+      assert(0);
+    } else {
+}
+#endif
+
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    if (result != cudaSuccess) {
+      std::cout << "CUDA ERROR: " << cudaGetErrorString(result);
+      return false;
+    }
+
+    tensor_D_computed.sync_host();
+
+    //
+    // Reference implementation
+    //
+
+    //tensor_D_reference.fill(tensor_C.host_view());
+
+    cutlass::reference::host::Gemm<ElementA, LayoutA, ElementB, LayoutB,
+                                   ElementC, LayoutC, ElementC, ElementC> reference_gemm;
+
+    reference_gemm(
+      {Shape::kM, Shape::kN, Shape::kK},
+      ElementC(1),
+      tensor_A.host_ref(),
+      tensor_B.host_ref(),
+      ElementC(0),
+      tensor_D_reference.host_ref()
+    );
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = cutlass::reference::host::TensorEquals(
+      tensor_D_computed.host_view(),
+      tensor_D_reference.host_view()
+    );
+
+    if(!passed) std::cout
+      << "A:\n" << tensor_A.host_view() << "\n\n"
+      << "B:\n" << tensor_B.host_view() << "\n\n"
+      << "C:\n" << tensor_C.host_view() << "\n\n"
+      << "Reference:\n" << tensor_D_reference.host_view() << "\n\n"
+      << "Computed:\n" << tensor_D_computed.host_view() << std::endl;
+    
+    std::cout << "passed " << passed << std::endl;
+    
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace nvrtc
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cc2946a2c51cfb8c1971345c81c1910bd667208
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed.h
@@ -0,0 +1,145 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Common Testbed file shared by Pipeline unit tests
+*/
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cutlass/gemm/gemm.h>
+
+#include "cutlass/util/command_line.h"
+#include "../common/cutlass_unit_test.h"
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+  #define CUTLASS_UNIT_TEST_PIPELINE true
+#else
+  #define CUTLASS_UNIT_TEST_PIPELINE false
+#endif
+
+// Command line test options
+struct Options {
+  //
+  // Data Members
+  // 
+  bool help;
+  bool verification_enabled;
+  int SM_count;
+  int clock_MHz;
+
+  //
+  // Methods
+  // 
+  Options():
+    help(false),
+    verification_enabled(true),
+    SM_count(116),
+    clock_MHz(1477)
+  { }
+
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("verification-enabled", verification_enabled, true);
+    cmd.get_cmd_line_argument("sm-count", SM_count, 116);
+    cmd.get_cmd_line_argument("clock", clock_MHz, 1477);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Options:\n\n"
+      << "  --help                          If specified, displays this usage statement.\n\n"
+      << "  --verification-enabled=<bool>   Enable/Disable verification\n"
+      << "  --sm-count=<int>                Number of SMs on the chip\n"
+      << "  --clock=<int>                   Locked clock value in Mhz\n";
+
+    return out;
+  }
+};
+
+//
+// Testbed
+//
+
+template<typename Pipeline>
+struct Testbed {
+private:
+  // Commandline options
+  Options options;
+
+  void run_test(uint32_t const kNumIters) {
+
+    // Run CuTe Gemm 
+    Pipeline pipeline;
+
+    cudaError_t result = pipeline.run(kNumIters);
+
+    CUTE_CHECK_LAST();
+  }
+
+
+public:
+  Testbed(Options const &options_) : options(options_) {
+    int device_id = 0;
+    cudaDeviceProp device_prop;
+    CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+    CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+  
+    if (device_prop.major < 1) {
+      fprintf(stderr, "Device does not support CUDA.\n");
+      exit(1);
+    }
+  }
+
+  /// Run verification Gemm problem sizes
+  bool verification() {
+
+    std::array<uint32_t, 5> kNumIters;
+
+    for (size_t i = 0; i < kNumIters.size(); ++i) {
+      kNumIters[i] = static_cast<uint32_t>( (rand() % 1000) + 1 );
+    }
+
+    for (int n : kNumIters) {
+      std::cout << "Stages = " << Pipeline::Stages << " kNumIters = " << n << "\n";
+      run_test(n);
+    }
+
+    return true;
+  }
+};
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed_cluster_launch_control.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed_cluster_launch_control.h
new file mode 100644
index 0000000000000000000000000000000000000000..50a68a1437956c95aa4e7912e93adc8b1481c9cc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/pipeline/testbed_cluster_launch_control.h
@@ -0,0 +1,154 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Testbed file used by cluster launch control pipeline unit test
+*/
+
+//
+
+//
+
+#if CUDA_12_0_SM90_FEATURES_SUPPORTED
+  #define CUTLASS_UNIT_TEST_PIPELINE true
+#else
+  #define CUTLASS_UNIT_TEST_PIPELINE false
+#endif
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <cutlass/gemm/gemm.h>
+
+#include "cutlass/util/command_line.h"
+
+// Command line test options
+struct OptionsClusterLaunch {
+  //
+  // Data Members
+  // 
+  bool help = false;
+  bool verification_enabled = true;
+  int SM_count = 116;
+  int clock_MHz = 1477;
+  dim3 grid_dim = {0,0,0};
+
+  //
+  // Methods
+  // 
+
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("verification-enabled", verification_enabled, verification_enabled);
+    cmd.get_cmd_line_argument("sm-count", SM_count, SM_count);
+    cmd.get_cmd_line_argument("clock", clock_MHz, clock_MHz);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "Options:\n\n"
+      << "  --help                          If specified, displays this usage statement.\n\n"
+      << "  --verification-enabled=<bool>   Enable/Disable verification\n"
+      << "  --sm-count=<int>                Number of SMs on the chip\n"
+      << "  --clock=<int>                   Locked clock value in Mhz\n";
+
+    return out;
+  }
+};
+
+//
+// Testbed
+//
+
+template<typename Pipeline>
+class TestbedClusterLaunch {
+private:
+  // Commandline options
+  OptionsClusterLaunch options;
+
+  bool run_test() {
+
+    // Run CuTe Gemm 
+    Pipeline pipeline;
+
+    bool success = false;
+    cudaError_t result = pipeline.run(success, this->options.grid_dim);
+    
+    CUTE_CHECK_LAST();
+    return success;
+  }
+
+
+public:
+  TestbedClusterLaunch(OptionsClusterLaunch const &options_) : options(options_) {
+    int device_id = 0;
+    cudaDeviceProp device_prop;
+    CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+    CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+  
+    if (device_prop.major < 1) {
+      fprintf(stderr, "Device does not support CUDA.\n");
+      exit(1);
+    }
+  }
+
+  /// Run verification Gemm problem sizes
+  bool verification() {
+
+#if !defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+  printf(
+    "CUTLASS_ARCH_MMA_SM100_SUPPORTED must be set, but it is not. \n"
+    "This test is waived.\n"
+  );
+  return true;
+#endif
+
+#if 0
+    bool is_success = false;
+    for (int i = 0; i< 10; i++){
+      printf("iteration = %d\n", i);
+      is_success = run_test();
+      if ( not is_success )
+        return is_success;
+    }
+    return is_success;
+#else
+    // Run the test with single launch
+    return run_test();
+#endif
+  }
+};
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/kernel/reduce_splitk_testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/kernel/reduce_splitk_testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..e44a42463ae95e4f76388d791c661de875092c93
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -0,0 +1,45 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level Reduction
+*/
+
+#pragma once
+
+#include "cutlass/reduction/thread/reduce.h"
+
+#include "cutlass/layout/vector.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/thread/testbed.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/thread/testbed.h
new file mode 100644
index 0000000000000000000000000000000000000000..239f228831a25527106af1659383112535943df1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/reduction/thread/testbed.h
@@ -0,0 +1,242 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Unit tests for thread-level Reduction
+*/
+
+#pragma once
+
+#include "cutlass/reduction/thread/reduce.h"
+
+#include "cutlass/layout/vector.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+
+namespace test {
+namespace reduction {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the reduction
+template <
+  /// Data type of elements
+  typename Element,
+  /// Number of elements
+  int N
+>
+struct Testbed_reduce_host {
+
+  /// Thread-level reduction operator
+  using Reduce = cutlass::reduction::thread::Reduce<
+    cutlass::plus<Element>,
+    cutlass::Array<Element, N>
+  >;
+
+  //
+  // Data members
+  //
+
+  cutlass::Array<Element, N> tensor_in;
+  cutlass::Array<Element, 1> reduced_tensor_computed;
+  cutlass::Array<Element, 1> reduced_tensor_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed_reduce_host() {
+    tensor_in.clear();
+    reduced_tensor_computed.clear();
+    reduced_tensor_reference.clear();
+  }
+
+  /// Runs the test
+  bool run() {
+
+    //
+    // initialize memory
+    //
+
+    for(int i = 0; i < N; i++)
+      tensor_in.at(i) = Element(i);
+
+   
+    Reduce reduce;
+
+    cutlass::Array<Element, 1> *out_ptr = &reduced_tensor_computed;
+    out_ptr[0] = reduce(tensor_in);
+
+    //
+    // Reference implementation
+    //
+    Element e(0);
+    for (int i = 0; i < N; i++)
+       e = e + Element(i);
+
+    reduced_tensor_reference.at(0) = e;
+
+    //
+    // Verify equivalence
+    //
+
+    // compare
+    bool passed = reduced_tensor_reference[0] == reduced_tensor_computed[0];
+
+    EXPECT_TRUE(passed) 
+    << "Expected = " << float(reduced_tensor_reference.at(0)) << "\n\n"
+    << "Actual   = " << float(reduced_tensor_computed.at(0)) << "\n\n"
+    << std::endl;
+    
+    return passed;
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Thread-level reduction kernel
+template <typename Element, int N>
+__global__ void kernel_reduce(Element const *array_in, Element *result) {
+
+  /// Thread-level reduction operator
+  using Reduce = cutlass::reduction::thread::Reduce<
+    cutlass::plus<Element>,
+    cutlass::Array<Element, N>
+  >;
+
+  Reduce reduce;
+
+  auto ptr_in = reinterpret_cast<cutlass::Array<Element , N> const *>(array_in);
+  auto result_ptr = reinterpret_cast<cutlass::Array<Element , 1> *>(result);
+  auto in = *ptr_in;
+  result_ptr[0] = reduce(in);
+}
+
+
+/// Structure to compute the reduction
+template <
+  /// Data type of elements
+  typename Element,
+  /// Number of elements
+  int N
+>
+struct Testbed_reduce_device {
+
+  using Layout = cutlass::layout::PackedVectorLayout;
+
+  //
+  // Data members
+  //
+
+  cutlass::HostTensor<Element, Layout> tensor_in;
+  cutlass::HostTensor<Element, Layout> reduced_tensor_computed;
+  cutlass::HostTensor<Element, Layout> reduced_tensor_reference;
+
+  //
+  // Methods
+  //
+
+  /// Allocates workspace in device memory
+  Testbed_reduce_device() {
+
+    tensor_in.reset(cutlass::make_Coord(N), true);
+    reduced_tensor_computed.reset(cutlass::make_Coord(1), true);
+    reduced_tensor_reference.reset(cutlass::make_Coord(1), true);
+  }
+
+
+  /// Runs the test
+  bool run() {
+
+    //
+    // initialize memory
+    //
+
+    cutlass::reference::host::TensorFill(
+      tensor_in.host_view(),
+      Element(1)
+    );
+
+    cutlass::reference::host::TensorFill(
+      reduced_tensor_computed.host_view(),
+      Element(0)
+    );
+
+    cutlass::reference::host::TensorFill(
+      reduced_tensor_reference.host_view(),
+      Element(N)
+    );
+
+    tensor_in.sync_device();
+    reduced_tensor_computed.sync_device();
+    reduced_tensor_reference.sync_device();
+
+    /// call the kernel
+    kernel_reduce<Element, N><<< dim3(1, 1), dim3(1, 1, 1) >>> (
+        tensor_in.device_data(), 
+        reduced_tensor_computed.device_data()
+        );
+    
+    // verify no errors
+    cudaError_t result = cudaDeviceSynchronize();
+
+    EXPECT_EQ(result, cudaSuccess) << "CUDA ERROR: " << cudaGetErrorString(result);
+    if (result != cudaSuccess) {
+      return false;
+    }
+
+    // Copy back results
+    reduced_tensor_computed.sync_host();
+
+    // Verify equivalence
+    bool passed = cutlass::reference::host::TensorEquals(
+      reduced_tensor_computed.host_view(),
+      reduced_tensor_reference.host_view()
+    );
+
+    EXPECT_TRUE(passed) 
+    << "Expected = " << reduced_tensor_reference.host_view() << "\n\n"
+    << "Actual   = " << reduced_tensor_computed.host_view() << "\n\n"
+    << std::endl;
+    
+    return passed;
+  }
+};
+
+} // namespace thread
+} // namespace reduction
+} // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4e7de4351076dba3a699b4cb1c8a6e01485bc20
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
@@ -0,0 +1,481 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Compress utils specific for SM90 structure sparse kernels
+*/
+
+#pragma once
+
+#include <algorithm>                       // std::fill
+#include <array>                           // std::array
+#include <cstdio>
+#include <random>                          // std::mt19937
+
+#include "cute/container/bit_field.hpp"    // cute::bit_field
+#include "cute/numeric/numeric_types.hpp"  // cute::sizeof_bits_v
+#include "cute/tensor.hpp"                 // cute::Tensor, cute::make_tensor, cute::print_tensor
+#include "cutlass/arch/arch.h"             // cutlass::arch::Sm90
+#include "cutlass/cutlass.h"               // cutlass::Status
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/detail/layout.hpp"       // cutlass::TagToStrideA_t
+#include "cutlass/fast_math.h"             // cutlass::ceil_div, cutlass::round_up
+#include "cutlass/kernel_hardware_info.h"  // cutlass::KernelHardwareInfo
+#include "cutlass/util/packed_stride.hpp"  // cutlass::make_cute_packed_stride
+#include "cutlass/numeric_size.h"          // cutlass::bits_to_bytes
+#include "cutlass/cuda_host_adapter.hpp"   // cutlass::CudaHostAdapter
+
+namespace cutlass
+{
+namespace transform
+{
+namespace kernel
+{
+
+using namespace cute;
+
+namespace detail {
+
+  template<typename T>
+  CUTLASS_HOST_DEVICE
+  static uint8_t
+  encode_in_chunk_idx_legacy(int in_chunk_idx){
+    if (sizeof(T) == 4) {
+      return in_chunk_idx == 0 ? 0b0100 : 0b1110;
+    }
+    else {
+      uint8_t res = 0;
+      if (in_chunk_idx == 0) {
+        res = 0b00;
+      }
+      else if (in_chunk_idx == 1) {
+        res = 0b01;
+      }
+      else if (in_chunk_idx == 2) {
+        res = 0b10;
+      }
+      else {
+        res = 0b11;
+      }
+      return res;
+    }
+  }
+
+  template <
+    class SparseConfig,
+    class EngineA,
+    class LayoutA,
+    class EngineAc,
+    class LayoutAc
+  >
+  CUTLASS_HOST_DEVICE
+  static void
+  compress_two_chunks_legacy(
+    Tensor<EngineA, LayoutA> tensorA,
+    Tensor<EngineAc, LayoutAc> tensorAc,
+    uint8_t& meta_two_chunk,
+    int effective_elems) {
+
+    using ElementA = typename EngineAc::value_type;
+
+    static constexpr int LogicalElemsAPerChunk  = typename SparseConfig::LogicalElemsAPerChunk{};
+    static constexpr int PhysicalElemsAPerChunk  = typename SparseConfig::PhysicalElemsAPerChunk{};
+    static constexpr int ElemsARawPerElementAMmaRaw    = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+    static constexpr int ElementEBitsPerElementAMma = typename SparseConfig::ElementEBitsPerElementAMma{};
+    static constexpr int LogicalSubChunk     = ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+    static constexpr int PhysicalSubChunk    = ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+    /*
+    Legal metadata chunk in SM90
+    Index   Bin   HEX
+    0, 1  0b0100   4
+    1, 2  0b1001   9
+    2, 3  0b1110   E
+    0, 2  0b1000   8
+    1, 3  0b1101   D
+    0, 3  0b1100   C
+    2, 1  0b0110   6  (Not used)
+    -----------------------------------
+    TF32
+    0     0b0100   4
+    1     0b1110   E
+    */
+
+    if (effective_elems <= 0) {
+      return;
+    }
+
+    // initialize
+    // 0 is the initial value for this function while 0x44 is the initial value for hardware.
+    meta_two_chunk = 0;
+
+    for (int chunk_idx = 0; chunk_idx < 2; ++chunk_idx) {
+      // If Only One Chunk within this Two Chunk
+      if ( effective_elems <= chunk_idx * ElemsARawPerElementAMmaRaw * LogicalSubChunk ) {
+        break;
+      }
+      /// init result;
+      int non_zero_cnt = 0;
+      int32_t nnz_chunk_idx[PhysicalSubChunk] = { 0 };
+      ElementA Ac_chunk[PhysicalSubChunk][ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+
+      for (int subchunk_idx = 0; subchunk_idx < LogicalSubChunk; ++subchunk_idx) {
+        bool is_nz = true;
+        ElementA subchunk_elems[ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+        /// Check if subchunk is non-zero
+        for(int elem_idx = 0; elem_idx < ElemsARawPerElementAMmaRaw; elem_idx++) {
+          int offset = chunk_idx * LogicalElemsAPerChunk + subchunk_idx * ElemsARawPerElementAMmaRaw + elem_idx;
+          subchunk_elems[elem_idx] = offset < effective_elems ? tensorA(offset) : ElementA(0);
+          
+          ElementA zero = static_cast<ElementA>(0);
+          ElementA minus_zero = static_cast<ElementA>(ElementA(1) << cutlass::sizeof_bits_v<ElementA> - 1);
+          if (subchunk_elems[elem_idx] != zero && subchunk_elems[elem_idx] != minus_zero) {
+            if (non_zero_cnt >= PhysicalSubChunk) {
+              #ifdef  __CUDA_ARCH__
+                asm volatile ("brkpt;\n" ::);
+              #else
+                throw std::runtime_error("Found extra non-zero elements in a chunk!\n");
+              #endif
+            }
+            is_nz = false;
+          }
+        }
+
+        /// There is non-zero element in the subchunk
+        if(!is_nz) {
+          nnz_chunk_idx[non_zero_cnt] = subchunk_idx;
+          memcpy(Ac_chunk[non_zero_cnt], subchunk_elems, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          non_zero_cnt++;
+        }
+      }
+
+      /*
+      Special cases
+      nnz == 1 and non-tf32 and nnz_idx = 3
+      */
+      ElementA elementA_zeros[ElemsARawPerElementAMmaRaw] = { ElementA{0} };
+      if constexpr (sizeof_bits_v<ElementA> < 32) {
+        if (non_zero_cnt == 1 && nnz_chunk_idx[0] == 3) {
+          memcpy(Ac_chunk[1], Ac_chunk[0], sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          memcpy(Ac_chunk[0], elementA_zeros, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          nnz_chunk_idx[1] = 3;
+          nnz_chunk_idx[0] = 0;
+        }
+        else if (non_zero_cnt == 1) {
+          memcpy(Ac_chunk[1], elementA_zeros, sizeof(ElementA) * ElemsARawPerElementAMmaRaw);
+          nnz_chunk_idx[1] = 3;
+        }
+      }
+
+      /// Setup metadata
+      uint8_t meta_chunk = 0;
+      for (int i = 0; i < PhysicalSubChunk; i++) {
+        meta_chunk = static_cast<uint8_t>(meta_chunk | (encode_in_chunk_idx_legacy<ElementA>(nnz_chunk_idx[i]) << (i * ElementEBitsPerElementAMma)));
+        for(int j = 0; j < ElemsARawPerElementAMmaRaw; j++) {
+          tensorAc(chunk_idx * PhysicalElemsAPerChunk + i * ElemsARawPerElementAMmaRaw + j) = Ac_chunk[i][j];
+        }
+      }
+      meta_two_chunk = uint8_t(meta_two_chunk | (meta_chunk << (chunk_idx * _4{})));
+    }
+  }
+}
+
+template<
+  class ProblemShape_,
+  class ElementA_,
+  class LayoutATag_,
+  class SparseConfig_
+>
+class SM90StructuredSparseCompressorLegacy {
+public:
+  using SparseConfig = SparseConfig_;
+  using ProblemShape = ProblemShape_;
+
+  // * EltA
+  using ElementA = ElementA_;
+  using ElementAUint = cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>;
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+  using ArrayElementA = cute::conditional_t<IsRuntimeDataTypeA,
+                                            cute::uint_bit_t<cute::sizeof_bits_v<ElementA>>,
+                                            ElementA>;
+  using ElementAMma = typename SparseConfig::ElementAMma;
+  using ElementAMmaRaw = typename SparseConfig::ElementAMmaRaw;
+  using ElementASparsity = typename SparseConfig::ElementASparsity;
+  using ElementAMmaSparsity = typename SparseConfig::ElementAMmaSparsity;
+  using LayoutATag = LayoutATag_;
+  using LayoutA = LayoutATag;
+  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutATag>;
+
+  // * EltE
+  using ElementEMma = typename SparseConfig::ElementEMma;
+  using ElementEMmaRaw = typename SparseConfig::ElementEMmaRaw;
+  using ElementEMmaSparsity = typename SparseConfig::ElementEMmaSparsity;
+
+  // * AtomE
+  using TensorEAtom = typename SparseConfig::TensorEAtom;
+  using TensorEAtomK = typename SparseConfig::TensorEAtomK;
+  using TensorEAtomM = typename SparseConfig::TensorEAtomM;
+
+  static constexpr int ElemsARawPerElementAMmaRaw = typename SparseConfig::ElemsARawPerElementAMmaRaw{};
+  static constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+  static constexpr int PhysicalElemsAPerChunk = typename SparseConfig::PhysicalElemsAPerChunk{};
+  static constexpr int LogicalElemsAMmaRawPerChunk = cutlass::ceil_div(LogicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+  static constexpr int PhysicalElemsAMmaRawPerChunk = cutlass::ceil_div(PhysicalElemsAPerChunk, ElemsARawPerElementAMmaRaw);
+
+  // * Alignment
+  static constexpr int TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+  static constexpr int TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+  static constexpr int TensorAAlignmentK = typename SparseConfig::TensorAAlignmentK{};
+  static constexpr int TensorAAlignmentM = typename SparseConfig::TensorAAlignmentM{};
+
+  // Required by `device_kernel`
+  static constexpr int MaxThreadsPerBlock = 1;
+  static constexpr int MinBlocksPerMultiprocessor = 1;
+  using ArchTag = arch::Sm90;
+
+  struct SharedStorage {
+    /* empty, no smem needed */
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  struct TransformArguments {
+    ArrayElementA const* ptr_A{nullptr};
+    StrideA dA{};
+    ArrayElementA* ptr_ACompress{nullptr};
+    ElementEMmaRaw* ptr_E{nullptr};
+  };
+
+  using TransformParams = TransformArguments;
+
+  struct Arguments {
+    ProblemShape problem_shape{};
+    TransformArguments transform{};
+    KernelHardwareInfo hw_info{};
+  };
+
+  struct Params {
+    ProblemShape problem_shape{};
+    TransformParams transform{};
+    KernelHardwareInfo hw_info{};
+    void* workspace = nullptr;
+  };
+
+  static Params
+  to_underlying_arguments(Arguments & args, void* workspace) {
+    return Params{{args.problem_shape},
+                  {args.transform.ptr_A, args.transform.dA, args.transform.ptr_ACompress, args.transform.ptr_E},
+                  {args.hw_info},
+                  workspace};
+  }
+
+  static Status
+  can_implement(Arguments const& args) {
+    auto [M, N, K, L] = args.problem_shape;
+    if (K % LogicalElemsAPerChunk != 0) {
+      CUTLASS_TRACE_HOST("SM90 Sparse Compressor CAN NOT IMPLEMENT: GemmK not multiplier of logical chunk size\n");
+      return Status::kErrorInvalidProblem;
+    }
+
+    return Status::kSuccess;
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    auto problem = args.problem_shape;
+    const int m = cute::size<0>(problem);
+    const int k = cute::size<2>(problem);
+    const int l = cute::size<3>(problem);
+    const int metadata_k = round_up(k, TensorEAlignmentK);
+    const int metadata_m = round_up(m, TensorEAlignmentM);
+    const int metadata_bytes = metadata_m * metadata_k / ElementEMmaSparsity{} * l;
+    return metadata_bytes;
+  }
+
+  static Status
+  initialize_workspace(Arguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter *cuda_adapter = nullptr) {
+    cudaError_t cuda_error;
+
+    auto workspace_size = get_workspace_size(args);
+    if (workspace_size == 0) {
+      return Status::kSuccess;
+    } else if (workspace == nullptr) {
+      return Status::kErrorInternal;
+    }
+
+    cudaPointerAttributes attri;
+    cuda_error = cudaPointerGetAttributes(&attri, workspace);
+    if (cuda_error != cudaSuccess) {
+      return Status::kErrorInternal;
+    }
+
+    if ( attri.type == cudaMemoryTypeDevice ) {
+#if defined(CUTLASS_ENABLE_CUDA_HOST_ADAPTER) && CUTLASS_ENABLE_CUDA_HOST_ADAPTER
+      CUTLASS_ASSERT(cuda_adapter);
+      if (Status::kSuccess != cuda_adapter->memsetDevice(workspace, static_cast<uint8_t>(0), workspace_size, stream)) {
+        return Status::kErrorInternal;
+      }
+#else
+      cudaMemsetAsync(workspace, 0, workspace_size, stream);
+      cuda_error = cudaGetLastError();
+      if (cuda_error != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+#endif
+    } else {
+      memset(workspace, 0, workspace_size);
+    }
+
+    return Status::kSuccess;
+  }
+
+  static dim3
+  get_grid_shape(Params const& params) {
+    return dim3(1, 1, 1);
+  }
+
+  static dim3
+  get_block_shape() {
+    return dim3(1, 1, 1);
+  }
+
+  CUTE_HOST_DEVICE
+  void
+  operator()(Params params, char* smem_buf = nullptr) {
+    run(params, smem_buf);
+  }
+
+  CUTE_HOST_DEVICE
+  static void
+  run(Params params, char* smem_buf = nullptr) {
+    do_compress_device_host(params);
+  }
+
+private:
+
+  CUTE_HOST_DEVICE
+  static void
+  do_compress_device_host(Params params) {
+    auto [m, n, k, l] = params.problem_shape;
+    auto [ptr_A, dA, ptr_ACompress, ptr_E] = params.transform;
+    auto workspace = params.workspace;
+
+    const int aligned_k = (k + TensorAAlignmentK - 1) / TensorAAlignmentK * TensorAAlignmentK;
+    const int aligned_m = (m + TensorAAlignmentM - 1) / TensorAAlignmentM * TensorAAlignmentM;
+    const int metadata_k = (k + TensorEAlignmentK - 1) / TensorEAlignmentK * TensorEAlignmentK;
+    const int metadata_m = (m + TensorEAlignmentM - 1) / TensorEAlignmentM * TensorEAlignmentM;
+    const int k_compressed = aligned_k / ElementASparsity{};
+
+    // Convert to CuTe tensors. But don't want to use sparse_ptr, which is making everything complicated here.
+    cute::Tensor tensorA = make_tensor(recast_ptr<ElementAUint>(ptr_A), make_layout(make_shape(m, k, l), dA));
+
+    cute::Tensor tensorAc = make_tensor(recast_ptr<ElementAUint>(ptr_ACompress),
+                      make_shape(aligned_m, k_compressed, l),
+                      make_cute_packed_stride(StrideA{}, cute::make_shape(aligned_m, k_compressed, l)));
+
+    cute::Tensor tensorE_raw_compress_logical = make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity{},ElementEMmaRaw>>(workspace),
+                                make_shape(metadata_m, make_shape(TensorEAtomK{}, metadata_k / TensorEAtomK{}), l),
+                                make_stride(TensorEAtomK{}, make_stride(_1{}, metadata_m*TensorEAtomK{}), metadata_m*metadata_k));
+
+    cute::Tensor tensorE_raw_compress = recast<uint8_t>(tensorE_raw_compress_logical);
+
+    // The following vars are all logical.
+    int atom_m = size<0>(TensorEAtom{});
+    int atom_k = size<1>(TensorEAtom{});
+    int tiled_m = metadata_m / atom_m;
+    int tiled_ke = metadata_k / atom_k;
+    // Col major when viewing atoms
+    int stride_tile_m = cosize(TensorEAtom{});
+    int stride_tile_ke = atom_k * metadata_m;
+
+    // Logical metadata tensor
+    cute::Tensor tensorE_logical = make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity{},ElementEMmaRaw>>(ptr_E),
+                           make_layout(make_shape(append(shape<0>(TensorEAtom{}), tiled_m),
+                                       append(shape<1>(TensorEAtom{}), tiled_ke),
+                                       shape<2>(tensorE_raw_compress_logical)),
+                                 make_stride(append(stride<0>(TensorEAtom{}), stride_tile_m),
+                                       append(stride<1>(TensorEAtom{}), stride_tile_ke),
+                                       stride<2>(tensorE_raw_compress_logical))));
+    // Physical metadata tensor
+    cute::Tensor tensorE = recast<uint8_t>(tensorE_logical);
+
+    // void do_init()
+    cute::clear(tensorAc);
+    cute::clear(tensorE_raw_compress);
+
+    // void do_raw_compress()
+    using TileStepA = Int<LogicalElemsAPerChunk * 2>;
+    using TileStepAc = Int<TileStepA{} / 2>;
+
+    cute::Tensor tensorATiled = logical_divide(tensorA, make_shape(_, TileStepA{}, _));
+    cute::Tensor tensorAcTiled = logical_divide(tensorAc, make_shape(_, TileStepAc{}, _));
+
+    for (int batch_idx = 0; batch_idx < l; batch_idx++) {
+      for (int m_idx = 0; m_idx < m; m_idx++) {
+        for (int tiler_k_idx = 0; tiler_k_idx < size<1,1>(tensorATiled); tiler_k_idx++) {
+          int effective_elems = cute::min(TileStepA{}, k - (tiler_k_idx * TileStepA{}));
+          detail::compress_two_chunks_legacy<SparseConfig>(tensorATiled(m_idx, make_coord(_, tiler_k_idx), batch_idx),
+                                                     tensorAcTiled(m_idx, make_coord(_, tiler_k_idx), batch_idx),
+                                                     tensorE_raw_compress(m_idx, tiler_k_idx, batch_idx),
+                                                     effective_elems);
+        }
+      }
+    }
+
+    // void do_reorder()
+    // Fast path when we don't permute.
+    if constexpr (sizeof_bits_v<ElementAUint> <= 8) {
+      memcpy(tensorE.data(), tensorE_raw_compress.data(), tensorE.size());
+    }
+    else {
+      cute::copy(tensorE_raw_compress, tensorE);
+    }
+
+    #if 0
+    print("--> TensorA\n");
+    auto tensorA_eltA = cute::recast<ElementA>(tensorA);
+    cute::print_tensor(tensorA_eltA); printf("\n\n");
+
+    print("--> REF TensorAC\n");
+    auto tensorAc_eltA = cute::recast<ElementA>(tensorAc);
+    cute::print_tensor(tensorAc_eltA); printf("\n\n");
+
+    print("--> REF TensorE\n");
+    cute::print_tensor(tensorE); printf("\n\n");
+    #endif
+
+  }
+};
+
+}  // namespace kernel
+}  // namespace transform
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f44458244e0d3c4c80ecc29a0115cd6906211559
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
@@ -0,0 +1,877 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * @brief Test for structured sparse gemm compressor device kernel
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>  // cudaGetLastError
+
+#include <cstdint>             // uint64_t
+#include <cstdio>              // printf
+#include <cstdlib>             // malloc
+#include <iostream>            // std::cout
+#include <vector>
+#include <array>
+
+#include "cute/layout.hpp"                                    // cute::make_shape
+#include "cute/util/type_traits.hpp"                          // cute::is_same_v
+#include "cutlass/coord.h"                                    // cutlass::make_Coord
+#include "cutlass/cutlass.h"                                  // cutlass::Status
+#include "cutlass/kernel_hardware_info.hpp"                          // cutlass::KernelHardwareInfo
+#include "cutlass/layout/matrix.h"                                   // cutlass::layout::Affine2Layout_Factory
+#include "cutlass/numeric_types.h"                                   // cutlass::sizeof_bits, cutlass::float_
+#include "cutlass/tensor_view.h"                                     // cutlass::TensorView
+#include "cutlass/transform/device/transform_universal_adapter.hpp"  // cutlass::transform::device::TransformUniversalAdapter
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"       // cutlass::transform::kernel::StructuredSparseCompressorUtility
+#include "cutlass/util/device_memory.h"                              // cutlass::device_memory::allocation
+#include "cutlass/util/distribution.h"                               // cutlass::Distribution
+#include "cutlass/util/host_tensor.h"                                // cutlass::HostTensor
+#include "cutlass/util/packed_stride.hpp"                            // cutlass::make_cute_packed_stride
+#include "cutlass/util/reference/host/tensor_compare.h"              // cutlass::reference::host::TensorEquals
+#include "cutlass/util/reference/host/tensor_fill.h"  // cutlass::reference::host::TensorFillRandomUniform, TensorFillIdentity, TensorFillRandomGaussian, BlockFillSequential, TensorFill
+#include "cutlass/detail/collective.hpp"
+
+#include "sm90_sparse_gemm_compressor_legacy.hpp"     // Legacy host compressor
+#include "../../common/cutlass_unit_test.h"           // CUTLASS UT, EXPECT_TRUE
+
+
+#define CUDA_CHECK_FALSE(cuda_error)                                                           \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return false;                                                                            \
+    }                                                                                          \
+  }
+
+#define CUDA_CHECK(cuda_error)                                                                 \
+  {                                                                                            \
+    if (cuda_error != cudaSuccess) {                                                           \
+      printf("cudaError %s in %s:%d\n", cudaGetErrorString(cuda_error), __func__, __LINE__ );  \
+      return;                                                                                  \
+    }                                                                                          \
+  }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// * Test Bed
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test
+{
+namespace transform
+{
+namespace device
+{
+
+// Helper Functions
+template <typename Element, typename Layout>
+bool
+initialize_tensor(cutlass::TensorView<Element, Layout> view, cutlass::Distribution::Kind dist_kind, uint64_t seed)
+{
+  if (dist_kind == cutlass::Distribution::Uniform) {
+    double scope_max, scope_min;
+    int bits_input = cutlass::sizeof_bits<Element>::value;
+
+    if (bits_input == 1) {
+      scope_max = 2;
+      scope_min = 0;
+    }
+    else if (bits_input <= 8) {
+        scope_max = 1;
+        scope_min = -1;
+    } else {
+      scope_max = 4;
+      scope_min = -4;
+    }
+    cutlass::reference::host::TensorFillRandomUniform(view, seed, scope_max, scope_min, 0);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Identity) {
+    cutlass::reference::host::TensorFillIdentity(view);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Gaussian) {
+    cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+  }
+
+  else if (dist_kind == cutlass::Distribution::Sequential) {
+    cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllOnes) {
+    cutlass::reference::host::TensorFill(view, Element(1));
+  }
+
+  else if (dist_kind == cutlass::Distribution::AllZeros) {
+    cutlass::reference::host::TensorFill(view, Element(0));
+  }
+
+  else {
+    EXPECT_TRUE(false) << "Not implemented";
+    return false;
+  }
+
+  return true;
+}
+
+// Testbed
+template <typename Compressor_>
+struct TestbedSparseGemmCompressor {
+public:
+  using Compressor = Compressor_;
+  using CompressorKernel = typename Compressor::TransformKernel;
+
+  using ElementA = typename CompressorKernel::ElementA;
+  using LayoutATag = typename CompressorKernel::LayoutATag;
+  using StrideA = typename CompressorKernel::StrideA;
+  using ArrayElementA = 
+    ElementA
+  ;
+
+  using ElementE = typename CompressorKernel::ElementEMmaRaw;
+  using LayoutETag = cutlass::layout::RowMajor;  // We don't care about the major here, just to allocate tensor
+
+  using SparseConfig = typename CompressorKernel::SparseConfig;
+  using ProblemShapeType = typename CompressorKernel::ProblemShape;
+
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              ProblemShapeType,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig>;
+
+  using CompressorKernelHost = cutlass::transform::kernel::SM90StructuredSparseCompressorLegacy<
+                                ProblemShapeType,
+                                ElementA,
+                                LayoutATag,
+                                SparseConfig>;
+
+  using CompressorHost = cutlass::transform::device::TransformUniversalAdapter<CompressorKernelHost>;
+
+  static constexpr auto LogicalElemsAPerChunk = CompressorKernel::LogicalElemsAPerChunk;
+  static constexpr auto PhysicalElemsAPerChunk = CompressorKernel::PhysicalElemsAPerChunk;
+
+  struct Data {
+    // Data Storage
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A;
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A_Comp;
+    cutlass::HostTensor<ElementE, LayoutETag> tensor_E;
+    cutlass::HostTensor<ArrayElementA, LayoutATag> tensor_A_Comp_ref;
+    cutlass::HostTensor<ElementE, LayoutETag> tensor_E_ref;
+  };
+
+  struct CudaRAII {
+    cudaStream_t stream;
+    cudaEvent_t start;
+    cudaEvent_t stop;
+  
+    CudaRAII(){
+      CUDA_CHECK(cudaStreamCreate( &stream ));
+      CUDA_CHECK(cudaEventCreate( &start ));
+      CUDA_CHECK(cudaEventCreate( &stop ));
+    };
+
+    CudaRAII(const CudaRAII&) = delete;  
+    CudaRAII& operator=(const CudaRAII&) = delete;  
+    CudaRAII(CudaRAII&&) = delete;  
+    CudaRAII& operator=(CudaRAII&&) = delete;  
+
+    ~CudaRAII(){
+      CUDA_CHECK(cudaStreamDestroy( stream ));
+      CUDA_CHECK(cudaEventDestroy( start ));
+      CUDA_CHECK(cudaEventDestroy( stop ));
+    }
+  };
+
+public:
+  TestbedSparseGemmCompressor(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_E_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_A_Comp_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 7)
+      : init_A(init_A_)
+      , init_E(init_E_)
+      , init_A_Comp(init_A_Comp_)
+      , seed(seed_)
+  {
+  }
+
+  bool valid_test(ProblemShapeType problem_shape_MNKL)
+  {
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+
+    if ( GemmK % LogicalElemsAPerChunk != 0 ) {
+      printf("GemmK needs to be multiplier of LogicalElemsAPerChunk\n");
+      return false;
+    }
+
+    return true;
+  }
+
+  bool initialize(ProblemShapeType problem_shape_MNKL, Data& datas)
+  {
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    // In unit of ElementARaw
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    // Compressor utility to get allocated data size
+    auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+    CompressorUtility compressor_utility(problem_shape_MNKL, stride_a);
+
+    // TensorA
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: no alignment requirement
+    // K-dim: multiplier of chunk size
+
+    // TensorA Compressed
+    // In unit of ElementARaw, after alignment requirement
+    // M-dim: TMA alignment
+    // K-dim: TMA alignment
+    const int GemmMAlignedAC = compressor_utility.get_tensorA_m_physical();
+    const int GemmKAlignedAC = compressor_utility.get_tensorA_k_physical();
+
+    // TensorE
+    // In unit of ElementE (uint8_t), after alignment requirement
+    // M-dim: TensorEAtom_M alignment
+    // K-dim: TensorEAtom_K alignment
+    const int GemmMAlignedE = compressor_utility.get_metadata_m_physical();
+    const int GemmKAlignedE = compressor_utility.get_metadata_k_physical();
+
+    auto a_coord = cutlass::make_Coord(GemmM * GemmL, GemmK);
+    auto e_coord = cutlass::make_Coord(GemmMAlignedE * GemmL, GemmKAlignedE);
+    auto a_comp_coord = cutlass::make_Coord(GemmMAlignedAC * GemmL, GemmKAlignedAC);
+
+    typename LayoutATag::Stride stride_factor_A;
+    typename LayoutETag::Stride stride_factor_E;
+
+    datas.tensor_A.resize(a_coord,
+                          cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_coord, stride_factor_A));
+    datas.tensor_A_Comp.resize(a_comp_coord,
+                               cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_comp_coord, stride_factor_A));
+    datas.tensor_A_Comp_ref.resize(a_comp_coord,
+                                   cutlass::layout::Affine2Layout_Factory<LayoutATag>::layout_factory(a_comp_coord, stride_factor_A),
+                                   false);
+    datas.tensor_E.resize(e_coord,
+                          cutlass::layout::Affine2Layout_Factory<LayoutETag>::layout_factory(e_coord, stride_factor_E));
+    datas.tensor_E_ref.resize(e_coord,
+                              cutlass::layout::Affine2Layout_Factory<LayoutETag>::layout_factory(e_coord, stride_factor_E),
+                              false);
+
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A.host_view(), init_A, seed + 1));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_E.host_view(), init_E, seed + 2));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_E_ref.host_view(), init_E, seed + 3));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A_Comp.host_view(), init_A_Comp, seed + 4));
+    EXPECT_TRUE(initialize_tensor(datas.tensor_A_Comp_ref.host_view(), init_A_Comp, seed + 5));
+
+    compressor_utility.structure_sparse_zero_mask_fill(datas.tensor_A.host_data(), seed + 6);
+
+    // Check for failed devide
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    datas.tensor_A.sync_device();
+    datas.tensor_A_Comp.sync_device();
+    datas.tensor_E.sync_device();
+
+    // Check for failed devide
+    CUDA_CHECK_FALSE(cudaGetLastError());
+
+    return true;
+  }
+
+  bool run_device(ProblemShapeType problem_shape_MNKL, Data& datas, float* time = nullptr)
+  {
+    CudaRAII cuda_raii;
+
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+
+    cutlass::KernelHardwareInfo hw_info;
+    hw_info.device_id = 0;
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+        {GemmM, GemmN, GemmK, GemmL},
+        {datas.tensor_A.device_data(),
+         stride_a,
+         datas.tensor_A_Comp.device_data(),
+         datas.tensor_E.device_data()},
+        {hw_info}
+    };
+
+    Compressor compressor_op;
+    size_t workspace_size = Compressor::get_workspace_size(arguments);
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    status = compressor_op.can_implement(arguments);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    status = compressor_op.initialize(arguments, workspace.get(), cuda_raii.stream);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    CUDA_CHECK_FALSE(cudaStreamSynchronize(cuda_raii.stream));
+    CUDA_CHECK_FALSE(cudaEventRecord(cuda_raii.start, cuda_raii.stream));
+
+    status = compressor_op.run(cuda_raii.stream);
+    if (status != cutlass::Status::kSuccess) {
+      CUDA_CHECK_FALSE(cudaGetLastError());
+    }
+
+    CUDA_CHECK_FALSE(cudaEventRecord(cuda_raii.stop, cuda_raii.stream));
+    CUDA_CHECK_FALSE(cudaEventSynchronize(cuda_raii.stop));
+    CUDA_CHECK_FALSE(cudaStreamSynchronize(cuda_raii.stream));
+    if ( time != nullptr ){
+      CUDA_CHECK_FALSE(cudaEventElapsedTime(time, cuda_raii.start, cuda_raii.stop));
+    }
+
+    datas.tensor_A_Comp.sync_host();
+    datas.tensor_E.sync_host();
+
+    #if 0
+    {
+      printf("\n--> DEVICE OUTPUT\n");
+      printf("datas.tensor_A\n");
+      std::cout << datas.tensor_A.host_view() << std::endl << std::endl;
+      printf("datas.tensor_A_Comp\n");
+      std::cout << datas.tensor_A_Comp.host_view() << std::endl << std::endl;
+      printf("datas.tensor_E\n");
+      std::cout << datas.tensor_E.host_view() << std::endl << std::endl;
+    }
+    #endif
+
+    return true;
+  }
+
+  bool run_host_ref(ProblemShapeType problem_shape_MNKL, Data& datas)
+  {
+    const int GemmM = cute::size<0>(problem_shape_MNKL);
+    const int GemmN = cute::size<1>(problem_shape_MNKL);
+    const int GemmK = cute::size<2>(problem_shape_MNKL);
+    const int GemmL = cute::size<3>(problem_shape_MNKL);
+
+    StrideA stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(GemmM, GemmK, GemmL));
+
+    typename CompressorKernelHost::Arguments arguments{
+        {GemmM, GemmN, GemmK, GemmL},
+        {datas.tensor_A.host_data(),
+         stride_a,
+         datas.tensor_A_Comp_ref.host_data(),
+         datas.tensor_E_ref.host_data()},
+        {}};
+
+    const auto can_imp = CompressorKernelHost::can_implement(arguments);
+    if (can_imp != cutlass::Status::kSuccess) {
+      printf("can_implement() check failed\n");
+      return false;
+    }
+
+    // Relies on std::vector for RAII
+    auto workspace_size =
+        static_cast<std::vector<uint8_t>::size_type>(CompressorKernelHost::get_workspace_size(arguments));
+    std::vector<uint8_t> workspace_vector(workspace_size);
+    auto workspace = static_cast<void*>(workspace_vector.data());
+
+    cutlass::Status status = CompressorKernelHost::initialize_workspace(arguments, workspace);
+    if (status != cutlass::Status::kSuccess) {
+      printf("initialize_workspace() failed\n");
+      return false;
+    }
+
+    auto params = CompressorKernelHost::to_underlying_arguments(arguments, workspace);
+    CompressorKernelHost::run(params);
+
+    return true;
+  }
+
+  bool compare_reference(Data& datas)
+  {
+    bool check_tensor_a_compressed =
+        cutlass::reference::host::TensorEquals(datas.tensor_A_Comp_ref.host_view(), datas.tensor_A_Comp.host_view());
+    if (!check_tensor_a_compressed) {
+      printf("A-Compressed Mismatch\n");
+    }
+
+    bool check_tensor_e = cutlass::reference::host::TensorEquals(datas.tensor_E_ref.host_view(), datas.tensor_E.host_view());
+    if (!check_tensor_e) {
+      printf("E Mismatch\n");
+    }
+
+    return check_tensor_a_compressed && check_tensor_e;
+  }
+
+  bool run_auto_small()
+  {
+    return run_auto(true);
+  }
+
+  bool run_auto(bool run_small = false)
+  {
+    constexpr auto TensorEAlignmentM = typename SparseConfig::TensorEAlignmentM{};
+    constexpr auto TensorEAlignmentK = typename SparseConfig::TensorEAlignmentK{};
+    constexpr int LogicalElemsAPerChunk = typename SparseConfig::LogicalElemsAPerChunk{};
+
+    constexpr int GemmN = 1;
+
+    using ProblemType = typename std::array<int, 4>;
+
+    std::vector<ProblemType> problems;
+
+    const std::vector<ProblemType> problems_multiplier_of_tensor_e_atom = {
+      // * Regular Cases (multiplier of TensorEAlignment)
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 1},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 1},
+
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 2},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 2},
+
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 1, GemmN, TensorEAlignmentK * 3, 3},
+
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 2, GemmN, TensorEAlignmentK * 3, 3},
+
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 2, 3},
+      {TensorEAlignmentM * 3, GemmN, TensorEAlignmentK * 3, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_tensor_e_atom_large = {
+      // * Large Case (multiplier of TensorEAlignment)
+      {TensorEAlignmentM * 10, GemmN, TensorEAlignmentK * 13, 1},
+      // {TensorEAlignmentM * 11, GemmN, TensorEAlignmentK * 14, 2},
+      // {TensorEAlignmentM * 12, GemmN, TensorEAlignmentK * 15, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_twochunk {
+      // * Corner Cases
+      {4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 1},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 2},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 6, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 2, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 4, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 6, 3},
+    };
+
+    const std::vector<ProblemType> problems_multiplier_of_onechunk {
+      {4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {32 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 1},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 1},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 2},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 2},
+
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK + LogicalElemsAPerChunk * 5, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 1, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 3, 3},
+      {TensorEAlignmentM * 2 + 4, GemmN, TensorEAlignmentK * 2 + LogicalElemsAPerChunk * 5, 3},
+    };
+
+    // Run small only run multiplier of chunk size cases
+    if (run_small) {
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom.begin(), problems_multiplier_of_tensor_e_atom.end());
+    }
+    // Run full run all corner cases
+    else {
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom_large.begin(), problems_multiplier_of_tensor_e_atom_large.end());
+      problems.insert(problems.end(), problems_multiplier_of_tensor_e_atom.begin(), problems_multiplier_of_tensor_e_atom.end());
+      problems.insert(problems.end(), problems_multiplier_of_twochunk.begin(), problems_multiplier_of_twochunk.end());
+      problems.insert(problems.end(), problems_multiplier_of_onechunk.begin(), problems_multiplier_of_onechunk.end());
+    }
+
+    for (const auto& problem_shape_MNKL : problems) {
+      const auto [GemmM, GemmN, GemmK, GemmL] = problem_shape_MNKL;
+      bool passed = run({GemmM, GemmN, GemmK, GemmL});
+      printf("run() (%.4d,%.4d,%.4d,%.4d) %s\n", GemmM, GemmN, GemmK, GemmL, passed ? "PASS" : "FAIL");
+      CUTLASS_TRACE_HOST("run() " << GemmM << " " << GemmN << " " << GemmK << " " << GemmL << passed ? " PASS" : " FAIL");
+      if (not passed) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool run(ProblemShapeType problem_shape_MNKL)
+  {
+    // Check if valid test
+    if (not valid_test(problem_shape_MNKL)) {
+      CUTLASS_TRACE_HOST("valid_test() fail\n");
+      return false;
+    }
+
+    // Data Storage
+    Data datas;
+
+    // Initialize Data
+    if (not initialize(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("initialize() fail\n");
+      return false;
+    }
+
+    // Run Compressor (Host Ref)
+    if (not run_host_ref(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("run_host() fail\n");
+      return false;
+    }
+
+    // Run Compressor (Device)
+    if (not run_device(problem_shape_MNKL, datas)) {
+      CUTLASS_TRACE_HOST("run_device() fail\n");
+      return false;
+    }
+
+    // Verify
+    if (not compare_reference(datas)) {
+      CUTLASS_TRACE_HOST("compare_reference() DEVICE <-> LEGACY HOST fail\n");
+      printf("compare_reference() DEVICE <-> LEGACY HOST fail\n");
+      return false;
+    }
+    // else {
+    //   printf("DEVICE <-> HOST PASS\n");
+    // }
+
+    return true;
+  }
+
+  bool benchmark(ProblemShapeType problem_shape_MNKL) {
+    const auto [GemmM, GemmN, GemmK, GemmL] = problem_shape_MNKL;
+    printf("Benchmark() (%.4d,%.4d,%.4d,%.4d) START\n", GemmM, GemmN, GemmK, GemmL);
+
+    // Check if valid test
+    if (valid_test(problem_shape_MNKL) == false) {
+      CUTLASS_TRACE_HOST("valid_test() fail\n");
+      return false;
+    }
+
+    // 2 warm-up iterations and 10 timing iterations
+    constexpr int num_warmup = 5;
+    constexpr int num_iter = 10;
+
+    // Duplicate data to mimic cold cache
+    Data data[num_warmup + num_iter];
+    double total_time_milliseconds{0.0};
+
+    for (int i = 0; i < num_warmup + num_iter; ++i ) {
+      printf("Benchmark() (%.4d,%.4d,%.4d,%.4d) ITER %d\n", GemmM, GemmN, GemmK, GemmL, i );
+
+      auto& datum_i = data[i];
+
+      // Initialize Data  
+      if (initialize(problem_shape_MNKL, datum_i) == false) {
+        CUTLASS_TRACE_HOST("initialize() fail\n");
+        return false;
+      }
+
+      // Run Compressor (Device)
+      double time_i_milliseconds{0.0f};
+      if (not run_device(problem_shape_MNKL, datum_i, &time_i_milliseconds)) {
+        CUTLASS_TRACE_HOST("run_device() fail\n");
+        return false;
+      }
+
+      if ( i >= num_warmup ) {
+        total_time_milliseconds += time_i_milliseconds;
+      }
+    }
+
+    const double mean_time_milliseconds = total_time_milliseconds / num_iter;
+    printf("Mean time (ms): %.5f\n", mean_time_milliseconds);
+
+    return true;
+  }
+
+public:
+  // Data Init Setting
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_A_Comp;
+  cutlass::Distribution::Kind init_E;
+  uint64_t seed;
+};
+
+}  // namespace device
+}  // namespace transform
+}  // namespace test
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/arch_mappings.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/arch_mappings.h
new file mode 100644
index 0000000000000000000000000000000000000000..df241e3ca6e6e584af7351402d990a8028e2abed
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/arch_mappings.h
@@ -0,0 +1,156 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+
+  \brief CUTLASS Library is an object-oriented approach to managing operations implemented by CUTLASS.
+
+  Generally,
+
+    description   - compile-time constant parameters used to instantiate an operation
+
+    configuration - runtime parameters with computationally expensive initialization
+
+    arguments     - runtime parameters that may be passed to an initialized operation with low
+                    computational overhead
+*/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+#include "cutlass/arch/arch.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename ArchTag, typename OperatorClass> struct ArchMap;
+
+template <> struct ArchMap<arch::Sm50, arch::OpClassSimt> {
+  static int const kMin = 50;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm60, arch::OpClassSimt> {
+  static int const kMin = 60;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm61, arch::OpClassSimt> {
+  static int const kMin = 61;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm70, arch::OpClassWmmaTensorOp> {
+  static int const kMin = 70;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm70, arch::OpClassTensorOp> {
+  static int const kMin = 70;
+  static int const kMax = 75;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm75, OperatorClass> {
+  static int const kMin = 75;
+  static int const kMax = 1024;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm80, OperatorClass> {
+  static int const kMin = 80;
+  static int const kMax = 1024;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm86, OperatorClass> {
+  static int const kMin = 86;
+  static int const kMax = 1024;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm89, OperatorClass> {
+  static int const kMin = 89;
+  static int const kMax = 100;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm90, OperatorClass> {
+  static int const kMin = 90;
+  static int const kMax = 1024;
+};
+
+// Arch conditional WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
+// Arch conditional sparse WGMMA
+template <> struct ArchMap<arch::Sm90, arch::OpClassSparseTensorOp> {
+  static int const kMin = 90;
+  static int const kMax = 90;
+};
+
+
+template <typename OperatorClass> struct ArchMap<arch::Sm100, OperatorClass> {
+  static int const kMin = 100;
+  static int const kMax = 1024;
+};
+
+template <> struct ArchMap<arch::Sm100, arch::OpClassTensorOp> {
+  static int const kMin = 100;
+  #if (__CUDACC_VER_MAJOR__ >= 13)
+    static int const kMax = 110;
+  #else
+      static int const kMax = 103;
+  #endif // __CUDACC_VER_MAJOR__ >= 13
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm103, OperatorClass> {
+  static int const kMin = 103;
+  static int const kMax = 1024;
+};
+template <> struct ArchMap<arch::Sm103, arch::OpClassTensorOp> {
+  static int const kMin = 103;
+  static int const kMax = 103;
+};
+
+template <typename OperatorClass> struct ArchMap<arch::Sm120, OperatorClass> {
+  static int const kMin = 120;
+  static int const kMax = 121;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/descriptions.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/descriptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e80c124e59d24cd90c7c1b0c06bcc3bedfee62f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/descriptions.h
@@ -0,0 +1,815 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/library/types.h>
+#include <cutlass/blas3_types.h>
+#include <cutlass/gemm_coord.h>
+
+#include <optional>
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct MathInstructionDescription {
+
+  /// Shape of the target math instruction
+  cutlass::gemm::GemmCoord instruction_shape;
+
+  /// Describes the data type of the internal accumulator
+  NumericTypeID element_accumulator;
+
+  /// Classification of math instruction
+  OpcodeClassID opcode_class;
+
+  /// Type of math operation performed
+  MathOperationID math_operation;
+
+  //
+  // Methods
+  //
+
+  MathInstructionDescription(
+    cutlass::gemm::GemmCoord instruction_shape = cutlass::gemm::GemmCoord(),
+    NumericTypeID element_accumulator = NumericTypeID::kInvalid,
+    OpcodeClassID opcode_class = OpcodeClassID::kInvalid,
+    MathOperationID math_operation = MathOperationID::kMultiplyAdd
+  ):
+    instruction_shape(instruction_shape), 
+    element_accumulator(element_accumulator), 
+    opcode_class(opcode_class),
+    math_operation(math_operation) {}
+
+  // Equality operator
+  inline
+  bool operator==(MathInstructionDescription const& rhs) const{
+    return (
+      (instruction_shape == rhs.instruction_shape) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (opcode_class == rhs.opcode_class) &&
+      (math_operation == rhs.math_operation));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(MathInstructionDescription const& rhs) const {
+    return !(*this == rhs);
+  }
+
+};
+
+/// Structure describing the tiled structure of a GEMM-like computation
+struct TileDescription {
+
+  /// Describes the shape of a threadblock (in elements)
+  cutlass::gemm::GemmCoord threadblock_shape;
+
+  /// Describes the number of pipeline stages in the threadblock-scoped mainloop
+  int threadblock_stages;
+
+  /// Number of warps in each logical dimension
+  cutlass::gemm::GemmCoord warp_count;
+
+  /// Core math instruction
+  MathInstructionDescription math_instruction;
+
+  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
+  int minimum_compute_capability;
+
+  /// Minimum compute capability (e.g. 70, 75) of a device eligible to run the operation.
+  int maximum_compute_capability;
+
+  /// Describes the shape of a cluster (in blocks)
+  cutlass::gemm::GemmCoord cluster_shape;
+
+  //
+  // Methods
+  //
+
+  TileDescription(
+    cutlass::gemm::GemmCoord threadblock_shape = cutlass::gemm::GemmCoord(),
+    int threadblock_stages = 0,
+    cutlass::gemm::GemmCoord warp_count = cutlass::gemm::GemmCoord(),
+    MathInstructionDescription math_instruction = MathInstructionDescription(),
+    int minimum_compute_capability = 0,
+    int maximum_compute_capability = 0,
+    cutlass::gemm::GemmCoord cluster_shape = cutlass::gemm::GemmCoord(1,1,1)
+  ):
+    threadblock_shape(threadblock_shape), 
+    threadblock_stages(threadblock_stages), 
+    warp_count(warp_count),
+    math_instruction(math_instruction),
+    minimum_compute_capability(minimum_compute_capability),
+    maximum_compute_capability(maximum_compute_capability),
+    cluster_shape(cluster_shape) { }
+
+  // Equality operator
+  inline
+  bool operator==(TileDescription const& rhs) const{
+    return (
+      (threadblock_shape == rhs.threadblock_shape) &&
+      (threadblock_stages == rhs.threadblock_stages) &&
+      (warp_count == rhs.warp_count) &&
+      (math_instruction == rhs.math_instruction) &&
+      (minimum_compute_capability == rhs.minimum_compute_capability) &&
+      (maximum_compute_capability == rhs.maximum_compute_capability));
+  }
+
+  // Inequality operator
+  inline
+  bool operator!=(TileDescription const& rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+/// High-level description of an operation
+struct OperationDescription {
+
+  /// Unique identifier describing the operation
+  char const * name;
+
+  /// Operation provider
+  Provider provider;
+
+  /// Kind of operation
+  OperationKind kind;
+
+  /// Describes the tiled structure of a GEMM-like computation
+  TileDescription tile_description;
+
+  //
+  // Methods
+  //
+  OperationDescription(
+    char const * name = "unknown",
+    Provider provider = Provider::kInvalid,
+    OperationKind kind = OperationKind::kInvalid, 
+    TileDescription const&  tile_description = TileDescription()
+  ):
+    name(name), provider(provider), kind(kind), tile_description(tile_description) { }
+};
+
+/// Structure describing the properties of a tensor
+struct TensorDescription {
+
+  /// Numeric type of an individual element
+  NumericTypeID element;
+
+  /// Enumerant identifying the layout function for the tensor
+  LayoutTypeID layout;
+
+  /// Alignment restriction on pointers, strides, and extents
+  int alignment;
+
+  /// log2() of the maximum extent of each dimension
+  int log_extent_range;
+
+  /// log2() of the maximum value each relevant stride may have
+  int log_stride_range;
+  
+  //
+  // Methods
+  //
+
+  TensorDescription(
+    NumericTypeID element = NumericTypeID::kInvalid,
+    LayoutTypeID layout = LayoutTypeID::kInvalid,
+    int alignment = 1,
+    int log_extent_range = 24,
+    int log_stride_range = 24
+  ):
+    element(element), 
+    layout(layout), 
+    alignment(alignment), 
+    log_extent_range(log_extent_range), 
+    log_stride_range(log_stride_range)  { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all GEMM computations
+struct GemmDescription : public OperationDescription {
+
+  /// Indicates the kind of GEMM performed
+  GemmKind gemm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source matrix
+  TensorDescription C;
+
+  /// Describes the destination matrix
+  TensorDescription D;
+
+  /// Describes the sparse meta matrices
+  TensorDescription E;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  GemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+
+  GemmDescription(
+    OperationDescription op_desc,
+    GemmKind gemm_kind,
+    TensorDescription const& A,
+    TensorDescription const& B,
+    TensorDescription const& C,
+    TensorDescription const& D,
+    NumericTypeID element_epilogue,
+    SplitKMode split_k_mode,
+    ComplexTransform transform_A,
+    ComplexTransform transform_B
+  ):
+    OperationDescription(op_desc),
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {}
+};
+
+struct BlockScaleDescription {
+  /// Describes the SFA operand
+  TensorDescription SFA;
+
+  /// Describes the SFB operand
+  TensorDescription SFB;
+
+  /// Describes the SFD operand
+  TensorDescription SFD;
+
+  /// Describes the input ScaleFactor VectorSize
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+
+  /// Describes the Output ScaleFactor VectorSize
+  int EpilogueSFVecSize;
+
+  /// Describes the underlying kind of scaling: 
+  /// Tensor Core supported (BlockScaled) or manual scaling (Blockwise)
+  OperationKind kind;
+};
+
+struct GroupedGemmDescription : public OperationDescription {
+  GemmDescription gemm;
+  std::optional<BlockScaleDescription> block_scales;
+};
+
+/// Description of all GEMM computations
+struct BlockScaledGemmDescription : public OperationDescription {
+
+  /// Indicates the kind of GEMM performed
+  GemmKind gemm_kind;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source matrix
+  TensorDescription C;
+
+  /// Describes the destination matrix
+  TensorDescription D;
+
+  /// Describes the SFA operand
+  TensorDescription SFA;
+
+  /// Describes the SFB operand
+  TensorDescription SFB;
+
+  /// Describes the SFD operand 
+  TensorDescription SFD; 
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  /// Describes the input ScaleFactor VectorSize 
+  int SFVecSize;
+
+  /// Describes the Output ScaleFactor VectorSize 
+  int EpilogueSFVecSize;
+
+  //
+  // Methods
+  //
+
+  BlockScaledGemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+
+  BlockScaledGemmDescription(
+    OperationDescription op_desc,
+    GemmKind gemm_kind,
+    TensorDescription const& A,
+    TensorDescription const& B,
+    TensorDescription const& C,
+    TensorDescription const& D,
+    NumericTypeID element_epilogue,
+    SplitKMode split_k_mode,
+    ComplexTransform transform_A,
+    ComplexTransform transform_B
+  ):
+    OperationDescription(op_desc),
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {}
+};
+
+/// Description of all GEMM computations
+struct BlockwiseGemmDescription : public OperationDescription {
+
+  /// Indicates the kind of GEMM performed
+  GemmKind gemm_kind;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source matrix
+  TensorDescription C;
+
+  /// Describes the destination matrix
+  TensorDescription D;
+
+  /// Describes the SFA operand
+  TensorDescription SFA;
+
+  /// Describes the SFB operand
+  TensorDescription SFB;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  /// Describes the input ScaleFactor VectorSize 
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+
+  //
+  // Methods
+  //
+
+  BlockwiseGemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+
+  BlockwiseGemmDescription(
+    OperationDescription op_desc,
+    GemmKind gemm_kind,
+    TensorDescription const& A,
+    TensorDescription const& B,
+    TensorDescription const& C,
+    TensorDescription const& D,
+    NumericTypeID element_epilogue,
+    SplitKMode split_k_mode,
+    ComplexTransform transform_A,
+    ComplexTransform transform_B
+  ):
+    OperationDescription(op_desc),
+    gemm_kind(gemm_kind),
+    A(A),
+    B(B),
+    C(C),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description for structured sparse GEMMs.
+struct SparseGemmDescription : public GemmDescription {
+
+  /// Description structure for structured sparse GEMM
+  SparseGemmDescription(
+    GemmKind gemm_kind = GemmKind::kGemm,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    TensorDescription const& E = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    GemmDescription(gemm_kind, A, B, C, D, element_epilogue, split_k_mode, transform_A, transform_B)
+     {this->E = E;}
+};
+
+/// Description of all Reduction operations
+struct ReductionDescription : public OperationDescription {
+
+  /// Describes the data type of workspace
+  NumericTypeID element_workspace;
+
+  /// Describes the data type of final output
+  NumericTypeID element_output;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+};
+
+/// Description of all Rank K update computations (SYRK, HERK, SYR2K, HER2K)
+struct RankKDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  RankKKind rank_k_kind;
+
+  /// Number of rank update (rank k or rank 2k)
+  int num_ranks;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand (used only for SYR2K and HER2K)
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the fill mode for matrix C
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  RankKDescription(
+    RankKKind rank_k_kind = RankKKind::kUniversal,
+    int num_ranks = 1,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    rank_k_kind(rank_k_kind),
+    num_ranks(num_ranks),
+    A(A),
+    B(B),
+    C(C),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all TRMM computations
+struct TrmmDescription : public OperationDescription {
+
+  /// Indicates the kind of TRMM performed
+  TrmmKind trmm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the diag type for matrix A
+  DiagType diag_type;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription D;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  //
+  // Methods
+  //
+
+  TrmmDescription(
+    TrmmKind trmm_kind = TrmmKind::kUniversal,
+    TensorDescription const& A = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    DiagType diag_type = DiagType::kInvalid,
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& D = TensorDescription(),
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone
+  ):
+    trmm_kind(trmm_kind),
+    A(A),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    diag_type(diag_type),
+    B(B),
+    D(D),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all SYMM/HEMM update computations
+struct SymmDescription : public OperationDescription {
+
+  /// Indicates which device template is used (universal or regular)
+  SymmKind symm_kind;
+  
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand 
+  TensorDescription B;
+
+  /// Describes the source and destination matrices
+  TensorDescription C;
+
+  /// Describes the side mode for matrix A
+  SideMode side_mode;
+
+  /// Describes the fill mode for matrix A
+  FillMode fill_mode;
+
+  /// Describes the blas mode (symmetric/hermitian)
+  BlasMode blas_mode;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  /// Describes the structure of parallel reductions
+  SplitKMode split_k_mode;
+
+  /// Transformation on A operand
+  ComplexTransform transform_A;
+
+  /// Transformation on B operand
+  ComplexTransform transform_B;
+
+  //
+  // Methods
+  //
+
+  SymmDescription(
+    SymmKind symm_kind = SymmKind::kUniversal,
+    TensorDescription const& A = TensorDescription(),
+    TensorDescription const& B = TensorDescription(),
+    TensorDescription const& C = TensorDescription(),
+    SideMode side_mode = SideMode::kInvalid,
+    FillMode fill_mode = FillMode::kInvalid,
+    BlasMode blas_mode = BlasMode::kInvalid,
+    NumericTypeID element_epilogue = NumericTypeID::kInvalid,
+    SplitKMode split_k_mode = SplitKMode::kNone,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    ComplexTransform transform_B = ComplexTransform::kNone
+  ):
+    symm_kind(symm_kind),
+    A(A),
+    B(B),
+    C(C),
+    side_mode(side_mode),
+    fill_mode(fill_mode),
+    blas_mode(blas_mode),
+    element_epilogue(element_epilogue),
+    split_k_mode(split_k_mode),
+    transform_A(transform_A),
+    transform_B(transform_B) {} 
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Description of all Conv2d operations
+struct ConvDescription : public OperationDescription {
+  /// Describes the convolution dimension support (2D or 3D)
+  int conv_dim;
+  
+  /// Describes the kind of convolution
+  ConvKind conv_kind;
+
+  /// Describes the type of iterator algorithm (analytic or precomputed)
+  IteratorAlgorithmID iterator_algorithm;
+
+  /// Describes the A operand
+  TensorDescription A;
+
+  /// Describes the B operand
+  TensorDescription B;
+
+  /// Describes the C operand
+  TensorDescription C;
+
+  /// Describes the data type of the scalars passed to the epilogue
+  NumericTypeID element_epilogue;
+
+  //
+  // Methods
+  //
+  // Returns Activation TensorDescription
+  TensorDescription activation() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return A;
+      case library::ConvKind::kDgrad : return C;
+      case library::ConvKind::kWgrad : return B;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Filter TensorDescription
+  TensorDescription filter() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return B;
+      case library::ConvKind::kDgrad : return B;
+      case library::ConvKind::kWgrad : return C;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Output TensorDescription
+  TensorDescription output() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return C;
+      case library::ConvKind::kDgrad : return A;
+      case library::ConvKind::kWgrad : return A;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/handle.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..027944eb6ac8c6e8f250d83ed33c0899adfbd3e8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/handle.h
@@ -0,0 +1,365 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief BLAS-like handle used to launch operations on the CUDA device.
+*/
+
+#pragma once
+
+#include <memory>
+#include "cutlass/library/library.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Handle object
+class Handle {
+private:
+
+  /// Host workspace
+  static int const kHostWorkspaceSize = (4 << 10);
+
+  /// Provider of operations
+  Provider provider_;
+
+  /// CUDA device properties
+  cudaDeviceProp device_;
+
+  /// CUDA stream
+  cudaStream_t stream_;
+
+  /// Device workspace
+  void *workspace_;
+
+  /// Size of device workspace in bytes
+  size_t workspace_size_;
+
+  /// Indicates whether scalars are host or device pointers
+  ScalarPointerMode scalar_pointer_mode_;
+
+  /// Pointer to the most recently executed operation
+  Operation const *last_operation_;
+
+  int device_idx_;
+
+public:
+
+  /// Constructor
+  Handle(cudaStream_t stream = nullptr, size_t workspace_size = (4<<20));
+
+  /// Destructor
+  ~Handle();
+
+  /// Move constructor
+  Handle(Handle && handle);
+
+  /// Move assignment operator
+  Handle &operator=(Handle && handle);
+
+  //
+  // Persistent state accessors
+  //
+
+  /// Returns compute capability of the selected device
+  int compute_capability() const;
+
+  /// Sets the current CUDA stream
+  void set_stream(cudaStream_t stream);
+
+  /// Gets the current CUDA stream
+  cudaStream_t get_stream() const;
+
+  /// Gets the current provider
+  Provider get_provider() const;
+
+  /// Sets the provider of operations
+  void set_provider(Provider provider);
+
+  /// Gets the device workspace size
+  size_t get_workspace_size() const;
+
+  /// Gets a pointer to the device workspace allocation in Global Memory
+  void *get_workspace() const;
+
+  /// Sets the size of device workspace, invalidating calls to get_device_workspace()
+  void set_workspace_size(size_t bytes);
+
+  /// Gets the scalar pointer mode
+  ScalarPointerMode get_scalar_pointer_mode() const;
+
+  /// Sets the scalar pointer mode
+  void set_scalar_pointer_mode(ScalarPointerMode mode);
+
+  /// Gets the most recently executed operation
+  Operation const *get_last_operation() const;
+
+  //
+  // Computations
+  //
+
+  /// Executes a GEMM computation: D <= alpha * A*B + beta * C
+  Status gemm(
+
+    int M,                                    /// GEMM M dimension
+    int N,                                    /// GEMM N dimension
+    int K,                                    /// GEMM K dimension
+
+    NumericTypeID element_compute,            /// Data type of internal accumulation
+
+    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+    void const *alpha,                        /// Pointer to alpha scalar
+
+    NumericTypeID element_A,                  /// Data type of A matrix elements
+    LayoutTypeID layout_A,                    /// Layout of A matrix
+    ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
+
+    void const * ptr_A,                       /// Pointer to A matrix in Global Memory
+    int64_t lda,                              /// Leading dimension of A matrix
+
+    NumericTypeID element_B,                  /// Data type of B matrix elements
+    LayoutTypeID layout_B,                    /// Layout of B matrix
+    ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
+
+    void const * ptr_B,                       /// Pointer to B matrix in Global Memory
+    int64_t ldb,                              /// Leading dimension of B matrix
+
+    void const * beta,                        /// Pointer to beta scalar
+
+    NumericTypeID element_C,                  /// Data type of C and D matrices
+
+    void const * ptr_C,                       /// Pointer to C matrix
+    int64_t ldc,                              /// Leading dimension of C matrix
+
+    void * ptr_D,                             /// Pointer to D matrix
+    int64_t ldd                               /// Leading dimension of D matrix
+  );
+
+  /// Executes a GEMM computation: D <= alpha * A*B + beta * C.
+  //
+  // Supports batched-strided, batched array or split-K serial or split-K parallel.
+  //
+  Status gemm_universal(
+
+    GemmUniversalMode mode,                   /// indicates the mode in which the kUniversal GEMM is launched
+
+    int M,                                    /// GEMM M dimension
+    int N,                                    /// GEMM N dimension
+    int K,                                    /// GEMM K dimension
+    
+    int cluster_m,                            /// cluster shape M dimension
+    int cluster_n,                            /// cluster shape N dimension
+    int cluster_k,                            /// cluster shape K dimension
+    int cluster_m_fallback,                   /// Fallback cluster shape M dimension
+    int cluster_n_fallback,                   /// Fallback cluster shape N dimension
+    int cluster_k_fallback,                   /// Fallback cluster shape K dimension
+    
+    
+    NumericTypeID element_compute,            /// Data type of internal accumulation
+
+    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+    void const *alpha,                        /// Pointer to alpha scalar
+
+    NumericTypeID element_A,                  /// Data type of A matrix elements
+    LayoutTypeID layout_A,                    /// Layout of A matrix
+    ComplexTransform transform_A,             /// Complex transformation applied to A matrix - ignored for real-valued matrices
+    void const * ptr_A,                       /// Pointer to A matrix in Global Memory
+    int64_t lda,                              /// Leading dimension of A matrix
+
+    NumericTypeID element_B,                  /// Data type of B matrix elements
+    LayoutTypeID layout_B,                    /// Layout of B matrix
+    ComplexTransform transform_B,             /// Complex transformation applied to B matrix - ignored for real-valued matrices
+    void const * ptr_B,                       /// Pointer to B matrix in Global Memory
+    int64_t ldb,                              /// Leading dimension of B matrix
+
+    void const * beta,                        /// Pointer to beta scalar
+
+    NumericTypeID element_C,                  /// Data type of C matrix
+    LayoutTypeID layout_C,                    /// Layout of D matrix
+    void const * ptr_C,                       /// Pointer to C matrix
+    int64_t ldc,                              /// Leading dimension of C matrix
+
+    NumericTypeID element_D,                  /// Data type of D matrix
+    LayoutTypeID layout_D,                    /// Layout of D matrix
+    void * ptr_D,                             /// Pointer to D matrix
+    int64_t ldd,                              /// Leading dimension of D matrix
+
+    int batch_count = 1,                      /// Batch count or number of split-K slices
+
+    int64_t batch_stride_A = 0,               /// Batch stride of A operand
+    int64_t batch_stride_B = 0,               /// Batch stride of B operand
+    int64_t batch_stride_C = 0,               /// Batch stride of C operand
+    int64_t batch_stride_D = 0                /// Batch stride of D operand
+  );
+
+  /// Planar complex GEMM
+  ///
+  /// Note, all data types are the real-valued base types used by the planar-complex GEMM kernel.
+  ///
+  Status gemm_planar_complex(
+
+    int M,                                    /// GEMM M dimension
+    int N,                                    /// GEMM N dimension
+    int K,                                    /// GEMM K dimension
+
+    NumericTypeID element_compute,            /// Data type of internal accumulation
+
+    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+    void const *alpha,                        /// Pointer to alpha scalar
+
+    NumericTypeID element_A,                  /// Data type of A matrix elements
+    LayoutTypeID layout_A,                    /// Layout of A matrix
+    ComplexTransform transform_A,             /// Complex transformation applied to A matrix
+
+    void const * ptr_A_real,                  /// Pointer to real part of A matrix
+    void const * ptr_A_imag,                  /// Pointer to imaginary part of A matrix
+    int64_t lda_real,                         /// Leading dimension of real part of A matrix
+    int64_t lda_imag,                         /// Leading dimension of imaginary part of A matrix
+
+    NumericTypeID element_B,                  /// Data type of B matrix elements
+    LayoutTypeID layout_B,                    /// Layout of B matrix
+    ComplexTransform transform_B,             /// Complex transformation applied to B matrix
+
+    void const * ptr_B_real,                  /// Pointer to real part of B matrix
+    void const * ptr_B_imag,                  /// Pointer to imaginary part of B matrix
+    int64_t ldb_real,                         /// Leading dimension of real part of B matrix
+    int64_t ldb_imag,                         /// Leading dimension of imaginary part of B matrix
+
+    void const * beta,                        /// Pointer to beta scalar
+
+    NumericTypeID element_C,                  /// Data type of C and D matrix
+
+    void const * ptr_C_real,                  /// Pointer to real part of C matrix
+    void const * ptr_C_imag,                  /// Pointer to imaginary part of C matrix
+    int64_t ldc_real,                         /// Leading dimension of real part of C matrix
+    int64_t ldc_imag,                         /// Leading dimension of imaginary part of C matrix
+
+    void * ptr_D_real,                        /// Pointer to real part of D matrix
+    void * ptr_D_imag,                        /// Pointer to imaginary part of D matrix
+    int64_t ldd_real,                         /// Leading dimension of real part of D matrix
+    int64_t ldd_imag,                         /// Leading dimension of imaginary part of D matrix
+
+    int batch_count = 1,                      /// Number of batched GEMMs to execute
+
+    int64_t batch_stride_A_real = 0,
+    int64_t batch_stride_A_imag = 0,
+
+    int64_t batch_stride_B_real = 0,
+    int64_t batch_stride_B_imag = 0,
+
+    int64_t batch_stride_C_real = 0,
+    int64_t batch_stride_C_imag = 0,
+
+    int64_t batch_stride_D_real = 0,
+    int64_t batch_stride_D_imag = 0
+  );
+
+  /// Planar complex GEMM loading pointers from arrays in global memory
+  Status gemm_planar_complex_array(
+
+    int expected_M,                           /// Expected GEMM M dimension (used for sizing CUDA grid)
+    int expected_N,                           /// Expected GEMM N dimension (used for sizing CUDA grid)
+    int expected_K,                           /// Expected GEMM K dimension
+    int batch_count,                          /// Number of independent GEMM computations to execute
+
+    int const *M,                             /// Array containing the GEMM M dimension for each batch index
+    int const *N,                             /// Array containing the GEMM N dimension for each batch index
+    int const *K,                             /// Array containing the GEMM K dimension for each batch index
+
+    NumericTypeID element_compute,            /// Data type of internal accumulation
+
+    NumericTypeID element_scalar,             /// Data type of alpha/beta scalars
+
+    void const *alpha,                        /// Pointer to alpha scalar
+
+    NumericTypeID element_A,                  /// Data type of A matrix elements
+    LayoutTypeID layout_A,                    /// Layout of A matrix
+    ComplexTransform transform_A,             /// Complex transformation applied to A matrix
+
+    void const * const * ptr_A_real,          /// Pointer to array containing pointers to real part of A matrices
+    void const * const * ptr_A_imag,          /// Pointer to array containing pointers to imaginary part of A matrices
+
+    int64_t lda_real,                         /// Leading dimension of real part of A matrix
+    int64_t lda_imag,                         /// Leading dimension of imaginary part of A matrix
+
+    NumericTypeID element_B,                  /// Data type of B matrix elements
+    LayoutTypeID layout_B,                    /// Layout of B matrix
+    ComplexTransform transform_B,             /// Complex transformation applied to B matrix
+
+    void const * const * ptr_B_real,          /// Pointer to array containing pointers to real part of B matrices
+    void const * const * ptr_B_imag,          /// Pointer to array containing pointers to imaginary part of B matrices
+
+    int64_t ldb_real,                         /// Leading dimension of real part of B matrix
+    int64_t ldb_imag,                         /// Leading dimension of imaginary part of B matrix
+
+    void const * beta,                        /// Pointer to beta scalar
+
+    NumericTypeID element_C,                  /// Data type of C and D matrix
+
+    void const * const * ptr_C_real,          /// Pointer to array containing pointers to real part of C matrices
+    void const * const * ptr_C_imag,          /// Pointer to array containing pointers to imaginary part of C matrices
+
+    int64_t ldc_real,                         /// Leading dimension of real part of C matrix
+    int64_t ldc_imag,                         /// Leading dimension of imaginary part of C matrix
+
+    void * const * ptr_D_real,                /// Pointer to array containing pointers to real part of D matrices
+    void * const * ptr_D_imag,                /// Pointer to array containing pointers to imaginary part of D matrices
+
+    int64_t ldd_real,                         /// Leading dimension of real part of D matrix
+    int64_t ldd_imag                          /// Leading dimension of imaginary part of D matrix
+  );
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Unique pointer storing the handle
+using HandlePtr = std::unique_ptr<Handle>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Finds conv2d operation instances with Conv2d::ElementC = Reduction::ElementWorkspace
+Operation const* find_conv_operation_for_parallel_reduction(Operation const *operation);
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Finds gemm operation instances with ElementC = Reduction::ElementWorkspace
+Operation const* find_gemm_operation_for_parallel_reduction(Operation const *operation);
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/library.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/library.h
new file mode 100644
index 0000000000000000000000000000000000000000..6764d9a6d81286c8bba0f5184b17819bfae86978
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/library.h
@@ -0,0 +1,995 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+
+  \brief CUTLASS Library is an object-oriented approach to managing operations implemented by CUTLASS.
+
+  Generally,
+
+    description   - compile-time constant parameters used to instantiate an operation
+
+    configuration - runtime parameters with computationally expensive initialization
+
+    arguments     - runtime parameters that may be passed to an initialized operation with low
+                    computational overhead
+*/
+
+#ifndef CUTLASS_LIBRARY_LIBRARY_H
+#define CUTLASS_LIBRARY_LIBRARY_H
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include <string>
+#include <cstdint>
+#include <stdexcept>
+#include <cuda_runtime.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/types.h"
+#include "cutlass/library/descriptions.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/blas3.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/conv/convolution.h"
+#include "cutlass/conv/conv2d_problem_size.h"
+#include "cutlass/conv/conv3d_problem_size.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Mode of Universal GEMM
+using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Base class for all operations
+class Operation {
+public:
+
+  virtual ~Operation() { }
+
+  virtual OperationDescription const & description() const = 0;
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const = 0;
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const = 0;
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const = 0;
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const = 0;
+
+  // Originally designed for metadata, but should be useful for FP8/6/4 too.  
+  virtual Status initialize_with_profiler_workspace(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace,
+    uint8_t **profiler_workspace_ptrs,
+    int problem_count,
+    cudaStream_t stream = nullptr) {
+    return Status::kErrorNotSupported;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const = 0;
+
+  // Set arguments that should only be set once before verifying or profiling the kernel.
+  // This should encompass any expensive operations that don't vary from run to run
+  // (e.g., max_active_clusters).
+  virtual Status initialize_with_arguments(void* arguments_ptr) const {
+    return Status::kSuccess;
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic GEMM operations
+//
+// OperationKind: Gemm
+// GemmKind:      Gemm
+//
+struct GemmConfiguration {
+
+  /// GEMM problem size
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of C matrix
+  int64_t ldc{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  /// Number of partitions of K dimension
+  int split_k_slices{0};
+};
+
+/// Arguments for GEMM
+struct GemmArguments {
+
+  /// Pointer to A matrix
+  void const *A{nullptr};
+
+  /// Pointer to B matrix
+  void const *B{nullptr};
+
+  /// Pointer to C matrix
+  void const *C{nullptr};
+
+  /// Pointer to D matrix
+  void *D{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+  
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for batched GEMM in which multiple matrix products are computed
+//
+// OperationKind: Gemm
+// GemmKind:      Batched
+
+struct GemmBatchedConfiguration {
+
+  /// GEMM problem size
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of C matrix
+  int64_t ldc{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  /// Stride between instances of the A matrix in memory
+  int64_t batch_stride_A{0};
+
+  /// Stride between instances of the B matrix in memory
+  int64_t batch_stride_B{0};
+
+  /// Stride between instances of the C matrix in memory
+  int64_t batch_stride_C{0};
+
+  /// Stride between instances of the D matrix in memory
+  int64_t batch_stride_D{0};
+
+  /// Number of GEMMs in batch
+  int batch_count{1};
+};
+
+/// Arguments to batched GEMM
+using GemmBatchedArguments = GemmArguments;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for batched GEMM in which multiple matrix products are computed
+//
+// OperationKind: Gemm
+// GemmKind:      Array
+
+struct GemmArrayConfiguration {
+
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of C matrix
+  int64_t ldc{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  int batch_count{1};
+};
+
+/// Arguments for GEMM - used by all the GEMM operations
+struct GemmArrayArguments {
+  void const * const *A{nullptr};
+  void const * const *B{nullptr};
+  void const * const *C{nullptr};
+  void * const *D{nullptr};
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Universal GEMM supporting multiple split-K modes, multiple batched modes, real and complex
+//
+// OperationKind: Gemm
+// GemmKind:      Universal
+
+struct GemmUniversalConfiguration {
+
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  gemm::GemmCoord cluster_shape{};           
+  gemm::GemmCoord cluster_shape_fallback{};  
+  int batch_count{1};
+
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int device_count{1};
+};
+
+enum class Sm90MixedInputWiderOperand {
+  A = 0,
+  B = 1
+};
+
+struct GemmUniversalArguments {
+  // NOTE: these are replicated for 3.0 interfaces
+  gemm::GemmCoord problem_size{};
+  gemm::GemmCoord cluster_shape{};          
+  gemm::GemmCoord cluster_shape_fallback{}; 
+  int batch_count{1};
+
+  void const *A{nullptr};
+  void const *B{nullptr};
+  void const *C{nullptr};
+  void *D{nullptr};
+
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  // NOTE: these are replicated for 3.0 interfaces
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+
+  // Needed for some 3.x kernels
+  int sm_count{0};
+  library::RasterOrder raster_order{};
+  library::RuntimeDatatype runtime_input_datatype_a{};
+  library::RuntimeDatatype runtime_input_datatype_b{};
+  int swizzle_size{1};
+  int split_k_slices{1};
+
+  // For SM90 mixed input dtype kernels
+  bool is_sm90_mixed_dtype{false};
+  Sm90MixedInputWiderOperand wider_operand{Sm90MixedInputWiderOperand::B};
+  bool generate_scale_and_zero{false};
+  bool generate_dequantized_AB{false};
+  void *Scale{nullptr};                 // Scale tensor
+  void *Zero{nullptr};                  // Zero tensor
+  void *dequantized_AB{nullptr};        // Dequantized A or B tensor for verification
+  void *encoded_AB{nullptr};            // Encoded A or B in int4 x fp8 or shuffle
+  void *packed_Scale{nullptr};          // Packed scale for int4 * fp8
+
+  int device_index{0};
+
+  bool use_pdl{false};
+};
+
+/// Block Scaled GEMM
+//
+// OperationKind: kBlockScaledGemm
+// GemmKind:      Universal
+
+struct BlockScaledGemmArguments {
+  // NOTE: these are replicated for 3.0 interfaces
+  gemm::GemmCoord problem_size{};
+  gemm::GemmCoord cluster_shape{};  
+  gemm::GemmCoord cluster_shape_fallback{}; 
+  int batch_count{1};
+
+  void const *A{nullptr};
+  void const *B{nullptr};
+  void const *SFA{nullptr};
+  void const *SFB{nullptr};
+  void const *C{nullptr};
+  void *D{nullptr};
+  void *SFD{nullptr}; 
+
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  // NOTE: these are replicated for 3.0 interfaces
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+
+  // Needed for ScaleFactor Generation
+  void const *norm_constant{nullptr};
+
+  // Needed for some 3.x kernels
+  int sm_count{0};
+  library::RasterOrder raster_order{};
+  int swizzle_size{1};
+  int split_k_slices{1};
+
+  library::RuntimeDatatype runtime_input_datatype_a{library::RuntimeDatatype::kStatic}; 
+  library::RuntimeDatatype runtime_input_datatype_b{library::RuntimeDatatype::kStatic}; 
+
+  bool use_pdl{false};
+};
+
+/// Blockwise GEMM
+//
+// OperationKind: kBlockwiseGemm
+// GemmKind:      Universal
+
+struct BlockwiseGemmArguments {
+  // NOTE: these are replicated for 3.0 interfaces
+  gemm::GemmCoord problem_size{};
+  gemm::GemmCoord cluster_shape{};  
+  gemm::GemmCoord cluster_shape_fallback{}; 
+  int batch_count{1};
+
+  void const *A{nullptr};
+  void const *B{nullptr};
+  void const *SFA{nullptr};
+  void const *SFB{nullptr};
+  void const *C{nullptr};
+  void *D{nullptr};
+
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  // NOTE: these are replicated for 3.0 interfaces
+  int64_t lda{0};
+  int64_t ldb{0};
+  int64_t ldc{0};
+  int64_t ldd{0};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+
+  int sf_m_vec_size{0};
+  int sf_n_vec_size{0};
+  int sf_k_vec_size{0};
+
+  // Needed for some 3.x kernels
+  int sm_count{0};
+  library::RasterOrder raster_order{};
+  int swizzle_size{1};
+  int split_k_slices{1};
+
+  library::RuntimeDatatype runtime_input_datatype_a{library::RuntimeDatatype::kStatic}; 
+  library::RuntimeDatatype runtime_input_datatype_b{library::RuntimeDatatype::kStatic}; 
+
+  bool use_pdl{false};
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Complex valued GEMM in which real and imaginary parts are separated by a stride
+//
+// OperationKind: Gemm
+// GemmKind:      Planar complex
+
+struct GemmPlanarComplexConfiguration {
+
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
+  int64_t lda_real{0};
+  int64_t lda_imag{0};
+  int64_t ldb_real{0};
+  int64_t ldb_imag{0};
+  int64_t ldc_real{0};
+  int64_t ldc_imag{0};
+  int64_t ldd_real{0};
+  int64_t ldd_imag{0};
+};
+
+/// Arguments for planar complex GEMMs
+struct GemmPlanarComplexArguments {
+
+  void const *A_real{nullptr};
+  void const *A_imag{nullptr};
+  void const *B_real{nullptr};
+  void const *B_imag{nullptr};
+  void const *C_real{nullptr};
+  void const *C_imag{nullptr};
+  void *D_real{nullptr};
+  void *D_imag{nullptr};
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+
+  int64_t batch_stride_A_real{0};
+  int64_t batch_stride_A_imag{0};
+  int64_t batch_stride_B_real{0};
+  int64_t batch_stride_B_imag{0};
+  int64_t batch_stride_C_real{0};
+  int64_t batch_stride_C_imag{0};
+  int64_t batch_stride_D_real{0};
+  int64_t batch_stride_D_imag{0};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// This is a special form of planar complex which loads pointers and problem size
+/// from memory.
+struct GemmPlanarComplexArrayConfiguration {
+
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};
+
+  int64_t lda_real{0};
+  int64_t lda_imag{0};
+  int64_t ldb_real{0};
+  int64_t ldb_imag{0};
+  int64_t ldc_real{0};
+  int64_t ldc_imag{0};
+  int64_t ldd_real{0};
+  int64_t ldd_imag{0};
+};
+
+/// Arguments for planar complex GEMMs
+struct GemmPlanarComplexArrayArguments {
+
+  int const *M{nullptr};
+  int const *N{nullptr};
+  int const *K{nullptr};
+
+  void const * const * A_real{nullptr};
+  void const * const * A_imag{nullptr};
+  void const * const * B_real{nullptr};
+  void const * const * B_imag{nullptr};
+  void const * const * C_real{nullptr};
+  void const * const * C_imag{nullptr};
+  void * const * D_real{nullptr};
+  void * const * D_imag{nullptr};
+
+  void const * alpha{nullptr};
+  void const * beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Grouped GEMM supporting
+//
+// OperationKind: Gemm
+// GemmKind:      Grouped
+
+struct GemmGroupedConfiguration {
+  int problem_count{0};
+  // GemmGroupedConfiguration is passed to initialize(), which
+  // is responsible for allocating the device-side stride storage.
+  int64_t* lda;
+  int64_t* ldb;
+  int64_t* ldc;
+
+  cute::Shape<int, int, int>* problem_sizes_3x_host;
+};
+
+struct GemmGroupedArguments {
+  int problem_count{};
+  gemm::GemmCoord* problem_sizes{nullptr};
+
+  void* ptr_A{nullptr};
+  void* ptr_B{nullptr};
+  void* ptr_C{nullptr};
+  void* ptr_D{nullptr};
+
+  int64_t* lda{nullptr};
+  int64_t* ldb{nullptr};
+  int64_t* ldc{nullptr};
+  int64_t* ldd{nullptr};
+
+  void const *alpha{nullptr};
+  void const *beta{nullptr};
+  ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
+
+  gemm::GemmCoord cluster_shape{};
+  gemm::GemmCoord cluster_shape_fallback{};
+
+  library::RasterOrder raster_order{};
+  library::RuntimeDatatype runtime_input_datatype_a{library::RuntimeDatatype::kStatic};
+  library::RuntimeDatatype runtime_input_datatype_b{library::RuntimeDatatype::kStatic};
+  int swizzle_size{1};
+
+  // these should really be in the configuration but staying consistent with GEMM
+  int sm_count{0};
+  int max_active_clusters{0};
+
+  // The user is responsible for allocating storage for problem sizes.
+  // Since GemmGroupedArguments is used by both the 2.x and 3.x APIs, we
+  // unfortunately need to have both options in this struct, and the
+  // underlying operation uses the one it needs.
+  cute::Shape<int, int, int>* problem_sizes_3x;
+  cute::Shape<int, int, int>* problem_sizes_3x_host;
+};
+
+struct GroupedGemmBlockScaledArguments : GemmGroupedArguments {
+  void* SFA{nullptr};
+  void* SFB{nullptr};
+  void* SFD{nullptr};
+  void* norm_constant{nullptr};
+};
+
+struct GroupedGemmBlockwiseArguments : GemmGroupedArguments {
+  void* SFA{nullptr};
+  void* SFB{nullptr};
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// OperationKind: kSparseGemm
+//
+
+/// Computes GEMM assuming one of the inputs has 2:4 structured sparsity.
+struct SparseGemmConfiguration {
+
+  GemmUniversalMode mode{GemmUniversalMode::kGemm};
+  gemm::GemmCoord problem_size{};
+  int batch_count{1};         /// number of sparse matrix products in batch
+  int64_t lda{0};             /// leading dimension of A operand
+  int64_t ldb{0};             /// leading dimension of B operand
+  int64_t ldc{0};             /// leading dimension of C operand
+  int64_t ldd{0};             /// leading dimension of D operand
+  int64_t lde{0};             /// leading dimension of E operand (metadata matrix)
+  int64_t batch_stride_A{0};  // stride between matrices
+  int64_t batch_stride_B{0};  // stride between matrices
+  int64_t batch_stride_C{0};  // stride between matrices
+  int64_t batch_stride_D{0};  // stride between matrices
+  int64_t batch_stride_E{0};  // stride between matrices
+};
+
+/// Arguments for sparse GEMMs
+struct SparseGemmArguments {
+  void const *A{nullptr};          /// pointer to A matrix
+  void const *B{nullptr};          /// pointer to B matrix
+  void const *C{nullptr};          /// pointer to C matrix
+  void *D{nullptr};                  /// pointer to D matrix
+  void const *E{nullptr};          /// pointer to E matrix (metadata)
+  void const *alpha{nullptr};      /// pointer to alpha scalar
+  void const *beta{nullptr};       /// pointer to beta scalar
+  ScalarPointerMode pointer_mode{}; /// enumerant indicating whether alpha/beta pointers are host
+                                    ///   or device pointers.
+  bool use_pdl{false};              /// Whether to use PDL when launching the kernel
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic Rank K update operations
+//
+// OperationKind: (Syrk, Herk, Syr2k, Her2k)
+// RankKKind:      Universal
+//
+struct RankKConfiguration {
+
+  /// SYRK problem size
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of C matrix
+  int64_t ldc{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  /// Batch Count
+  int batch_count{1};
+};
+
+/// Arguments for (Syrk, Herk, Syr2k, Her2k)
+struct RankKArguments {
+
+  /// Pointer to A matrix
+  void const *A{nullptr};
+
+  /// Pointer to B matrix (used only for Syr2k and Her2k)
+  void const *B{nullptr};
+
+  /// Pointer to C matrix
+  void const *C{nullptr};
+
+  /// Pointer to D matrix
+  void *D{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic TRMM operations
+//
+// OperationKind: Trmm
+// TrmmKind:      Universal
+//
+struct TrmmConfiguration {
+
+  /// TRMM problem size
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  /// Batch Count
+  int batch_count{1};
+};
+
+/// Arguments for TRMM
+struct TrmmArguments {
+
+  /// Pointer to A matrix
+  void const *A{nullptr};
+
+  /// Pointer to B matrix
+  void const *B{nullptr};
+
+  /// Pointer to D matrix
+  void *D{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_D{0};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for basic SYMM/HEMM update operations
+//
+// OperationKind: (Symm, Hemm)
+// SymmKind:      Universal
+//
+struct SymmConfiguration {
+
+  /// SYMM/HEMM problem size
+  gemm::GemmCoord problem_size{};
+
+  /// Leading dimension of A matrix
+  int64_t lda{0};
+
+  /// Leading dimension of B matrix
+  int64_t ldb{0};
+
+  /// Leading dimension of C matrix
+  int64_t ldc{0};
+
+  /// Leading dimension of D matrix
+  int64_t ldd{0};
+
+  /// Batch Count
+  int batch_count{1};
+};
+
+/// Arguments for (Symm, Hemm)
+struct SymmArguments {
+
+  /// Pointer to A matrix
+  void const *A{nullptr};
+
+  /// Pointer to B matrix
+  void const *B{nullptr};
+
+  /// Pointer to C matrix
+  void const *C{nullptr};
+
+  /// Pointer to D matrix
+  void *D{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+
+  int64_t batch_stride_A{0};
+  int64_t batch_stride_B{0};
+  int64_t batch_stride_C{0};
+  int64_t batch_stride_D{0};
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Two dimensional convolution
+//
+// OperationKind: Conv2d
+//
+struct Conv2dConfiguration {
+
+  conv::SplitKMode split_k_mode;
+
+  /// Conv2d problem size
+  //  contains strictly conv2d size (N,H,W,C,K,R,S,P,Q,padding,stride,dilation,mode)
+  //  also includes (split_k_slices, groups)
+  conv::Conv2dProblemSize problem_size{};
+
+  // stride of operand A
+  std::vector<int64_t> stride_a{};
+
+  // stride of operand B
+  std::vector<int64_t> stride_b{};
+
+  // stride of operand C
+  std::vector<int64_t> stride_c{};
+};
+
+
+/// Three dimensional convolution
+//
+// OperationKind: Conv3d
+//
+struct Conv3dConfiguration {
+
+  conv::SplitKMode split_k_mode{};
+
+  /// Conv2d problem size
+  //  contains strictly conv2d size (N,D,H,W,C,K,T,R,S,Z,P,Q,padding,stride,dilation,mode)
+  //  also includes (split_k_slices, groups)
+  conv::Conv3dProblemSize problem_size{};
+
+  /// Layout object for activations tensor
+  layout::TensorNDHWC layout_activations{};
+
+  /// Layout object for filters tensor
+  layout::TensorNDHWC layout_filters{};
+
+  /// Layout object for source tensor
+  layout::TensorNDHWC layout_source{};
+
+  /// Layout object for output tensor
+  layout::TensorNDHWC layout_output{};
+
+  //
+  // Methods
+  //
+
+  // Mapping functions (A,B,C -> activation,filter,output)
+  layout::TensorNDHWC layout_a(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_activations;
+      case library::ConvKind::kDgrad: return layout_output;
+      case library::ConvKind::kWgrad: return layout_output;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNDHWC layout_b(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_filters;
+      case library::ConvKind::kDgrad: return layout_filters;
+      case library::ConvKind::kWgrad: return layout_activations;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  layout::TensorNDHWC layout_c(library::ConvKind const &conv_kind) const {
+    switch (conv_kind) {
+      case library::ConvKind::kFprop: return layout_output;
+      case library::ConvKind::kDgrad: return layout_activations;
+      case library::ConvKind::kWgrad: return layout_filters;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+};
+
+/// Arguments for CONV
+struct ConvArguments {
+
+  /////////////////////////////////////////////////////////
+  /// ImplicitGemm matrices A, B, C, D
+  /////////////////////////////////////////////////////////
+  /// pointer to implicit gemm matrix A
+  void const *A{nullptr};
+
+  /// pointer to implicit gemm matrix B
+  void const *B{nullptr};
+
+  /// pointer to reordered matrix B
+  void const *reordered_B{nullptr};
+
+  /// pointer to implicit gemm matrix C
+  void const *C{nullptr};
+
+  /// pointer to implicit gemm destination matrix D
+  void *D{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+  
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Configuration for Reduction operations
+//
+// OperationKind: Reduction
+//
+struct ReductionConfiguration {
+
+  /// Reduction problem size
+  MatrixCoord problem_size{};
+
+  /// Number of partitions to reduce
+  int partitions{0};
+
+  /// Number of elements between each partition
+  int64_t partition_stride{0};
+
+  /// leading dimension of 'w'orkspace operand
+  int64_t ldw{0};
+
+  /// leading dimension of 's'ource operand
+  int64_t lds{0};
+
+  /// leading dimension of 'd'estination operand
+  int64_t ldd{0};
+};
+
+/// Arguments for Reduction
+struct ReductionArguments {
+
+  /// Pointer to workspace matrix
+  void const *workspace{nullptr};
+
+  /// Pointer to source matrix
+  void const *source{nullptr};
+
+  /// Pointer to destination matrix
+  void *destination{nullptr};
+
+  /// pointer to reference matrix
+  void *reference{nullptr};
+
+  /// Host or device pointer to alpha scalar
+  void const *alpha{nullptr};
+
+  /// Host or device pointer to beta scalar
+  void const *beta{nullptr};
+
+  /// Enumerant indicating whether alpha/beta point to host or device memory
+  ScalarPointerMode pointer_mode{};
+
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
+};
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/manifest.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/manifest.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4fb0ee8ca32124450b1063cc3613078e600479d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/manifest.h
@@ -0,0 +1,114 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Manifest of CUTLASS Library
+
+    This is the root of the data structure containing CUTLASS objects
+*/
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <map>
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "library.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Forward declaration 
+class Manifest;
+
+// init and insert all cutlass gemm operations in manifest object (procedurally generated using generator.py)
+void initialize_all(Manifest &manifest);         
+
+// init and insert all reduction op in manifest object (manually instantiated in library/reduction)
+void initialize_all_reduction_op(Manifest &manifest);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// List of operations
+using OperationVector = std::vector<std::unique_ptr<Operation>>;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Manifest of CUTLASS Library
+class Manifest {
+private:
+
+  /// Operation provider 
+  Provider provider_;
+
+  /// Global list of operations
+  OperationVector operations_;
+
+public:
+  Manifest (Provider provider = library::Provider::kCUTLASS) : provider_(provider) { }
+
+  /// Top-level initialization
+  Status initialize();
+
+  /// Used for initialization
+  void reserve(size_t operation_count);
+
+  /// Graceful shutdown
+  Status release();
+
+  /// Appends an operation and takes ownership
+  void append(Operation *operation_ptr) {\
+    // This function is inline s.t. it is present in generated libraries
+    // without having to compile or link in manifest.cpp
+    operations_.emplace_back(operation_ptr);
+  }
+
+  /// Returns an iterator to the first operation
+  OperationVector const &operations() const;
+
+  /// Returns a const iterator
+  OperationVector::const_iterator begin() const;
+
+  /// Returns a const iterator
+  OperationVector::const_iterator end() const;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/operation_table.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/operation_table.h
new file mode 100644
index 0000000000000000000000000000000000000000..f36232c8dc833e2b24d681686f6662e79b7ecd0a
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/operation_table.h
@@ -0,0 +1,905 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*
+  \file
+  \brief Defines a data structure in which a set of functionally equivalent library::Operation
+        instances may be queried.
+*/
+
+#pragma once
+#include <fstream>
+#include <iosfwd>
+#include <unordered_map>
+#include <algorithm>
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Gemm Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying Gemm functional behavior
+struct GemmFunctionalKey {
+
+  Provider provider;
+  GemmKind gemm_kind;
+  NumericTypeID element_compute;
+  NumericTypeID element_scalar;
+  NumericTypeID element_A;
+  LayoutTypeID layout_A;
+  ComplexTransform transform_A;
+  NumericTypeID element_B;
+  LayoutTypeID layout_B;
+  ComplexTransform transform_B;
+  NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;
+
+  //
+  // Methods
+  //
+
+  inline
+  GemmFunctionalKey(
+    Provider provider,
+    GemmKind gemm_kind = GemmKind::kGemm,
+    NumericTypeID element_compute = NumericTypeID::kF32,
+    NumericTypeID element_scalar = NumericTypeID::kF32,
+    NumericTypeID element_A = NumericTypeID::kF16,
+    LayoutTypeID layout_A = LayoutTypeID::kColumnMajor,
+    ComplexTransform transform_A = ComplexTransform::kNone,
+    NumericTypeID element_B = NumericTypeID::kF16,
+    LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
+    ComplexTransform transform_B = ComplexTransform::kNone,
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor
+  ):
+    provider(provider),
+    gemm_kind(gemm_kind),
+    element_compute(element_compute),
+    element_scalar(element_scalar),
+    element_A(element_A),
+    layout_A(layout_A),
+    transform_A(transform_A),
+    element_B(element_B),
+    layout_B(layout_B),
+    transform_B(transform_B),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D)
+  { }
+
+  inline
+  bool operator==(GemmFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (gemm_kind == rhs.gemm_kind) &&
+      (element_compute == rhs.element_compute) &&
+      (element_scalar == rhs.element_scalar) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (transform_A == rhs.transform_A) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (transform_B == rhs.transform_B) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D);
+  }
+
+  inline
+  bool operator!=(GemmFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) {
+
+  out << "{\n"
+    << "         provider: " << to_string(k.provider) << "\n"
+    << "        gemm_kind: " << to_string(k.gemm_kind) << "\n"
+    << "  element_compute: " << to_string(k.element_compute) << "\n"
+    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
+    << "        element_A: " << to_string(k.element_A) << "\n"
+    << "         layout_A: " << to_string(k.layout_A) << "\n"
+    << "      transform_A: " << to_string(k.transform_A) << "\n"
+    << "        element_B: " << to_string(k.element_B) << "\n"
+    << "         layout_B: " << to_string(k.layout_B) << "\n"
+    << "      transform_B: " << to_string(k.transform_B) << "\n"
+    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
+    << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function for GemmFunctionalKey
+struct GemmFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(GemmFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)),        1) ^
+      rotl(hash(int(key.gemm_kind)),       2) ^
+      rotl(hash(int(key.element_compute)), 3) ^
+      rotl(hash(int(key.element_scalar)),  4) ^
+      rotl(hash(int(key.element_A)),       5) ^
+      rotl(hash(int(key.layout_A)),        6) ^
+      rotl(hash(int(key.transform_A)),     7) ^
+      rotl(hash(int(key.element_B)),       8) ^
+      rotl(hash(int(key.layout_B)),        9) ^
+      rotl(hash(int(key.transform_B)),    10) ^
+      rotl(hash(int(key.element_C)),      11) ^
+      rotl(hash(int(key.layout_C)),       12) ^
+      rotl(hash(int(key.element_D)),      13) ^
+      rotl(hash(int(key.layout_D)),       14);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes a partial ordering to search for GEMM operators
+struct GemmPreferenceKey {
+
+  int compute_capability;
+  int alignment;
+
+  //
+  // Methods
+  //
+
+  GemmPreferenceKey(): compute_capability(), alignment() { }
+
+  GemmPreferenceKey(int cc, int alignment): compute_capability(cc), alignment(alignment) { }
+
+  bool operator<(GemmPreferenceKey const &rhs) const {
+    return (compute_capability < rhs.compute_capability) ||
+      ((compute_capability == rhs.compute_capability) && (alignment < rhs.alignment));
+  }
+
+  bool operator==(GemmPreferenceKey const &rhs) const {
+    return compute_capability == rhs.compute_capability;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline
+std::ostream& operator<< (std::ostream& out, const cutlass::library::GemmPreferenceKey& key) {
+    out << "{\n"
+      << "compute_capability : " << key.compute_capability << std::endl
+      << "alignment          : " << key.alignment << std::endl
+      << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Maps minimum compute capability onto a vector of possible operations
+using GemmOperationVectorMap = std::map<
+  GemmPreferenceKey,
+  std::vector<Operation const *>
+>;
+
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using GemmOperationFunctionalMap = std::unordered_map<
+  GemmFunctionalKey,
+  GemmOperationVectorMap,
+  GemmFunctionalKeyHasher
+>;
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for BlockScaled Gemm Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying Gemm functional behavior
+struct BlockScaledGemmFunctionalKey {
+
+  Provider provider;
+  GemmKind gemm_kind;
+  OperationKind kind;
+  NumericTypeID element_compute;
+  NumericTypeID element_scalar;
+  NumericTypeID element_A;
+  LayoutTypeID layout_A;
+  NumericTypeID element_SFA;
+  NumericTypeID element_B;
+  LayoutTypeID layout_B;
+  NumericTypeID element_SFB;
+  NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;
+  NumericTypeID element_SFD; 
+  LayoutTypeID layout_SFD; 
+  int SFVecSize;
+  int EpilogueSFVecSize; 
+  //
+  // Methods
+  //
+
+  inline
+  BlockScaledGemmFunctionalKey(
+    Provider provider,
+    GemmKind gemm_kind = GemmKind::kGemm,
+    OperationKind kind = OperationKind::kBlockScaledGemm,
+    NumericTypeID element_compute = NumericTypeID::kF32,
+    NumericTypeID element_scalar = NumericTypeID::kF32,
+    NumericTypeID element_A = NumericTypeID::kF16,
+    LayoutTypeID layout_A = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFA = NumericTypeID::kF16,
+    NumericTypeID element_B = NumericTypeID::kF16,
+    LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFB = NumericTypeID::kF16,
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFD = NumericTypeID::kF16, 
+    LayoutTypeID layout_SFD = LayoutTypeID::kRowMajor, 
+    int sf_vec_size = 32
+    , int epilogue_sf_vec_size = 32 
+  ):
+    provider(provider),
+    gemm_kind(gemm_kind),
+    kind(kind),
+    element_compute(element_compute),
+    element_scalar(element_scalar),
+    element_A(element_A),
+    layout_A(layout_A),
+    element_SFA(element_SFA),
+    element_B(element_B),
+    layout_B(layout_B),
+    element_SFB(element_SFB),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D),
+    element_SFD(element_SFD), 
+    layout_SFD(layout_SFD), 
+    SFVecSize(sf_vec_size)
+    , EpilogueSFVecSize(epilogue_sf_vec_size) 
+  { }
+
+  inline
+  bool operator==(BlockScaledGemmFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (gemm_kind == rhs.gemm_kind) &&
+      (kind == rhs.kind) &&
+      (element_compute == rhs.element_compute) &&
+      (element_scalar == rhs.element_scalar) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (element_SFA == rhs.element_SFA) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (element_SFB == rhs.element_SFB) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D) &&
+      (element_SFD == rhs.element_SFD) && 
+      (layout_SFD == rhs.layout_SFD) && 
+      (SFVecSize == rhs.SFVecSize) 
+      && (EpilogueSFVecSize == rhs.EpilogueSFVecSize) 
+      ;
+  }
+
+  inline
+  bool operator!=(BlockScaledGemmFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::library::BlockScaledGemmFunctionalKey const &k) {
+
+  out << "{\n"
+    << "         provider: " << to_string(k.provider) << "\n"
+    << "        gemm_kind: " << to_string(k.gemm_kind) << "\n"
+    << "             kind: " << to_string(k.kind) << "\n"
+    << "  element_compute: " << to_string(k.element_compute) << "\n"
+    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
+    << "        element_A: " << to_string(k.element_A) << "\n"
+    << "         layout_A: " << to_string(k.layout_A) << "\n"
+    << "      element_SFA: " << to_string(k.element_SFA) << "\n"
+    << "        element_B: " << to_string(k.element_B) << "\n"
+    << "         layout_B: " << to_string(k.layout_B) << "\n"
+    << "      element_SFB: " << to_string(k.element_SFB) << "\n"
+    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
+    << "      element_SFD: " << to_string(k.element_SFD) << "\n" 
+    << "       layout_SFD: " << to_string(k.layout_SFD) << "\n" 
+    << "        SFVecSize: " << k.SFVecSize << "\n"
+    << "EpilogueSFVecSize: " << k.EpilogueSFVecSize << "\n" 
+    << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function for BlockScaledGemmFunctionalKeyHasher
+struct  BlockScaledGemmFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(BlockScaledGemmFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)),           1) ^
+      rotl(hash(int(key.gemm_kind)),          2) ^
+      rotl(hash(int(key.kind)),               3) ^
+      rotl(hash(int(key.element_compute)),    4) ^
+      rotl(hash(int(key.element_scalar)),     5) ^
+      rotl(hash(int(key.element_A)),          6) ^
+      rotl(hash(int(key.layout_A)),           7) ^
+      rotl(hash(int(key.element_SFA)),        8) ^
+      rotl(hash(int(key.element_B)),          9) ^
+      rotl(hash(int(key.layout_B)),          10) ^
+      rotl(hash(int(key.element_SFB)),       11) ^
+      rotl(hash(int(key.element_C)),         12) ^
+      rotl(hash(int(key.layout_C)),          13) ^
+      rotl(hash(int(key.element_D)),         14) ^
+      rotl(hash(int(key.layout_D)),          15) ^
+      rotl(hash(int(key.element_SFD)),       16) ^ 
+      rotl(hash(int(key.layout_SFD)),        17) ^ 
+      rotl(hash(int(key.SFVecSize)),         18) ^ 
+      rotl(hash(int(key.EpilogueSFVecSize)), 19)   
+      ;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using BlockScaledGemmOperationFunctionalMap = std::unordered_map<
+  BlockScaledGemmFunctionalKey,
+  GemmOperationVectorMap,
+  BlockScaledGemmFunctionalKeyHasher
+>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Blockwise Gemm Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying Gemm functional behavior
+struct BlockwiseGemmFunctionalKey {
+
+  Provider provider;
+  GemmKind gemm_kind;
+  OperationKind kind;
+  NumericTypeID element_compute;
+  NumericTypeID element_scalar;
+  NumericTypeID element_A;
+  LayoutTypeID layout_A;
+  NumericTypeID element_SFA;
+  NumericTypeID element_B;
+  LayoutTypeID layout_B;
+  NumericTypeID element_SFB;
+  NumericTypeID element_C;
+  LayoutTypeID layout_C;
+  NumericTypeID element_D;
+  LayoutTypeID layout_D;
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+  //
+  // Methods
+  //
+
+  inline
+  BlockwiseGemmFunctionalKey(
+    Provider provider,
+    GemmKind gemm_kind = GemmKind::kGemm,
+    OperationKind kind = OperationKind::kBlockwiseGemm,
+    NumericTypeID element_compute = NumericTypeID::kF32,
+    NumericTypeID element_scalar = NumericTypeID::kF32,
+    NumericTypeID element_A = NumericTypeID::kF16,
+    LayoutTypeID layout_A = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFA = NumericTypeID::kF16,
+    NumericTypeID element_B = NumericTypeID::kF16,
+    LayoutTypeID layout_B = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_SFB = NumericTypeID::kF16,
+    NumericTypeID element_C = NumericTypeID::kF16,
+    LayoutTypeID layout_C = LayoutTypeID::kColumnMajor,
+    NumericTypeID element_D = NumericTypeID::kF16,
+    LayoutTypeID layout_D = LayoutTypeID::kColumnMajor,
+    int sfm_vec_size = 32,
+    int sfn_vec_size = 32,
+    int sfk_vec_size = 32
+  ):
+    provider(provider),
+    gemm_kind(gemm_kind),
+    kind(kind),
+    element_compute(element_compute),
+    element_scalar(element_scalar),
+    element_A(element_A),
+    layout_A(layout_A),
+    element_SFA(element_SFA),
+    element_B(element_B),
+    layout_B(layout_B),
+    element_SFB(element_SFB),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_D(element_D),
+    layout_D(layout_D),
+    SFMVecSize(sfm_vec_size),
+    SFNVecSize(sfn_vec_size),
+    SFKVecSize(sfk_vec_size)
+  { }
+
+  inline
+  bool operator==(BlockwiseGemmFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (gemm_kind == rhs.gemm_kind) &&
+      (kind == rhs.kind) &&
+      (element_compute == rhs.element_compute) &&
+      (element_scalar == rhs.element_scalar) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (element_SFA == rhs.element_SFA) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (element_SFB == rhs.element_SFB) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_D == rhs.element_D) &&
+      (layout_D == rhs.layout_D) &&
+      (SFMVecSize == rhs.SFMVecSize) &&
+      (SFNVecSize == rhs.SFNVecSize) && 
+      (SFKVecSize == rhs.SFKVecSize);
+  }
+
+  inline
+  bool operator!=(BlockwiseGemmFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream & operator<<(std::ostream &out, cutlass::library::BlockwiseGemmFunctionalKey const &k) {
+
+  out << "{\n"
+    << "         provider: " << to_string(k.provider) << "\n"
+    << "        gemm_kind: " << to_string(k.gemm_kind) << "\n"
+    << "             kind: " << to_string(k.kind) << "\n"
+    << "  element_compute: " << to_string(k.element_compute) << "\n"
+    << "   element_scalar: " << to_string(k.element_scalar) << "\n"
+    << "        element_A: " << to_string(k.element_A) << "\n"
+    << "         layout_A: " << to_string(k.layout_A) << "\n"
+    << "      element_SFA: " << to_string(k.element_SFA) << "\n"
+    << "        element_B: " << to_string(k.element_B) << "\n"
+    << "         layout_B: " << to_string(k.layout_B) << "\n"
+    << "      element_SFB: " << to_string(k.element_SFB) << "\n"
+    << "        element_C: " << to_string(k.element_C) << "\n"
+    << "         layout_C: " << to_string(k.layout_C) << "\n"
+    << "        element_D: " << to_string(k.element_D) << "\n"
+    << "         layout_D: " << to_string(k.layout_D) << "\n"
+    << "        SFMVecSize: " << k.SFMVecSize << "\n"
+    << "        SFNVecSize: " << k.SFNVecSize << "\n"
+    << "        SFKVecSize: " << k.SFKVecSize << "\n"
+    << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Hash function for BlockwiseGemmFunctionalKeyHasher
+struct  BlockwiseGemmFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(BlockwiseGemmFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)),           1) ^
+      rotl(hash(int(key.gemm_kind)),          2) ^
+      rotl(hash(int(key.kind)),               3) ^
+      rotl(hash(int(key.element_compute)),    4) ^
+      rotl(hash(int(key.element_scalar)),     5) ^
+      rotl(hash(int(key.element_A)),          6) ^
+      rotl(hash(int(key.layout_A)),           7) ^
+      rotl(hash(int(key.element_SFA)),        8) ^
+      rotl(hash(int(key.element_B)),          9) ^
+      rotl(hash(int(key.layout_B)),          10) ^
+      rotl(hash(int(key.element_SFB)),       11) ^
+      rotl(hash(int(key.element_C)),         12) ^
+      rotl(hash(int(key.layout_C)),          13) ^
+      rotl(hash(int(key.element_D)),         14) ^
+      rotl(hash(int(key.layout_D)),          15) ^
+      rotl(hash(int(key.SFMVecSize)),        16) ^ 
+      rotl(hash(int(key.SFNVecSize)),        17) ^ 
+      rotl(hash(int(key.SFKVecSize)),        18) 
+      ;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using BlockwiseGemmOperationFunctionalMap = std::unordered_map<
+  BlockwiseGemmFunctionalKey,
+  GemmOperationVectorMap,
+  BlockwiseGemmFunctionalKeyHasher
+>;
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//                          Data Structures for Conv Functional Maps
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tuple uniquely identifying conv2d functional behavior
+struct ConvFunctionalKey {
+  library::Provider provider;
+  library::ConvKind conv_kind;
+  library::NumericTypeID element_A;
+  library::LayoutTypeID layout_A;
+  library::NumericTypeID element_B;
+  library::LayoutTypeID layout_B;
+  library::NumericTypeID element_C;
+  library::LayoutTypeID layout_C;
+  library::NumericTypeID element_accumulator;
+  library::NumericTypeID element_compute;
+
+
+  //
+  // Methods
+  //
+
+  inline
+  ConvFunctionalKey(
+    library::Provider provider = library::Provider::kInvalid,
+    library::ConvKind conv_kind = library::ConvKind::kFprop,
+    library::NumericTypeID element_A = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_A = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_B = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_B = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_C = library::NumericTypeID::kF16,
+    library::LayoutTypeID layout_C = library::LayoutTypeID::kTensorNHWC,
+    library::NumericTypeID element_accumulator = library::NumericTypeID::kF32,
+    library::NumericTypeID element_compute = library::NumericTypeID::kF32
+  ):
+    provider(provider),
+    conv_kind(conv_kind),
+    element_A(element_A),
+    layout_A(layout_A),
+    element_B(element_B),
+    layout_B(layout_B),
+    element_C(element_C),
+    layout_C(layout_C),
+    element_accumulator(element_accumulator),
+    element_compute(element_compute)
+  { }
+
+  inline
+  bool operator==(ConvFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (conv_kind == rhs.conv_kind) &&
+      (element_A == rhs.element_A) &&
+      (layout_A == rhs.layout_A) &&
+      (element_B == rhs.element_B) &&
+      (layout_B == rhs.layout_B) &&
+      (element_C == rhs.element_C) &&
+      (layout_C == rhs.layout_C) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (element_compute == rhs.element_compute);
+  }
+
+  inline
+  bool operator!=(ConvFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+inline
+std::ostream& operator<< (std::ostream& out, const cutlass::library::ConvFunctionalKey& key) {
+    out << "{\n"
+      << "provider: " << to_string(key.provider) << std::endl
+      << "conv_kind: " << to_string(key.conv_kind) << std::endl
+      << "element_A: " << to_string(key.element_A) << std::endl
+      << "layout_A: " << to_string(key.layout_A) << std::endl
+      << "element_B: " << to_string(key.element_B) << std::endl
+      << "layout_B: " << to_string(key.layout_B) << std::endl
+      << "element_C: " << to_string(key.element_C) << std::endl
+      << "layout_C: " << to_string(key.layout_C) << std::endl
+      << "element_accumulator: " << to_string(key.element_accumulator) << std::endl
+      << "element_compute: " << to_string(key.element_compute) << std::endl
+      << "}";
+
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+struct ConvFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(ConvFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)), 1) ^
+      rotl(hash(int(key.conv_kind)), 2) ^
+      rotl(hash(int(key.element_A)), 3) ^
+      rotl(hash(int(key.layout_A)), 4) ^
+      rotl(hash(int(key.element_B)), 5) ^
+      rotl(hash(int(key.layout_B)), 6) ^
+      rotl(hash(int(key.element_C)), 7) ^
+      rotl(hash(int(key.layout_C)), 8) ^
+      rotl(hash(int(key.element_accumulator)), 9) ^
+      rotl(hash(int(key.element_compute)), 10);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Establishes a partial ordering to search for Conv2d operators
+struct ConvPreferenceKey {
+
+  int compute_capability;
+  IteratorAlgorithmID iterator_algorithm;
+
+
+  //
+  // Methods
+  //
+
+  ConvPreferenceKey(): compute_capability(), iterator_algorithm() { }
+
+  ConvPreferenceKey(int cc, IteratorAlgorithmID iterator_algorithm):
+    compute_capability(cc), iterator_algorithm(iterator_algorithm) { }
+
+  bool operator<(ConvPreferenceKey const &rhs) const {
+    return (compute_capability < rhs.compute_capability) ||
+      ((compute_capability == rhs.compute_capability) && (iterator_algorithm < rhs.iterator_algorithm));
+  }
+
+  bool operator==(ConvPreferenceKey const &rhs) const {
+    return (compute_capability == rhs.compute_capability) &&
+          (iterator_algorithm == rhs.iterator_algorithm);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Maps minimum compute capability onto a vector of possible operations
+using ConvOperationVectorMap = std::map<
+  ConvPreferenceKey,
+  std::vector<Operation const *>
+>;
+
+/// Maps a GemmFunctionalKey onto a vector of Operation * objects expected to be of kind kGemm
+using ConvOperationFunctionalMap = std::unordered_map<
+  ConvFunctionalKey,
+  ConvOperationVectorMap,
+  ConvFunctionalKeyHasher
+>;
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Tuple uniquely identifying conv2d functional behavior
+struct ReductionFunctionalKey {
+  library::Provider provider;
+  library::NumericTypeID element_workspace;
+  library::NumericTypeID element_accumulator;
+  library::NumericTypeID element_output;
+  library::NumericTypeID element_compute;
+  library::MathOperationID reduce_math_op;
+  library::EpilogueKind epilogue_math_op;
+
+
+  //
+  // Methods
+  //
+
+  inline
+  ReductionFunctionalKey(
+    library::Provider provider = library::Provider::kInvalid,
+    library::NumericTypeID element_workspace = library::NumericTypeID::kF16,
+    library::NumericTypeID element_accumulator = library::NumericTypeID::kF32,
+    library::NumericTypeID element_output = library::NumericTypeID::kF16,
+    library::NumericTypeID element_compute = library::NumericTypeID::kF32,
+    library::MathOperationID reduce_math_op = library::MathOperationID::kAdd,
+    library::EpilogueKind epilogue_math_op = library::EpilogueKind::kLinearCombination
+  ):
+    provider(provider),
+    element_workspace(element_workspace),
+    element_accumulator(element_accumulator),
+    element_output(element_output),
+    element_compute(element_compute),
+    reduce_math_op(reduce_math_op),
+    epilogue_math_op(epilogue_math_op)
+  { }
+
+  inline
+  bool operator==(ReductionFunctionalKey const &rhs) const {
+    return
+      (provider == rhs.provider) &&
+      (element_workspace == rhs.element_workspace) &&
+      (element_accumulator == rhs.element_accumulator) &&
+      (element_output == rhs.element_output) &&
+      (element_compute == rhs.element_compute) &&
+      (reduce_math_op == rhs.reduce_math_op) &&
+      (epilogue_math_op == rhs.epilogue_math_op);
+  }
+
+  inline
+  bool operator!=(ReductionFunctionalKey const &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+
+struct ReductionFunctionalKeyHasher {
+  using IntHash = std::hash<int>;
+
+  inline
+  static size_t rotl(size_t key, int shl) {
+    return (key << shl) | (key >> (sizeof(key)*8u - static_cast<size_t>(shl)));
+  }
+
+  inline
+  size_t operator()(ReductionFunctionalKey const &key) const {
+    IntHash hash;
+
+    return
+      rotl(hash(int(key.provider)), 1) ^
+      rotl(hash(int(key.element_workspace)), 2) ^
+      rotl(hash(int(key.element_accumulator)), 3) ^
+      rotl(hash(int(key.element_output)), 4) ^
+      rotl(hash(int(key.element_compute)), 5) ^
+      rotl(hash(int(key.reduce_math_op)), 6) ^
+      rotl(hash(int(key.epilogue_math_op)), 7);
+  }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline
+std::ostream& operator<< (std::ostream& out, const ReductionFunctionalKey& key) {
+    out << "{\n"
+      << "provider: " << library::to_string(key.provider) << std::endl
+      << "element_workspace   : " << library::to_string(key.element_workspace) << std::endl
+      << "element_accumulator : " << library::to_string(key.element_accumulator) << std::endl
+      << "element_output      : " << library::to_string(key.element_output) << std::endl
+      << "element_compute     : " << library::to_string(key.element_compute) << std::endl
+      << "}";
+  return out;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// ReductionOperationFunctionalMap has NO preference key and a single instance per functional key
+// i.e. only one tile size configuration per functional key
+using ReductionOperationFunctionalMap = std::unordered_map<
+  ReductionFunctionalKey,
+  library::Operation const *,
+  ReductionFunctionalKeyHasher
+>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Table of cutlass::library::Operation instances
+class OperationTable {
+public:
+
+  /// Map of all operations of type kGemm
+  // provider (kCUTLASS)
+  GemmOperationFunctionalMap gemm_operations;
+
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)                        
+  BlockScaledGemmOperationFunctionalMap block_scaled_gemm_operations;             
+
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)                        
+  BlockwiseGemmOperationFunctionalMap blockwise_gemm_operations;             
+
+  /// Map of all operations of type kConv2d
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
+  ConvOperationFunctionalMap conv2d_operations;
+
+  /// Map of all operations of type kConv3d
+  // provider (kCUTLASS, kReferenceHost, kReferenceDevice)
+  ConvOperationFunctionalMap conv3d_operations;
+
+  /// Map of all operations of type kConv2d
+  // provider (kCUTLASS)
+  ReductionOperationFunctionalMap reduction_operations;
+
+public:
+
+  void append(Manifest const &manifest);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k);
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/singleton.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/singleton.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a757433f38fbf10d9a352e07c7f3084a99e4098
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/singleton.h
@@ -0,0 +1,68 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/operation_table.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Singleton instance stores a Manifest and Operation table
+class Singleton {
+public:
+
+  /// Manifest object
+  Manifest manifest;
+
+  /// Operation table referencing the Manifest
+  OperationTable operation_table;
+
+public:
+
+  Singleton();
+
+  static Singleton const &get();
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/types.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f8c4ff13ba543b4ec63997ba55e9278bfb357a6
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/types.h
@@ -0,0 +1,295 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ #pragma once
+
+ /////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Layout type identifier
+enum class LayoutTypeID {
+  kUnknown,
+  kColumnMajor,
+  kRowMajor,
+  kBlockScalingTensor,          
+  kColumnMajorInterleavedK2,
+  kRowMajorInterleavedK2,
+  kColumnMajorInterleavedK4,
+  kRowMajorInterleavedK4,
+  kColumnMajorInterleavedK16,
+  kRowMajorInterleavedK16,
+  kColumnMajorInterleavedK32,
+  kRowMajorInterleavedK32,
+  kColumnMajorInterleavedK64,
+  kRowMajorInterleavedK64,
+  kTensorNCHW,
+  kTensorNCDHW,
+  kTensorNHWC,
+  kTensorNDHWC,
+  kTensorNC32HW32,
+  kTensorC32RSK32,
+  kTensorNC64HW64,
+  kTensorC64RSK64,
+  kInvalid
+};
+  
+/// Numeric data type
+enum class NumericTypeID {
+  kUnknown,
+  kVoid,
+  kB1,
+  kU2,
+  kU4,
+  kU8,
+  kU16,
+  kU32,
+  kU64,
+  kS2,
+  kS4,
+  kS8,
+  kS16,
+  kS32,
+  kS64,
+  kFE4M3,
+  kFE5M2,
+  
+  kFE2M3,
+  kFE3M2,
+  kFE2M1,
+  kFUE8M0, 
+  kFUE4M3, 
+  kF8,
+  kF6,
+  kF4,
+  
+  kF16,
+  kBF16, 
+  kTF32,
+  kF32,
+  kF64,
+  kCF16,
+  kCBF16,
+  kCF32,
+  kCTF32,
+  kCF64,
+  kCS2,
+  kCS4,
+  kCS8,
+  kCS16,
+  kCS32,
+  kCS64,
+  kCU2,
+  kCU4,
+  kCU8,
+  kCU16,
+  kCU32,
+  kCU64,
+  kInvalid
+};
+
+/// Enumerated type describing a transformation on a complex value.
+enum class ComplexTransform {
+  kNone,
+  kConjugate,
+  kInvalid
+};
+
+/// Providers
+enum class Provider {
+  kNone,
+  kCUTLASS,
+  kReferenceHost,
+  kReferenceDevice,
+  kCUBLAS,
+  kCUDNN,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumeration indicating the kind of operation
+enum class OperationKind {
+  kGemm,
+  kBlockScaledGemm,
+  kBlockwiseGemm,
+  kRankK,
+  kRank2K,
+  kTrmm,
+  kSymm,
+  kConv2d,
+  kConv3d,
+  kEqGemm,
+  kSparseGemm,
+  kReduction,
+  kGroupedGemm,
+  kInvalid
+};
+
+/// Enumeration indicating whether scalars are in host or device memory
+enum class ScalarPointerMode {
+  kHost,
+  kDevice,
+  kInvalid
+};
+
+/// Describes how reductions are performed across threadblocks
+enum class SplitKMode {
+  kNone,
+  kSerial,
+  kParallel,
+  kParallelSerial,
+  kInvalid
+};
+
+/// Indicates the classificaition of the math instruction
+enum class OpcodeClassID {
+  kSimt,
+  kTensorOp,
+  kWmmaTensorOp,
+  kSparseTensorOp,
+  kBlockScaledOp,                
+  kInvalid
+};
+
+enum class MathOperationID {
+  kAdd,
+  kMultiplyAdd,
+  kMultiplyAddSaturate,
+  kMultiplyAddMixedInputUpcast,
+  kMultiplyAddFastBF16,
+  kMultiplyAddFastF16,
+  kMultiplyAddFastF32,
+  kMultiplyAddComplex,
+  kMultiplyAddComplexFastF32,
+  kMultiplyAddGaussianComplex,
+  kXorPopc,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumeration indicating what kind of GEMM operation to perform
+enum class GemmKind {
+  kGemm,
+  kBlockScaledGemm,                
+  kSparse,
+  kUniversal,
+  kPlanarComplex,
+  kPlanarComplexArray,
+  kGrouped,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of RankK update operation to perform
+enum class RankKKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of TRMM operation to perform
+enum class TrmmKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of SYMM/HEMM operation to perform
+enum class SymmKind {
+  kUniversal,
+  kInvalid
+};
+
+/// Enumeration indicating what kind of Conv2d operation to perform
+enum class ConvKind {
+  kUnknown,
+  kFprop,
+  kDgrad,
+  kWgrad,
+  kInvalid
+};
+
+enum class ConvModeID {
+  kCrossCorrelation,
+  kConvolution,
+  kInvalid
+};
+
+// Iterator algorithm enum in order of general performance-efficiency
+enum class IteratorAlgorithmID {
+  kNone,
+  kAnalytic,
+  kOptimized,
+  kFixedChannels,
+  kFewChannels,
+  kInvalid
+};
+
+
+enum class EpilogueKind {
+  kUnknown,
+  kConversion,
+  kLinearCombination,
+  kLinearCombinationClamp,
+  kLinearCombinationPlanarComplex,
+  kLinearCombinationRelu,
+  kLinearCombinationSigmoid,
+  kInvalid
+};
+
+
+enum class RuntimeDatatype {
+  kStatic,
+  kE4M3,
+  kE5M2,
+  kE3M2,
+  kE2M3,
+  kE2M1,
+  
+  kInvalid
+};
+
+
+enum class RasterOrder {
+  kAlongN,
+  kAlongM,
+  kHeuristic,
+  kInvalid
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/util.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f537421751c1f2af3b95a2e1951006af441b28e0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/include/cutlass/library/util.h
@@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file
+
+  \brief Utilities accompanying the CUTLASS library for interacting with Library types.
+*/
+
+#ifndef CUTLASS_LIBRARY_UTIL_H
+#define CUTLASS_LIBRARY_UTIL_H
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Lexical cast from string
+template <typename T> T from_string(std::string const &);
+
+/// Converts a Provider enumerant to a string
+char const *to_string(Provider provider, bool pretty = false);
+
+/// Parses a Provider enumerant from a string
+template <> Provider from_string<Provider>(std::string const &str);
+
+/// Converts a GemmKind enumerant to a string
+char const *to_string(GemmKind type, bool pretty = false);
+
+/// Converts a RankKKind enumerant to a string
+char const *to_string(RankKKind type, bool pretty = false);
+
+/// Converts a TrmmKind enumerant to a string
+char const *to_string(TrmmKind type, bool pretty = false);
+
+/// Converts a SymmKind enumerant to a string
+char const *to_string(SymmKind type, bool pretty = false);
+
+/// Converts a SideMode enumerant to a string
+char const *to_string(SideMode type, bool pretty = false);
+
+/// Converts a FillMode enumerant to a string
+char const *to_string(FillMode type, bool pretty = false);
+
+/// Converts a BlasMode enumerant to a string
+char const *to_string(BlasMode type, bool pretty = false);
+
+/// Converts a DiagType enumerant to a string
+char const *to_string(DiagType type, bool pretty = false);
+
+/// Converts a NumericType enumerant to a string
+char const *to_string(OperationKind type, bool pretty = false);
+
+/// Parses a NumericType enumerant from a string
+template <> OperationKind from_string<OperationKind>(std::string const &str);
+
+/// Converts a NumericType enumerant to a string
+char const *to_string(NumericTypeID type, bool pretty = false);
+
+/// Parses a NumericType enumerant from a string
+template <> NumericTypeID from_string<NumericTypeID>(std::string const &str);
+
+/// Returns the size of a data type in bits
+int sizeof_bits(NumericTypeID type);
+
+/// Returns true if the numeric type is a complex data type or false if real-valued.
+bool is_complex_type(NumericTypeID type);
+
+/// Returns the real-valued type underlying a type (only different from 'type' if complex)
+NumericTypeID get_real_type(NumericTypeID type);
+
+/// Returns true if numeric type is integer
+bool is_integer_type(NumericTypeID type);
+
+/// Returns true if numeric type is signed
+bool is_signed_type(NumericTypeID type);
+
+/// Returns true if numeric type is a signed integer
+bool is_signed_integer(NumericTypeID type);
+
+/// returns true if numeric type is an unsigned integer
+bool is_unsigned_integer(NumericTypeID type);
+
+/// Returns true if numeric type is floating-point type
+bool is_float_type(NumericTypeID type);
+
+/// To string method for cutlass::Status
+char const *to_string(Status status, bool pretty = false);
+
+/// Converts a LayoutTypeID enumerant to a string
+char const *to_string(LayoutTypeID layout, bool pretty = false);
+
+/// Parses a LayoutType enumerant from a string
+template <> LayoutTypeID from_string<LayoutTypeID>(std::string const &str);
+
+/// Returns the rank of a layout's stride base on the LayoutTypeID
+int get_layout_stride_rank(LayoutTypeID layout_id);
+
+/// Converts a OpcodeClassID enumerant to a string
+char const *to_string(OpcodeClassID type, bool pretty = false);
+
+/// Converts a OpcodeClassID enumerant from a string
+template <>
+OpcodeClassID from_string<OpcodeClassID>(std::string const &str);
+
+/// Converts a ComplexTransform enumerant to a string
+char const *to_string(ComplexTransform type, bool pretty = false);
+
+/// Converts a ComplexTransform enumerant from a string
+template <>
+ComplexTransform from_string<ComplexTransform>(std::string const &str);
+
+
+/// Converts a SplitKMode enumerant to a string
+char const *to_string(SplitKMode split_k_mode, bool pretty = false);
+
+/// Converts a SplitKMode enumerant from a string
+template <>
+SplitKMode from_string<SplitKMode>(std::string const &str);
+
+/// Converts a ConvModeID enumerant to a string
+char const *to_string(ConvModeID type, bool pretty = false);
+
+/// Converts a ConvModeID enumerant from a string
+template <>
+ConvModeID from_string<ConvModeID>(std::string const &str);
+
+/// Converts a IteratorAlgorithmID enumerant to a string
+char const *to_string(IteratorAlgorithmID type, bool pretty = false);
+
+/// Converts a IteratorAlgorithmID enumerant from a string
+template <>
+IteratorAlgorithmID from_string<IteratorAlgorithmID>(std::string const &str);
+
+/// Converts a ConvKind enumerant to a string
+char const *to_string(ConvKind type, bool pretty = false);
+
+/// Converts a ConvKind enumerant from a string
+template <>
+ConvKind from_string<ConvKind>(std::string const &str);
+
+
+/// Converts a RuntimeDatatype enumerant to a string
+char const *to_string(cutlass::library::RuntimeDatatype type, bool pretty = false);
+
+/// Convers a RuntimeDatatype enumerant from a string
+template<>
+cutlass::library::RuntimeDatatype from_string<cutlass::library::RuntimeDatatype>(std::string const &str);
+
+
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(RasterOrder type, bool pretty = false);
+
+/// Convers a RasterOrder enumerant from a string
+template<>
+RasterOrder from_string<RasterOrder>(std::string const &str);
+
+/// Converts a bool to a string
+char const *to_string(bool type, bool pretty = false);
+
+/// Convers a bool from a string
+template<>
+bool from_string<bool>(std::string const &str);
+
+/// Lexical cast from int64_t to string
+std::string lexical_cast(int64_t int_value);
+
+/// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
+bool lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type, std::string const &str);
+
+/// Lexical cast TO a string FROM a byte array. Returns true if cast is successful or false if invalid.
+std::string lexical_cast(std::vector<uint8_t> &bytes, NumericTypeID type);
+
+/// Casts from a signed int64 to the destination type. Returns true if successful.
+bool cast_from_int64(std::vector<uint8_t> &bytes, NumericTypeID type, int64_t src);
+
+/// Casts from an unsigned int64 to the destination type. Returns true if successful.
+bool cast_from_uint64(std::vector<uint8_t> &bytes, NumericTypeID type, uint64_t src);
+
+/// Casts from a real value represented as a double to the destination type. Returns true if successful.
+bool cast_from_double(std::vector<uint8_t> &bytes, NumericTypeID type, double src);
+
+NumericTypeID dynamic_datatype_to_id(RuntimeDatatype type); 
+
+#define CUDA_CHECK(call)                                                                           \
+  do {                                                                                             \
+    cudaError_t err = (call);                                                                      \
+    if (err != cudaSuccess) {                                                                      \
+      std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " in " << __func__ << " at "       \
+                << __FILE__ << ":" << __LINE__ << std::endl;                                       \
+      return Status::kInvalid;                                                                     \
+    }                                                                                              \
+  } while (0)
+
+// RAII CUDA buffer container
+class CudaBuffer {
+public:
+  CudaBuffer() : size_(0), d_ptr_(nullptr) {}
+
+  explicit CudaBuffer(size_t size) : size_(size), d_ptr_(nullptr) {
+    cudaError_t err = cudaMalloc(&d_ptr_, size_);
+    if (err != cudaSuccess) {
+      throw std::runtime_error("cudaMalloc failed: " + std::string(cudaGetErrorString(err)));
+    }
+  }
+
+  ~CudaBuffer() {
+    if (d_ptr_) {
+      cudaFree(d_ptr_);
+    }
+  }
+
+  CudaBuffer(CudaBuffer const&) = delete;
+  CudaBuffer& operator=(CudaBuffer const&) = delete;
+
+  CudaBuffer(CudaBuffer&& other) noexcept : size_(other.size_), d_ptr_(other.d_ptr_) {
+    other.d_ptr_ = nullptr;
+    other.size_ = 0;
+  }
+
+  CudaBuffer& operator=(CudaBuffer&& other) noexcept {
+    if (this != &other) {
+      if (d_ptr_) {
+        cudaFree(d_ptr_);
+      }
+      d_ptr_ = other.d_ptr_;
+      size_ = other.size_;
+      other.d_ptr_ = nullptr;
+      other.size_ = 0;
+    }
+    return *this;
+  }
+
+  void* data() const noexcept { return d_ptr_; }
+  size_t size() const noexcept { return size_; }
+
+private:
+  size_t size_;
+  void* d_ptr_;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/block_scaled_gemm_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/block_scaled_gemm_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c96b9a2212b42c191551ea70da3ac3baecbed487
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/block_scaled_gemm_operation_3x.hpp
@@ -0,0 +1,450 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "gemm_operation_3x.hpp"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class BlockScaledGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::CollectiveMainloop::ElementA;
+  using ElementSFA = typename Operator::CollectiveMainloop::ElementSF;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::CollectiveMainloop::ElementB;
+  using ElementSFB = typename Operator::CollectiveMainloop::ElementSF;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+  constexpr static int SFVecSize = TiledMma::SFVecSize;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using Sm1xxBlkScaledConfig =  typename CollectiveMainloop::Sm1xxBlkScaledConfig;
+    
+  static constexpr bool epilogue_scalefactor_generation = not cute::is_same_v<typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  static constexpr int32_t SFD_VectorSize = epilogue_scalefactor_generation ? ThreadEpilogueOp::SFVecSize : SFVecSize;
+  using ElementSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  using LayoutSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::GmemLayoutTagScalefactor, LayoutD>; 
+  
+
+  
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB), 
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+  using RuntimeDataTypeA = typename Operator::CollectiveMainloop::RuntimeDataTypeA;
+  using RuntimeDataTypeB = typename Operator::CollectiveMainloop::RuntimeDataTypeB;
+  
+
+private:
+  BlockScaledGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockScaledGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+      GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {
+    description_.kind = OperationKind::kBlockScaledGemm;
+    description_.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    description_.SFA.layout = LayoutTypeID::kRowMajor;
+    description_.SFA.alignment = 128;
+    description_.SFA.log_extent_range = 32;
+    description_.SFA.log_stride_range = 32;
+
+    description_.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    description_.SFB.layout = LayoutTypeID::kRowMajor;
+    description_.SFB.alignment = 128;
+    description_.SFB.log_extent_range = 32;
+    description_.SFB.log_stride_range = 32;
+
+    description_.SFVecSize = SFVecSize;
+    
+    description_.SFD = make_TensorDescription<ElementSFD, LayoutSFD>(128);
+    description_.EpilogueSFVecSize = SFD_VectorSize;
+    
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      description_.tile_description.cluster_shape = make_Coord(
+        Operator::ClusterShape::kM,
+        Operator::ClusterShape::kN,
+        Operator::ClusterShape::kK);
+    }
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentD);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  /// Returns the description of the GEMM operation
+  BlockScaledGemmDescription const& get_gemm_description() const {
+    return description_;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, BlockScaledGemmArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, BlockScaledGemmArguments const &arguments) {
+      
+      if constexpr (epilogue_scalefactor_generation) {
+        fusion_args.block_scale_factor_ptr = static_cast<ElementSFD*>(arguments.SFD);
+        fusion_args.norm_constant_ptr = static_cast<ElementCompute const *>(arguments.norm_constant);
+      }
+      
+
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      BlockScaledGemmArguments const *arguments) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      using RuntimeDataTypeA = typename Operator::GemmKernel::CollectiveMainloop::RuntimeDataTypeA;
+      using RuntimeDataTypeB = typename Operator::GemmKernel::CollectiveMainloop::RuntimeDataTypeB;
+
+      static_assert(cute::is_same_v<RuntimeDataTypeA, RuntimeDataTypeB>, 
+        "RuntimeDataTypeA/B should be identical, either MXF8F6F4Format or MXF4Format");
+      using RuntimeDatatypeArg = RuntimeDataTypeA;
+
+      auto mapping = [](RuntimeDatatype type) {
+        if constexpr (cute::is_same_v<RuntimeDatatypeArg, cute::UMMA::MXF8F6F4Format>) {
+          if (type == RuntimeDatatype::kE3M2) {
+            return cute::UMMA::MXF8F6F4Format::E3M2;
+          } else if (type == RuntimeDatatype::kE2M3) {
+            return cute::UMMA::MXF8F6F4Format::E2M3;
+          } else if (type == RuntimeDatatype::kE2M1) {
+            return cute::UMMA::MXF8F6F4Format::E2M1;
+          } else {
+            assert("Invalid input datatype.");
+          }
+        }
+        else if constexpr (cute::is_same_v<RuntimeDatatypeArg, cute::UMMA::MXF4Format>) {
+          if (type == RuntimeDatatype::kE2M1) {
+            return cute::UMMA::MXF4Format::E2M1;
+          } else {
+            assert("Invalid input datatype.");
+          }
+        }
+        // BlockScaled kernels receive either MXF4Format or MXF8F6F4Format runtime datatype
+        CUTE_GCC_UNREACHABLE;
+      };
+
+      operator_args.mainloop.runtime_data_type_a = mapping(arguments->runtime_input_datatype_a);
+      operator_args.mainloop.runtime_data_type_b = mapping(arguments->runtime_input_datatype_b);
+
+    }
+    else {
+    
+    operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    } 
+    operator_args.mainloop.ptr_SFA = static_cast<ElementSFA const *>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB = static_cast<ElementSFB const *>(arguments->SFB);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.dA = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideA>(
+        arguments->lda, arguments->batch_stride_A);
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    operator_args.mainloop.layout_SFA = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(operator_args.problem_shape);
+    operator_args.mainloop.layout_SFB = Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(operator_args.problem_shape);
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+    
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default: 
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (std::is_same_v<typename Operator::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      operator_args.scheduler.splits = arguments->split_k_slices;
+    }
+
+    
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
+    
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration = 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    BlockScaledGemmArguments const *arguments =
+      static_cast<BlockScaledGemmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *configuration) const override {
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+      void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<BlockScaledGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    Operator *op = new (host_workspace) Operator;
+    return Status::kSuccess;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration, 
+      void *host_workspace, 
+      void *device_workspace, 
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments args;
+    Status status = update_arguments_(args, static_cast<BlockScaledGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(args, device_workspace, stream, nullptr, static_cast<BlockScaledGemmArguments const *>(arguments_ptr)->use_pdl);
+    return status;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/blockwise_gemm_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/blockwise_gemm_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..00347a993e29035e58401e69698267045b399f7d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/blockwise_gemm_operation_3x.hpp
@@ -0,0 +1,429 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "gemm_operation_3x.hpp"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class BlockwiseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::CollectiveMainloop::ElementA;
+  using ElementSFA = typename Operator::ElementAccumulator;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::CollectiveMainloop::ElementB;
+  using ElementSFB = typename Operator::ElementAccumulator;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+  
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB), 
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+private:
+  BlockwiseGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockwiseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+      GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {
+    description_.kind = OperationKind::kBlockwiseGemm;
+    description_.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    description_.SFA.layout = size<0,1>(typename CollectiveMainloop::LayoutSFA{}.stride()) == 1 ? 
+        LayoutTypeID::kColumnMajor : LayoutTypeID::kRowMajor;
+    description_.SFA.alignment = CollectiveMainloop::AlignmentSFA;
+    description_.SFA.log_extent_range = 32;
+    description_.SFA.log_stride_range = 32;
+
+    description_.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    description_.SFB.layout = size<0,1>(typename CollectiveMainloop::LayoutSFB{}.stride()) == 1 ? 
+        LayoutTypeID::kRowMajor : LayoutTypeID::kColumnMajor;
+    description_.SFB.alignment = CollectiveMainloop::AlignmentSFA;
+    description_.SFB.log_extent_range = 32;
+    description_.SFB.log_stride_range = 32;
+
+    description_.SFMVecSize = Operator::CollectiveMainloop::ScaleGranularityM;
+    description_.SFNVecSize = Operator::CollectiveMainloop::ScaleGranularityN;
+    description_.SFKVecSize = Operator::CollectiveMainloop::ScaleGranularityK;
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      description_.tile_description.cluster_shape = make_Coord(
+        Operator::ClusterShape::kM,
+        Operator::ClusterShape::kN,
+        Operator::ClusterShape::kK);
+    }
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentD);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  /// Returns the description of the GEMM operation
+  BlockwiseGemmDescription const& get_gemm_description() const {
+    return description_;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, BlockwiseGemmArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, BlockwiseGemmArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      BlockwiseGemmArguments const *arguments) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+    
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
+          {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2}, 
+          {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
+          {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
+      };
+
+      auto iter_runtime_a = mapping.find(arguments->runtime_input_datatype_a);
+      auto iter_runtime_b = mapping.find(arguments->runtime_input_datatype_b);
+
+      if (iter_runtime_a != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_a = iter_runtime_a->second;
+      } else {
+        assert("invalid runtime argument for datatype A!");
+      }
+
+      if (iter_runtime_b != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_b = iter_runtime_b->second;
+      } else {
+        assert("invalid runtime argument for datatype B!");
+      }
+
+   }
+    else {
+    
+    operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+    operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    } 
+    operator_args.mainloop.ptr_SFA = static_cast<ElementSFA const *>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB = static_cast<ElementSFB const *>(arguments->SFB);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.dA = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideA>(
+        arguments->lda, arguments->batch_stride_A);
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    operator_args.mainloop.layout_SFA = Operator::CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFA(operator_args.problem_shape);
+    operator_args.mainloop.layout_SFB = Operator::CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFB(operator_args.problem_shape);
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+    
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default: 
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (std::is_same_v<typename Operator::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      operator_args.scheduler.splits = arguments->split_k_slices;
+    }
+
+    
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
+    
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration = 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    BlockwiseGemmArguments const *arguments =
+      static_cast<BlockwiseGemmArguments const *>(arguments_ptr);
+
+    if (arguments->sf_m_vec_size != description_.SFMVecSize && arguments->sf_m_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (arguments->sf_n_vec_size != description_.SFNVecSize && arguments->sf_n_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+    if (arguments->sf_k_vec_size != description_.SFKVecSize && arguments->sf_k_vec_size != 0) {
+      return Status::kErrorInvalidProblem;
+    }
+
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *configuration) const override {
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+      void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<BlockwiseGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    Operator *op = new (host_workspace) Operator;
+    return Status::kSuccess;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration, 
+      void *host_workspace, 
+      void *device_workspace, 
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments args;
+    Status status = update_arguments_(args, static_cast<BlockwiseGemmArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(args, device_workspace, stream, nullptr, static_cast<BlockwiseGemmArguments const *>(arguments_ptr)->use_pdl);
+    return status;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv2d_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv2d_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b1a1584db92c4379e04c84a2658f79313b3eaad
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv2d_operation.h
@@ -0,0 +1,650 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv2d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/conv/device/direct_convolution.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Conv2dOperationBase : public Operation {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  Conv2dOperationBase(char const *name = "unknown_conv2d") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kConv2d;
+    description_.conv_dim = Operator::kConvDim;
+    
+    description_.iterator_algorithm = IteratorAlgorithmMap<Operator::kIteratorAlgorithm>::kId;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::UnderlyingKernel::WarpCount::kM,
+      Operator::UnderlyingKernel::WarpCount::kN,
+      Operator::UnderlyingKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    // TODO: Add split k mode Serial and parallel to convolutions
+    // description_.split_k_mode = Operator::kSplitK ? SplitKMode::kSerial : SplitKMode::kNone;
+
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Operator_>
+class Conv2dOperation : public Conv2dOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  Conv2dOperation(char const *name = "unknown_conv2d_fprop") : Conv2dOperationBase<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv2dConfiguration const *configuration) {
+
+
+    operator_args.problem_size = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv2dConfiguration const *configuration = 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv2dOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << "}" << std::endl;
+  } 
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// DirectConv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class DirectConv2dOperation : public Conv2dOperation<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using Base = Conv2dOperation<Operator_>;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  DirectConv2dOperation(char const *name = "unknown_direct)conv2d_fprop") : Conv2dOperation<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv2dConfiguration const *configuration) {
+
+
+    operator_args.problem_size = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_reordered_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+    operator_args.ref_reordered_B.reset(static_cast<ElementC *>(const_cast<void *>(arguments->reordered_B)));
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv2dConfiguration const *configuration = 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv2dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv2dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv2dOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << "}" << std::endl;
+  } 
+};
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv3d_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv3d_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe402c4494c27a882bf42f867a708e954ee87dc0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv3d_operation.h
@@ -0,0 +1,389 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/kernel/default_conv3d_dgrad.h"
+#include "cutlass/conv/kernel/default_conv3d_wgrad.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/util/host_tensor.h"
+
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Conv3dOperationBase : public Operation {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::IteratorAlgorithm const kIteratorAlgorithm = Operator::kIteratorAlgorithm;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  Conv3dOperationBase(char const *name = "unknown_conv3d") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kConv3d;
+    description_.conv_dim = Operator::kConvDim;
+    
+    description_.iterator_algorithm = IteratorAlgorithmMap<Operator::kIteratorAlgorithm>::kId;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::UnderlyingKernel::WarpCount::kM,
+      Operator::UnderlyingKernel::WarpCount::kN,
+      Operator::UnderlyingKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Conv2d library operation class for cutlass profiler
+//
+///////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename Operator_>
+class Conv3dOperation : public Conv3dOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+    /// Constructor
+  Conv3dOperation(char const *name = "unknown_conv3d_fprop") : Conv3dOperationBase<Operator_>(name) {
+    this->description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    Conv3dConfiguration const *configuration) {
+
+
+    operator_args.problem_size     = configuration->problem_size;
+
+    operator_args.ref_A = 
+    {
+      nullptr, 
+      LayoutA::packed(implicit_gemm_tensor_a_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_B = 
+    {
+      nullptr, 
+      LayoutB::packed(implicit_gemm_tensor_b_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_C = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+    
+    operator_args.ref_D = 
+    {
+      nullptr, 
+      LayoutC::packed(implicit_gemm_tensor_c_extent(kConvolutionalOperator, configuration->problem_size))
+    };
+
+    operator_args.split_k_mode     = configuration->split_k_mode;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ConvArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output_op = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA *>(const_cast<void *>(arguments->A)));
+    operator_args.ref_B.reset(static_cast<ElementB *>(const_cast<void *>(arguments->B)));
+    operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
+    operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    Conv3dConfiguration const *configuration = 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr);
+
+    ConvArguments const *arguments = 
+      static_cast<ConvArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<Conv3dConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Conv3dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ConvArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    //std::cout << "run library::Conv3dOperation" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Conv3dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Conv3dOperation::OperatorArguments" << std::endl
+              << "  problem_size: " 
+              << operator_args.problem_size << std::endl
+              << "  split_k_mode: "
+              << (operator_args.split_k_mode == cutlass::conv::SplitKMode::kSerial ? "serial" : "parallel") << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.output_op.alpha << ", " 
+              << operator_args.output_op.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ref_A.data() << ", {"
+              << operator_args.ref_A.stride(0) << ", " 
+              << operator_args.ref_A.stride(1) << ", " 
+              << operator_args.ref_A.stride(2) << ", " 
+              << operator_args.ref_A.stride(3) << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ref_B.data() << ", {"
+              << operator_args.ref_B.stride(0) << ", " 
+              << operator_args.ref_B.stride(1) << ", " 
+              << operator_args.ref_B.stride(2) << ", " 
+              << operator_args.ref_B.stride(3) << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ref_C.data() << ", {"
+              << operator_args.ref_C.stride(0) << ", "
+              << operator_args.ref_C.stride(1) << ", " 
+              << operator_args.ref_C.stride(2) << ", " 
+              << operator_args.ref_C.stride(3) << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ref_D.data() << ", {"
+              << operator_args.ref_D.stride(0) << ", "
+              << operator_args.ref_D.stride(1) << ", " 
+              << operator_args.ref_D.stride(2) << ", "
+              << operator_args.ref_D.stride(3) << "}" << std::endl;
+  } 
+};
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..86c1513e9c934c22e281cf37e1c5e7783e23d305
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/conv_operation_3x.hpp
@@ -0,0 +1,980 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/conv/convnd_problem_shape.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/detail/dependent_false.hpp"
+#include "cutlass/trace.h"
+#include <utility>
+#include <variant>
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+#include <sstream>
+#endif
+
+namespace cutlass::library {
+
+namespace detail {
+
+template<class ValueType, size_t ... Indices>
+constexpr cute::array<ValueType, 1u + sizeof...(Indices)>
+vector_to_array_strides_helper(const std::vector<ValueType>& v,
+                               std::index_sequence<Indices...>)
+{
+  return {v[(sizeof...(Indices) - 1u) - Indices]..., ValueType(1)};
+}
+
+template<class ValueType, size_t Size>
+cute::array<ValueType, Size>
+vector_to_array_strides(const std::vector<ValueType>& v, std::integral_constant<size_t, Size>)
+{
+  static_assert(Size != 0);
+  CUTLASS_ASSERT(v.size() + 1u == Size);
+  return vector_to_array_strides_helper(v, std::make_index_sequence<Size - 1u>{});
+}
+
+template<class Index, class LongIndex, size_t ... Indices>
+constexpr cute::array<int64_t, 1u + sizeof...(Indices)>
+coord_to_array_strides_helper(
+  const ::cutlass::Coord<int(sizeof...(Indices)), Index, LongIndex> coord,
+  std::index_sequence<Indices...>)
+{
+  return {int64_t(coord[(sizeof...(Indices) - 1u) - Indices])..., int64_t(1)};
+}
+
+template<int Rank, class Index, class LongIndex>
+cute::array<int64_t, 1u + size_t(Rank)>
+coord_to_array_strides(const ::cutlass::Coord<Rank, Index, LongIndex>& coord)
+{
+  static_assert(Rank >= 0);
+  return coord_to_array_strides_helper(coord, std::make_index_sequence<Rank>{});
+}
+
+} // namespace detail
+
+// Tells the profiler about CUTLASS 3's 2-D and 3-D convolutions.
+// For CUTLASS 2's 2-D convolutions, see Conv2dOperation.
+// For CUTLASS 2's 3-D convolutions, see Conv3dOperation.
+template<class Operator_>
+class ConvOperation3x : public Operation {
+public:
+  using Operator = Operator_;
+
+  static_assert(Operator::NumSpatialDimensions == 2 ||
+    Operator::NumSpatialDimensions == 3,
+    "The profiler currently only supports convolutions with 2 or 3 spatial dimensions.");
+  using LayoutA = cute::conditional_t<Operator::NumSpatialDimensions == 3,
+    cutlass::layout::TensorNDHWC,
+    cute::conditional_t<Operator::NumSpatialDimensions == 2,
+      cutlass::layout::TensorNHWC,
+      cutlass::layout::TensorNWC>
+    >;
+  using LayoutB = LayoutA;
+  using LayoutC = LayoutA;
+
+  using ElementA = typename Operator::ElementA;
+  using ElementB = typename Operator::ElementB;
+  using ElementC = typename Operator::ElementC;
+  using ElementD = typename Operator::ElementD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static cutlass::conv::Operator const kConvolutionalOperator = Operator::kConvolutionalOperator;
+
+  ConvOperation3x(const char* name = "unknown_cutlass_3_conv") {
+    // Initialize OperationDescription (the base class)
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      description_.kind = OperationKind::kConv2d;
+    }
+    else if constexpr (Operator::NumSpatialDimensions == 3) {
+      description_.kind = OperationKind::kConv3d;
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationID::kMultiplyAdd;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    // Initialize ConvDescription (the subclass)
+
+    // kConvDim does not exist in Operator for CUTLASS 3 convolutions.
+    // For CUTLASS 2 convolutions, it is the number of spatial dimensions.
+    description_.conv_dim = Operator::NumSpatialDimensions;
+    description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+
+    description_.iterator_algorithm = {};
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+  }
+
+  ~ConvOperation3x() override = default;
+
+  OperationDescription const& description() const override {
+    return static_cast<OperationDescription const&>(description_);
+  }
+
+private:
+  Status update_operator_arguments_from_configuration_2d_or_3d(
+    typename Operator::Arguments& out_args,
+    void const* configuration) const {
+    Status status = Status::kInvalid;
+
+    CUTLASS_ASSERT(configuration != nullptr);
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv2d);
+      // tools/library/include/cutlass/library/library.h
+      // defines Conv2dConfiguration.
+      // tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
+      // uses Conv2dConfiguration.
+      auto* conf_ptr = reinterpret_cast<Conv2dConfiguration const*>(configuration);
+      status = update_operator_arguments_from_configuration(out_args, *conf_ptr);
+    }
+    else if constexpr (Operator::NumSpatialDimensions == 3) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv3d);
+      auto* conf_ptr = reinterpret_cast<Conv3dConfiguration const*>(configuration);
+      status = update_operator_arguments_from_configuration(out_args, *conf_ptr);
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+
+    return status;
+  }
+
+public:
+  Status can_implement(
+    void const* configuration,
+    void const* arguments) const override {
+    Status status = Status::kInvalid;
+
+    // gemm_operation_3x.hpp accesses "configuration" as
+    // GemmUniversalConfiguration (which lives in
+    // tools/library/include/cutlass/library/library.h) and
+    // "arguments" as GemmUniversalArguments (which lives in
+    // tools/library/include/cutlass/library/library.h).
+    // Those things don't apply to convolutions.
+    // Despite the existence of ConvUniversal, there's no
+    // corresponding "ConvUniversalConfiguration" or
+    // "ConvUniversalArguments."
+
+    CUTLASS_ASSERT(configuration != nullptr);
+    CUTLASS_ASSERT(arguments != nullptr);
+
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("*** can_implement: update_operator_arguments_from_configuration_2d_or_3d failed");
+      return status;
+    }
+
+    auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+    status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("*** can_implement: update_operator_arguments_from_arguments failed");
+      return status;
+    }
+
+    return Operator::can_implement(out_args);
+  }
+
+  uint64_t get_host_workspace_size(void const* /* configuration */) const override {
+    return sizeof(Operator);
+  }
+
+  uint64_t get_device_workspace_size(
+    void const* configuration,
+    void const* arguments = nullptr) const override
+  {
+    // This presumes that at least one of configuration or arguments is nonnull.
+    Status status = Status::kInvalid;
+
+    // gemm_operation_3x.hpp has get_device_workspace_size return 0 on
+    // error.  It's not clear that this is what we want -- perhaps we
+    // should return something like expected<uint64_t, Status>? -- but
+    // it's the only option that preserves the current interface.
+    constexpr uint64_t error_indication = 0;
+
+    typename Operator::Arguments out_args{};
+    if (configuration != nullptr) {
+      status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+      if (status != Status::kSuccess) {
+        return error_indication;
+      }
+    }
+    if (arguments != nullptr) {
+      auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+      status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+      if (status != Status::kSuccess) {
+        return error_indication;
+      }
+    }
+
+    if (status == Status::kSuccess) {
+      return static_cast<uint64_t>(Operator::get_workspace_size(out_args));
+    }
+    else {
+      return error_indication;
+    }
+  }
+
+  Status initialize(
+    void const* configuration,
+    void* host_workspace,
+    void* /* device_workspace */ = nullptr,
+    cudaStream_t stream = nullptr) const override
+  {
+    Status status = Status::kInvalid;
+
+    if (configuration == nullptr) {
+      CUTLASS_TRACE_HOST("Input configuration is null.");
+      return Status::kInvalid;
+    }
+
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_configuration_2d_or_3d(out_args, configuration);
+    if (status != Status::kSuccess) {
+      // Any kind of failure invalidates the last successful configuration.
+      clear_last_successful_config();
+      return status;
+    }
+    else {
+      set_last_successful_config(configuration);
+    }
+
+    if (host_workspace == nullptr) {
+      CUTLASS_TRACE_HOST("host_workspace is null.");
+      return Status::kInvalid;
+    }
+    (void) new (host_workspace) Operator;
+    return status;
+
+    // CUTLASS 2 convolutions call the Operator's initialize function
+    // here, like this.
+    //
+    //return op->initialize(args, device_workspace, stream);
+    //
+    // CUTLASS 3 convolutions (ConvUniversal), like CUTLASS 3 Gemms
+    // (GemmUniversal), lack an "initialize" member function.
+  }
+
+  Status run(
+    void const* arguments,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override
+  {
+    auto status = Status::kInvalid;
+
+    // The Operator doesn't appear to save the last configuration (it
+    // doesn't have a way to do that, since it lacks an initialize()
+    // member function), so we have to use the stored configuration
+    // from the last successful initialize() call (if any).
+    typename Operator::Arguments out_args{};
+    status = update_operator_arguments_from_stored_configuration(out_args);
+    if (status != Status::kSuccess) {
+      CUTLASS_TRACE_HOST("Updating from previous successful configuration failed.");
+      return status;
+    }
+
+    if (arguments == nullptr) {
+      CUTLASS_TRACE_HOST("Input argument 'arguments' is null.");
+      return Status::kInvalid;
+    }
+    auto* in_args_ptr = reinterpret_cast<ConvArguments const*>(arguments);
+    status = update_operator_arguments_from_arguments(out_args, *in_args_ptr);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto* op = reinterpret_cast<Operator*>(host_workspace);
+    return op->run(out_args, device_workspace, stream, nullptr, in_args_ptr->use_pdl);
+  }
+
+private:
+  ConvDescription description_;
+  // Result of initialize() calling
+  // update_operator_arguments_from_configuration() successfully.
+  // This is needed because run() doesn't take a configuration, just
+  // arguments, and the kernel doesn't appear to save the
+  // configuration from the last initialize() call.
+  //
+  // Unfortunately, this must be declared mutable, because it must be
+  // set in initialize(), and initialize() is inherited as const.
+  mutable std::variant<
+    std::monostate,
+    Conv2dConfiguration,
+    Conv3dConfiguration> last_successful_config_{std::monostate{}};
+
+  // Clear the last configuration resulting from a successful initialize() call.
+  //
+  // Unfortunately, this must be declared const, because initialize() is.
+  void clear_last_successful_config() const {
+    last_successful_config_ = std::monostate{};
+  }
+
+  // Set the last configuration resulting from a successful initialize() call.
+  //
+  // Unfortunately, this must be declared const, because initialize() is.
+  void set_last_successful_config(void const* configuration) const {
+    CUTLASS_ASSERT(configuration != nullptr);
+
+    if constexpr (Operator::NumSpatialDimensions == 2) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv2d);
+      auto* conf_ptr = reinterpret_cast<Conv2dConfiguration const*>(configuration);
+      last_successful_config_ = *conf_ptr;
+    } else if constexpr (Operator::NumSpatialDimensions == 3) {
+      CUTLASS_ASSERT(description_.kind == OperationKind::kConv3d);
+      auto* conf_ptr = reinterpret_cast<Conv3dConfiguration const*>(configuration);
+      last_successful_config_ = *conf_ptr;
+    }
+    else {
+      static_assert(::cutlass::detail::dependent_false<Operator>,
+        "This class currently only supports 2-D and 3-D convolutions.");
+    }
+  }
+
+  // Whether a configuration from a successful initialize() call exists.
+  bool last_successful_config_exists() const {
+    return not std::holds_alternative<std::monostate>(last_successful_config_);
+  }
+
+  // Visitor for update_operator_arguments_from_stored_configuration.
+  struct ConfigurationVisitor {
+    typename Operator::Arguments& out_args;
+
+    Status operator() (std::monostate const&) const {
+      CUTLASS_TRACE_HOST("No successful previous configuration exists.  "
+        "One cause is calling run() before a successful initialize() call.");
+      return Status::kInvalid;
+    }
+    Status operator() (Conv2dConfiguration const& conf2d) const {
+      return update_operator_arguments_from_configuration(out_args, conf2d);
+    }
+    Status operator() (Conv3dConfiguration const& conf3d) const {
+      return update_operator_arguments_from_configuration(out_args, conf3d);
+    }
+  };
+
+  // Like update_operator_arguments_from_configuration, but on the
+  // stored configuration from the last successful initialize() call,
+  // if any.  If there was no last successful initialize() call,
+  // then return Status::kInvalid.
+  //
+  // Unfortunately, this must be declared const, because run() is.
+  Status update_operator_arguments_from_stored_configuration(
+    typename Operator::Arguments& out_args) const
+  {
+    return std::visit(ConfigurationVisitor{out_args}, last_successful_config_);
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(
+      FusionArgs const&,
+      ConvArguments const&)
+    {
+      // For custom EVT, it is the user's responsibility to ensure
+      // that alpha and beta are updated appropriately.
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(
+      FusionArgs& fusion_args,
+      ConvArguments const& arguments)
+    {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  static Status update_operator_arguments_from_configuration(
+    typename Operator::Arguments& out_args,
+    Conv2dConfiguration const& config)
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperator3x::"
+      "update_operator_arguments_from_configuration"
+      "(Conv2dConfiguration)\n");
+#endif    
+    using detail::vector_to_array_strides;
+
+    constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+    if constexpr (num_spatial_dims != 2) {
+      CUTLASS_TRACE_HOST("You can only use Conv2dConfiguration "
+        "with an Operator whose NumSpatialDimensions is exactly 2.");
+      return Status::kInvalid;
+    }
+    else {
+      // Convolutions split the metadata (in Conv2dConfiguration) from
+      // the data (ConvArguments, which only has pointers and a single
+      // enum value).  Thus, this class will need both the
+      // configuration and the (user's input) arguments to set up the
+      // kernel's arguments.  This function can fill in what the
+      // configuration has now, but the class will need the user's
+      // input arguments later.
+      if (config.split_k_mode != conv::SplitKMode::kSerial) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 convolutions currently only support split_k_mode = kSerial.");
+        return Status::kInvalid;
+      }
+      // config.problem_size.split_k_slices is only meaningful if
+      // split_k_mode != kSerial.  If this code later supports other
+      // split_k_mode values, then it will also need to read
+      // split_k_slices.
+
+      const int N = config.problem_size.N;
+      const int H = config.problem_size.H;
+      const int W = config.problem_size.W;
+      const int C = config.problem_size.C;
+      const int K = config.problem_size.K;
+      const int R = config.problem_size.R;
+      const int S = config.problem_size.S;
+      const int pad_h = config.problem_size.pad_h;
+      const int pad_w = config.problem_size.pad_w;
+      const int traversal_stride_h = config.problem_size.stride_h;
+      const int traversal_stride_w = config.problem_size.stride_w;
+      const int dilation_h = config.problem_size.dilation_h;
+      const int dilation_w = config.problem_size.dilation_w;
+
+      // CUTLASS 3's implicit GEMM convolution kernels currently only
+      // support cross correlation (passing over the activation and
+      // filter tensors in the same order).  The convolution mode is
+      // future work.
+      const auto mode = config.problem_size.mode;
+      if (mode != cutlass::conv::Mode::kCrossCorrelation) {
+        CUTLASS_TRACE_HOST("Convolution modes other than kCrossCorrelation "
+          "are not currently supported.");
+        return Status::kInvalid;
+      }
+
+      constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+      constexpr size_t stride_size = size_t(num_spatial_dims) + 2u;
+      constexpr auto the_stride_size = std::integral_constant<size_t, stride_size>{};
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      std::cerr << "  num_spatial_dims = " << num_spatial_dims << "\n"
+                << "  stride_size = " << stride_size << "\n";
+      auto print_stride = [] (auto const& stride, char const variable_name[]) {
+        std::cerr << "  " << variable_name << ": [";
+        for (size_t k = 0; k < stride.size(); ++k) {
+          std::cerr << stride[k];
+          if (k + 1u < stride.size()) {
+            std::cerr << ", ";
+          }
+        }
+        std::cerr << "]\n";
+      };
+      print_stride(config.stride_a, "config.stride_a");
+      print_stride(config.stride_b, "config.stride_b");
+      print_stride(config.stride_c, "config.stride_c");
+#endif
+
+      // Conv2dConfiguration stores the strides as std::vector,
+      // so the code needs to check the run-time vector lengths.
+      if (config.stride_a.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_a.size() + 1u = "
+           << (config.stride_a.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+      if (config.stride_b.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_b.size() + 1u = "
+           << (config.stride_b.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+      if (config.stride_c.size() + 1u != stride_size) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL)
+        std::ostringstream os;
+        os << "config.stride_c.size() + 1u = "
+           << (config.stride_c.size() + 1u)
+           << " != num_spatial_dims + 2u = " << stride_size;
+        CUTLASS_TRACE_HOST( os.str() );
+#endif
+        return Status::kInvalid;
+      }
+
+      constexpr cutlass::conv::Operator conv_op = Operator::DispatchPolicy::ConvOp;
+      using problem_shape_type =
+        cutlass::conv::ConvProblemShape<conv_op, num_spatial_dims>;
+      // cute::array<int64_t, RankT>; must convert to the kernel's native strides
+      using TensorStride = typename problem_shape_type::TensorStride;
+
+      const TensorStride stride_A = vector_to_array_strides(config.stride_a, the_stride_size);
+      const TensorStride stride_B = vector_to_array_strides(config.stride_b, the_stride_size);
+      const TensorStride stride_C = vector_to_array_strides(config.stride_c, the_stride_size);
+
+      // cutlass::library::Conv2dConfiguration has no member stride_d.
+      // The code below imitates the testbed,
+      // which just sets D's strides to C's strides.
+
+      const int num_groups = config.problem_size.groups;
+      if (num_groups != 1) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 kernels currently only support groups = 1.");
+        return Status::kInvalid;
+      }
+      // ConvProblemShape is how CUTLASS 3 kernels represent
+      // convolution problems.  ConvProblemShape's constructors take
+      // shape_act, stride_act, shape_flt, and stride_flt, and set
+      // shape_A, stride_A, shape_B, stride_B, shape_C, and stride_C
+      // according to Fprop / Dgrad / Wgrad.
+      //
+      // This means that stride_act isn't always config.stride_A,
+      // depending on Fprop / Dgrad / Wgrad.  The code here "undoes"
+      // the logic in Conv2dWorkspace::set_stride_vector so that we
+      // can recover the strides of the activation and filter tensors.
+      // It doesn't need to worry about the so-called "output" tensor
+      // (which might not be C), as ConvProblemShape's constructor
+      // figures out its shapes and strides.
+      using TensorExtent = typename problem_shape_type::TensorExtent;
+      TensorExtent shape_act{N, H, W, C};
+      auto stride_act = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_A;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_C;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_B;
+        }
+      } ();
+      TensorExtent shape_flt{K, R, S, C};
+      auto stride_flt = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_B;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_B;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_C;
+        }
+      } ();
+      
+      problem_shape_type problem_shape(
+        /* mode             = */ mode,
+        /* shape_act        = */ shape_act,
+        /* stride_act       = */ stride_act,
+        /* shape_flt        = */ shape_flt,
+        /* stride_flt       = */ stride_flt,
+        /* lower_padding    = */ {pad_h, pad_w},
+        /* upper_padding    = */ {pad_h, pad_w},
+        /* traversal_stride = */ {traversal_stride_h, traversal_stride_w},
+        /* dilation         = */ {dilation_h, dilation_w},
+                                 num_groups);
+      out_args.problem_shape = problem_shape;
+
+      // ConvProblemShape's constructor sets its shape_C member.
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("\n  problem_shape.shape_C: ");
+      print(problem_shape.shape_C);
+      printf("\n  problem_shape.stride_C: ");
+      print(problem_shape.stride_C);
+      printf("\n");
+#endif
+      // Initialization of C's and D's strides follows the CUTLASS 3
+      // convolutions testbed (test/unit/conv/device_3x/testbed_conv.hpp).
+      {
+        using StrideC = typename Operator::ConvKernel::StrideC;
+        using StrideD = typename Operator::ConvKernel::StrideD;
+        auto stride_C = StrideC{};
+        auto stride_D = StrideD{};
+
+        if constexpr (conv_op == cutlass::conv::Operator::kWgrad) {
+          stride_C = cutlass::make_cute_packed_stride(
+            StrideC{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+          stride_D = cutlass::make_cute_packed_stride(
+            StrideD{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          std::cerr << "  Wgrad: stride_C: " << stride_C << "\n";
+#endif
+        }
+        else {
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_C_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_C): "
+                      << stride_C_i << "\n";
+#endif
+            cute::get<0, i>(stride_C) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_D_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_D): "
+                      << stride_D_i << "\n";
+#endif
+            cute::get<0, i>(stride_D) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+        }
+        out_args.epilogue.dC = stride_C;
+        out_args.epilogue.dD = stride_D;
+      }
+      return Status::kSuccess;
+    }
+  }
+
+  static Status update_operator_arguments_from_configuration(
+    typename Operator::Arguments& out_args,
+    Conv3dConfiguration const& config)
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperator3x::"
+      "update_operator_arguments_from_configuration"
+      "(Conv3dConfiguration)\n");
+#endif    
+    using detail::coord_to_array_strides;
+
+    constexpr int num_spatial_dims = Operator::NumSpatialDimensions;
+    if constexpr (num_spatial_dims != 3) {
+      CUTLASS_TRACE_HOST("You can only use Conv3dConfiguration "
+        "with an Operator whose NumSpatialDimensions is exactly 3.");
+      return Status::kInvalid;
+    }
+    else {
+      // Convolutions split the metadata (in Conv3dConfiguration) from
+      // the data (ConvArguments, which only has pointers and a single
+      // enum value).  Thus, this class will need both the
+      // configuration and the (user's input) arguments to set up the
+      // kernel's arguments.  This function can fill in what the
+      // configuration has now, but the class will need the user's
+      // input arguments later.
+      if (config.split_k_mode != conv::SplitKMode::kSerial) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 convolutions currently only support split_k_mode = kSerial.");
+        return Status::kInvalid;
+      }
+      // config.problem_size.split_k_slices is only meaningful if
+      // split_k_mode != kSerial.  If this code later supports other
+      // split_k_mode values, then it will also need to read
+      // split_k_slices.
+
+      const int N = config.problem_size.N;
+      const int D = config.problem_size.D;
+      const int H = config.problem_size.H;
+      const int W = config.problem_size.W;
+      const int C = config.problem_size.C;
+      const int K = config.problem_size.K;
+      const int T = config.problem_size.T;
+      const int R = config.problem_size.R;
+      const int S = config.problem_size.S;
+      const int pad_d = config.problem_size.pad_d;
+      const int pad_h = config.problem_size.pad_h;
+      const int pad_w = config.problem_size.pad_w;
+      const int traversal_stride_d = config.problem_size.stride_d;
+      const int traversal_stride_h = config.problem_size.stride_h;
+      const int traversal_stride_w = config.problem_size.stride_w;
+      const int dilation_d = config.problem_size.dilation_d;
+      const int dilation_h = config.problem_size.dilation_h;
+      const int dilation_w = config.problem_size.dilation_w;
+
+      // CUTLASS 3's implicit GEMM convolution kernels currently only
+      // support cross correlation (passing over the activation and
+      // filter tensors in the same order).  The convolution mode is
+      // future work.
+      const auto mode = config.problem_size.mode;
+      if (mode != cutlass::conv::Mode::kCrossCorrelation) {
+        CUTLASS_TRACE_HOST("Convolution modes other than kCrossCorrelation "
+          "are not currently supported.");
+        return Status::kInvalid;
+      }
+
+      using Stride = cutlass::layout::TensorNDHWC::Stride;
+      static_assert(std::is_same_v<Stride, cutlass::Coord<4>>);
+
+      const cutlass::library::ConvKind conv_kind = [] () {
+        constexpr cutlass::conv::Operator op = Operator::DispatchPolicy::ConvOp;
+        if constexpr (op == cutlass::conv::Operator::kFprop) {
+          return library::ConvKind::kFprop;
+        }
+        else if constexpr (op == cutlass::conv::Operator::kDgrad) {
+          return library::ConvKind::kDgrad;
+        }
+        else /* if constexpr (op == cutlass::conv::Operator::kWgrad) */ {
+          return library::ConvKind::kWgrad;
+        }
+      } ();
+      const Stride input_stride_a = config.layout_a(conv_kind).stride();
+      const Stride input_stride_b = config.layout_b(conv_kind).stride();
+      const Stride input_stride_c = config.layout_c(conv_kind).stride();
+
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      constexpr size_t stride_size = size_t(num_spatial_dims) + 2u;
+      std::cerr << "  num_spatial_dims = " << num_spatial_dims << "\n"
+                << "  stride_size = " << stride_size << "\n";
+      auto print_stride = [] (Stride const& stride, char const variable_name[]) {
+        std::cerr << "  " << variable_name << ": [";
+        for (size_t k = 0; k < Stride::kRank; ++k) {
+          std::cerr << stride[static_cast<int>(k)];
+          if (k + 1u < Stride::kRank) {
+            std::cerr << ", ";
+          }
+        }
+        std::cerr << "]\n";
+      };
+      print_stride(input_stride_a, "input_stride_a");
+      print_stride(input_stride_b, "input_stride_b");
+      print_stride(input_stride_c, "input_stride_c");
+#endif
+      // Conv3dConfiguration stores the strides as Coord (with
+      // compile-time size), so there's no need to check sizes here
+      // (unlike Conv2dConfiguration, which stores strides as
+      // std::vector).
+
+      constexpr cutlass::conv::Operator conv_op = Operator::DispatchPolicy::ConvOp;
+      using problem_shape_type =
+        cutlass::conv::ConvProblemShape<conv_op, num_spatial_dims>;
+      // cute::array<int64_t, RankT>; must convert to the kernel's native strides
+      using TensorStride = typename problem_shape_type::TensorStride;
+
+      const TensorStride stride_A = coord_to_array_strides(input_stride_a);
+      const TensorStride stride_B = coord_to_array_strides(input_stride_b);
+      const TensorStride stride_C = coord_to_array_strides(input_stride_c);
+
+      const int num_groups = config.problem_size.groups;
+      if (num_groups != 1) {
+        CUTLASS_TRACE_HOST("CUTLASS 3 kernels currently only support groups = 1.");
+        return Status::kInvalid;
+      }
+      // ConvProblemShape is how CUTLASS 3 kernels represent
+      // convolution problems.  ConvProblemShape's constructors take
+      // shape_act, stride_act, shape_flt, and stride_flt, and set
+      // shape_A, stride_A, shape_B, stride_B, shape_C, and stride_C
+      // according to Fprop / Dgrad / Wgrad.
+      //
+      // Conv3dConfiguration differs a bit from Conv2dConfiguration,
+      // but the idea is the same: the "input_stride_a" from config
+      // depends on conv_kind (Fprop, Dgrad, or Wgrad), so stride_act
+      // isn't always input_stride_a.  Analogously, stride_flt isn't
+      // always input_stride_b.  The code here "undoes" the logic in
+      // config.layout_a(conv_kind) and config.layout_b(conv_kind)
+      // (analogous to Conv2dWorkspace::set_stride_vector) so that we
+      // can recover the strides of the activation and filter tensors.
+      // It doesn't need to worry about the so-called "output" tensor
+      // (which might not be C), as ConvProblemShape's constructor
+      // figures out its shapes and strides.
+      using TensorExtent = typename problem_shape_type::TensorExtent;
+      TensorExtent shape_act{N, D, H, W, C};
+      auto stride_act = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_A;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_C;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_B;
+        }
+      } ();
+      TensorExtent shape_flt{K, T, R, S, C};
+      auto stride_flt = [&] () {
+        // Some compilers consider conv_op (defined above), as
+        // captured by this lambda, as "not a constant expression."
+        constexpr auto conv_kind = Operator::DispatchPolicy::ConvOp;
+        if constexpr (conv_kind == cutlass::conv::Operator::kFprop) {
+          return stride_B;
+        }
+        else if constexpr (conv_kind == cutlass::conv::Operator::kDgrad) {
+          return stride_B;
+        }
+        else { // conv_kind == cutlass::conv::Operator::kWgrad
+          return stride_C;
+        }
+      } ();
+
+      problem_shape_type problem_shape(
+        /* mode             = */ mode,
+        /* shape_act        = */ shape_act,
+        /* stride_act       = */ stride_act,
+        /* shape_flt        = */ shape_flt,
+        /* stride_flt       = */ stride_flt,
+        /* lower_padding    = */ {pad_d, pad_h, pad_w},
+        /* upper_padding    = */ {pad_d, pad_h, pad_w},
+        /* traversal_stride = */ {traversal_stride_d, traversal_stride_h, traversal_stride_w},
+        /* dilation         = */ {dilation_d, dilation_h, dilation_w},
+                                 num_groups);
+      out_args.problem_shape = problem_shape;
+
+      // ConvProblemShape's constructor sets its shape_C member.
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+      printf("\n  problem_shape.shape_C: ");
+      print(problem_shape.shape_C);
+      printf("\n  problem_shape.stride_C: ");
+      print(problem_shape.stride_C);
+      printf("\n");
+#endif
+      // Initialization of C's and D's strides follows the CUTLASS 3
+      // convolutions testbed (test/unit/conv/device_3x/testbed_conv.hpp).
+      {
+        using StrideC = typename Operator::ConvKernel::StrideC;
+        using StrideD = typename Operator::ConvKernel::StrideD;
+        auto stride_C = StrideC{};
+        auto stride_D = StrideD{};
+
+        if constexpr (conv_op == cutlass::conv::Operator::kWgrad) {
+          stride_C = cutlass::make_cute_packed_stride(
+            StrideC{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+          stride_D = cutlass::make_cute_packed_stride(
+            StrideD{}, problem_shape.shape_C, problem_shape.stride_C, conv_op);
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+          std::cerr << "  Wgrad: stride_C: " << stride_C << "\n";
+#endif
+        }
+        else {
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideC{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_C_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_C): "
+                      << stride_C_i << "\n";
+#endif
+            cute::get<0, i>(stride_C) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+          cute::for_each(cute::make_seq<cute::rank<0>(StrideD{})>{}, [&](auto i) {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+            const auto stride_D_i = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+            std::cerr << "  Fprop or Dgrad: get<0, " << i << ">(stride_D): "
+                      << stride_D_i << "\n";
+#endif
+            cute::get<0, i>(stride_D) = problem_shape.stride_C[problem_shape_type::RankT-2-i];
+          });
+        }
+        out_args.epilogue.dC = stride_C;
+        out_args.epilogue.dD = stride_D;
+      }
+      return Status::kSuccess;
+    }
+  }
+
+  Status update_operator_arguments_from_arguments(
+    typename Operator::Arguments& out_args,
+    ConvArguments const& in_args) const
+  {
+#if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+    CUTLASS_TRACE_HOST("ConvOperation3x::update_operator_arguments_from_arguments\n");
+#endif
+    auto status = UpdateFusionArgs<decltype(out_args.epilogue.thread)>::update_(
+      out_args.epilogue.thread, in_args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    out_args.mainloop.ptr_A = reinterpret_cast<ElementA const*>(in_args.A);
+    out_args.mainloop.ptr_B = reinterpret_cast<ElementB const*>(in_args.B);
+
+    out_args.epilogue.ptr_C = reinterpret_cast<ElementC const*>(in_args.C);
+    out_args.epilogue.ptr_D = reinterpret_cast<ElementD*>(in_args.D);
+
+    return Status::kSuccess;
+  }
+};
+
+} // namespace cutlass::library
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..880cb4bf34b1f3d946e1dc86b80806309bb2b3c1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation.h
@@ -0,0 +1,1408 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/gemm/device/gemm_sparse.h"
+#include "cutlass/gemm/device/gemm_complex.h"
+#include "cutlass/gemm/device/gemm_batched.h"
+#include "cutlass/gemm/device/gemm_array.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  // assuming all tensors use same type for StrideIndex 
+  using StrideIndex = typename Operator::LayoutA::Index;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  GemmDescription description_;
+
+public:
+
+  /// Constructor
+  GemmOperationBase(char const *name = "unknown_gemm") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kGemm;
+    description_.gemm_kind = GemmKind::kGemm;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::GemmKernel::WarpCount::kM,
+      Operator::GemmKernel::WarpCount::kN,
+      Operator::GemmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformB>::kId;
+  }
+  
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmOperation(char const *name = "unknown_gemm"): GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kGemm;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    GemmConfiguration const *configuration) {
+
+    operator_args.problem_size = configuration->problem_size;
+
+    operator_args.ref_A = {nullptr, configuration->lda};
+    operator_args.ref_B = {nullptr, configuration->ldb};
+    operator_args.ref_C = {nullptr, configuration->ldc};
+    operator_args.ref_D = {nullptr, configuration->ldd};
+
+    operator_args.split_k_slices = configuration->split_k_slices;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    GemmArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA const *>(arguments->A));
+    operator_args.ref_B.reset(static_cast<ElementB const *>(arguments->B));
+    operator_args.ref_C.reset(static_cast<ElementC const *>(arguments->C));
+    operator_args.ref_D.reset(static_cast<ElementD *>(arguments->D));
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    GemmConfiguration const *configuration = 
+      static_cast<GemmConfiguration const *>(configuration_ptr);
+
+    GemmArguments const *arguments = 
+      static_cast<GemmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    return op->initialize(args, device_workspace, stream);
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<GemmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return op->run(stream);
+  }
+
+  void print_operator_args(OperatorArguments &operator_args) const {
+#if 0
+    std::cout << "GemmOperation::OperatorArguments" << std::endl;
+    std::cout << "    problem_size: " << operator_args.problem_size.m() << ", "<< operator_args.problem_size.n() << "," <<  operator_args.problem_size.k() << std::endl;
+    std::cout << "    alpha:      " << operator_args.epilogue.alpha << std::endl;
+    std::cout << "    alpha_ptr:  " << operator_args.epilogue.alpha_ptr << std::endl;
+    std::cout << "    beta:       " << operator_args.epilogue.beta << std::endl;
+    std::cout << "    beta_ptr:   " << operator_args.epilogue.beta_ptr << std::endl;
+    std::cout << "  ref_A.data(): " << operator_args.ref_A.data() << std::endl;
+    std::cout << "  ref_A.stride: " << operator_args.ref_A.stride(0) << std::endl;
+    std::cout << "  ref_B.data(): " << operator_args.ref_B.data() << std::endl;
+    std::cout << "  ref_B.stride: " << operator_args.ref_B.stride(0) << std::endl;
+    std::cout << "  ref_C.data(): " << operator_args.ref_C.data() << std::endl;
+    std::cout << "  ref_C.stride: " << operator_args.ref_C.stride(0) << std::endl;
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmSparseOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementE = typename Operator::ElementE;
+  using LayoutE = typename Operator::LayoutE;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmSparseOperation(char const *name = "unknown_gemm"): GemmOperationBase<Operator_>(name) {
+
+    this->description_.kind = OperationKind::kSparseGemm;
+    this->description_.gemm_kind = GemmKind::kSparse;
+    this->description_.E = make_TensorDescription<ElementE, LayoutE>(Operator::kAlignmentE);
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    SparseGemmConfiguration const *configuration) {
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.ref_A = {nullptr, configuration->lda};
+    operator_args.ref_B = {nullptr, configuration->ldb};
+    operator_args.ref_C = {nullptr, configuration->ldc};
+    operator_args.ref_D = {nullptr, configuration->ldd};
+    operator_args.ref_E = {nullptr, configuration->lde};
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    SparseGemmArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    operator_args.ref_A.reset(static_cast<ElementA const *>(arguments->A));
+    operator_args.ref_B.reset(static_cast<ElementB const *>(arguments->B));
+    operator_args.ref_C.reset(static_cast<ElementC const *>(arguments->C));
+    operator_args.ref_D.reset(static_cast<ElementD *>(arguments->D));
+    operator_args.ref_E.reset(static_cast<ElementE const *>(arguments->E));
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    SparseGemmConfiguration const *configuration = 
+      static_cast<SparseGemmConfiguration const *>(configuration_ptr);
+
+    SparseGemmArguments const *arguments = 
+      static_cast<SparseGemmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SparseGemmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SparseGemmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    return op->initialize(args, device_workspace, stream);
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+ 
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<SparseGemmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return op->run(stream);
+  }
+
+  void print_operator_args(OperatorArguments &operator_args) const {
+#if 0
+    std::cout << "GemmOperation::OperatorArguments" << std::endl;
+    std::cout << "    problem_size: " << operator_args.problem_size.m() << ", "<< operator_args.problem_size.n() << "," <<  operator_args.problem_size.k() << std::endl;
+    std::cout << "    alpha:      " << operator_args.epilogue.alpha << std::endl;
+    std::cout << "    alpha_ptr:  " << operator_args.epilogue.alpha_ptr << std::endl;
+    std::cout << "    beta:       " << operator_args.epilogue.beta << std::endl;
+    std::cout << "    beta_ptr:   " << operator_args.epilogue.beta_ptr << std::endl;
+    std::cout << "  ref_A.data(): " << operator_args.ref_A.data() << std::endl;
+    std::cout << "  ref_A.stride: " << operator_args.ref_A.stride(0) << std::endl;
+    std::cout << "  ref_B.data(): " << operator_args.ref_B.data() << std::endl;
+    std::cout << "  ref_B.stride: " << operator_args.ref_B.stride(0) << std::endl;
+    std::cout << "  ref_C.data(): " << operator_args.ref_C.data() << std::endl;
+    std::cout << "  ref_C.stride: " << operator_args.ref_C.stride(0) << std::endl;
+#endif
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmUniversalOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmUniversalOperation(char const *name = "unknown_gemm"): 
+    GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    GemmUniversalConfiguration const *configuration) {
+
+    operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = (configuration->lda);
+    operator_args.ldb = (configuration->ldb);
+    operator_args.ldc = (configuration->ldc);
+    operator_args.ldd = (configuration->ldd);
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    GemmUniversalArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    GemmUniversalConfiguration const *configuration = 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+
+    GemmUniversalArguments const *arguments = 
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    status = update_arguments_(
+      args,
+      static_cast<GemmUniversalArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<GemmUniversalArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmPlanarComplexOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmPlanarComplexOperation(char const *name = "unknown_gemm"): GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kPlanarComplex;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    GemmPlanarComplexConfiguration const *configuration) {
+
+    operator_args.mode = cutlass::gemm::GemmUniversalMode::kBatched;
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+
+    operator_args.lda_real = configuration->lda_real;
+    operator_args.lda_imag = configuration->lda_imag;
+    operator_args.ldb_real = configuration->ldb_real;
+    operator_args.ldb_imag = configuration->ldb_imag;
+    operator_args.ldc_real = configuration->ldc_real;
+    operator_args.ldc_imag = configuration->ldc_imag;
+    operator_args.ldd_real = configuration->ldd_real;
+    operator_args.ldd_imag = configuration->ldd_imag;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    GemmPlanarComplexArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<cutlass::complex<ElementCompute> const *>(arguments->alpha),
+        *static_cast<cutlass::complex<ElementCompute> const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<cutlass::complex<ElementCompute> const *>(arguments->alpha),
+        static_cast<cutlass::complex<ElementCompute> const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A_real = arguments->A_real;
+    operator_args.ptr_A_imag = arguments->A_imag;
+    operator_args.ptr_B_real = arguments->B_real;
+    operator_args.ptr_B_imag = arguments->B_imag;
+    operator_args.ptr_C_real = arguments->C_real;
+    operator_args.ptr_C_imag = arguments->C_imag;
+    operator_args.ptr_D_real = arguments->D_real;
+    operator_args.ptr_D_imag = arguments->D_imag;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A_real;
+    operator_args.batch_stride_A_imag = arguments->batch_stride_A_imag;
+    operator_args.batch_stride_B = arguments->batch_stride_B_real;
+    operator_args.batch_stride_B_imag = arguments->batch_stride_B_imag;
+    operator_args.batch_stride_C = arguments->batch_stride_C_real;
+    operator_args.batch_stride_C_imag = arguments->batch_stride_C_imag;
+    operator_args.batch_stride_D = arguments->batch_stride_D_real;
+    operator_args.batch_stride_D_imag = arguments->batch_stride_D_imag;
+    
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    GemmPlanarComplexConfiguration const *configuration = 
+      static_cast<GemmPlanarComplexConfiguration const *>(configuration_ptr);
+
+    GemmPlanarComplexArguments const *arguments = 
+      static_cast<GemmPlanarComplexArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmPlanarComplexConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmPlanarComplexConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args,
+      static_cast<GemmPlanarComplexArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = op->run(stream);
+
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmPlanarComplexArrayOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmPlanarComplexArrayOperation(char const *name = "unknown_gemm"): GemmOperationBase<Operator_>(name) {
+
+    this->description_.gemm_kind = GemmKind::kPlanarComplexArray;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    GemmPlanarComplexArrayConfiguration const *configuration) {
+
+    operator_args.mode = cutlass::gemm::GemmUniversalMode::kArray;
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda_real = configuration->lda_real;
+    operator_args.lda_imag = configuration->lda_imag;
+    operator_args.ldb_real = configuration->ldb_real;
+    operator_args.ldb_imag = configuration->ldb_imag;
+    operator_args.ldc_real = configuration->ldc_real;
+    operator_args.ldc_imag = configuration->ldc_imag;
+    operator_args.ldd_real = configuration->ldd_real;
+    operator_args.ldd_imag = configuration->ldd_imag;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    GemmPlanarComplexArrayArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<cutlass::complex<ElementCompute> const *>(arguments->alpha),
+        *static_cast<cutlass::complex<ElementCompute> const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<cutlass::complex<ElementCompute> const *>(arguments->alpha),
+        static_cast<cutlass::complex<ElementCompute> const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A_real = arguments->A_real;
+    operator_args.ptr_A_imag = arguments->A_imag;
+    operator_args.ptr_B_real = arguments->B_real;
+    operator_args.ptr_B_imag = arguments->B_imag;
+    operator_args.ptr_C_real = arguments->C_real;
+    operator_args.ptr_C_imag = arguments->C_imag;
+    operator_args.ptr_D_real = arguments->D_real;
+    operator_args.ptr_D_imag = arguments->D_imag;
+
+    operator_args.ptr_M = arguments->M;
+    operator_args.ptr_N = arguments->N;
+    operator_args.ptr_K = arguments->K;
+    
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    GemmPlanarComplexArrayConfiguration const *configuration = 
+      static_cast<GemmPlanarComplexArrayConfiguration const *>(configuration_ptr);
+
+    GemmPlanarComplexArrayArguments const *arguments = 
+      static_cast<GemmPlanarComplexArrayArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmPlanarComplexArrayConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<GemmPlanarComplexArrayConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<GemmPlanarComplexArrayArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmGroupedOperation : public GemmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = ElementC;
+  using LayoutD = LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  GemmGroupedOperation(char const *name = "unknown_gemm"):
+    GemmOperationBase<Operator_>(name) {
+
+    this->description_.kind = OperationKind::kGroupedGemm;
+    this->description_.provider = Provider::kCUTLASS;
+    this->threadblock_count = Operator::sufficient();
+
+    this->description_.gemm = GemmOperationBase<Operator_>::description_;
+    this->description_.gemm.gemm_kind = GemmKind::kGrouped;
+    this->description_.tile_description = this->description_.gemm.tile_description;
+  }
+
+  /// Returns the description of the GroupedGEMM operation
+  virtual OperationDescription const & description() const override final {
+    return description_;
+  }
+
+
+private:
+  int threadblock_count;
+  GroupedGemmDescription description_;
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status construct_arguments_(
+    OperatorArguments &op_args,
+    GemmGroupedConfiguration const *config) const {
+
+    op_args.problem_count = config->problem_count;
+    op_args.threadblock_count = threadblock_count;
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status update_arguments_(
+    OperatorArguments &op_args,
+    GemmGroupedArguments const *arguments) const {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+
+      op_args.output_op = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice) {
+
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+
+      op_args.output_op = params;
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    op_args.threadblock_count = threadblock_count;
+    op_args.problem_count = arguments->problem_count;
+    op_args.problem_sizes = arguments->problem_sizes;
+
+    op_args.ptr_A         = static_cast<ElementA **>(arguments->ptr_A);
+    op_args.ptr_B         = static_cast<ElementB **>(arguments->ptr_B);
+    op_args.ptr_C         = static_cast<ElementC **>(arguments->ptr_C);
+    op_args.ptr_D         = static_cast<ElementD **>(arguments->ptr_D);
+
+    op_args.lda           = arguments->lda;
+    op_args.ldb           = arguments->ldb;
+    op_args.ldc           = arguments->ldc;
+    op_args.ldd           = arguments->ldd;
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr,
+    void const *arguments_ptr) const {
+
+    GemmGroupedConfiguration const *configuration =
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr);
+
+    GemmGroupedArguments const *arguments =
+      static_cast<GemmGroupedArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args,
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    status = update_arguments_(
+      args,
+      static_cast<GemmGroupedArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr,
+    void *host_workspace,
+    void *device_workspace,
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args,
+      static_cast<GemmGroupedConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args,
+      static_cast<GemmGroupedArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = op->run(stream);
+
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c1d17943f11fe8126b3070c3fcead5598e2d207
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/gemm_operation_3x.hpp
@@ -0,0 +1,714 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/array.h"
+#include "cutlass/array_subbyte.h"
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cute/tensor.hpp"
+#include <unordered_map>
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmOperation3xBase : public Operation {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  // assuming all tensors use same type for StrideIndex
+  using StrideIndex = typename Operator::LayoutA::Index;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+protected:
+  GemmDescription description_;
+
+public:
+
+  /// Constructor
+  GemmOperation3xBase(char const *name = "unknown_gemm", GemmKind gemm_kind_ = GemmKind::kGemm) {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kGemm;
+    description_.gemm_kind = gemm_kind_;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      description_.tile_description.cluster_shape = make_Coord(
+        Operator::ClusterShape::kM,
+        Operator::ClusterShape::kN,
+        Operator::ClusterShape::kK);
+    }
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::WarpCount::kM,
+      Operator::WarpCount::kN,
+      Operator::WarpCount::kK);
+
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class =
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::MathOperator>::kId;
+
+    description_.tile_description.minimum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability =
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.D = make_TensorDescription<ElementD, LayoutD>(Operator::kAlignmentD);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformB>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  /// Returns the description of the GEMM operation
+  GemmDescription const& get_gemm_description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class GemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+  
+
+public:
+
+  /// Constructor
+  GemmUniversal3xOperation(char const *name = "unknown_gemm"):
+    GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {
+    if constexpr (Operator::ArchTag::kMinComputeCapability == 90) {
+      dim3 cluster_dims(
+        cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<1>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<2>(typename Operator::GemmKernel::ClusterShape{}));
+      uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
+      void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+      max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
+        cluster_dims,
+        threads_per_block,
+        kernel_ptr);
+    }
+  }
+
+private:
+  int max_active_clusters{};
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  template<template<int, class, class> class Policy, int Stages, class ClusterShape, class KernelSchedule>
+  static constexpr bool is_sm90_mixed_dtype_mainloop_(Policy<Stages, ClusterShape, KernelSchedule> policy) {
+    return (cute::is_same_v<Policy<Stages, ClusterShape, KernelSchedule>,
+                            cutlass::gemm::MainloopSm90TmaGmmaRmemAWarpSpecializedMixedInput<Stages, ClusterShape, KernelSchedule>>);
+  }
+
+  template <class DispatchPolicy>
+  static constexpr bool is_sm90_mixed_dtype_mainloop_(DispatchPolicy) {
+    return false;
+  }
+
+  template <
+    typename ElementWide,
+    typename ElementNarrow,
+    typename ElementScaleMainloop,
+    class ActualStrideAB,
+    Sm90MixedInputWiderOperand wider_operand,
+    bool is_n4w8,
+    typename ElementScale,
+    typename ElementZero,
+    class Layout_SZ>
+  static void dequantize_encode_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments,
+      cudaStream_t stream,
+      const int &problem_mn,
+      const int &problem_k,
+      const int &options_l,
+      const int &options_g,
+      ElementScale *ptr_S,
+      ElementZero *ptr_Z,
+      const size_t &SZ_size,
+      Layout_SZ layout_SZ
+      ) {
+
+    auto shape_AB  = cute::make_shape(problem_mn, problem_k, options_l);
+    auto stride_AB = cutlass::make_cute_packed_stride(ActualStrideAB{}, shape_AB);
+    auto layout_AB = cute::make_layout(shape_AB, stride_AB);
+    auto *ptr_dequantized_AB = static_cast<ElementWide *>(arguments->dequantized_AB);
+    const ElementNarrow *ptr_AB = nullptr;
+    if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+      ptr_AB = static_cast<const ElementNarrow *>(arguments->B);
+    }
+    else {
+      ptr_AB = static_cast<const ElementNarrow *>(arguments->A);
+    }
+    dequantize(ptr_dequantized_AB, ptr_AB, layout_AB, ptr_S, ptr_Z, layout_SZ, options_g, stream);
+    if constexpr(is_n4w8) {
+      size_t AB_size = cute::size(layout_AB);
+      cutlass::int4b_t *encoded_AB = static_cast<cutlass::int4b_t *>(arguments->encoded_AB);
+      unified_encode_int4b(ptr_AB, encoded_AB, AB_size);
+      if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+        operator_args.mainloop.ptr_B = static_cast<ElementNarrow const *>(encoded_AB);
+      }
+      else {
+        operator_args.mainloop.ptr_A = static_cast<ElementNarrow const *>(encoded_AB);
+      }
+      ElementScaleMainloop *ptr_packed_Scale = static_cast<ElementScaleMainloop *>(arguments->packed_Scale);
+      pack_scale_fp8(ptr_S, ptr_packed_Scale, SZ_size);
+    }
+  }
+
+  template <
+    typename ElementAB,
+    class ActualStrideAB,
+    class LayoutAB_Reordered,
+    class LayoutAtomQuant,
+    Sm90MixedInputWiderOperand wider_operand>
+  static void handle_shuffle_tensor_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments,
+      const int &problem_mn,
+      const int &problem_k,
+      const int &options_l) {
+
+    auto shape_AB  = cute::make_shape(problem_mn, problem_k, options_l);
+    auto stride_AB = cutlass::make_cute_packed_stride(ActualStrideAB{}, shape_AB);
+    auto layout_AB = cute::make_layout(shape_AB, stride_AB);
+    LayoutAB_Reordered layout_AB_reordered = cute::tile_to_shape(LayoutAtomQuant{}, shape_AB);
+    if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+      operator_args.mainloop.dB = layout_AB_reordered;
+    }
+    else {
+      operator_args.mainloop.dA = layout_AB_reordered;
+    }
+    if (arguments->generate_dequantized_AB) {
+      size_t AB_size = cute::size(layout_AB);
+      ElementAB *AB_reordered = cutlass::device_memory::allocate<ElementAB>(AB_size);
+      const ElementAB *AB_src = nullptr;
+      if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+        AB_src = static_cast<const ElementAB *>(operator_args.mainloop.ptr_B);
+      }
+      else {
+        AB_src = static_cast<const ElementAB *>(operator_args.mainloop.ptr_A);
+      }
+      reorder_tensor(AB_src, layout_AB, AB_reordered, layout_AB_reordered);
+      ElementAB *AB_dst = static_cast<ElementAB *>(arguments->encoded_AB);
+      cutlass::device_memory::copy_device_to_device(AB_dst, AB_reordered, AB_size);
+      cutlass::device_memory::free(AB_reordered);
+      if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+        operator_args.mainloop.ptr_B = AB_dst;
+      }
+      else {
+        operator_args.mainloop.ptr_A = AB_dst;
+      }
+    }
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status update_arguments_(
+    OperatorArguments& operator_args,
+    GemmUniversalArguments const* arguments,
+    cudaStream_t stream = nullptr) const {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // TODO: type erase Arguments structure in 3.0 GEMM
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
+          {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2},
+          {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
+          {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
+      };
+
+      auto iter_runtime_a = mapping.find(arguments->runtime_input_datatype_a);
+      auto iter_runtime_b = mapping.find(arguments->runtime_input_datatype_b);
+
+      if (iter_runtime_a != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_a = iter_runtime_a->second;
+      } else {
+        assert("invalid runtime argument for datatype A!");
+      }
+
+      if (iter_runtime_b != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_b = iter_runtime_b->second;
+      } else {
+        assert("invalid runtime argument for datatype B!");
+      }
+
+    }
+    else {
+      operator_args.mainloop.ptr_A = static_cast<ElementA const *>(arguments->A);
+      operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    }
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    // Stride{A,B} is a Layout if and only if:
+    // (1) This is a mixed dtype kernel, and
+    // (2) This mixed dtype kernel is using shuffling, and
+    // (3) sizeof(narrow_type) == 4 or 8 bits, and
+    // (4) sizeof(wide_type) == 16 bits.
+    // If A/B has the narrow data type, Stride{A/B} will be a Layout
+    constexpr bool is_StrideA_Layout = cute::is_layout<typename CollectiveMainloop::StrideA>::value;
+    constexpr bool is_StrideB_Layout = cute::is_layout<typename CollectiveMainloop::StrideB>::value;
+    static_assert(!(is_StrideA_Layout && is_StrideB_Layout), "Incorrect kernel configuration: StrideA and StrideB are both cute::Layout");
+    if constexpr(!is_StrideA_Layout) {
+      operator_args.mainloop.dA = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideA>(
+        arguments->lda, arguments->batch_stride_A);
+    }
+    if constexpr(!is_StrideB_Layout) {
+      operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    }
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    using MainloopPolicy = typename CollectiveMainloop::DispatchPolicy;
+    if constexpr(is_sm90_mixed_dtype_mainloop_(MainloopPolicy{})) {
+      const int problem_m = arguments->problem_size.m();
+      const int problem_n = arguments->problem_size.n();
+      const int problem_k = arguments->problem_size.k();
+      const int options_l = arguments->batch_count;
+
+      constexpr Sm90MixedInputWiderOperand wider_operand =
+        (cutlass::sizeof_bits<ElementA>::value > cutlass::sizeof_bits<ElementB>::value) ?
+        Sm90MixedInputWiderOperand::A : Sm90MixedInputWiderOperand::B;
+      using ElementWide = std::conditional_t<wider_operand == Sm90MixedInputWiderOperand::A, ElementA, ElementB>;
+      using ElementNarrow = std::conditional_t<wider_operand == Sm90MixedInputWiderOperand::A, ElementB, ElementA>;
+
+      constexpr bool has_scale = !std::is_same_v<typename CollectiveMainloop::ElementScale, void>;
+      constexpr bool has_zero  = !std::is_same_v<typename CollectiveMainloop::ElementZero,  void>;
+
+      const int options_g = problem_k;
+      const int scale_k = (problem_k + options_g - 1) / options_g;
+
+      constexpr bool is_A4B8 = (
+        cutlass::is_same_v<ElementA, cutlass::int4b_t> &&
+        (cutlass::is_same_v<ElementB, cutlass::float_e4m3_t> ||
+         cutlass::is_same_v<ElementB, cutlass::float_e5m2_t>));
+      constexpr bool is_A8B4 = (
+        cutlass::is_same_v<ElementB, cutlass::int4b_t> &&
+        (cutlass::is_same_v<ElementA, cutlass::float_e4m3_t> ||
+         cutlass::is_same_v<ElementA, cutlass::float_e5m2_t>));
+      constexpr bool is_int4_x_fp8 = is_A4B8 || is_A8B4;
+
+      // If this is a convert-only kernel, we still need to generate dequantized A or B for verification,
+      // and in this case ElementScale is the same as ElementWide
+      // In int4 * fp8, ElementScale is a cutlass::Array, need to take out it's real element
+      using DummyElementScaleMainloop = std::conditional_t<
+        is_int4_x_fp8,
+        typename cutlass::Array<ElementWide, 8>,
+        ElementWide
+      >;
+      using ElementScaleMainloop = std::conditional_t<
+        has_scale,
+        typename CollectiveMainloop::ElementScale,
+        DummyElementScaleMainloop
+      >;
+      using ElementScale = std::conditional_t<
+        has_scale,
+        typename UnderlyingElement<typename CollectiveMainloop::ElementScale>::type,
+        ElementWide
+      >;
+      using StrideScale = typename CollectiveMainloop::StrideScale;
+      // In ScaleOnly mode, we have allocated the same size of memory for arguments->Z and arguments->S
+      using ElementZero = std::conditional_t<
+        has_zero,
+        typename CollectiveMainloop::ElementZero,
+        ElementScale
+      >;
+      const int SZ_1st_dim = (wider_operand == Sm90MixedInputWiderOperand::A) ? problem_n : problem_m;
+      const size_t SZ_size = static_cast<size_t>(SZ_1st_dim * scale_k * options_l);
+      auto shape_SZ = cute::make_shape(SZ_1st_dim, scale_k, options_l);
+      ElementScale *ptr_S = static_cast<ElementScale *>(arguments->Scale);
+      ElementZero  *ptr_Z = static_cast<ElementZero  *>(arguments->Zero);
+
+      // 1. If arguments is initialized in profiler, S and Z needs to be allocated and filled
+      if (arguments->generate_scale_and_zero) {
+        float scale_min = 1.0f, scale_max = 1.0f;
+        if constexpr(has_scale) {
+          const float elt_max_f = float(cutlass::platform::numeric_limits<ElementScale>::max());
+          // Need to fix max_dequant_val and min_dequant_val?
+          const float max_dequant_val = elt_max_f * 0.25f;
+          const float min_dequant_val = 0.5f;
+          scale_max = max_dequant_val / elt_max_f;
+          scale_min = min_dequant_val / elt_max_f;
+        }
+        uint64_t seed = 2023;
+        cutlass::reference::device::BlockFillRandomUniform(
+          ptr_S, SZ_size, seed, ElementScale(scale_max), ElementScale(scale_min));
+
+        // In ScaleOnly mode, set Z as zero for generating dequantized A or B
+        const float zero_max = has_zero ?  2.0f : 0.0f;
+        const float zero_min = has_zero ? -2.0f : 0.0f;
+        cutlass::reference::device::BlockFillRandomUniform(
+          ptr_Z, SZ_size, seed, ElementZero(zero_max), ElementZero(zero_min));
+      }  // End of "if (arguments->generate_scale_and_zero)"
+
+      // 2. Generate the dequantized A or B for verification
+      if (arguments->generate_dequantized_AB) {
+        StrideScale stride_SZ = cutlass::make_cute_packed_stride(StrideScale{}, shape_SZ);
+        auto layout_SZ = cute::make_layout(shape_SZ, stride_SZ);
+        if constexpr(wider_operand == Sm90MixedInputWiderOperand::A) {
+          if constexpr(is_StrideB_Layout) {
+            // The generator only generates row-major A and col-major B at the moment
+            // Need a way to read out the actual layout of B later
+            using ActualLayoutB = cutlass::layout::ColumnMajor;
+            using ActualStrideB = cutlass::detail::TagToStrideB_t<ActualLayoutB>;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+          else {
+            using ActualStrideB = typename CollectiveMainloop::StrideB;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideB, wider_operand, is_A8B4>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+        }
+        else {
+          if constexpr(is_StrideA_Layout) {
+            // The generator only generates row-major A and col-major B at the moment
+            // Need a way to read out the actual layout of A later
+            using ActualLayoutA = cutlass::layout::RowMajor;
+            using ActualStrideA = cutlass::detail::TagToStrideA_t<ActualLayoutA>;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+          else {
+            using ActualStrideA = typename CollectiveMainloop::StrideA;
+            dequantize_encode_<ElementWide, ElementNarrow, ElementScaleMainloop, ActualStrideA, wider_operand, is_A4B8>(
+              operator_args, arguments, stream, problem_m, problem_k, options_l, options_g, ptr_S, ptr_Z, SZ_size, layout_SZ);
+          }
+        }  // End of "if constexpr(wider_operand == Sm90MixedInputWiderOperand::A)"
+      }  // End of "if (arguments->generate_dequantized_AB)"
+
+      // 3. Put Scale and Zero in mainloop
+      if constexpr(has_scale) {
+        if constexpr(is_int4_x_fp8) {
+          operator_args.mainloop.ptr_S = static_cast<ElementScaleMainloop const*>(arguments->packed_Scale);
+        }
+        else {
+          operator_args.mainloop.ptr_S = static_cast<ElementScale const*>(arguments->Scale);
+        }
+        operator_args.mainloop.dS = cutlass::make_cute_packed_stride(StrideScale{}, shape_SZ);
+        operator_args.mainloop.group_size = options_g;
+        if constexpr(has_zero) {
+          operator_args.mainloop.ptr_Z = static_cast<ElementZero const*>(arguments->Zero);
+        }
+      }  // End of "if constexpr(has_scale)"
+
+      // Handle the shuffling
+      using ValueShuffle = std::conditional_t<
+        cutlass::sizeof_bits<ElementNarrow>::value == 4,
+        cute::Layout<cute::Shape<cute::_2,cute::_4>, cute::Stride<cute::_4,cute::_1>>,
+        cute::Layout<cute::Shape<cute::_2,cute::_2>, cute::Stride<cute::_2,cute::_1>>
+      >;
+      constexpr int NumShuffleAtoms = 1;
+      using MmaAtomShape = cute::Layout<cute::Shape<cute::_1,cute::Int<NumShuffleAtoms>>>;
+      using LayoutAtomQuant = decltype(compute_memory_reordering_atom<ElementWide, MmaAtomShape, ValueShuffle>());
+      // The generator only generates row-major A and col-major B at the moment
+      // Need a way to read out the actual layout and stride of A/B later
+      if constexpr(wider_operand == Sm90MixedInputWiderOperand::A && is_StrideB_Layout) {
+        using ActualLayoutB = cutlass::layout::ColumnMajor;
+        using ActualStrideB = cutlass::detail::TagToStrideB_t<ActualLayoutB>;
+        using LayoutB_Reordered = typename CollectiveMainloop::StrideB;
+        handle_shuffle_tensor_<ElementB, ActualStrideB, LayoutB_Reordered, LayoutAtomQuant, wider_operand>(
+          operator_args, arguments, problem_n, problem_k, options_l);
+      }
+      if constexpr(wider_operand == Sm90MixedInputWiderOperand::B && is_StrideA_Layout) {
+        using ActualLayoutA = cutlass::layout::RowMajor;
+        using ActualStrideA = cutlass::detail::TagToStrideA_t<ActualLayoutA>;
+        using LayoutA_Reordered = typename CollectiveMainloop::StrideA;
+        handle_shuffle_tensor_<ElementA, ActualStrideA, LayoutA_Reordered, LayoutAtomQuant, wider_operand>(
+          operator_args, arguments, problem_m, problem_k, options_l);
+      }
+    } // End of "if constexpr(is_sm90_mixed_dtype_mainloop_(MainloopPolicy{}))"
+
+    /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (Operator::ArchTag::kMinComputeCapability == 90) {
+      operator_args.hw_info.max_active_clusters = max_active_clusters;
+    }
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (std::is_same_v<typename Operator::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      operator_args.scheduler.splits = arguments->split_k_slices;
+    }
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      [[maybe_unused]] void const *configuration_ptr, void const *arguments_ptr) const override {
+    GemmUniversalArguments const *arguments =
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+    OperatorArguments args;
+
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Status can_impl = Operator::can_implement(args);
+
+    //return Operator::can_implement(args);
+    return can_impl;
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *configuration) const override {
+    return sizeof(Operator);
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+      void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<GemmUniversalArguments const *>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    Operator *op = new (host_workspace) Operator;
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace = nullptr,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments args;
+    Status status = update_arguments_(args, static_cast<GemmUniversalArguments const *>(arguments_ptr), stream);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(args, device_workspace, stream, nullptr, 
+                     static_cast<GemmUniversalArguments const *>(arguments_ptr)->use_pdl);
+    return status;
+  }
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/grouped_gemm_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/grouped_gemm_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..91f618d4fab74a6d43e2d82c572d215d5bea5a1c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/grouped_gemm_operation_3x.hpp
@@ -0,0 +1,873 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all grouped GEMM operations in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "gemm_operation_3x.hpp"
+#include "library_internal.h"
+
+namespace cutlass::library {
+
+template <typename Operator_>
+class GroupedGemmOperation3xBase : public GemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  GroupedGemmOperation3xBase(char const* name = "unknown_gemm")
+      : GemmOperation3xBase<Operator_>(name, GemmKind::kGrouped) {
+    this->description_.kind = OperationKind::kGroupedGemm;
+    this->description_.name = name;
+    this->description_.provider = Provider::kCUTLASS;
+
+    this->description_.gemm = GemmOperation3xBase<Operator_>::description_;
+    this->description_.tile_description = this->description_.gemm.tile_description;
+  };
+
+public:
+  mutable CudaBuffer strideA_device;
+  mutable CudaBuffer strideB_device;
+  mutable CudaBuffer strideC_device;
+  mutable CudaBuffer strideD_device;
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const& description() const override final { return description_; }
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const* configuration) const override final {
+    return sizeof(Operator);
+  }
+
+protected:
+  library::GroupedGemmDescription description_;
+
+  Status initialize_strides(GemmGroupedConfiguration const& config) const {
+    auto const num_groups = config.problem_count;
+    this->strideA_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups);
+    this->strideB_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups);
+    this->strideC_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups);
+    this->strideD_device =
+      CudaBuffer(sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups);
+
+    std::vector<typename Operator::GemmKernel::InternalStrideA> strideA_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideB> strideB_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideC> strideC_host(num_groups);
+    std::vector<typename Operator::GemmKernel::InternalStrideD> strideD_host(num_groups);
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      strideA_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideA>(
+          config.lda[group_idx]);
+      strideB_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideB>(
+          config.ldb[group_idx]);
+      strideC_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideC>(
+          config.ldc[group_idx]);
+      strideD_host[group_idx] =
+        cute::make_int_tuple_from<typename Operator::GemmKernel::InternalStrideD>(
+          config.ldc[group_idx]);
+    }
+    CUDA_CHECK(cudaMemcpy(
+      this->strideA_device.data(),
+      strideA_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideB_device.data(),
+      strideB_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideB) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideC_device.data(),
+      strideC_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideC) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->strideD_device.data(),
+      strideD_host.data(),
+      sizeof(typename Operator::GemmKernel::InternalStrideD) * num_groups,
+      cudaMemcpyHostToDevice));
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status update_arguments_base(
+    OperatorArguments& operator_args,
+    GemmGroupedArguments const& arguments) const {
+    operator_args.mode = cutlass::gemm::GemmUniversalMode::kGrouped;
+    operator_args.problem_shape = {
+      arguments.problem_count,
+      arguments.problem_sizes_3x,
+      arguments.pointer_mode == ScalarPointerMode::kHost ? arguments.problem_sizes_3x_host
+                                                         : nullptr};
+
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const**>(arguments.ptr_A);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const**>(arguments.ptr_B);
+
+      using RuntimeDataTypeA = typename Operator::GemmKernel::CollectiveMainloop::RuntimeDataTypeA;
+      using RuntimeDataTypeB = typename Operator::GemmKernel::CollectiveMainloop::RuntimeDataTypeB;
+
+      static_assert(cute::is_same_v<RuntimeDataTypeA, RuntimeDataTypeB>, 
+        "RuntimeDataTypeA/B should be identical, either MXF8F6F4Format or MXF4Format");
+      using RuntimeDatatypeArg = RuntimeDataTypeA;
+
+      auto mapping = [](RuntimeDatatype type) {
+        if constexpr (cute::is_same_v<RuntimeDatatypeArg, cute::UMMA::MXF8F6F4Format>) {
+          if (type == RuntimeDatatype::kE5M2) {
+            return cute::UMMA::MXF8F6F4Format::E5M2;
+          }
+          else if (type == RuntimeDatatype::kE4M3) {
+            return cute::UMMA::MXF8F6F4Format::E4M3;
+          }
+          else if (type == RuntimeDatatype::kE3M2) {
+            return cute::UMMA::MXF8F6F4Format::E3M2;
+          }
+          else if (type == RuntimeDatatype::kE2M3) {
+            return cute::UMMA::MXF8F6F4Format::E2M3;
+          }
+          else if (type == RuntimeDatatype::kE2M1) {
+            return cute::UMMA::MXF8F6F4Format::E2M1;
+          }
+          else {
+            #if defined(CUTLASS_DEBUG_TRACE_LEVEL) && CUTLASS_DEBUG_TRACE_LEVEL >= 1
+            std::cerr << "Invalid input datatype specified. Running with e4m3." << std::endl;
+            #endif
+            return cute::UMMA::MXF8F6F4Format::E4M3;
+          }
+        }
+        else if constexpr (cute::is_same_v<RuntimeDatatypeArg, cute::UMMA::MXF4Format>) {
+          if (type == RuntimeDatatype::kE2M1) {
+            return cute::UMMA::MXF4Format::E2M1;
+          }
+          else {
+            #if defined(CUTLASS_DEBUG_TRACE_LEVEL) && CUTLASS_DEBUG_TRACE_LEVEL >= 1
+            std::cerr << "Invalid input datatype specified. Running with e2m1." << std::endl;
+            #endif
+            return cute::UMMA::MXF4Format::E2M1;
+          }
+        }
+        // BlockScaled kernels receive either MXF4Format or MXF8F6F4Format runtime datatype
+        CUTE_GCC_UNREACHABLE;
+      };
+      operator_args.mainloop.runtime_data_type_a = mapping(arguments.runtime_input_datatype_a);
+      operator_args.mainloop.runtime_data_type_b = mapping(arguments.runtime_input_datatype_b);
+    }
+    else {
+      operator_args.mainloop.ptr_A = static_cast<ElementA const**>(arguments.ptr_A);
+      operator_args.mainloop.ptr_B = static_cast<ElementB const**>(arguments.ptr_B);
+    }
+    operator_args.epilogue.ptr_C = static_cast<ElementC const**>(arguments.ptr_C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD**>(arguments.ptr_D);
+
+    operator_args.mainloop.dA =
+      static_cast<typename Operator::GemmKernel::InternalStrideA*>(this->strideA_device.data());
+    operator_args.mainloop.dB =
+      static_cast<typename Operator::GemmKernel::InternalStrideB*>(this->strideB_device.data());
+    operator_args.epilogue.dC =
+      static_cast<typename Operator::GemmKernel::InternalStrideC*>(this->strideC_device.data());
+    operator_args.epilogue.dD =
+      static_cast<typename Operator::GemmKernel::InternalStrideD*>(this->strideD_device.data());
+
+    /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments.sm_count;
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 90) {
+      operator_args.hw_info.max_active_clusters = arguments.max_active_clusters;
+    }
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments.swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments.raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape =
+        dim3(arguments.cluster_shape.m(), arguments.cluster_shape.n(), arguments.cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments.cluster_shape_fallback.m(),
+        arguments.cluster_shape_fallback.n(),
+        arguments.cluster_shape_fallback.k());
+    }
+    return Status::kSuccess;
+  }
+
+  template <typename FusionArgs>
+  static Status update_fusion_args(FusionArgs& fusion_args, GemmGroupedArguments const& arguments) {
+    if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+      fusion_args.alpha = *static_cast<ElementCompute const*>(arguments.alpha);
+      fusion_args.beta = *static_cast<ElementCompute const*>(arguments.beta);
+      fusion_args.alpha_ptr = nullptr;
+      fusion_args.beta_ptr = nullptr;
+      fusion_args.alpha_ptr_array = nullptr;
+      fusion_args.beta_ptr_array = nullptr;
+
+      return Status::kSuccess;
+    }
+    else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+      fusion_args.alpha = 0;
+      fusion_args.beta = 0;
+      fusion_args.alpha_ptr = static_cast<ElementCompute const*>(arguments.alpha);
+      fusion_args.beta_ptr = static_cast<ElementCompute const*>(arguments.beta);
+      fusion_args.alpha_ptr_array = nullptr;
+      fusion_args.beta_ptr_array = nullptr;
+      return Status::kSuccess;
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+  }
+};
+
+/// **** CAUTION ****
+/// Unlike other operations, initialize() must be called when
+/// certain arguments change. See initialize() for details.
+template <typename Operator_>
+class GroupedGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+  GroupedGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {}
+
+  ~GroupedGemmUniversal3xOperation() override = default;
+
+private:
+  int max_active_clusters{};
+
+protected:
+  template <class FusionArgs, class = void> struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmGroupedArguments const& arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template <class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmGroupedArguments const& arguments) {
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  Status
+  update_arguments_(OperatorArguments& operator_args, GemmGroupedArguments const* arguments) const {
+
+    Status status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread,
+      *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = this->update_arguments_base(operator_args, *arguments);
+    return status;
+  }
+
+public:
+  /// Returns success if the operation can proceed
+  Status can_implement([[maybe_unused]] void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+    GemmGroupedArguments const* arguments = static_cast<GemmGroupedArguments const*>(arguments_ptr);
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operator::can_implement(args);
+    return status;
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(args, static_cast<GemmGroupedArguments const*>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  /// **** CAUTION ****
+  /// Must be called when lda, ldb, ldc, or ldd change.
+  /// The CUTLASS library stores the operations in a type-
+  /// erased manifest. Therefore, only this class knows
+  /// the type of strideA, strideB, strideC, and strideD.
+  /// Since grouped GEMM needs to allocate storage for
+  /// the strides on device, the concrete type of the stride
+  /// must be known in order to copy in the correct memory
+  /// layout on device.
+  Status initialize(
+    void const* configuration_ptr,
+    void* host_workspace,
+    void* device_workspace,
+    cudaStream_t stream = nullptr) const override {
+
+    Operator* op = new (host_workspace) Operator;
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    return this->initialize_strides(config);
+  }
+
+  /// **** CAUTION ****
+  /// initialize() must be called if lda, ldb, ldc, or ldd change.
+  Status run(
+    void const* arguments_ptr,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+    auto const& args = *static_cast<GemmGroupedArguments const*>(arguments_ptr);
+
+    Status status = update_arguments_(operator_args, &args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator* op = static_cast<Operator*>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(operator_args, device_workspace, stream, nullptr, args.use_pdl);
+    return status;
+  }
+
+  // Set arguments that should only be set once before verifying or profiling the kernel.
+  // This should encompass any expensive operations that don't vary from run to run
+  // (e.g., max_active_clusters).
+  Status initialize_with_arguments(void* arguments_ptr) const override {
+    if constexpr (Operator::ArchTag::kMinComputeCapability < 90) {
+      return Status::kSuccess;
+    }
+
+    GemmGroupedArguments* args = static_cast<GemmGroupedArguments*>(arguments_ptr);
+
+    dim3 cluster_dims;
+    if constexpr (cute::is_static_v<typename Operator::GemmKernel::ClusterShape>) {
+      cluster_dims = dim3(
+        cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<1>(typename Operator::GemmKernel::ClusterShape{}),
+        cute::size<2>(typename Operator::GemmKernel::ClusterShape{})
+      );
+    }
+    else {
+      cluster_dims = dim3(
+        args->cluster_shape.m(),
+        args->cluster_shape.n(),
+        args->cluster_shape.k()
+      );      
+    }
+
+    uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
+    void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+    args->max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
+      cluster_dims,
+      threads_per_block,
+      kernel_ptr);
+
+    if (args->max_active_clusters == 0) {
+      std::cerr << "Max Active Clusters could not be queried. " 
+                << "Falling back to heuristics mode (static cluster shape) or preferred cluster mode.\n";
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+template <typename Operator_>
+class GroupedBlockScaledGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementSFA = typename Operator::CollectiveMainloop::ElementSF;
+  using ElementSFB = typename Operator::CollectiveMainloop::ElementSF;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+  constexpr static int SFVecSize = TiledMma::SFVecSize;
+
+
+  static constexpr bool epilogue_scalefactor_generation = not cute::is_same_v<typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  static constexpr int32_t SFD_VectorSize = epilogue_scalefactor_generation ? ThreadEpilogueOp::SFVecSize : SFVecSize;
+  using ElementSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::ElementBlockScaleFactor, void>;
+  using LayoutSFD = cute::conditional_t<epilogue_scalefactor_generation, typename ThreadEpilogueOp::GmemLayoutTagScalefactor, LayoutD>; 
+
+  GroupedBlockScaledGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {
+
+    BlockScaleDescription block_scaled_desc{};
+    block_scaled_desc.kind = OperationKind::kBlockScaledGemm;
+    block_scaled_desc.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    block_scaled_desc.SFA.layout = LayoutTypeID::kRowMajor;
+    block_scaled_desc.SFA.alignment = 128;
+    block_scaled_desc.SFA.log_extent_range = 32;
+    block_scaled_desc.SFA.log_stride_range = 32;
+
+    block_scaled_desc.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    block_scaled_desc.SFB.layout = LayoutTypeID::kRowMajor;
+    block_scaled_desc.SFB.alignment = 128;
+    block_scaled_desc.SFB.log_extent_range = 32;
+    block_scaled_desc.SFB.log_stride_range = 32;
+
+    block_scaled_desc.SFMVecSize = 1;
+    block_scaled_desc.SFNVecSize = 1;
+    block_scaled_desc.SFKVecSize = SFVecSize;
+
+    block_scaled_desc.SFD = make_TensorDescription<ElementSFD, LayoutSFD>(128);
+    block_scaled_desc.EpilogueSFVecSize = SFD_VectorSize;
+
+    this->description_.block_scales = block_scaled_desc;
+  }
+
+  ~GroupedBlockScaledGemmUniversal3xOperation() override = default;
+
+  mutable CudaBuffer layout_SFA_device;
+  mutable CudaBuffer layout_SFB_device;
+
+protected:
+  template <class FusionArgs, class = void> struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmGroupedArguments const& arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template <class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status
+    update_(FusionArgs& fusion_args, GroupedGemmBlockScaledArguments const& arguments) {
+
+      if constexpr (epilogue_scalefactor_generation) {
+        fusion_args.block_scale_factor_ptr = static_cast<ElementSFD**>(arguments.SFD);
+        fusion_args.norm_constant_ptr = static_cast<ElementCompute const*>(arguments.norm_constant);
+      }
+
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
+    }
+  };
+
+public:
+  /// Returns success if the operation can proceed
+  Status can_implement([[maybe_unused]] void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+    GroupedGemmBlockScaledArguments const* arguments =
+      static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr);
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operator::can_implement(args);
+    return status;
+  }
+
+  Status update_arguments_(
+    OperatorArguments& operator_args,
+    GroupedGemmBlockScaledArguments const* arguments) const {
+    Status status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread,
+      *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.mainloop.ptr_SFA =
+      static_cast<const typename Operator::GemmKernel::ElementSF**>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB =
+      static_cast<const typename Operator::GemmKernel::ElementSF**>(arguments->SFB);
+
+    operator_args.mainloop.layout_SFA =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFA*>(this->layout_SFA_device.data());
+    operator_args.mainloop.layout_SFB =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFB*>(this->layout_SFB_device.data());
+
+    return this->update_arguments_base(operator_args, *arguments);
+  }
+
+  uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+
+    OperatorArguments args;
+    auto status =
+      update_arguments_(args, static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  /// **** CAUTION ****
+  /// Must be called when lda, ldb, ldc, or ldd change.
+  /// The CUTLASS library stores the operations in a type-
+  /// erased manifest. Therefore, only this class knows
+  /// the type of strideA, strideB, strideC, and strideD.
+  /// Since grouped GEMM needs to allocate storage for
+  /// the strides on device, the concrete type of the stride
+  /// must be known in order to copy in the correct memory
+  /// layout on device.
+  Status initialize(
+    void const* configuration_ptr,
+    void* host_workspace,
+    void* device_workspace,
+    cudaStream_t stream = nullptr) const override {
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    auto status = this->initialize_strides(config);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto num_groups = config.problem_count;
+    this->layout_SFA_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups);
+    this->layout_SFB_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups);
+    auto layout_SFA_host = std::vector<typename CollectiveMainloop::InternalLayoutSFA>(num_groups);
+    auto layout_SFB_host = std::vector<typename CollectiveMainloop::InternalLayoutSFB>(num_groups);
+
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      auto const& shape = config.problem_sizes_3x_host[group_idx];
+      auto M = get<0>(shape);
+      auto N = get<1>(shape);
+      auto K = get<2>(shape);
+
+      auto layout_SFA = CollectiveMainloop::Sm1xxBlkScaledConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+      auto layout_SFB = CollectiveMainloop::Sm1xxBlkScaledConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+      layout_SFA_host[group_idx] = layout_SFA;
+      layout_SFB_host[group_idx] = layout_SFB;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFA_device.data(),
+      layout_SFA_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFB_device.data(),
+      layout_SFB_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups,
+      cudaMemcpyHostToDevice));
+
+    Operator* op = new (host_workspace) Operator;
+    return status;
+  }
+
+  /// **** CAUTION ****
+  /// initialize() must be called if lda, ldb, ldc, or ldd change.
+  Status run(
+    void const* arguments_ptr,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+    auto const& args = *static_cast<GroupedGemmBlockScaledArguments const*>(arguments_ptr);
+
+    Status status = update_arguments_(operator_args, &args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator* op = static_cast<Operator*>(host_workspace);
+    status = op->run(operator_args, device_workspace, stream, nullptr);
+    return status;
+  }
+};
+
+template <typename Operator_>
+class GroupedBlockwiseGemmUniversal3xOperation : public GroupedGemmOperation3xBase<Operator_> {
+public:
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  using ElementSFA = typename Operator::ElementAccumulator;
+  using ElementSFB = typename Operator::ElementAccumulator;
+
+  using TiledMma = typename Operator::CollectiveMainloop::TiledMma;
+
+  GroupedBlockwiseGemmUniversal3xOperation(char const* name = "unknown_gemm")
+      : GroupedGemmOperation3xBase<Operator_>(name) {
+
+    BlockScaleDescription blockwise_desc{};
+    blockwise_desc.kind = OperationKind::kBlockwiseGemm;
+    blockwise_desc.SFA.element = NumericTypeMap<ElementSFA>::kId;
+    blockwise_desc.SFA.layout = size<0,1>(typename CollectiveMainloop::InternalLayoutSFA{}.stride()) == 1 ? 
+        LayoutTypeID::kColumnMajor : LayoutTypeID::kRowMajor;
+    blockwise_desc.SFA.alignment = CollectiveMainloop::AlignmentSFA;
+    blockwise_desc.SFA.log_extent_range = 32;
+    blockwise_desc.SFA.log_stride_range = 32;
+
+    blockwise_desc.SFB.element = NumericTypeMap<ElementSFB>::kId;
+    blockwise_desc.SFB.layout = size<0,1>(typename CollectiveMainloop::InternalLayoutSFB{}.stride()) == 1 ? 
+        LayoutTypeID::kRowMajor : LayoutTypeID::kColumnMajor;
+    blockwise_desc.SFB.alignment = CollectiveMainloop::AlignmentSFA;
+    blockwise_desc.SFB.log_extent_range = 32;
+    blockwise_desc.SFB.log_stride_range = 32;
+
+    blockwise_desc.SFMVecSize = Operator::CollectiveMainloop::ScaleGranularityM;
+    blockwise_desc.SFNVecSize = Operator::CollectiveMainloop::ScaleGranularityN;
+    blockwise_desc.SFKVecSize = Operator::CollectiveMainloop::ScaleGranularityK;
+
+    blockwise_desc.EpilogueSFVecSize = 0;
+
+    this->description_.block_scales = blockwise_desc;
+  }
+
+  ~GroupedBlockwiseGemmUniversal3xOperation() override = default;
+
+  mutable CudaBuffer layout_SFA_device;
+  mutable CudaBuffer layout_SFB_device;
+
+protected:
+  template <class FusionArgs, class = void> struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmGroupedArguments const& arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template <class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status
+    update_(FusionArgs& fusion_args, GroupedGemmBlockwiseArguments const& arguments) {
+      return GroupedGemmOperation3xBase<Operator>::update_fusion_args(fusion_args, arguments);
+    }
+  };
+
+public:
+  /// Returns success if the operation can proceed
+  Status can_implement([[maybe_unused]] void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+    GroupedGemmBlockwiseArguments const* arguments =
+      static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr);
+    OperatorArguments args;
+    auto status = update_arguments_(args, arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = Operator::can_implement(args);
+    return status;
+  }
+
+  Status update_arguments_(
+    OperatorArguments& operator_args,
+    GroupedGemmBlockwiseArguments const* arguments) const {
+    Status status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread,
+      *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.mainloop.ptr_SFA =
+      static_cast<const typename Operator::GemmKernel::ElementAccumulator**>(arguments->SFA);
+    operator_args.mainloop.ptr_SFB =
+      static_cast<const typename Operator::GemmKernel::ElementAccumulator**>(arguments->SFB);
+
+    operator_args.mainloop.layout_SFA =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFA*>(this->layout_SFA_device.data());
+    operator_args.mainloop.layout_SFB =
+      static_cast<typename CollectiveMainloop::InternalLayoutSFB*>(this->layout_SFB_device.data());
+
+    return this->update_arguments_base(operator_args, *arguments);
+  }
+
+  uint64_t get_device_workspace_size(void const* configuration_ptr, void const* arguments_ptr)
+    const override {
+
+    OperatorArguments args;
+    auto status =
+      update_arguments_(args, static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr));
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+    return size;
+  }
+
+  /// Initializes the workspace
+  /// **** CAUTION ****
+  /// Must be called when lda, ldb, ldc, or ldd change.
+  /// The CUTLASS library stores the operations in a type-
+  /// erased manifest. Therefore, only this class knows
+  /// the type of strideA, strideB, strideC, and strideD.
+  /// Since grouped GEMM needs to allocate storage for
+  /// the strides on device, the concrete type of the stride
+  /// must be known in order to copy in the correct memory
+  /// layout on device.
+  Status initialize(
+    void const* configuration_ptr,
+    void* host_workspace,
+    void* device_workspace,
+    cudaStream_t stream = nullptr) const override {
+
+    auto const& config = *static_cast<GemmGroupedConfiguration const*>(configuration_ptr);
+    auto status = this->initialize_strides(config);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    auto num_groups = config.problem_count;
+    this->layout_SFA_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups);
+    this->layout_SFB_device =
+      CudaBuffer(sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups);
+    auto layout_SFA_host = std::vector<typename CollectiveMainloop::InternalLayoutSFA>(num_groups);
+    auto layout_SFB_host = std::vector<typename CollectiveMainloop::InternalLayoutSFB>(num_groups);
+
+    for (int group_idx = 0; group_idx < num_groups; group_idx++) {
+      auto const& shape = config.problem_sizes_3x_host[group_idx];
+      auto M = get<0>(shape);
+      auto N = get<1>(shape);
+      auto K = get<2>(shape);
+
+      auto layout_SFA = CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFA(cute::make_shape(M, N, K, 1));
+      auto layout_SFB = CollectiveMainloop::ScaleConfig::tile_atom_to_shape_SFB(cute::make_shape(M, N, K, 1));
+      layout_SFA_host[group_idx] = layout_SFA;
+      layout_SFB_host[group_idx] = layout_SFB;
+    }
+
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFA_device.data(),
+      layout_SFA_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFA) * num_groups,
+      cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(
+      this->layout_SFB_device.data(),
+      layout_SFB_host.data(),
+      sizeof(typename CollectiveMainloop::InternalLayoutSFB) * num_groups,
+      cudaMemcpyHostToDevice));
+
+    Operator* op = new (host_workspace) Operator;
+    return status;
+  }
+
+  /// **** CAUTION ****
+  /// initialize() must be called if lda, ldb, ldc, or ldd change.
+  Status run(
+    void const* arguments_ptr,
+    void* host_workspace,
+    void* device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+    auto const& args = *static_cast<GroupedGemmBlockwiseArguments const*>(arguments_ptr);
+
+    Status status = update_arguments_(operator_args, &args);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator* op = static_cast<Operator*>(host_workspace);
+    status = op->run(operator_args, device_workspace, stream, nullptr);
+    return status;
+  }
+};
+
+
+} // namespace cutlass::library
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/library_internal.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/library_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8bd77397f3b85cce2da2a7a8e447ab6ccb48aea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/library_internal.h
@@ -0,0 +1,427 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! 
+  \file
+
+  \brief CUTLASS Library is an object-oriented approach to managing operations implemented by CUTLASS.
+
+  Generally,
+    
+    description   - compile-time constant parameters used to instantiate an operation
+
+    configuration - runtime parameters with computationally expensive initialization 
+    
+    arguments     - runtime parameters that may be passed to an initialized operation with low
+                    computational overhead
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/complex.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/mma.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/arch_mappings.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T> struct NumericTypeMap;
+
+template <> struct NumericTypeMap<void> {
+  static NumericTypeID const kId = NumericTypeID::kVoid;
+};
+
+template <> struct NumericTypeMap<cutlass::uint1b_t> {
+  static NumericTypeID const kId = NumericTypeID::kB1;
+};
+
+template <> struct NumericTypeMap<cutlass::int2b_t> {
+  static NumericTypeID const kId = NumericTypeID::kS2;
+};
+
+template <> struct NumericTypeMap<cutlass::int4b_t> {
+  static NumericTypeID const kId = NumericTypeID::kS4;
+};
+
+template <> struct NumericTypeMap<int8_t> {
+  static NumericTypeID const kId = NumericTypeID::kS8;
+};
+
+template <> struct NumericTypeMap<int16_t> {
+  static NumericTypeID const kId = NumericTypeID::kS16;
+};
+
+template <> struct NumericTypeMap<int32_t> {
+  static NumericTypeID const kId = NumericTypeID::kS32;
+};
+
+template <> struct NumericTypeMap<int64_t> {
+  static NumericTypeID const kId = NumericTypeID::kS64;
+};
+
+template <> struct NumericTypeMap<cutlass::uint2b_t> {
+  static NumericTypeID const kId = NumericTypeID::kU2;
+};
+
+template <> struct NumericTypeMap<cutlass::uint4b_t> {
+  static NumericTypeID const kId = NumericTypeID::kU4;
+};
+
+template <> struct NumericTypeMap<uint8_t> {
+  static NumericTypeID const kId = NumericTypeID::kU8;
+};
+
+template <> struct NumericTypeMap<cutlass::float_e4m3_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE4M3;
+};
+
+template <> struct NumericTypeMap<cutlass::float_e5m2_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE5M2;
+};
+
+
+template <> struct NumericTypeMap<cutlass::float_e2m3_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE2M3;
+};
+
+template <> struct NumericTypeMap<cutlass::float_e3m2_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE3M2;
+};
+
+template <> struct NumericTypeMap<cutlass::float_e2m1_t> {
+  static NumericTypeID const kId = NumericTypeID::kFE2M1;
+};
+template <> struct NumericTypeMap<cutlass::float_ue8m0_t> {
+  static NumericTypeID const kId = NumericTypeID::kFUE8M0;
+};
+
+template <> struct NumericTypeMap<cutlass::float_ue4m3_t> {
+  static NumericTypeID const kId = NumericTypeID::kFUE4M3;
+};
+
+
+template <> struct NumericTypeMap<uint16_t> {
+  static NumericTypeID const kId = NumericTypeID::kU16;
+};
+
+template <> struct NumericTypeMap<uint32_t> {
+  static NumericTypeID const kId = NumericTypeID::kU32;
+};
+
+template <> struct NumericTypeMap<uint64_t> {
+  static NumericTypeID const kId = NumericTypeID::kU64;
+};
+
+template <> struct NumericTypeMap<cutlass::half_t> {
+  static NumericTypeID const kId = NumericTypeID::kF16;
+};
+
+template <> struct NumericTypeMap<float> {
+  static NumericTypeID const kId = NumericTypeID::kF32;
+};
+
+template <> struct NumericTypeMap<double> {
+  static NumericTypeID const kId = NumericTypeID::kF64;
+};
+
+template <> struct NumericTypeMap<cutlass::complex<cutlass::half_t> > {
+  static NumericTypeID const kId = NumericTypeID::kCF16;
+};
+
+template <> struct NumericTypeMap<cutlass::complex<float> > {
+  static NumericTypeID const kId = NumericTypeID::kCF32;
+};
+
+template <> struct NumericTypeMap<cutlass::complex<double> > {
+  static NumericTypeID const kId = NumericTypeID::kCF64;
+};
+
+template <> struct NumericTypeMap<cutlass::bfloat16_t> {
+  static NumericTypeID const kId = NumericTypeID::kBF16;
+};
+
+template <> struct NumericTypeMap<cutlass::tfloat32_t> {
+  static NumericTypeID const kId = NumericTypeID::kTF32;
+};
+
+
+
+
+template <> struct NumericTypeMap<cutlass::type_erased_dynamic_float8_t> {
+  static NumericTypeID const kId = NumericTypeID::kF8;
+};
+
+template <> struct NumericTypeMap<cutlass::type_erased_dynamic_float6_t> {
+  static NumericTypeID const kId = NumericTypeID::kF6;
+};
+
+template <> struct NumericTypeMap<cutlass::type_erased_dynamic_float4_t> {
+  static NumericTypeID const kId = NumericTypeID::kF4;
+};
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T> struct MathOperationMap {
+  static MathOperationID const kId = MathOperationID::kInvalid;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAdd> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAdd;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddFastBF16> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddFastBF16;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddFastF16> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddFastF16;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddSaturate> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddSaturate;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddMixedInputUpcast> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddMixedInputUpcast;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddComplex> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddComplex;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddGaussianComplex> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddGaussianComplex;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpXorPopc> {
+  static MathOperationID const kId = MathOperationID::kXorPopc;
+};
+
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddFastF32> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddFastF32;
+};
+
+template <> struct MathOperationMap<cutlass::arch::OpMultiplyAddComplexFastF32> {
+  static MathOperationID const kId = MathOperationID::kMultiplyAddComplexFastF32;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T> struct LayoutMap;
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajor> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajor;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajor> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajor;
+};
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajorInterleaved<2>> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajorInterleavedK2;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajorInterleaved<2>> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajorInterleavedK2;
+};
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajorInterleaved<4>> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajorInterleavedK4;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajorInterleaved<4>> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajorInterleavedK4;
+};
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajorInterleaved<16>> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajorInterleavedK16;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajorInterleaved<16>> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajorInterleavedK16;
+};
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajorInterleaved<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajorInterleavedK32;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajorInterleaved<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajorInterleavedK32;
+};
+
+template <> struct LayoutMap<cutlass::layout::ColumnMajorInterleaved<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kColumnMajorInterleavedK64;
+};
+
+template <> struct LayoutMap<cutlass::layout::RowMajorInterleaved<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kRowMajorInterleavedK64;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorNHWC> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNHWC;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorNDHWC> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNDHWC;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorNCxHWx<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNC32HW32;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorNCxHWx<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorNC64HW64;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorCxRSKx<32>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorC32RSK32;
+};
+
+template <> struct LayoutMap<cutlass::layout::TensorCxRSKx<64>> {
+  static LayoutTypeID const kId = LayoutTypeID::kTensorC64RSK64;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T> struct OpcodeClassMap;
+
+template <> struct OpcodeClassMap<arch::OpClassSimt> {
+  static OpcodeClassID const kId = OpcodeClassID::kSimt;
+};
+
+template <> struct OpcodeClassMap<arch::OpClassTensorOp> {
+  static OpcodeClassID const kId = OpcodeClassID::kTensorOp;
+};
+
+template <> struct OpcodeClassMap<arch::OpClassSparseTensorOp> {
+  static OpcodeClassID const kId = OpcodeClassID::kSparseTensorOp;
+};
+
+
+template <> struct OpcodeClassMap<arch::OpClassBlockScaledTensorOp> {
+  static OpcodeClassID const kId = OpcodeClassID::kBlockScaledOp;
+};
+
+
+template <> struct OpcodeClassMap<arch::OpClassWmmaTensorOp> {
+  static OpcodeClassID const kId = OpcodeClassID::kWmmaTensorOp;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <cutlass::ComplexTransform Transform> struct ComplexTransformMap;
+
+template <> struct ComplexTransformMap<cutlass::ComplexTransform::kNone> {
+  static cutlass::library::ComplexTransform const kId = cutlass::library::ComplexTransform::kNone;
+};
+
+template <> struct ComplexTransformMap<cutlass::ComplexTransform::kConjugate> {
+  static cutlass::library::ComplexTransform const kId = cutlass::library::ComplexTransform::kConjugate;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <cutlass::conv::Mode  T> struct ConvModeMap;
+
+template <> struct ConvModeMap<conv::Mode::kCrossCorrelation> {
+  static ConvModeID const kId = ConvModeID::kCrossCorrelation;
+};
+
+template <> struct ConvModeMap<conv::Mode::kConvolution> {
+  static ConvModeID const kId = ConvModeID::kConvolution;
+};
+
+
+template <cutlass::conv::Operator  T> struct ConvKindMap;
+
+template <> struct ConvKindMap<conv::Operator::kFprop> {
+  static ConvKind const kId = ConvKind::kFprop;
+};
+
+template <> struct ConvKindMap<conv::Operator::kDgrad> {
+  static ConvKind const kId = ConvKind::kDgrad;
+};
+
+template <> struct ConvKindMap<conv::Operator::kWgrad> {
+  static ConvKind const kId = ConvKind::kWgrad;
+};
+
+
+template <cutlass::conv::IteratorAlgorithm  T> struct IteratorAlgorithmMap;
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kAnalytic> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kAnalytic;
+};
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kOptimized> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kOptimized;
+};
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kFixedChannels> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kFixedChannels;
+};
+
+template <> struct IteratorAlgorithmMap<conv::IteratorAlgorithm::kFewChannels> {
+  static IteratorAlgorithmID const kId = IteratorAlgorithmID::kFewChannels;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element, typename Layout>
+TensorDescription make_TensorDescription(int alignment = 1) {
+  TensorDescription desc;
+
+  desc.element = NumericTypeMap<Element>::kId;
+  desc.layout = LayoutMap<Layout>::kId;
+  desc.alignment = alignment;
+  desc.log_extent_range = int(sizeof(typename Layout::TensorCoord::Index) - 1) * 8;
+  desc.log_stride_range = int(sizeof(typename Layout::Stride::Index) - 1) * 8;
+
+  return desc;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_2k_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_2k_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..76d8d0dfdb1aa6ed0324b9d6299b06ebf3f436d9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_2k_operation.h
@@ -0,0 +1,377 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Rank 2K operation kinds (Syr2k, Her2k) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/rank_2k.h"
+#include "cutlass/gemm/kernel/default_rank_2k_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Rank2KOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  RankKDescription description_;
+
+public:
+
+  /// Constructor
+  Rank2KOperationBase(char const *name = "unknown_rank_k") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.rank_k_kind = RankKKind::kUniversal;
+    description_.fill_mode = kFillModeC;    
+    description_.blas_mode = kBlasMode;
+    description_.num_ranks = kUpdateRank;
+
+    description_.kind = OperationKind::kRank2K;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::Rank2Kkernel::WarpCount::kM,
+      Operator::Rank2Kkernel::WarpCount::kN,
+      Operator::Rank2Kkernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformB>::kId;
+  }
+  
+  /// Returns the description of the SYRK operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class Rank2KOperation : public Rank2KOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  Rank2KOperation(char const *name = "unknown_rank_2k"): 
+    Rank2KOperationBase<Operator_>(name) {
+
+    this->description_.rank_k_kind = RankKKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    RankKConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    RankKArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    RankKConfiguration const *configuration = 
+      static_cast<RankKConfiguration const *>(configuration_ptr);
+
+    RankKArguments const *arguments = 
+      static_cast<RankKArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    
+    //std::cout << "initialize() library::Rank2KOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<RankKArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    //std::cout << "run() library::Rank2KOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->run(stream);
+    
+    return status;
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Rank2KOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.epilogue.alpha << ", " 
+              << operator_args.epilogue.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ptr_A << ", {"
+              << operator_args.lda << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ptr_B << ", {"
+              << operator_args.ldb << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ptr_C << ", {"
+              << operator_args.ldc << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ptr_D << ", {"
+              << operator_args.ldd << "}" << std::endl;
+  } 
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_k_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_k_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..021f7f03fcc4449bdc2ef2c97e29fe0fead09a64
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/rank_k_operation.h
@@ -0,0 +1,348 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Rank K operation kinds (Syrk, Herk) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/rank_k.h"
+#include "cutlass/gemm/kernel/default_rank_k_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class RankKOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementA;
+  using LayoutB = typename Operator::LayoutA;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  RankKDescription description_;
+
+public:
+
+  /// Constructor
+  RankKOperationBase(char const *name = "unknown_rank_k") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.rank_k_kind = RankKKind::kUniversal;
+    description_.fill_mode = kFillModeC;    
+    description_.blas_mode = kBlasMode;
+    description_.num_ranks = kUpdateRank;
+
+    description_.kind = OperationKind::kRankK;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::RankKkernel::WarpCount::kM,
+      Operator::RankKkernel::WarpCount::kN,
+      Operator::RankKkernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+    description_.transform_B = ComplexTransformMap<Operator::kTransformA>::kId;
+  }
+  
+  /// Returns the description of the SYRK operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class RankKOperation : public RankKOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementA;
+  using LayoutB = typename Operator::LayoutA;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static int const kUpdateRank = Operator::kUpdateRank;
+  static FillMode const kFillModeC = Operator::kFillModeC;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  RankKOperation(char const *name = "unknown_rank_k"): 
+    RankKOperationBase<Operator_>(name) {
+
+    this->description_.rank_k_kind = RankKKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    RankKConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->lda);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    RankKArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    RankKConfiguration const *configuration = 
+      static_cast<RankKConfiguration const *>(configuration_ptr);
+
+    RankKArguments const *arguments = 
+      static_cast<RankKArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<RankKConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<RankKArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+    
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reduction/reduction_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reduction/reduction_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e948540e3f29dceace42b5e8ef3f91118c01b37
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reduction/reduction_operation.h
@@ -0,0 +1,294 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for reduction operation in CUTLASS Library.
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class ReductionOperation : public Operation {
+public:
+  using Operator = Operator_;
+  
+  using ElementWorkspace = typename Operator::ElementWorkspace;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementOutput = typename Operator::ElementOutput;
+  
+  using ElementCompute = typename Operator::OutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  ReductionDescription description_;
+
+public:
+
+  /// Constructor
+  ReductionOperation(char const *name = "unknown_reduction") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kReduction;
+
+    description_.tile_description.threadblock_shape = make_Coord(Operator::Shape::kRow, Operator::Shape::kColumn, 1);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(1, 1, 1);
+    description_.tile_description.math_instruction.element_accumulator = NumericTypeMap<ElementAccumulator>::kId;
+    description_.tile_description.math_instruction.opcode_class = OpcodeClassID::kSimt;
+    description_.tile_description.math_instruction.math_operation = MathOperationID::kAdd;
+
+    description_.tile_description.minimum_compute_capability = 50;
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    description_.element_workspace = NumericTypeMap<ElementWorkspace>::kId;
+    description_.element_output = NumericTypeMap<ElementOutput>::kId;
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+  }
+  
+  /// Returns the description of the Reduction operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    ReductionConfiguration const *configuration) {
+
+    operator_args.problem_size     = configuration->problem_size;
+    operator_args.partitions       = configuration->partitions;
+    operator_args.partition_stride = configuration->partition_stride;
+
+    operator_args.workspace        = {nullptr, int(configuration->ldw)};
+    operator_args.source           = {nullptr, int(configuration->lds)};
+    operator_args.destination      = {nullptr, int(configuration->ldd)};
+
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    ReductionArguments const *arguments) {
+
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::OutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::OutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.output = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+    
+    operator_args.workspace.reset(static_cast<ElementWorkspace *>(const_cast<void *>(arguments->workspace)));
+    operator_args.source.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->source)));
+    operator_args.destination.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->destination)));
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+
+    ReductionConfiguration const *configuration = 
+      static_cast<ReductionConfiguration const *>(configuration_ptr);
+
+    ReductionArguments const *arguments = 
+      static_cast<ReductionArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<ReductionConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    return Operator::get_workspace_size(args);
+  }
+
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<ReductionConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    //std::cout << "initialize library::Reduction" << std::endl;
+    //print_operator_args(args);
+    return op->initialize(args, device_workspace, stream);
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+ 
+    OperatorArguments args;
+
+    Status status = update_arguments_(
+      args, 
+      static_cast<ReductionArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    status = op->update(args, device_workspace);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    //std::cout << "run library::Reduction" << std::endl;
+    //print_operator_args(args);
+    return op->run(stream);
+  }
+
+  /// Call print_operator_args  from the Reduction::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "Reduction::OperatorArguments" << std::endl
+              << "  problem_size: " 
+              << operator_args.problem_size << std::endl 
+              << "  partitions: " 
+              << operator_args.partitions << std::endl 
+              << "  partition_stride: " 
+              << operator_args.partition_stride << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.output.alpha << ", " 
+              << operator_args.output.beta << std::endl
+              << "  workspace (ptr, stride): "
+              << operator_args.workspace.data() << ", " 
+              << operator_args.workspace.stride(0) << std::endl
+              << "  source (ptr, stride): " 
+              << operator_args.source.data() << ", " 
+              << operator_args.source.stride(0) << std::endl
+              << "  destination (ptr, stride): " 
+              << operator_args.destination.data() << ", " 
+              << operator_args.destination.stride(0) << std::endl;
+  }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/block_scaled_gemm_reference_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/block_scaled_gemm_reference_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..769da1c8515877536fd9b9fd72c836fd43ebd5d8
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/block_scaled_gemm_reference_operation.h
@@ -0,0 +1,453 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines reference operations for block-scaled GEMM operation kinds in CUTLASS Library
+*/
+
+
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "library_internal.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/detail/sm100_blockscaled_layout.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+namespace detail {
+template <typename T>
+auto make_iterator(T* ptr) {
+  return cute::recast_ptr<T>(ptr);
+}
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ElementSFD_ = void,
+  typename LayoutSFD_ = LayoutC_,
+  int SFVecSize_ = 32,
+  int EpilogueSFVecSize_ = 0,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class BlockScaledGemmReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementSFA = ElementSFA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementSFB = ElementSFB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementD = ElementD_;
+  using ElementSFD = ElementSFD_;
+  using LayoutSFD = LayoutSFD_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+  constexpr static int SFVecSize = SFVecSize_;
+  constexpr static int EpilogueSFVecSize = EpilogueSFVecSize_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  BlockScaledGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockScaledGemmReferenceOperation() {
+
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = OperationKind::kBlockScaledGemm;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.SFA = make_TensorDescription<ElementSFA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.SFB = make_TensorDescription<ElementSFB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.D = make_TensorDescription<ElementD, LayoutC>();
+    description_.SFD = make_TensorDescription<ElementSFD, LayoutSFD>();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Compute capability for gemm reference
+    description_.tile_description.minimum_compute_capability =
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    description_.SFVecSize = SFVecSize;
+    description_.EpilogueSFVecSize = EpilogueSFVecSize;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "gemm"
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.SFA.element) << to_string(description_.SFA.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.SFB.element) << to_string(description_.SFB.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.SFD.element) << to_string(description_.SFD.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(GemmUniversalConfiguration);
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    using namespace cute;
+
+    BlockScaledGemmArguments const &args = *static_cast<BlockScaledGemmArguments const *>(arguments);
+
+    // Construct cute::Tensor A/B/C
+
+    int M = args.problem_size.m();
+    int N = args.problem_size.n();
+    int K = args.problem_size.k();
+    int L = args.batch_count;
+
+    auto problem_shape_MNKL = cute::make_shape(M, N, K, L);
+
+    auto alpha = *(static_cast<ElementCompute const*>(args.alpha));
+    auto beta = *(static_cast<ElementCompute const*>(args.beta));
+
+    using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+    using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+    using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+    using StrideD = cutlass::gemm::TagToStrideC_t<LayoutC>;
+
+    auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    auto stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    auto stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    auto stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+    using Sm1xxBlockScaledConfig = cutlass::detail::Sm1xxBlockScaledConfig<SFVecSize>;
+    auto A = cute::make_tensor(detail::make_iterator(static_cast<ElementA const*>(args.A)),
+        cute::make_layout(cute::make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(static_cast<ElementSFA const*>(args.SFA), Sm1xxBlockScaledConfig::tile_atom_to_shape_SFA(problem_shape_MNKL));
+
+    auto B = cute::make_tensor(detail::make_iterator(static_cast<ElementB const*>(args.B)),
+        cute::make_layout(cute::make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(static_cast<ElementSFB const*>(args.SFB), Sm1xxBlockScaledConfig::tile_atom_to_shape_SFB(problem_shape_MNKL));
+
+    auto C = [&]() {
+      if constexpr (not is_same_v<ElementC, void>) {
+        return cute::make_tensor(detail::make_iterator(static_cast<ElementC const*>(args.C)),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+      else {
+        return cute::make_tensor(detail::make_iterator(static_cast<ElementD const*>(nullptr)),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+    }();
+
+    auto D = cute::make_tensor(detail::make_iterator(static_cast<ElementD *>(args.D)),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+
+    cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator,
+        decltype(A), decltype(SfA),
+        decltype(B), decltype(SfB)>
+        mainloop_params{A, SfA, B, SfB};
+
+    if constexpr (not is_same_v<ElementSFD, void>) {
+
+      using Sm1xxBlockScaledOutputConfig= cutlass::detail::Sm1xxBlockScaledOutputConfig<
+                                              EpilogueSFVecSize
+                                            >;
+
+      auto SfD = cute::make_tensor(detail::make_iterator(static_cast<ElementSFD*>(args.SFD)), Sm1xxBlockScaledOutputConfig::tile_atom_to_shape_SFD(problem_shape_MNKL));
+
+      cutlass::reference::host::GettBlockScalingEpilogueParams<
+          ElementCompute, ElementAccumulator, ElementCompute,
+          decltype(C), decltype(D), decltype(SfD), Int<EpilogueSFVecSize>, cutlass::reference::host::SfStrategy::SfDGen>
+          epilogue_params{alpha, beta, C, D, SfD, *(static_cast<ElementCompute const*>(args.norm_constant))};
+
+      cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+    }
+    else {
+      //  W/O SF generation
+      auto SfD = cute::make_tensor(static_cast<ElementSFA *>(nullptr),
+          cute::make_layout(cute::make_shape(M, N, L))); // not used.
+      cutlass::reference::host::GettBlockScalingEpilogueParams<
+          ElementCompute, ElementAccumulator, ElementCompute,
+          decltype(C), decltype(D), decltype(SfD)>
+          epilogue_params{alpha, beta, C, D, SfD};
+
+      cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+    }
+
+    return Status::kSuccess;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementSFD_ = void,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  int SFVecSize = 32,
+  int EpilogueSFVecSize = SFVecSize,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_block_scaled_gemm_tn(Manifest &manifest) {
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+  manifest.append(new BlockScaledGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ElementSFD_,
+    cutlass::layout::RowMajor,
+    SFVecSize,
+    EpilogueSFVecSize,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+#endif // !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementSFD_ = void,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  int SFVecSize = 32,
+  int EpilogueSFVecSize = SFVecSize,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_block_scaled_gemm(Manifest &manifest) {
+  ///
+  /// A is Row , B is Col
+  ///
+  manifest.append(new BlockScaledGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ElementSFD_,
+    cutlass::layout::RowMajor,
+    SFVecSize,
+    EpilogueSFVecSize,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+  manifest.append(new BlockScaledGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ElementSFD_,
+    cutlass::layout::RowMajor,
+    SFVecSize,
+    EpilogueSFVecSize,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+  ///
+  /// A is Col , B is Row
+  ///
+  manifest.append(new BlockScaledGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ElementSFD_,
+    cutlass::layout::RowMajor,
+    SFVecSize,
+    EpilogueSFVecSize,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+  manifest.append(new BlockScaledGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ElementSFD_,
+    cutlass::layout::RowMajor,
+    SFVecSize,
+    EpilogueSFVecSize,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/blockwise_gemm_reference_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/blockwise_gemm_reference_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd988f899f563acfc6f8003bdb49523bca51d6d9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/blockwise_gemm_reference_operation.h
@@ -0,0 +1,807 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines reference operations for blockwise/groupwise GEMM operation kinds in CUTLASS Library
+*/
+
+
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "library_internal.h"
+
+#include "cutlass/util/reference/host/gett.hpp"
+#include "cutlass/detail/blockwise_scale_layout.hpp"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  typename ElementA_, 
+  typename LayoutA_,
+  typename LayoutSFA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename LayoutSFB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class BlockwiseGemmReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementSFA = ElementSFA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementSFB = ElementSFB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementD = ElementD_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  BlockwiseGemmDescription description_;
+
+public:
+
+  /// Constructor
+  BlockwiseGemmReferenceOperation(int SFMVecSize_, int SFNVecSize_, int SFKVecSize_)
+    : SFMVecSize(SFMVecSize_), SFNVecSize(SFNVecSize_), SFKVecSize(SFKVecSize_) {
+    
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = OperationKind::kBlockwiseGemm;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.SFA = make_TensorDescription<ElementSFA, LayoutSFA_>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.SFB = make_TensorDescription<ElementSFB, LayoutSFB_>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.D = make_TensorDescription<ElementD, LayoutC>();
+    
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Compute capability for gemm reference
+    description_.tile_description.minimum_compute_capability = 
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    description_.SFMVecSize = SFMVecSize;
+    description_.SFNVecSize = SFNVecSize;
+    description_.SFKVecSize = SFKVecSize;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "gemm"  
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.SFA.element) << SFMVecSize << "x" << SFKVecSize << to_string(description_.SFA.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.SFB.element)  << SFNVecSize << "x" << SFKVecSize << to_string(description_.SFB.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(GemmUniversalConfiguration);
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+    using namespace cute;
+
+    BlockwiseGemmArguments const &args = *static_cast<BlockwiseGemmArguments const *>(arguments);
+
+    // Construct cute::Tensor A/B/C 
+
+    int M = args.problem_size.m();
+    int N = args.problem_size.n();
+    int K = args.problem_size.k();
+    int L = args.batch_count;
+
+    auto problem_shape_MNKL = cute::make_shape(M, N, K, L);
+
+    auto alpha = *(static_cast<ElementCompute const*>(args.alpha));
+    auto beta = *(static_cast<ElementCompute const*>(args.beta));
+
+    using StrideA = cutlass::gemm::TagToStrideA_t<LayoutA>;
+    using StrideB = cutlass::gemm::TagToStrideB_t<LayoutB>;
+    using StrideC = cutlass::gemm::TagToStrideC_t<LayoutC>;
+    using StrideD = cutlass::gemm::TagToStrideC_t<LayoutC>;
+
+    auto stride_a = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    auto stride_b = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+    auto stride_c = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+    auto stride_d = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+    using BlockwiseConfig = cutlass::detail::RuntimeBlockwiseScaleConfig<>;
+    auto A = cute::make_tensor(static_cast<ElementA const*>(args.A),
+        cute::make_layout(cute::make_shape(M, K, L), stride_a));
+    auto SfA = make_tensor(static_cast<ElementSFA const*>(args.SFA), BlockwiseConfig::tile_atom_to_shape_SFA(problem_shape_MNKL, cute::make_tuple(SFMVecSize, SFNVecSize, SFKVecSize)));
+
+    auto B = cute::make_tensor(static_cast<ElementB const*>(args.B),
+        cute::make_layout(cute::make_shape(N, K, L), stride_b));
+    auto SfB = make_tensor(static_cast<ElementSFB const*>(args.SFB), BlockwiseConfig::tile_atom_to_shape_SFB(problem_shape_MNKL, cute::make_tuple(SFMVecSize, SFNVecSize, SFKVecSize)));
+
+    auto C = [&]() {
+      if constexpr (not is_same_v<ElementC, void>) {
+        return cute::make_tensor(static_cast<ElementC const*>(args.C),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+      else {
+        return cute::make_tensor(static_cast<ElementD const*>(nullptr),
+            cute::make_layout(cute::make_shape(M, N, L), stride_c));
+      }
+    }();
+
+    auto D = cute::make_tensor(static_cast<ElementD *>(args.D),
+        cute::make_layout(cute::make_shape(M, N, L), stride_d));
+
+    cutlass::reference::host::GettBlockScalingMainloopParams<ElementAccumulator, 
+        decltype(A), decltype(SfA), 
+        decltype(B), decltype(SfB)> 
+        mainloop_params{A, SfA, B, SfB};
+
+    //  W/O SF generation
+    cutlass::reference::host::GettEpilogueParams<
+        ElementCompute, ElementAccumulator, ElementAccumulator, ElementCompute,
+        decltype(C), decltype(D)>
+        epilogue_params{alpha, beta, C, D};
+
+    cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params);
+
+    return Status::kSuccess;
+  }
+
+private:
+  int SFMVecSize;
+  int SFNVecSize;
+  int SFKVecSize;
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename ElementSFA_,
+  typename ElementB_,
+  typename ElementSFB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_blockwise_gemm(Manifest &manifest, int SFMVecSize, int SFNVecSize, int SFKVecSize) {
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::RowMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+  manifest.append(new BlockwiseGemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFA_,
+    ElementB_,
+    cutlass::layout::RowMajor,
+    cutlass::layout::ColumnMajor,
+    ElementSFB_,
+    ElementC_,
+    cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(SFMVecSize, SFNVecSize, SFKVecSize));
+
+
+}
+
+template<class ElementC,
+         class ElementD>
+void initialize_blockwise_gemm_reference_operations_given_C_and_D(Manifest &manifest) {
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 1, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 1, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 32, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 32, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 64, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 64, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 256, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 256, 128);
+
+
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 1, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 1 , 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 128, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 32, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 32, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 64, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 64, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 256, 128);
+  make_blockwise_gemm<
+    float_e4m3_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 256, 128);
+
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 1, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 1, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 32, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 32, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 64, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 64, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 256, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e4m3_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 256, 128);
+
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 1 , 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 1, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 1 , 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 64, 128, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 32, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 32, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 64, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 64, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 128, 256, 128);
+  make_blockwise_gemm<
+    float_e5m2_t /*A*/, float /*SFA*/, float_e5m2_t /*B*/, float /*SFB*/,
+    ElementC /*D*/, float /*Compute*/, float /*Accum*/, ElementD /*D*/
+  >(manifest, 1, 256, 128);
+
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/conv_reference_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/conv_reference_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..240fe18d16a27778bf75e0c02f99d251c096353f
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/conv_reference_operation.h
@@ -0,0 +1,636 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines operations for all CONV operation kinds in CUTLASS Library
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "library_internal.h"
+
+#include "cutlass/conv/convolution.h"
+#include "cutlass/util/reference/host/convolution.h"
+#include "cutlass/util/reference/device/convolution.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  Provider kProvider,
+  cutlass::conv::Operator ConvolutionalOperator,
+  int ConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+struct ConvReferenceDispatcher;
+
+/// Dispatcher for Conv2d (partially specialized for kConvDim == 2)
+template <
+  Provider kProvider,
+  cutlass::conv::Operator kConvolutionalOperator,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator,
+  typename ConvertOp,
+  typename InnerProductOp
+>
+struct ConvReferenceDispatcher<
+  kProvider,
+  kConvolutionalOperator, 
+  2, 
+  ElementA, LayoutA, 
+  ElementB, LayoutB, 
+  ElementC, LayoutC, 
+  ElementCompute, 
+  ElementAccumulator, 
+  ConvertOp, 
+  InnerProductOp> {
+
+  static Status dispatch(
+    void const *configuration,
+    ElementA *ptr_A,
+    ElementB *ptr_B,
+    ElementC *ptr_C,
+    ElementC *ptr_D,
+    ElementCompute alpha,
+    ElementCompute beta,
+    cudaStream_t stream = nullptr
+  ) {
+
+    Conv2dConfiguration const &config = 
+      *static_cast<Conv2dConfiguration const *>(configuration);
+
+    // TODO: make below code more general.  It is fixed for NHWC now.
+    layout::TensorNHWC layout_a;
+    layout::TensorNHWC layout_b;
+    layout::TensorNHWC layout_c;
+
+    layout_a.stride() =
+        make_Coord(int32_t(config.stride_a[0]), 
+                   int32_t(config.stride_a[1]), 
+                   int32_t(config.stride_a[2]));
+
+    layout_b.stride() =
+        make_Coord(int32_t(config.stride_b[0]), 
+                   int32_t(config.stride_b[1]), 
+                   int32_t(config.stride_b[2]));
+
+    layout_c.stride() =
+        make_Coord(int32_t(config.stride_c[0]), 
+                   int32_t(config.stride_c[1]), 
+                   int32_t(config.stride_c[2]));
+
+    if (kProvider == Provider::kReferenceHost) {
+
+      cutlass::reference::host::Conv2d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC ,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ElementC,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, layout_a},
+        {ptr_B, layout_b},
+        {ptr_C, layout_c},
+        {ptr_D, layout_c},
+        alpha,
+        beta
+      );
+
+      return Status::kSuccess;
+    }
+    else if (kProvider == Provider::kReferenceDevice) {
+      return cutlass::reference::device::Conv2d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, layout_a},
+        {ptr_B, layout_b},
+        {ptr_C, layout_c},
+        {ptr_D, layout_c},
+        alpha,
+        beta,
+        stream
+      );
+    }
+    return Status::kErrorNotSupported;
+  }
+};
+
+/// Dispatcher for Conv3d (partially specialized for kConvDim == 3)
+template <
+  Provider kProvider,
+  cutlass::conv::Operator kConvolutionalOperator,
+  typename ElementA,
+  typename LayoutA,
+  typename ElementB,
+  typename LayoutB,
+  typename ElementC,
+  typename LayoutC,
+  typename ElementCompute,
+  typename ElementAccumulator,
+  typename ConvertOp,
+  typename InnerProductOp
+>
+struct ConvReferenceDispatcher<
+  kProvider,
+  kConvolutionalOperator, 
+  3, 
+  ElementA, LayoutA, 
+  ElementB, LayoutB, 
+  ElementC, LayoutC, 
+  ElementCompute, 
+  ElementAccumulator, 
+  ConvertOp, 
+  InnerProductOp> {
+
+  static Status dispatch(
+    void const *configuration,
+    ElementA *ptr_A,
+    ElementB *ptr_B,
+    ElementC *ptr_C,
+    ElementC *ptr_D,
+    ElementCompute alpha,
+    ElementCompute beta,
+    cudaStream_t stream = nullptr
+  ) {
+
+    Conv3dConfiguration const &config = 
+      *static_cast<Conv3dConfiguration const *>(configuration);
+    
+    ConvKind const conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+
+    if (kProvider == Provider::kReferenceHost) {
+      cutlass::reference::host::Conv3d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC ,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta
+      );
+
+      return Status::kSuccess;
+    }
+    else if (kProvider == Provider::kReferenceDevice) {
+      return cutlass::reference::device::Conv3d<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ConvertOp,
+        InnerProductOp
+      >(
+        kConvolutionalOperator,
+        config.problem_size,
+        {ptr_A, config.layout_a(conv_kind)},
+        {ptr_B, config.layout_b(conv_kind)},
+        {ptr_C, config.layout_c(conv_kind)},
+        {ptr_D, config.layout_c(conv_kind)},
+        alpha,
+        beta,
+        stream
+      );
+    }
+    return Status::kErrorNotSupported;
+  }
+};
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  cutlass::conv::Operator ConvolutionalOperator,
+  int ConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class ConvReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+  static cutlass::conv::Operator const kConvolutionalOperator = ConvolutionalOperator;
+  static int const kConvDim = ConvDim;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  ConvDescription description_;
+
+public:
+
+  /// Constructor
+  ConvReferenceOperation() {
+    
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = (kConvDim == 2 ? OperationKind::kConv2d : OperationKind::kConv3d);
+    description_.conv_kind = ConvKindMap<kConvolutionalOperator>::kId;
+    description_.conv_dim = kConvDim;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Iterator algorithm for convolution reference
+    description_.iterator_algorithm = IteratorAlgorithmID::kNone;
+    
+    // Compute capability for convolution reference
+    description_.tile_description.minimum_compute_capability = 
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "conv" << kConvDim << "d_" << to_string(description_.conv_kind) 
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    switch (kConvDim) {
+    case 2:
+      return sizeof(Conv2dConfiguration);
+    case 3:
+      return sizeof(Conv3dConfiguration);
+    default:
+      break;
+    }
+
+    return 0;
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    std::memcpy(host_workspace, configuration, get_host_workspace_size(configuration));
+
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    ConvArguments const  &args = *static_cast<ConvArguments const *>(arguments);
+
+    ElementCompute alpha;
+    ElementCompute beta;
+
+    alpha = *static_cast<ElementCompute const *>(args.alpha);
+    beta = *static_cast<ElementCompute const *>(args.beta);
+
+    // TODO - respect pointer mode
+
+    // Invoke 2D or 3D convolution
+    return detail::ConvReferenceDispatcher<
+      kProvider,
+      kConvolutionalOperator,
+      kConvDim,
+      ElementA,
+      LayoutA,
+      ElementB,
+      LayoutB,
+      ElementC,
+      LayoutC,
+      ElementCompute,
+      ElementAccumulator,
+      ConvertOp,
+      InnerProductOp
+    >::dispatch(
+      host_workspace,
+      static_cast<ElementA *>(const_cast<void *>(args.A)),
+      static_cast<ElementB *>(const_cast<void *>(args.B)),
+      static_cast<ElementC *>(const_cast<void *>(args.C)),
+      static_cast<ElementC *>(args.D),
+      alpha,
+      beta,
+      stream
+    );
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Constructs Fprop reference operators.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_fprop(Manifest &manifest) {
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    cutlass::conv::Operator::kFprop,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    cutlass::conv::Operator::kFprop,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+#endif // !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+}
+
+/// Constructs Dgrad and Wgrad reference operators.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_backwards(Manifest &manifest) {
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    cutlass::conv::Operator::kDgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    cutlass::conv::Operator::kDgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceHost,
+    cutlass::conv::Operator::kWgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new ConvReferenceOperation<
+    Provider::kReferenceDevice,
+    cutlass::conv::Operator::kWgrad,
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+#endif // !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+}
+
+/// Six operators for the price of one.
+template <
+  int kConvDim,
+  typename ElementA_,
+  typename LayoutA_,
+  typename ElementB_,
+  typename LayoutB_,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_conv_all(Manifest &manifest) {
+
+  make_conv_fprop<
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_conv_backwards<
+    kConvDim,
+    ElementA_, LayoutA_,
+    ElementB_, LayoutB_,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/gemm_reference_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/gemm_reference_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..e07158b0602eef1d71cfdca95323b3da60553747
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/reference/gemm_reference_operation.h
@@ -0,0 +1,543 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+  \brief Defines reference operations for GEMM operation kinds in CUTLASS Library
+*/
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/util.h"
+#include "library_internal.h"
+
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  Provider Provider_,
+  typename ElementA_,
+  typename LayoutA_,
+  cutlass::ComplexTransform TransformA,
+  typename ElementB_,
+  typename LayoutB_,
+  cutlass::ComplexTransform TransformB,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+class GemmReferenceOperation : public Operation {
+public:
+  static Provider const kProvider = Provider_;
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+  static cutlass::ComplexTransform const kTransformA = TransformA;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using TensorRefB = TensorRef<ElementB, LayoutB>;
+  static cutlass::ComplexTransform const kTransformB = TransformB;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using ElementD = ElementD_;
+  using TensorRefC = TensorRef<ElementC, LayoutC>;
+  using TensorRefD = TensorRef<ElementD, LayoutC>;
+  using ElementCompute = ElementCompute_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ConvertOp = ConvertOp_;
+  using InnerProductOp = InnerProductOp_;
+
+protected:
+
+  /// Storage for the name string
+  std::string name_;
+
+  ///
+  GemmDescription description_;
+
+public:
+
+  /// Constructor
+  GemmReferenceOperation() {
+    
+    // Basic information
+    description_.provider = kProvider;
+    description_.kind = OperationKind::kGemm;
+    description_.gemm_kind = GemmKind::kUniversal;
+
+    // Tensor description
+    description_.A = make_TensorDescription<ElementA, LayoutA>();
+    description_.transform_A = ComplexTransformMap<kTransformA>::kId;
+    description_.B = make_TensorDescription<ElementB, LayoutB>();
+    description_.transform_B = ComplexTransformMap<kTransformB>::kId;
+    description_.C = make_TensorDescription<ElementC, LayoutC>();
+    description_.D = make_TensorDescription<ElementD, LayoutC>();
+    
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    // Compute capability for gemm reference
+    description_.tile_description.minimum_compute_capability = 
+      (kProvider == Provider::kReferenceDevice ? 50 : 0);
+
+    description_.tile_description.maximum_compute_capability = 1024;
+
+    // Procedural name
+    std::stringstream ss;
+
+    ss << "gemm"  
+      << "_reference_" << to_string(description_.provider)
+      << "_" << to_string(description_.A.element) << to_string(description_.A.layout)
+      << "_" << to_string(description_.B.element) << to_string(description_.B.layout)
+      << "_" << to_string(description_.C.element) << to_string(description_.C.layout)
+      << "_" << to_string(description_.tile_description.math_instruction.element_accumulator);
+
+    name_ = ss.str();
+
+    description_.name = name_.c_str();
+
+    // Epilogue compute and accumulator type description
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.tile_description.math_instruction.element_accumulator =
+      NumericTypeMap<ElementAccumulator>::kId;
+  }
+
+  /// Returns the description of the GEMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+
+  virtual Status can_implement(
+    void const *configuration,
+    void const *arguments) const {
+
+    return Status::kSuccess;
+  }
+
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(GemmUniversalConfiguration);
+  }
+
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration,
+    void const *arguments = nullptr) const {
+
+    return 0;
+  }
+
+  virtual Status initialize(
+    void const *configuration,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    std::memcpy(host_workspace, configuration, get_host_workspace_size(configuration));
+
+    return Status::kSuccess;
+  }
+
+  virtual Status run(
+    void const *arguments,
+    void *host_workspace,
+    void *device_workspace = nullptr,
+    cudaStream_t stream = nullptr) const {
+
+    GemmUniversalConfiguration const &config = *static_cast<GemmUniversalConfiguration const *>(host_workspace);
+    GemmUniversalArguments const &args = *static_cast<GemmUniversalArguments const *>(arguments);
+
+    TensorRefA ref_A{static_cast<ElementA *>(const_cast<void *>(args.A)), LayoutA(int(config.lda))};
+    TensorRefB ref_B{static_cast<ElementB *>(const_cast<void *>(args.B)), LayoutB(int(config.ldb))};
+    TensorRefC ref_C{static_cast<ElementC *>(const_cast<void *>(args.C)), LayoutC(int(config.ldc))};
+    TensorRefD ref_D{static_cast<ElementD *>(args.D), LayoutC(int(config.ldd))};
+
+    if (kProvider == Provider::kReferenceHost) {
+
+      cutlass::reference::host::GemmComplex<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ElementD,
+        ConvertOp,
+        InnerProductOp
+      >(
+        config.problem_size,
+        *static_cast<ElementCompute const *>(args.alpha),
+        ref_A,
+        kTransformA,
+        ref_B,
+        kTransformB,
+        *static_cast<ElementCompute const *>(args.beta),
+        ref_C,
+        ref_D,
+        ElementAccumulator(),
+        ((config.mode == library::GemmUniversalMode::kBatched) ? config.batch_count : 1),
+        args.batch_stride_A,
+        args.batch_stride_B,
+        args.batch_stride_C,
+        args.batch_stride_D
+      );
+
+      return Status::kSuccess;
+    }
+    else if (kProvider == Provider::kReferenceDevice) {
+
+      cutlass::reference::device::GemmComplex<
+        ElementA,
+        LayoutA,
+        ElementB,
+        LayoutB,
+        ElementC,
+        LayoutC,
+        ElementCompute,
+        ElementAccumulator,
+        ElementD,
+        ConvertOp,
+        InnerProductOp
+      >(
+        config.problem_size,
+        *static_cast<ElementCompute const *>(args.alpha),
+        ref_A,
+        kTransformA,
+        ref_B,
+        kTransformB,
+        *static_cast<ElementCompute const *>(args.beta),
+        ref_C,
+        ref_D,
+        ElementAccumulator(),
+        ((config.mode == library::GemmUniversalMode::kBatched) ? config.batch_count : 1),
+        args.batch_stride_A,
+        args.batch_stride_B,
+        args.batch_stride_C,
+        args.batch_stride_D
+      );
+
+      return Status::kSuccess;
+    }
+    
+    return Status::kErrorNotSupported;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementA_,
+  typename LayoutA_,
+  cutlass::ComplexTransform TransformA,
+  typename ElementB_,
+  typename LayoutB_,
+  cutlass::ComplexTransform TransformB,
+  typename ElementC_,
+  typename LayoutC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_gemm(Manifest &manifest) {
+#if !defined(CUTLASS_PROFILER_DISABLE_REFERENCE)
+  manifest.append(new GemmReferenceOperation<
+    Provider::kReferenceHost,
+    ElementA_, LayoutA_, TransformA,
+    ElementB_, LayoutB_, TransformB,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+
+  manifest.append(new GemmReferenceOperation<
+    Provider::kReferenceDevice,
+    ElementA_, LayoutA_, TransformA,
+    ElementB_, LayoutB_, TransformB,
+    ElementC_, LayoutC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >);
+#endif
+}
+
+/// Helper to create NN, NT, TN, and TT GEMM layouts.
+template <
+  typename ElementA_, cutlass::ComplexTransform TransformA,
+  typename ElementB_, cutlass::ComplexTransform TransformB,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_gemm_canonical_layouts(Manifest &manifest) {
+
+  // M Major outputs
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+  
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  // N Major outputs
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::ColumnMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::ColumnMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, TransformA,
+    ElementB_, cutlass::layout::RowMajor, TransformB,
+    ElementC_, cutlass::layout::RowMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+}
+
+
+/// Helper to create TN and interleaved layouts GEMM layouts.
+template <
+  int InterleaveK,
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementC_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_gemm_interleaved_layouts(Manifest &manifest) {
+  
+  make_gemm<
+    ElementA_, cutlass::layout::RowMajor, cutlass::ComplexTransform::kNone,
+    ElementB_, cutlass::layout::ColumnMajor, cutlass::ComplexTransform::kNone,
+    ElementC_, cutlass::layout::ColumnMajor,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+}
+
+/// Helper to real-valued GEMM with canonical layouts
+template <
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_gemm_real_canonical_layouts(Manifest &manifest) {
+  make_gemm_canonical_layouts<
+    ElementA_, cutlass::ComplexTransform::kNone,
+    ElementB_, cutlass::ComplexTransform::kNone,
+    ElementC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);  
+}
+
+// Helper to create all complex transformation permutations
+template <
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementCompute_,
+  typename ElementAccumulator_ = ElementCompute_,
+  typename ElementD_ = ElementC_,
+  typename ConvertOp_ = NumericConverter<ElementD_, ElementCompute_>,
+  typename InnerProductOp_ = multiply_add<ElementAccumulator_>
+>
+void make_gemm_complex_canonical_layouts(Manifest &manifest) {
+
+  make_gemm_canonical_layouts<
+    ElementA_, cutlass::ComplexTransform::kNone,
+    ElementB_, cutlass::ComplexTransform::kNone,
+    ElementC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+  
+  make_gemm_canonical_layouts<
+    ElementA_, cutlass::ComplexTransform::kConjugate,
+    ElementB_, cutlass::ComplexTransform::kConjugate,
+    ElementC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+
+  make_gemm_canonical_layouts<
+    ElementA_, cutlass::ComplexTransform::kNone,
+    ElementB_, cutlass::ComplexTransform::kConjugate,
+    ElementC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+  
+  make_gemm_canonical_layouts<
+    ElementA_, cutlass::ComplexTransform::kConjugate,
+    ElementB_, cutlass::ComplexTransform::kNone,
+    ElementC_,
+    ElementCompute_,
+    ElementAccumulator_,
+    ElementD_,
+    ConvertOp_,
+    InnerProductOp_
+  >(manifest);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/sparse_gemm_operation_3x.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/sparse_gemm_operation_3x.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..01caa11e229ffd9109b0973dcca01064df448fa3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/sparse_gemm_operation_3x.hpp
@@ -0,0 +1,504 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all GEMM operation kinds in CUTLASS Library.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/detail/collective.hpp"
+#include "cutlass/array.h"
+#include "cutlass/array_subbyte.h"
+#include "cutlass/library/library.h"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp" // StructuredSparseCompressor
+#include "cutlass/transform/device/transform_universal_adapter.hpp" // TransformUniversalAdapter
+#include "cutlass/util/packed_stride.hpp"        // make_cute_packed_stride
+#include "gemm_operation_3x.hpp"
+#include "library_internal.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/mixed_dtype_utils.hpp"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cute/tensor.hpp"
+#include <unordered_map>
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Limitation & Assumptions:
+// 1. The tensor must be densely packed.  That is, lda is k if the tensor is k-major,
+//    and lda is m if the tensor is m-major.
+// 2. Circular buffer for tensorA and tensorE may have a less count compared to tensorB and others.
+//    This is because we can not get the problem_count information in the get_device_workspace_size().
+//    But I can promise it will use at least 192MB memory if we enable circular buffer.
+template <typename Operator_>
+class SparseGemmUniversal3xOperation : public GemmOperation3xBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using OperatorArguments = typename Operator::Arguments;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementD = typename Operator::ElementD;
+  using LayoutD = typename Operator::LayoutD;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using CollectiveMainloop = typename Operator::CollectiveMainloop;
+  using CollectiveEpilogue = typename Operator::CollectiveEpilogue;
+  using ThreadEpilogueOp = typename CollectiveEpilogue::ThreadEpilogueOp;
+
+  static constexpr bool IsRuntimeDataTypeA = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementA>();
+
+  static constexpr bool IsRuntimeDataTypeB = cutlass::gemm::collective::detail::is_sm10x_runtime_f8f6f4<ElementB>();
+
+  static_assert((IsRuntimeDataTypeA && IsRuntimeDataTypeB) ||
+                (!IsRuntimeDataTypeA && !IsRuntimeDataTypeB),
+                "ElementA and ElementB in a GEMM kernel should be both runtime or both static.");
+
+  static constexpr bool IsRuntimeDataType = IsRuntimeDataTypeA && IsRuntimeDataTypeB;
+
+  using ElementE = typename CollectiveMainloop::ElementE;
+  using LayoutE = typename CollectiveMainloop::LayoutE;
+  using SparseConfig = typename CollectiveMainloop::SparseConfig;
+  using LayoutATag = decltype(SparseConfig::deduce_layoutA_tag(typename CollectiveMainloop::LayoutA{}));
+  using CompressorUtility = cutlass::transform::kernel::StructuredSparseCompressorUtility<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig>;
+  using CompressorKernel = cutlass::transform::kernel::StructuredSparseCompressor<
+                              cute::Shape<int, int, int, int>,
+                              ElementA,
+                              LayoutATag,
+                              SparseConfig,
+                              typename Operator::ArchTag>;
+
+  using Compressor = cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
+
+public:
+
+  /// Constructor
+  SparseGemmUniversal3xOperation(char const *name = "unknown_gemm"):
+    GemmOperation3xBase<Operator_>(name, GemmKind::kUniversal) {}
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+      OperatorArguments &operator_args, GemmUniversalConfiguration const *configuration) {
+    // NOTE: GemmUniversalConfiguration does not contain problem shapes or batch strides
+    // Do nothing here and construct kernel arguments in update_arguments_ instead
+    // We also cannot construct TMA descriptors without all the arguments available
+
+    operator_args.mode = configuration->mode;
+    return Status::kSuccess;
+  }
+
+  template<class FusionArgs, class = void>
+  struct UpdateFusionArgs {
+    static Status update_(FusionArgs const& fusion_args, GemmUniversalArguments const &arguments) {
+      // If a custom EVT is instantiated then it is the users's responsibility
+      // to ensure alpha and beta are updated appropriately
+      return Status::kSuccess;
+    }
+  };
+
+  template<class FusionArgs>
+  struct UpdateFusionArgs<FusionArgs, cute::void_t<decltype(FusionArgs{}.alpha)>> {
+    static Status update_(FusionArgs& fusion_args, GemmUniversalArguments const &arguments) {
+      if (arguments.pointer_mode == ScalarPointerMode::kHost) {
+        fusion_args.alpha = *static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta = *static_cast<ElementCompute const *>(arguments.beta);
+        fusion_args.alpha_ptr = nullptr;
+        fusion_args.beta_ptr = nullptr;
+
+        return Status::kSuccess;
+      }
+      else if (arguments.pointer_mode == ScalarPointerMode::kDevice) {
+        fusion_args.alpha = 0;
+        fusion_args.beta = 0;
+        fusion_args.alpha_ptr = static_cast<ElementCompute const *>(arguments.alpha);
+        fusion_args.beta_ptr = static_cast<ElementCompute const *>(arguments.beta);
+
+        return Status::kSuccess;
+      }
+      else {
+        return Status::kErrorInvalidProblem;
+      }
+    }
+  };
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+      OperatorArguments &operator_args,
+      GemmUniversalArguments const *arguments,
+      CompressorUtility const& compressor_utility,
+      void* device_a_compressed_ptr = nullptr,
+      void* device_e_ptr = nullptr) {
+    Status status = Status::kSuccess;
+
+    status = UpdateFusionArgs<decltype(operator_args.epilogue.thread)>::update_(
+      operator_args.epilogue.thread, *arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    operator_args.problem_shape = cute::make_shape(
+      arguments->problem_size.m(),
+      arguments->problem_size.n(),
+      arguments->problem_size.k(),
+      arguments->batch_count);
+
+    // update arguments
+
+    if constexpr (IsRuntimeDataType) {
+      using ArrayElementA = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementA;
+      using ArrayElementB = typename Operator::GemmKernel::CollectiveMainloop::ArrayElementB;
+      operator_args.mainloop.ptr_A = static_cast<ArrayElementA const *>(device_a_compressed_ptr);
+      operator_args.mainloop.ptr_B = static_cast<ArrayElementB const *>(arguments->B);
+
+      std::unordered_map<RuntimeDatatype, cute::UMMA::MXF8F6F4Format> mapping = {
+          {RuntimeDatatype::kE4M3, cute::UMMA::MXF8F6F4Format::E4M3},
+          {RuntimeDatatype::kE5M2, cute::UMMA::MXF8F6F4Format::E5M2},
+          {RuntimeDatatype::kE3M2, cute::UMMA::MXF8F6F4Format::E3M2},
+          {RuntimeDatatype::kE2M1, cute::UMMA::MXF8F6F4Format::E2M1}
+      };
+
+      auto iter_runtime_a = mapping.find(arguments->runtime_input_datatype_a);
+      auto iter_runtime_b = mapping.find(arguments->runtime_input_datatype_b);
+
+      if (iter_runtime_a != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_a = iter_runtime_a->second;
+      } else {
+        assert("invalid runtime argument for datatype A!");
+      }
+
+      if (iter_runtime_b != mapping.end()) {
+          operator_args.mainloop.runtime_data_type_b = iter_runtime_b->second;
+      } else {
+        assert("invalid runtime argument for datatype B!");
+      }
+
+    }
+    else {
+      operator_args.mainloop.ptr_A = static_cast<ElementA const *>(device_a_compressed_ptr);
+      operator_args.mainloop.ptr_B = static_cast<ElementB const *>(arguments->B);
+    }
+    operator_args.mainloop.ptr_E = static_cast<ElementE const *>(device_e_ptr);
+    operator_args.epilogue.ptr_C = static_cast<ElementC const *>(arguments->C);
+    operator_args.epilogue.ptr_D = static_cast<ElementD       *>(arguments->D);
+
+    operator_args.mainloop.layout_a = compressor_utility.fill_layoutA_from_compressor();
+    operator_args.mainloop.layout_e = compressor_utility.fill_layoutE_from_compressor();
+    operator_args.mainloop.dB = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideB>(
+        arguments->ldb, arguments->batch_stride_B);
+    operator_args.epilogue.dC = cute::make_int_tuple_from<typename Operator::GemmKernel::StrideC>(
+        arguments->ldc, arguments->batch_stride_C);
+    operator_args.epilogue.dD = operator_args.epilogue.dC;
+
+    /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
+    operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
+      operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
+    }
+
+    if constexpr (!std::is_const_v<decltype(operator_args.scheduler.raster_order)>) {
+      using Enum_t = decltype(operator_args.scheduler.raster_order);
+      switch (arguments->raster_order) {
+        case RasterOrder::kAlongN:
+          operator_args.scheduler.raster_order = Enum_t::AlongN;
+          break;
+        case RasterOrder::kAlongM:
+          operator_args.scheduler.raster_order = Enum_t::AlongM;
+          break;
+        default:
+          operator_args.scheduler.raster_order = Enum_t::Heuristic;
+      }
+    }
+
+    if constexpr (std::is_same_v<typename Operator::GemmKernel::TileSchedulerTag, cutlass::gemm::StreamKScheduler>) {
+      operator_args.scheduler.splits = arguments->split_k_slices;
+    }
+
+    if constexpr (Operator::ArchTag::kMinComputeCapability >= 100) {
+      operator_args.hw_info.cluster_shape = dim3(
+        arguments->cluster_shape.m(),
+        arguments->cluster_shape.n(),
+        arguments->cluster_shape.k());
+      operator_args.hw_info.cluster_shape_fallback = dim3(
+        arguments->cluster_shape_fallback.m(),
+        arguments->cluster_shape_fallback.n(),
+        arguments->cluster_shape_fallback.k());
+    }
+    return status;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  Status can_implement(
+      void const *configuration_ptr, void const *arguments_ptr) const override {
+
+    GemmUniversalConfiguration const *configuration =
+      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+    GemmUniversalArguments const *arguments =
+      static_cast<GemmUniversalArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+    auto problem_shape_MNKL = cute::make_shape(
+      configuration->problem_size.m(),
+      configuration->problem_size.n(),
+      configuration->problem_size.k(),
+      configuration->batch_count);
+
+    const int M = configuration->problem_size.m();
+    const int N = configuration->problem_size.n();
+    const int K = configuration->problem_size.k();
+    const int L = configuration->batch_count;
+    using StrideA = typename CompressorUtility::StrideA;
+    auto dA = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+    compressor_utility.set_problem_size(problem_shape_MNKL, dA);
+    auto status = update_arguments_(args, arguments, compressor_utility);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // can_implement rules may need access to problem shape
+    args.problem_shape = problem_shape_MNKL;
+    return Operator::can_implement(args);
+  }
+
+  /// Gets the host-side workspace
+  uint64_t get_host_workspace_size(void const *) const override {
+    // Memory to hold operator
+    host_op_workspace_size = sizeof(Operator);
+
+    // Memory to hold result of `.structure_sparse_zero_mask_fill()`
+    tensor_a_size          = compressor_utility.get_raw_tensor_A_bytes();
+
+    // NOTE: order here is the order of workspace partition
+    const uint64_t size = host_op_workspace_size + tensor_a_size;
+
+    return size;
+  }
+
+  /// Gets the device-side workspace
+  uint64_t get_device_workspace_size(
+    void const *configuration_ptr,void const *arguments_ptr) const override {
+
+    OperatorArguments args;
+    auto status = update_arguments_(
+      args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility);
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    typename Compressor::Arguments compress_arguments {
+      {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+      {/*Empty Not Use*/},
+      {/*Empty Not Use*/} };
+
+    // Size for one iteration
+    // For multi-iteration, will need to multiply result of this function w/ actual problem_count
+    tensor_ac_size           = compressor_utility.get_compressed_tensor_A_bytes();
+    tensor_e_size            = compressor_utility.get_tensor_E_bytes();
+    device_op_workspace_size = Operator::get_workspace_size(args);
+    device_compress_workspace_size = Compressor::get_workspace_size(compress_arguments);
+
+    // NOTE: order here is the order of workspace partition
+    device_per_iter_workspace_size = device_op_workspace_size + device_compress_workspace_size + tensor_ac_size + tensor_e_size;
+
+    return device_per_iter_workspace_size;
+  }
+
+  /// Initializes the workspace
+  Status initialize(
+      void const *configuration_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+    return Status::kErrorInternal;
+  }
+
+  Status initialize_with_profiler_workspace(
+      void const *configuration,
+      void *host_workspace,
+      void *device_workspace,
+      uint8_t **profiler_workspaces,
+      int problem_count_from_profiler,
+      cudaStream_t stream = nullptr) {
+
+    iter_idx.resize(static_cast<GemmUniversalConfiguration const*>(configuration)->device_count, 0);
+
+    // Set problem_count.
+    problem_count = problem_count_from_profiler;
+
+    // * Host Ptr
+    auto* host_op_workspace_ptr       = reinterpret_cast<uint8_t*>(host_workspace);
+    auto* host_a_raw_ptr              = host_op_workspace_ptr + host_op_workspace_size;
+
+    // * Construct Op
+    Operator *op = new (host_op_workspace_ptr) Operator;
+
+    // * Device Ptr (1st iteration)
+    // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+    //            iteri : op_workspace | tensor_ac | tensor_e
+    auto* device_ptr_iter1                = static_cast<uint8_t*>(device_workspace);
+    auto* device_op_workspace_ptr_iter1         = device_ptr_iter1;
+    auto* device_compressor_workspace_ptr_iter1 = device_op_workspace_ptr_iter1 + device_op_workspace_size;
+    auto* device_a_compressed_ptr_iter1         = device_compressor_workspace_ptr_iter1 + device_compress_workspace_size;
+    auto* device_e_ptr_iter1                    = device_a_compressed_ptr_iter1 + tensor_ac_size;
+
+    // * Device A Raw Ptr
+    auto* device_a_raw_ptr = profiler_workspaces[0];
+
+    // * Random fill 50% of TensorA w/ zero following the structured sparse requirement
+    CUDA_CHECK(cudaMemcpyAsync(host_a_raw_ptr, device_a_raw_ptr, tensor_a_size, cudaMemcpyDeviceToHost, stream));
+    compressor_utility.structure_sparse_zero_mask_fill(host_a_raw_ptr, 2000);
+    CUDA_CHECK(cudaMemcpyAsync(device_a_raw_ptr, host_a_raw_ptr, tensor_a_size, cudaMemcpyHostToDevice, stream));
+
+    CUDA_CHECK(cudaGetLastError());
+
+    // * Compress DTensorA and get DTensorAC & DTensorE
+    cutlass::KernelHardwareInfo hw_info;
+    CUDA_CHECK(cudaGetDevice(&hw_info.device_id));
+    hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+    typename Compressor::Arguments arguments{
+        {compressor_utility.M, 0, compressor_utility.K, compressor_utility.L},
+        {device_a_raw_ptr,
+         compressor_utility.dA,
+         device_a_compressed_ptr_iter1,
+         device_e_ptr_iter1},
+        {hw_info}
+    };
+
+    cutlass::Status status {cutlass::Status::kSuccess };
+
+    Compressor compressor_op;
+    status = compressor_op.can_implement(arguments);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = compressor_op.initialize(arguments, device_compressor_workspace_ptr_iter1, stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    status = compressor_op.run(stream);
+    if (status != Status::kSuccess) {
+       return status;
+    }
+
+    // * Copy Iter1's DTensorAC DTensorE to each iteration's DTensorAC DTensorE
+    for (int iter_i = 1; iter_i < problem_count; iter_i++) {
+      // * Device AC E Ptr per iteration
+      // Device workspace : | iter1 | iter2 | iter3 | .. | iterx |
+      //            iteri : op_workspace | tensor_ac | tensor_e
+      auto* device_ptr_iteri                = static_cast<uint8_t*>(device_workspace) + device_per_iter_workspace_size * iter_i;
+      auto* device_op_workspace_ptr         = device_ptr_iteri;
+      auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+      auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+      auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+
+      CUDA_CHECK(cudaMemcpyAsync(device_a_compressed_ptr, device_a_compressed_ptr_iter1, tensor_ac_size, cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(device_e_ptr, device_e_ptr_iter1, tensor_e_size, cudaMemcpyDeviceToDevice, stream));
+    }
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    CUDA_CHECK(cudaGetLastError());
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel
+  Status run(
+      void const *arguments_ptr,
+      void *host_workspace,
+      void *device_workspace,
+      cudaStream_t stream = nullptr) const override {
+
+    OperatorArguments operator_args;
+
+
+    const auto device_index = static_cast<GemmUniversalArguments const *>(arguments_ptr)->device_index;
+
+    auto* device_ptr_iteri                = static_cast<uint8_t*>(device_workspace) + device_per_iter_workspace_size * iter_idx[device_index];
+    auto* device_op_workspace_ptr         = device_ptr_iteri;
+    auto* device_compressor_workspace_ptr = device_op_workspace_ptr + device_op_workspace_size;
+    auto* device_a_compressed_ptr         = device_compressor_workspace_ptr + device_compress_workspace_size;
+    auto* device_e_ptr                    = device_a_compressed_ptr + tensor_ac_size;
+    iter_idx[device_index] = (iter_idx[device_index] + 1) % problem_count;
+
+    Status status = update_arguments_(operator_args, static_cast<GemmUniversalArguments const *>(arguments_ptr), compressor_utility, device_a_compressed_ptr, device_e_ptr );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = static_cast<Operator *>(host_workspace);
+    // We need to call initialize() since we have to rebuild TMA desc for every new set of args
+    status = op->run(operator_args, device_op_workspace_ptr, stream, nullptr, 
+                     static_cast<GemmUniversalArguments const *>(arguments_ptr)->use_pdl);
+    return status;
+  }
+
+private:
+  // Variables that must change in the const functions.
+  mutable CompressorUtility compressor_utility;
+  mutable int problem_count = 1;
+  mutable std::vector<int> iter_idx;
+
+  mutable uint64_t tensor_ac_size = 0;
+  mutable uint64_t tensor_e_size = 0;
+  mutable uint64_t tensor_a_size = 0;
+  mutable uint64_t host_op_workspace_size = 0;
+  mutable uint64_t device_compress_workspace_size = 0;
+  mutable uint64_t device_op_workspace_size = 0;
+  mutable uint64_t device_per_iter_workspace_size = 0;
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::library
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/symm_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/symm_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..c95d238a81f825dbbeae689ec452467cc8ca3afa
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/symm_operation.h
@@ -0,0 +1,382 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all Symm operation kinds (Symm, Hemm) 
+    in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+#include <iostream>
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/symm.h"
+#include "cutlass/gemm/kernel/default_symm_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+#include "cutlass/core_io.h"
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class SymmOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static SideMode const kSideModeA = Operator::kSideModeA;
+  static FillMode const kFillModeA = Operator::kFillModeA;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  SymmDescription description_;
+
+public:
+
+  /// Constructor
+  SymmOperationBase(char const *name = "unknown_symm") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.symm_kind = SymmKind::kUniversal;
+    description_.side_mode = kSideModeA;    
+    description_.fill_mode = kFillModeA;    
+    description_.blas_mode = kBlasMode;
+
+    description_.kind = OperationKind::kSymm;
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::SymmKernel::WarpCount::kM,
+      Operator::SymmKernel::WarpCount::kN,
+      Operator::SymmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.C = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+  }
+  
+  /// Returns the description of the SYMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class SymmOperation : public SymmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  static BlasMode const kBlasMode = Operator::kBlasMode;
+  static SideMode const kSideModeA = Operator::kSideModeA;
+  static FillMode const kFillModeA = Operator::kFillModeA;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  SymmOperation(char const *name = "unknown_symm"): 
+    SymmOperationBase<Operator_>(name) {
+
+    this->description_.symm_kind = SymmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    SymmConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldc = int(configuration->ldc);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    SymmArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.ptr_C = arguments->C;
+    operator_args.ptr_D = arguments->D;
+
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.batch_stride_C = arguments->batch_stride_C;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+    
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    SymmConfiguration const *configuration = 
+      static_cast<SymmConfiguration const *>(configuration_ptr);
+
+    SymmArguments const *arguments = 
+      static_cast<SymmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SymmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<SymmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+    
+    //std::cout << "initialize() library::SymmOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<SymmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+
+    bool need_swapped_matrices = (kSideModeA == SideMode::kLeft && 
+                                    std::is_same<typename Operator::LayoutC, layout::ColumnMajor>::value) ||
+                                 (kSideModeA == SideMode::kRight &&
+                                    std::is_same<typename Operator::LayoutC, layout::RowMajor>::value);
+    if (need_swapped_matrices) {
+      status = op->update(args.swapped_matrices(), device_workspace);
+    } else {
+      status = op->update(args, device_workspace);
+    } 
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    //std::cout << "run() library::SymmOperation" << std::endl;
+    //print_operator_args(args);
+    status = op->run(stream);
+    
+    return status;
+  }
+
+  /// Call print_operator_args  from the Conv2dOperation::initialize()
+  // to dump arguments passed on to cutlass operator for debugging
+  void print_operator_args(OperatorArguments &operator_args) const {
+    std::cout << "SymmOperation::OperatorArguments" << std::endl
+              << "  problem_size:" << std::endl 
+              << operator_args.problem_size << std::endl
+              << "  epilogue (alpha, beta): "
+              << operator_args.epilogue.alpha << ", " 
+              << operator_args.epilogue.beta << std::endl
+              << "  ref_A (ptr, {stride}): " 
+              << operator_args.ptr_A << ", {"
+              << operator_args.lda << "}" << std::endl
+              << "  ref_B (ptr, {stride}): " 
+              << operator_args.ptr_B << ", {"
+              << operator_args.ldb << "}" << std::endl
+              << "  ref_C (ptr, {stride}): "
+              << operator_args.ptr_C << ", {"
+              << operator_args.ldc << "}" << std::endl
+              << "  ref_D (ptr, {stride}): "
+              << operator_args.ptr_D << ", {"
+              << operator_args.ldd << "}" << std::endl;
+  } 
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/trmm_operation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/trmm_operation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d419723791ace5d90eb7955223be9db72bbc2c3c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/library/src/trmm_operation.h
@@ -0,0 +1,350 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines operations for all TRMM operation kinds in CUTLASS Library.
+
+  
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/device/trmm.h"
+#include "cutlass/gemm/kernel/default_trmm_universal.h"
+#include "cutlass/gemm/kernel/trmm_universal.h"
+
+#include "cutlass/library/library.h"
+#include "library_internal.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace library {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class TrmmOperationBase : public Operation {
+public:
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  static SideMode const kSideMode = Operator::kSideMode;
+  static FillMode const kFillMode = Operator::kFillMode;
+  static DiagType const kDiagType = Operator::kDiagType;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+protected:
+
+  /// 
+  TrmmDescription description_;
+
+public:
+
+  /// Constructor
+  TrmmOperationBase(char const *name = "unknown_trmm") {
+
+    description_.name = name;
+    description_.provider = Provider::kCUTLASS;
+    description_.kind = OperationKind::kTrmm;
+    description_.trmm_kind = TrmmKind::kUniversal;
+    description_.side_mode = kSideMode;    
+    description_.fill_mode = kFillMode;    
+    description_.diag_type = kDiagType;    
+
+    description_.tile_description.threadblock_shape = make_Coord(
+      Operator::ThreadblockShape::kM,
+      Operator::ThreadblockShape::kN,
+      Operator::ThreadblockShape::kK);
+
+    description_.tile_description.threadblock_stages = Operator::kStages;
+
+    description_.tile_description.warp_count = make_Coord(
+      Operator::TrmmKernel::WarpCount::kM,
+      Operator::TrmmKernel::WarpCount::kN,
+      Operator::TrmmKernel::WarpCount::kK);
+    
+    description_.tile_description.math_instruction.instruction_shape = make_Coord(
+      Operator::InstructionShape::kM,
+      Operator::InstructionShape::kN,
+      Operator::InstructionShape::kK);
+
+    description_.tile_description.math_instruction.element_accumulator = 
+      NumericTypeMap<ElementAccumulator>::kId;
+
+    description_.tile_description.math_instruction.opcode_class = 
+      OpcodeClassMap<typename Operator::OperatorClass>::kId;
+
+    description_.tile_description.math_instruction.math_operation =
+      MathOperationMap<typename Operator::Operator>::kId;
+
+    description_.tile_description.minimum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMin;
+
+    description_.tile_description.maximum_compute_capability = 
+      ArchMap<typename Operator::ArchTag, typename Operator::OperatorClass>::kMax;
+    
+    description_.A = make_TensorDescription<ElementA, LayoutA>(Operator::kAlignmentA);
+    description_.B = make_TensorDescription<ElementB, LayoutB>(Operator::kAlignmentB);
+    description_.D = make_TensorDescription<ElementC, LayoutC>(Operator::kAlignmentC);
+    description_.element_epilogue = NumericTypeMap<ElementCompute>::kId;
+
+    description_.split_k_mode = SplitKMode::kNone;
+    description_.transform_A = ComplexTransformMap<Operator::kTransformA>::kId;
+  }
+  
+  /// Returns the description of the TRMM operation
+  virtual OperationDescription const & description() const {
+    return description_;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Operator_>
+class TrmmOperation : public TrmmOperationBase<Operator_> {
+public:
+
+  using Operator = Operator_;
+  using ElementA = typename Operator::ElementA;
+  using LayoutA = typename Operator::LayoutA;
+  static SideMode const kSideMode = Operator::kSideMode;
+  static FillMode const kFillMode = Operator::kFillMode;
+  static DiagType const kDiagType = Operator::kDiagType;
+  using ElementB = typename Operator::ElementB;
+  using LayoutB = typename Operator::LayoutB;
+  using ElementC = typename Operator::ElementC;
+  using LayoutC = typename Operator::LayoutC;
+  using ElementAccumulator = typename Operator::ElementAccumulator;
+  using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute;
+
+  using OperatorArguments = typename Operator::Arguments;
+
+public:
+
+  /// Constructor
+  TrmmOperation(char const *name = "unknown_trmm"): 
+    TrmmOperationBase<Operator_>(name) {
+
+    this->description_.trmm_kind = TrmmKind::kUniversal;
+  }
+
+protected:
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status construct_arguments_(
+    OperatorArguments &operator_args,
+    TrmmConfiguration const *configuration) {
+
+    //operator_args.mode = configuration->mode;
+
+    operator_args.problem_size = configuration->problem_size;
+    operator_args.batch_count = configuration->batch_count;
+
+    operator_args.lda = int(configuration->lda);
+    operator_args.ldb = int(configuration->ldb);
+    operator_args.ldd = int(configuration->ldd);
+    
+    return Status::kSuccess;
+  }
+
+  /// Constructs the arguments structure given the configuration and arguments
+  static Status update_arguments_(
+    OperatorArguments &operator_args,
+    TrmmArguments const *arguments) {
+    
+    if (arguments->pointer_mode == ScalarPointerMode::kHost) {
+      typename Operator::EpilogueOutputOp::Params params(
+        *static_cast<ElementCompute const *>(arguments->alpha),
+        *static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params;
+    }
+    else if (arguments->pointer_mode == ScalarPointerMode::kDevice){
+      typename Operator::EpilogueOutputOp::Params params(
+        static_cast<ElementCompute const *>(arguments->alpha),
+        static_cast<ElementCompute const *>(arguments->beta)
+      );
+      operator_args.epilogue = params; 
+    }
+    else {
+      return Status::kErrorInvalidProblem;
+    }
+
+    // update arguments
+    operator_args.ptr_A = arguments->A;
+    operator_args.ptr_B = arguments->B;
+    operator_args.batch_stride_A = arguments->batch_stride_A;
+    operator_args.batch_stride_B = arguments->batch_stride_B;
+    operator_args.ptr_D = arguments->D;
+    operator_args.batch_stride_D = arguments->batch_stride_D;
+
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
+    return Status::kSuccess;
+  }
+
+public:
+
+  /// Returns success if the operation can proceed
+  virtual Status can_implement(
+    void const *configuration_ptr, 
+    void const *arguments_ptr) const {
+    
+    TrmmConfiguration const *configuration = 
+      static_cast<TrmmConfiguration const *>(configuration_ptr);
+
+    TrmmArguments const *arguments = 
+      static_cast<TrmmArguments const *>(arguments_ptr);
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(args, configuration);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    status = update_arguments_(args, arguments);
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Operator::can_implement(args);
+  }
+  
+  /// Gets the host-side workspace
+  virtual uint64_t get_host_workspace_size(
+    void const *configuration) const {
+
+    return sizeof(Operator);
+  }
+  
+  /// Gets the device-side workspace
+  virtual uint64_t get_device_workspace_size(
+    void const *configuration_ptr,
+    void const *arguments_ptr = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<TrmmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return 0;
+    }
+
+    uint64_t size = Operator::get_workspace_size(args);
+
+    return size;
+  }
+  
+  /// Initializes the workspace
+  virtual Status initialize(
+    void const *configuration_ptr, 
+    void *host_workspace, 
+    void *device_workspace, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+
+    Status status = construct_arguments_(
+      args, 
+      static_cast<TrmmConfiguration const *>(configuration_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    Operator *op = new (host_workspace) Operator;
+
+    status = op->initialize(args, device_workspace, stream);
+    
+    return status;
+  }
+
+  /// Runs the kernel
+  virtual Status run(
+    void const *arguments_ptr,
+    void *host_workspace, 
+    void *device_workspace = nullptr, 
+    cudaStream_t stream = nullptr) const {
+
+    OperatorArguments args;
+    
+    Status status = update_arguments_(
+      args, 
+      static_cast<TrmmArguments const *>(arguments_ptr));
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    Operator *op = static_cast<Operator *>(host_workspace);
+   
+    bool need_swapped_matrices = (kSideMode == SideMode::kLeft && 
+                                    std::is_same<typename Operator::LayoutC, layout::ColumnMajor>::value) ||
+                                 (kSideMode == SideMode::kRight &&
+                                    std::is_same<typename Operator::LayoutC, layout::RowMajor>::value);
+    if (need_swapped_matrices) {
+      status = op->update(args.swapped_matrices(), device_workspace);
+    } else {
+      status = op->update(args, device_workspace);
+    } 
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    
+    status = op->run(stream);
+    
+    return status;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace library
+} // namespace cutlass
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d500d9149bf645eadf8110d98612c40882d742c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/block_scaled_gemm_operation_profiler.h
@@ -0,0 +1,330 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Blockscale Gemm Profiler
+*/
+
+
+
+#pragma once
+
+#include <vector>
+#include <array>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class BlockScaledGemmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct GemmProblem {
+
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
+
+    /// For profiling purposes
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<std::array<int64_t, 3>> leading_dims;
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+
+    int64_t m{16};
+    int64_t n{16};
+    int64_t k{16};
+
+    
+    int cluster_m{1};
+    int cluster_n{1};
+    int cluster_k{1};
+    int cluster_m_fallback{1};
+    int cluster_n_fallback{1};
+    int cluster_k_fallback{1};
+    
+
+    int64_t lda{0};
+    int64_t ldb{0};
+    int64_t ldc{0};
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    cutlass::library::SplitKMode split_k_mode{library::SplitKMode::kNone};
+    int split_k_slices{1};
+    int batch_count{1};
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_a{};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_b{};
+    
+
+    // gemm with parallel interleaved reduction
+    // gemm epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    bool use_pdl{false};
+    //
+    // Methods
+    //
+
+    /// Parses the problem
+    Status parse(
+      library::BlockScaledGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    int64_t bytes_with_problem_shape(
+      library::BlockScaledGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    int64_t flops_with_problem_shape(
+      library::BlockScaledGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::BlockScaledGemmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::BlockScaledGemmDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::BlockScaledGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct GemmWorkspace {
+
+    DeviceAllocation *A{nullptr};
+    DeviceAllocation *SFA{nullptr};
+    DeviceAllocation *B{nullptr};
+    DeviceAllocation *SFB{nullptr};
+    DeviceAllocation *C{nullptr};
+    DeviceAllocation *Computed{nullptr};
+    DeviceAllocation *Reference{nullptr};
+    DeviceAllocation *Computed_SFD{nullptr}; 
+    DeviceAllocation *Reference_SFD{nullptr}; 
+    DeviceAllocation *Norm_constant{nullptr}; 
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count{1};
+
+    library::GemmUniversalConfiguration configuration;
+    library::BlockScaledGemmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+
+    cudaStream_t stream;
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  GemmProblem problem_;
+
+  /// Device memory allocations 
+  GemmWorkspace gemm_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* gemm operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  BlockScaledGemmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~BlockScaledGemmOperationProfiler();
+
+  GemmProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Update workspace configuration according to flexible user setups
+  void update_workspace_(
+    GemmWorkspace &gemm_workspace,
+    gemm::GemmCoord const &problem_shape,
+    std::array<int64_t, 3> const &leading_dim,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    cutlass::library::RasterOrder const &raster_order,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Update performance result configuration according to flexible user setups
+  void update_result_(
+    PerformanceResult &result,
+    library::BlockScaledGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space,
+    gemm::GemmCoord const &problem_shape,
+    cutlass::library::RasterOrder const &raster_order,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::BlockScaledGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against host and device references
+  bool verify_with_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
+
+  /// Method to profile a CUTLASS Operation
+  Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    library::Operation const *operation,
+    ProblemSpace::Problem const &problem);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..c110de278cac640c1cedd8dd29d1b8ac09de81ef
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/blockwise_gemm_operation_profiler.h
@@ -0,0 +1,305 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Blockscale Gemm Profiler
+*/
+
+
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class BlockwiseGemmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct GemmProblem {
+
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
+
+    int64_t m{16};
+    int64_t n{16};
+    int64_t k{16};
+
+    int64_t sf_vec_m{0};
+    int64_t sf_vec_n{0};
+    int64_t sf_vec_k{0};
+    
+    int cluster_m{1};
+    int cluster_n{1};
+    int cluster_k{1};
+    int cluster_m_fallback{1};
+    int cluster_n_fallback{1};
+    int cluster_k_fallback{1};
+    
+
+    int64_t lda{0};
+    int64_t ldb{0};
+    int64_t ldc{0};
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    cutlass::library::SplitKMode split_k_mode{library::SplitKMode::kNone};
+    int split_k_slices{1};
+    int batch_count{1};
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
+
+    /// For profiling purposes
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<std::array<int64_t, 3>> leading_dims;
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+    
+    cutlass::library::RuntimeDatatype runtime_input_datatype_a{};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_b{};
+    
+
+    // gemm with parallel interleaved reduction
+    // gemm epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    bool use_pdl{false};
+    //
+    // Methods
+    //
+
+    /// Parses the problem
+    Status parse(
+      library::BlockwiseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    int64_t bytes_with_problem_shape(
+      library::BlockwiseGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    int64_t flops_with_problem_shape(
+      library::BlockwiseGemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::BlockwiseGemmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::BlockwiseGemmDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::BlockwiseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct GemmWorkspace {
+
+    DeviceAllocation *A{nullptr};
+    DeviceAllocation *SFA{nullptr};
+    DeviceAllocation *B{nullptr};
+    DeviceAllocation *SFB{nullptr};
+    DeviceAllocation *C{nullptr};
+    DeviceAllocation *Computed{nullptr};
+    DeviceAllocation *Reference{nullptr};
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count{1};
+
+    library::GemmUniversalConfiguration configuration;
+    library::BlockwiseGemmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  GemmProblem problem_;
+
+  /// Device memory allocations 
+  GemmWorkspace gemm_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* gemm operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  BlockwiseGemmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~BlockwiseGemmOperationProfiler();
+
+  GemmProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::BlockwiseGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against host and device references
+  bool verify_with_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
+
+  /// Method to profile a CUTLASS Operation
+  Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    library::Operation const *operation,
+    ProblemSpace::Problem const &problem);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..683465f50cda19c8d505f2e66bcb60173d7e942d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
@@ -0,0 +1,495 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for convolution
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/handle.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class Conv2dOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct Conv2dProblem {
+
+    int64_t n, h, w, c, p, q, k, r, s;
+    int64_t groups;
+    int64_t pad_h, pad_w;
+    int64_t stride_h, stride_w;
+    int64_t dilation_h, dilation_w;
+
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    library::SplitKMode split_k_mode;
+    int64_t split_k_slices;
+
+    library::ConvModeID conv_mode;
+
+    library::Provider eq_gemm_provider;
+
+    // convolution with parallel interleaved reduction  
+    // convolution epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (Conv2dProblem::alpha, Conv2dProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    //
+    // Methods
+    //
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::ConvDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::ConvDescription const &operation_desc) const;
+
+    void set_default_output_size() {
+      p = ((h + pad_h - r * dilation_h) / stride_h) + 1;
+      q = ((w + pad_w - s * dilation_w) / stride_w) + 1;
+    }
+
+    // Returns equivalent gemm problem size for convolution
+    cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * p * q), int(k), int(r * s * c / groups));
+        case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * h * w), int(c), int(k * r * s));
+        case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(r * s * c), int(n * p * q));
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor A
+    std::vector<int> extent_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(h), int(w), int(c)};
+        case library::ConvKind::kDgrad: return {int(n), int(p), int(q), int(k)};
+        case library::ConvKind::kWgrad: return {int(n), int(p), int(q), int(k)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor B
+    std::vector<int> extent_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(k), int(r), int(s), int(c / groups)};
+        case library::ConvKind::kDgrad: return {int(k), int(r), int(s), int(c)};
+        case library::ConvKind::kWgrad: return {int(n), int(h), int(w), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor C
+    std::vector<int> extent_c(library::ConvKind const &conv_kind) const {
+    
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(p), int(q), int(k)};
+        case library::ConvKind::kDgrad: return {int(n), int(h), int(w), int(c)};
+        case library::ConvKind::kWgrad: return {int(k), int(r), int(s), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix A
+    library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor;    // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;    // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix B
+    library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor;  // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;     // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor;     // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix C
+    library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        // Gemm operator assumes column-major output
+        case library::ConvKind::kFprop:
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor;
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix A
+    int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix B
+    int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix C
+    int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: 
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+  };
+
+  /// Workspace used 
+  struct Conv2dWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *reordered_B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments for convolution operator
+    library::Conv2dConfiguration configuration;
+    library::ConvArguments arguments;
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count;
+
+    /// Buffer used for the cutlass conv2d operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+    
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  
+    /// Host data buffers for host reference operation
+    /// host buffer for tensor 
+    std::vector<uint8_t> host_tensor_a;
+
+    /// host buffer for tensor b
+    std::vector<uint8_t> host_tensor_b;
+
+    /// host buffer for tensor c
+    std::vector<uint8_t> host_tensor_c;
+
+    //
+    // Methods
+    //
+
+    Conv2dWorkspace()
+        : A(nullptr),
+          B(nullptr),
+          reordered_B(nullptr),
+          C(nullptr),
+          Computed(nullptr),
+          Reference(nullptr) {}
+
+    // Set stride vector for tensor activations, filters, output
+    void set_stride_vector(Conv2dProblem const &problem,
+                           library::ConvKind const &conv_kind,
+                           library::LayoutTypeID const &layout_a,
+                           library::LayoutTypeID const &layout_b,
+                           library::LayoutTypeID const &layout_c) {
+      std::vector<int64_t> stride_activations;
+      std::vector<int64_t> stride_filters;
+      std::vector<int64_t> stride_output;
+
+      // Strides for interleaved fprop
+      if (conv_kind == library::ConvKind::kFprop &&
+          ((layout_a == library::LayoutTypeID::kTensorNC32HW32 &&
+            layout_b == library::LayoutTypeID::kTensorC32RSK32 &&
+            layout_c == library::LayoutTypeID::kTensorNC32HW32) ||
+           (layout_a == library::LayoutTypeID::kTensorNC64HW64 &&
+            layout_b == library::LayoutTypeID::kTensorC64RSK64 &&
+            layout_c == library::LayoutTypeID::kTensorNC64HW64))) {
+        int interleave =
+            (layout_a == library::LayoutTypeID::kTensorNC32HW32) ? 32 : 64;
+
+        stride_activations.push_back(int(problem.w) * interleave);
+        stride_activations.push_back(int(problem.w) * int(problem.h) *
+                                     interleave);
+        stride_activations.push_back(int(problem.h) * int(problem.w) *
+                                     int(problem.c));
+
+        stride_filters.push_back(int(problem.k) * interleave);
+        stride_filters.push_back(int(problem.k) * int(problem.s) * interleave);
+        stride_filters.push_back(int(problem.k) * int(problem.s) *
+                                 int(problem.r) * interleave);
+
+        stride_output.push_back(int(problem.q) * interleave);
+        stride_output.push_back(int(problem.q) * int(problem.p) * interleave);
+        stride_output.push_back(int(problem.q) * int(problem.p) *
+                                int(problem.k));
+      } else {
+        // Strides for the rest cases
+        stride_activations.push_back(int(problem.c));
+        stride_activations.push_back(int(problem.w) * int(problem.c));
+        stride_activations.push_back(int(problem.h) * int(problem.w) *
+                                     int(problem.c));
+
+        stride_filters.push_back(int(problem.c / problem.groups));
+        stride_filters.push_back(int(problem.s) * int(problem.c / problem.groups));
+        stride_filters.push_back(int(problem.r) * int(problem.s) *
+                                 int(problem.c / problem.groups));
+
+        stride_output.push_back(int(problem.k));
+        stride_output.push_back(int(problem.q) * int(problem.k));
+        stride_output.push_back(int(problem.q) * int(problem.p) *
+                                int(problem.k));
+      }
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop:
+          configuration.stride_a = stride_activations;
+          configuration.stride_b = stride_filters;
+          configuration.stride_c = stride_output;
+
+          break;
+        case library::ConvKind::kDgrad:
+          configuration.stride_a = stride_output;
+          configuration.stride_b = stride_filters;
+          configuration.stride_c = stride_activations;
+
+          break;
+        case library::ConvKind::kWgrad:
+          configuration.stride_a = stride_output;
+          configuration.stride_b = stride_activations;
+          configuration.stride_c = stride_filters;
+
+          break;
+        default:
+          throw std::runtime_error(
+              "Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// CONV problem obtained from problem space
+  Conv2dProblem problem_;
+
+  /// Device memory allocations 
+  Conv2dWorkspace conv_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* conv2d operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Conv2dOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Conv2dOperationProfiler();
+
+  Conv2dProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+  /// Method to profile an initialized CUTLASS operation
+  virtual Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+ 
+ 
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::ConvDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against host reference
+  bool verify_with_host_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against device reference
+  bool verify_with_device_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#if CUTLASS_ENABLE_CUDNN
+
+  /// Verifies CUTLASS against cudnn reference
+  bool verify_with_cudnn_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#endif //#if CUTLASS_ENABLE_CUDNN
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac4abdef238b00f216053419620a60dfccfd5316
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
@@ -0,0 +1,449 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for convolution
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/handle.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class Conv3dOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct Conv3dProblem {
+
+    int64_t n, d, h, w, c, z, p, q, k, t, r, s;
+    int64_t pad_d, pad_h, pad_w;
+    int64_t stride_d, stride_h, stride_w;
+    int64_t dilation_d, dilation_h, dilation_w;
+
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    library::SplitKMode split_k_mode;
+    int64_t split_k_slices;
+
+    library::ConvModeID conv_mode;
+
+    library::Provider eq_gemm_provider;
+
+    // convolution with parallel interleaved reduction  
+    // convolution epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (Conv3dProblem::alpha, Conv3dProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    //
+    // Methods
+    //
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::ConvDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::ConvDescription const &operation_desc) const;
+
+    /// Infers output size from the input size, padding, stride, and dilation
+    void set_default_output_size() {
+      z = ((d + pad_d - t * dilation_d) / stride_d) + 1;
+      p = ((h + pad_h - r * dilation_h) / stride_h) + 1;
+      q = ((w + pad_w - s * dilation_w) / stride_w) + 1;
+    }
+
+    // Returns equivalent gemm problem size for convolution
+    cutlass::gemm::GemmCoord eq_gemm_size(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return cutlass::gemm::GemmCoord(int(n * z * p * q), int(k), int(t * r * s * c));
+        case library::ConvKind::kDgrad: return cutlass::gemm::GemmCoord(int(n * d * h * w), int(c), int(t * r * s * k));
+        case library::ConvKind::kWgrad: return cutlass::gemm::GemmCoord(int(k), int(t * r * s * c), int(n * z * p * q));
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor A
+    std::vector<int> extent_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(d), int(h), int(w), int(c)};
+        case library::ConvKind::kDgrad: return {int(n), int(z), int(p), int(q), int(k)};
+        case library::ConvKind::kWgrad: return {int(n), int(z), int(p), int(q), int(k)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor B
+    std::vector<int> extent_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(k), int(t), int(r), int(s), int(c)};
+        case library::ConvKind::kDgrad: return {int(k), int(t), int(r), int(s), int(c)};
+        case library::ConvKind::kWgrad: return {int(n), int(d), int(h), int(w), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns extent for tensor C
+    std::vector<int> extent_c(library::ConvKind const &conv_kind) const {
+    
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return {int(n), int(z), int(p), int(q), int(k)};
+        case library::ConvKind::kDgrad: return {int(n), int(d), int(h), int(w), int(c)};
+        case library::ConvKind::kWgrad: return {int(k), int(t), int(r), int(s), int(c)};
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix A
+    library::LayoutTypeID eq_gemm_layout_a(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kRowMajor;    // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;    // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor; // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix B
+    library::LayoutTypeID eq_gemm_layout_b(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return library::LayoutTypeID::kColumnMajor;  // TN Gemm
+        case library::ConvKind::kDgrad: return library::LayoutTypeID::kRowMajor;     // TT Gemm
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kRowMajor;     // NT Gemm
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns layout for equivalent gemm matrix C
+    library::LayoutTypeID eq_gemm_layout_c(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        // Gemm operator assumes column-major output
+        case library::ConvKind::kFprop:
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return library::LayoutTypeID::kColumnMajor;
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix A
+    int64_t eq_gemm_lda(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix B
+    int64_t eq_gemm_ldb(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: return eq_gemm_size(conv_kind).k();
+        case library::ConvKind::kDgrad: return eq_gemm_size(conv_kind).n();
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).n();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+
+    // Returns leading dimension for equivalent gemm matrix C
+    int64_t eq_gemm_ldc(library::ConvKind const &conv_kind) const {
+
+      switch (conv_kind) {
+        case library::ConvKind::kFprop: 
+        case library::ConvKind::kDgrad: 
+        case library::ConvKind::kWgrad: return eq_gemm_size(conv_kind).m();
+        default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+      }
+    }
+  };
+
+  /// Workspace used 
+  struct Conv2dWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments for convolution operator
+    library::Conv3dConfiguration configuration;
+    library::ConvArguments arguments;
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count;
+
+    /// Buffer used for the cutlass conv2d operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+    
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+  
+    /// Host data buffers for host reference operation
+    /// host buffer for tensor 
+    std::vector<uint8_t> host_tensor_a;
+
+    /// host buffer for tensor b
+    std::vector<uint8_t> host_tensor_b;
+
+    /// host buffer for tensor c
+    std::vector<uint8_t> host_tensor_c;
+
+
+    //
+    // Methods
+    //
+
+    Conv2dWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+
+      // Returns stride vector for tensor A
+      std::vector<int64_t> stride_a(library::ConvKind const &conv_kind) {
+        return {        
+          configuration.layout_a(conv_kind).stride()[0],
+          configuration.layout_a(conv_kind).stride()[1],
+          configuration.layout_a(conv_kind).stride()[2],
+          configuration.layout_a(conv_kind).stride()[3]
+        };
+      }
+
+      // Returns stride vector for tensor B
+      std::vector<int64_t> stride_b(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_b(conv_kind).stride()[0],
+          configuration.layout_b(conv_kind).stride()[1],
+          configuration.layout_b(conv_kind).stride()[2],
+          configuration.layout_b(conv_kind).stride()[3]
+        };
+      }
+
+      // Returns stride vector for tensor C
+      std::vector<int64_t> stride_c(library::ConvKind const &conv_kind) {
+
+        return {        
+          configuration.layout_c(conv_kind).stride()[0],
+          configuration.layout_c(conv_kind).stride()[1],
+          configuration.layout_c(conv_kind).stride()[2],
+          configuration.layout_c(conv_kind).stride()[3]
+        };
+      }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// CONV problem obtained from problem space
+  Conv3dProblem problem_;
+
+  /// Device memory allocations 
+  Conv2dWorkspace conv_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* conv2d operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Conv3dOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Conv3dOperationProfiler();
+
+  Conv3dProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Updates the arguments structure for the CUTLASS operator based on
+  /// the problem index.
+  void set_cutlass_operator_arguments_(int problem_idx = 0);
+
+  /// Method to profile an initialized CUTLASS operation
+  virtual Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+  
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::ConvDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against host reference
+  bool verify_with_host_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against device reference
+  bool verify_with_device_reference_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#if CUTLASS_ENABLE_CUDNN
+
+  /// Verifies CUTLASS against cudnn reference
+  bool verify_with_cudnn_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+#endif //#if CUTLASS_ENABLE_CUDNN
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cublas_helpers.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cublas_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..873ba1abe03c05df29edc032ea3f1ffd2f19c3ee
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cublas_helpers.h
@@ -0,0 +1,456 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Helper functions for mapping CUTLASS concepts to cuBLAS.
+*/
+
+#pragma once
+
+#if CUTLASS_ENABLE_CUBLAS
+#include <cublas_v2.h>
+#include <cublasLt.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/blas3.h"
+
+#include "options.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Converts a cuBLAS status to cutlass::Status
+Status get_cutlass_status(cublasStatus_t cublas);
+
+/// Converts a cuBLAS status to cutlass::profiler::Disposition
+Disposition get_cutlass_disposition(cublasStatus_t cublas_status);
+
+/// Maps a CUTLASS tensor layout to a cuBLAS transpose operation
+bool get_cublas_transpose_operation(
+  cublasOperation_t &operation,
+  library::LayoutTypeID layout,
+  library::ComplexTransform transform = library::ComplexTransform::kNone);
+
+/// Maps a CUTLASS numeric type to a cuBLAS data type enumeration
+bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type);
+
+/// Gets the cublas algorithm given threadblock tile dimensions and math opcode class
+cublasGemmAlgo_t get_cublas_gemm_algo(
+  int cta_m, 
+  int cta_n, 
+  int cta_k, 
+  library::OpcodeClassID opcode_class);
+
+/// Returns a status if cuBLAS can satisfy a particular GEMM description
+Status cublas_satisfies(library::GemmDescription const &desc);
+
+/// Returns a status if cuBLAS can satisfy a particular RankK description
+Status cublas_satisfies(library::RankKDescription const &desc);
+
+/// Returns a status if cuBLAS can satisfy a particular TRMM description
+Status cublas_satisfies(library::TrmmDescription const &desc);
+
+/// Returns a status if cuBLAS can satisfy a particular SYMM/HEMM description
+Status cublas_satisfies(library::SymmDescription const &desc);
+
+/// This is a helper class to create cublasHandle_t automatically on CublasCreate object creation and 
+/// to destroy cublasHandle_t on CublasCreate object destruction. 
+/// Additionally, it provides implicit cast from CublasCreate's object to cublasHandle_t's object
+class CublasCreate {
+private:
+  cublasHandle_t handle;
+  cublasStatus_t status;
+
+public:
+  CublasCreate() {
+    status = cublasCreate(&handle);
+  }
+
+  ~CublasCreate() {
+    cublasDestroy(handle);
+  }
+
+  /// Implicit cast CublasCreate object to cublasHandle_t
+  operator cublasHandle_t() const { return handle; }
+
+  /// returns cublasStatus_t for handle creation
+  cublasStatus_t get_cublas_create_status() { return status; }
+};
+
+/// This is a helper class to create cublasLtHandle_t automatically on CublasLtCreate object creation and 
+/// to destroy cublasLtHandle_t on CublasLtCreate object destruction. 
+/// Additionally, it provides implicit cast from CublasLtCreate's object to cublasLtHandle_t's object
+class CublasLtCreate {
+private:
+  cublasLtHandle_t handle;
+  cublasStatus_t status;
+
+public:
+  CublasLtCreate() {
+    status = cublasLtCreate(&handle);
+  }
+
+  ~CublasLtCreate() {
+    cublasLtDestroy(handle);
+  }
+
+  /// Implicit cast CublasLtCreate object to cublasLtHandle_t
+  operator cublasLtHandle_t() const { return handle; }
+
+  /// returns cublasLtStatus_t for handle creation
+  cublasStatus_t get_cublaslt_create_status() { return status; }
+};
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+/// Selects one or more cuBLAS algorithms.
+static void select_cublas_algorithms(
+  std::vector<cublasGemmAlgo_t> &algorithms,
+  Options const &options, 
+  library::GemmDescription const &op_desc) {
+
+  library::OpcodeClassID const & opcode_class = 
+    op_desc.tile_description.math_instruction.opcode_class;
+
+  switch (options.library.algorithm_mode) {
+    case AlgorithmMode::kMatching:
+    {
+      algorithms.push_back(get_cublas_gemm_algo(
+        op_desc.tile_description.threadblock_shape.m(), 
+        op_desc.tile_description.threadblock_shape.n(), 
+        op_desc.tile_description.threadblock_shape.k(), 
+        opcode_class));
+      break;
+    }
+
+    case AlgorithmMode::kBest:
+    {
+      // Choose first enumerated mode. If none are enumerated, choose based on opcode class
+      // and evaluate all of them.
+
+      if (options.library.algorithms.empty()) {
+        // Enumerate all algorithms
+        if (opcode_class == library::OpcodeClassID::kSimt) {
+          
+          for (int algo = CUBLAS_GEMM_DEFAULT; 
+            algo <= CUBLAS_GEMM_ALGO23; 
+            ++algo) {
+
+            algorithms.push_back(cublasGemmAlgo_t(algo));
+          }
+        }
+        else {
+          
+          for (int algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP; 
+            algo <= CUBLAS_GEMM_ALGO15_TENSOR_OP; 
+            ++algo) {
+
+            algorithms.push_back(cublasGemmAlgo_t(algo));
+          }
+        }
+      }
+      else {
+        // Use the listed algorithms
+        algorithms.reserve(options.library.algorithms.size());
+
+        for (int algo : options.library.algorithms) {
+          algorithms.push_back(reinterpret_cast<cublasGemmAlgo_t const &>(algo));
+        }
+      }
+
+      break;
+    }
+
+    case AlgorithmMode::kDefault:
+    {
+
+      // Use the library's default algorithm
+      algorithms.push_back((opcode_class == library::OpcodeClassID::kSimt ? 
+        CUBLAS_GEMM_DEFAULT : CUBLAS_GEMM_DEFAULT_TENSOR_OP)); 
+
+      break;
+    }
+    default:
+    {
+      break;
+    }
+  }
+}
+
+/// Dispatcher to cublasGemmEx() 
+struct cublasGemmExDispatcher {
+
+  //
+  // Data members
+  //
+  library::GemmUniversalConfiguration configuration;
+  library::GemmUniversalArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasOperation_t trans_B;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  cublasGemmAlgo_t algo;
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasGemmExDispatcher( 
+    library::GemmDescription const &op_desc,
+    library::GemmUniversalConfiguration configuration_,
+    library::GemmUniversalArguments arguments_,
+    cublasGemmAlgo_t algorithm = CUBLAS_GEMM_DFALT
+  );
+
+  /// Executes GEMM using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+/// Dispatcher to cublaslt kernels 
+//
+struct cublasLtGemmExDispatcher {
+
+  //
+  // Data members
+  //
+  library::GemmDescription const &op_desc;
+  library::GemmUniversalConfiguration configuration;
+  library::GemmUniversalArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasOperation_t trans_B;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type = CUDA_R_32F;
+
+  //cublasLt-specific data structures
+  cublasLtMatmulDesc_t operationDesc = NULL;
+  cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
+  cublasLtMatmulPreference_t preference = NULL;
+  
+  //is set by call to get_cublaslt_algo()
+  cublasLtMatmulHeuristicResult_t heuristicResult_;
+  void *workspace = nullptr;
+
+  Status status;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  //
+  // Methods
+  //
+
+  cublasLtGemmExDispatcher( 
+    library::GemmDescription const &op_desc,
+    library::GemmUniversalConfiguration configuration_,
+    library::GemmUniversalArguments arguments_
+  );
+
+  /// Initialize the cublasLt variables
+  void initialize_cublaslt();
+  
+
+  /// Runs auto-tuning for the cublas heuristics
+  bool get_cublaslt_algo(cublasLtHandle_t handle,
+    AlgorithmMode algorithm_mode 
+    ); 
+
+  /// Executes GEMM using these arguments
+  cublasStatus_t operator()(cublasLtHandle_t handle, cudaStream_t stream = nullptr);
+
+  ~cublasLtGemmExDispatcher(){
+
+    // descriptors are no longer needed as all GPU work was already enqueued
+    if (preference) cublasLtMatmulPreferenceDestroy(preference);
+    if (Ddesc) cublasLtMatrixLayoutDestroy(Ddesc);
+    if (Cdesc) cublasLtMatrixLayoutDestroy(Cdesc);
+    if (Bdesc) cublasLtMatrixLayoutDestroy(Bdesc);
+    if (Adesc) cublasLtMatrixLayoutDestroy(Adesc);
+    if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
+
+    if (workspace) {
+      cudaFree(workspace);
+    }
+
+  } 
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to cublas rank k update kernels 
+struct cublasRankKDispatcher {
+
+  //
+  // Data members
+  //
+  library::RankKConfiguration configuration;
+  library::RankKArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasFillMode_t uplo;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  int num_ranks;       //(rank-k or rank-2k)
+  BlasMode blas_mode; //(symmetric or hermitian)
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasRankKDispatcher( 
+    library::RankKDescription const &op_desc,
+    library::RankKConfiguration configuration_,
+    library::RankKArguments arguments_
+  );
+
+  /// Executes RankK using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to cublasTrmm() 
+struct cublasTrmmDispatcher {
+
+  //
+  // Data members
+  //
+  library::TrmmConfiguration configuration;
+  library::TrmmArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasOperation_t trans_A;
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cublasDiagType_t diag;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_D;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasTrmmDispatcher( 
+    library::TrmmDescription const &op_desc,
+    library::TrmmConfiguration configuration_,
+    library::TrmmArguments arguments_
+  );
+
+  /// Executes TRMM using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Dispatcher to cublas symm/hemm update kernels 
+struct cublasSymmDispatcher {
+
+  //
+  // Data members
+  //
+  library::SymmConfiguration configuration;
+  library::SymmArguments arguments;
+
+  // cublas-specific data structures to fill cublas API call arguments
+  cublasSideMode_t side;
+  cublasFillMode_t uplo;
+  cudaDataType_t data_type_A;
+  cudaDataType_t data_type_B;
+  cudaDataType_t data_type_C;
+  cudaDataType_t compute_data_type;
+
+#if (__CUDACC_VER_MAJOR__ >= 11)
+  cublasComputeType_t compute_type;
+#endif
+  
+  BlasMode blas_mode; //(symmetric or hermitian)
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  cublasSymmDispatcher( 
+    library::SymmDescription const &op_desc,
+    library::SymmConfiguration configuration_,
+    library::SymmArguments arguments_
+  );
+
+  /// Executes Symm using these arguments
+  cublasStatus_t operator()(cublasHandle_t handle);
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+} // namespace profiler
+} // namespace cutlass
+
+
+#endif // #if CUTLASS_ENABLE_CUBLAS
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cudnn_helpers.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ce9eea5a883fa4c5732f5d8aec120a99064bac0
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
@@ -0,0 +1,590 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Helper functions for mapping CUTLASS concepts to cuDNN.
+
+*/
+
+#pragma once
+#if CUTLASS_ENABLE_CUDNN
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/library/library.h"
+#include "enumerated_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Converts a cuDNN status to cutlass::Status
+Status get_cutlass_status(cudnnStatus_t cudnn_status);
+
+/// Converts a cuDNN status to cutlass::profiler::Disposition
+Disposition get_cutlass_disposition(cudnnStatus_t cudnn_status);
+
+/// Checks cudnnStatus_t converts to cutlas status and returns if Status::kSuccess o.w. throws exception
+Status checkCudnnErr(cudnnStatus_t cudnn_status);
+
+/// Maps a CUTLASS conv mode to a cuDNN conv mode enumeration
+bool get_cudnn_conv_mode(cudnnConvolutionMode_t &cudnn_conv_mode, conv::Mode conv_mode);
+
+/// Maps a CUTLASS layout type to a cuDNN data type enumeration
+bool get_cudnn_layout(cudnnTensorFormat_t &cudnn_layout, library::LayoutTypeID layout);
+
+/// Maps a CUTLASS numeric type to a cuDNN data type enumeration
+bool get_cudnn_datatype(cudnnDataType_t &cudnn_element_type, library::NumericTypeID element_type);
+
+/// Maps CUTLASS math OpcodeClassID and MathOperationID to cuDNN math_type
+bool get_cudnn_mathtype(cudnnMathType_t &cudnn_math_type, library::ConvDescription const &conv_desc);
+
+/// Returns a status if cudnn can satisfy a particular Conv2d description
+Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv2dConfiguration const &configuration);
+
+/// Returns a status if cudnn can satisfy a particular Conv3d description
+Status cudnn_satisfies(library::ConvDescription const &desc, library::Conv3dConfiguration const &configuration);
+
+/// Cudnn compute type seems to be hardcoded to float (To handle a possible cudnn issue)
+float cast_cudnn_compute_type_to_float(library::NumericTypeID type, void const * src);
+
+
+/// This is a helper class to create cudnnHandle_t automatically on CudnnCreate object creation and 
+/// to destroy cudnnHandle_t on CudnnCreate object destruction. 
+/// Additionally, it provides implicit cast from CudnnCreate's object to cudnnHandle_t's object
+class CudnnCreate {
+private:
+	cudnnHandle_t handle;
+	cudnnStatus_t status;
+
+public:
+	CudnnCreate() {
+		status = cudnnCreate(&handle);
+	}
+
+	~CudnnCreate() {
+		cudnnDestroy(handle);
+	}
+
+    /// Implicit cast CudnnCreate object to cudnnHandle_t
+    operator cudnnHandle_t() const { return handle; }
+
+    /// returns cudnnStatus_t for handle creation
+    cudnnStatus_t get_cudnn_create_status() { return status; }
+};
+
+
+namespace detail {
+
+/// Dispatcher to cudnn convolution operators
+struct cudnnConvDispatcher {
+
+  //
+  // Data members
+  //
+  //library::Conv2dConfiguration configuration;
+  library::ConvArguments arguments;
+  library::ConvKind conv_kind;
+
+  // cudnn-specific data structures to fill cudnn API call arguments
+  // cudnn activation, filter, and output descriptors
+  cudnnTensorDescriptor_t activation_desc;
+  cudnnFilterDescriptor_t filter_desc;
+  cudnnTensorDescriptor_t output_desc;
+  cudnnConvolutionDescriptor_t conv_desc;
+
+  // cudnn datatypes
+  cudnnDataType_t data_type_activation;
+  cudnnDataType_t data_type_filter;
+  cudnnDataType_t data_type_output;
+
+  // cudnn layouts
+  cudnnTensorFormat_t layout_activation;
+  cudnnTensorFormat_t layout_filter;
+  cudnnTensorFormat_t layout_output;
+
+  // cudnn convolution mode
+  cudnnConvolutionMode_t conv_mode;
+  
+  // cudnn math type (tensorop, tensorop with conversion, simt)
+  cudnnMathType_t math_type;
+
+  // cudnn compute data type
+  cudnnDataType_t compute_type;
+  
+  // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+  float alpha;
+  float beta;
+
+  // cudnn workspace
+  size_t workspace_size_in_bytes = 0;
+  cutlass::device_memory::allocation<char> workspace;
+  
+  // select cudnn's implicit gemm precomputed algorithm with tensor operations
+  static cudnnConvolutionFwdAlgo_t const fprop_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+  static cudnnConvolutionBwdDataAlgo_t const dgrad_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+  static cudnnConvolutionBwdFilterAlgo_t const wgrad_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+
+  Status status;
+  
+  //
+  // Methods
+  //
+
+  // TODO: unify ctor cudnnConvDispatcher for conv2d and conv3d by unifying Conv2dConfiguration
+  
+  // ctor for conv2d 
+  cudnnConvDispatcher( 
+    library::ConvDescription const &op_desc,
+    library::Conv2dConfiguration configuration,
+    library::ConvArguments arguments_,
+    cudnnHandle_t handle
+  ):
+    //configuration(configuration_), 
+    arguments(arguments_),
+    conv_kind(op_desc.conv_kind), 
+    status(Status::kSuccess) {
+
+    bool good = true;
+
+    // Get cudnn datatype, layout, and convolution mode from library::ConvDescription
+    good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element));
+    good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element));
+    good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element));
+    good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout));
+    good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout));
+    good = (good && get_cudnn_layout(layout_output, op_desc.C.layout));
+    good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode));
+    // Get cudnn mathtype (cudnnMathType_t)
+    good = (good && get_cudnn_mathtype(math_type, op_desc));
+    good = (good && get_cudnn_datatype(
+      compute_type,
+      op_desc.tile_description.math_instruction.element_accumulator));
+    // Check cutlass Conv2d description has equivalent operator in cudnn
+    if (!good) {
+      status = Status::kErrorNotSupported;
+      return;
+    }
+    // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+    alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha);
+    beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta);
+
+    // Create convolution descriptor object
+    status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc));
+
+    // Configure convolution operator
+    std::vector<int> padding {configuration.problem_size.pad_h, configuration.problem_size.pad_w};
+    std::vector<int> stride {configuration.problem_size.stride_h, configuration.problem_size.stride_w};
+    std::vector<int> dilation {configuration.problem_size.dilation_h, configuration.problem_size.dilation_w};
+
+    status = get_cutlass_status(
+      cudnnSetConvolutionNdDescriptor(
+        conv_desc,
+        op_desc.conv_dim,
+        padding.data(),
+        stride.data(),
+        dilation.data(),
+        conv_mode,
+        compute_type
+    ));
+
+    // Set groups
+    status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups));
+
+    // Create activation, filter, and output descriptor objects
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc));
+    status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc));
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc));
+
+    // Set activation, filter, and output descriptor 
+    status = get_cutlass_status(
+      cudnnSetTensor4dDescriptor(
+        activation_desc,
+        layout_activation,
+        data_type_activation,
+        configuration.problem_size.N,
+        configuration.problem_size.C,
+        configuration.problem_size.H,
+        configuration.problem_size.W 
+    ));
+
+    status = get_cutlass_status(
+      cudnnSetFilter4dDescriptor(
+        filter_desc,
+        data_type_filter,
+        layout_filter,
+        configuration.problem_size.K,
+        configuration.problem_size.C / configuration.problem_size.groups,
+        configuration.problem_size.R,
+        configuration.problem_size.S
+    ));
+
+    status = get_cutlass_status(
+      cudnnSetTensor4dDescriptor(
+        output_desc,
+        layout_output,
+        data_type_output,
+        configuration.problem_size.N,
+        configuration.problem_size.K,
+        configuration.problem_size.P,
+        configuration.problem_size.Q
+    ));
+
+    // Set math instruction to tensor op
+    status = get_cutlass_status(
+      cudnnSetConvolutionMathType(conv_desc, math_type));
+
+    // Initialize workspace
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionForwardWorkspaceSize(
+            handle,
+            activation_desc,
+            filter_desc,
+            conv_desc,
+            output_desc,
+            fprop_algo,
+            &workspace_size_in_bytes
+        )); break;
+      case library::ConvKind::kDgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle,
+            filter_desc,
+            output_desc,
+            conv_desc,
+            activation_desc,
+            dgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+        case library::ConvKind::kWgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            handle,
+            activation_desc,
+            output_desc,
+            conv_desc,
+            filter_desc,
+            wgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+
+    }
+
+    workspace = cutlass::device_memory::allocation<char>(workspace_size_in_bytes);
+  }
+
+
+  // ctor for conv3d 
+  cudnnConvDispatcher( 
+    library::ConvDescription const &op_desc,
+    library::Conv3dConfiguration configuration,
+    library::ConvArguments arguments_,
+    cudnnHandle_t handle
+  ):
+    //configuration(configuration_), 
+    arguments(arguments_),
+    conv_kind(op_desc.conv_kind), 
+    status(Status::kSuccess) {
+
+    bool good = true;
+
+    // Get cudnn datatype, layout, and convolution mode from library::ConvDescription
+    good = (good && get_cudnn_datatype(data_type_activation, op_desc.A.element));
+    good = (good && get_cudnn_datatype(data_type_filter, op_desc.B.element));
+    good = (good && get_cudnn_datatype(data_type_output, op_desc.C.element));
+
+    good = (good && get_cudnn_layout(layout_activation, op_desc.A.layout));
+    good = (good && get_cudnn_layout(layout_filter, op_desc.B.layout));
+    good = (good && get_cudnn_layout(layout_output, op_desc.C.layout));
+
+    good = (good && get_cudnn_conv_mode(conv_mode, configuration.problem_size.mode));
+    
+    // cudnn compute type seems to be hardcoded to float (to handle a possible a cudnn issue)
+    alpha = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.alpha);
+    beta = cast_cudnn_compute_type_to_float(op_desc.element_epilogue, arguments.beta);
+
+    good = (good && get_cudnn_datatype(
+      compute_type, 
+      op_desc.tile_description.math_instruction.element_accumulator));
+
+    // Check cutlass Conv2d description has equivalent operator in cudnn
+    if (!good) {
+      status = Status::kErrorNotSupported;
+    }
+
+    // Create convolution descriptor object
+    status = get_cutlass_status(cudnnCreateConvolutionDescriptor(&conv_desc));
+
+    // Configure convolution operator
+    std::vector<int> padding {configuration.problem_size.pad_d, configuration.problem_size.pad_h, configuration.problem_size.pad_w};
+    std::vector<int> stride {configuration.problem_size.stride_d, configuration.problem_size.stride_h, configuration.problem_size.stride_w};
+    std::vector<int> dilation {configuration.problem_size.dilation_d, configuration.problem_size.dilation_h, configuration.problem_size.dilation_w};
+
+    status = get_cutlass_status(
+      cudnnSetConvolutionNdDescriptor(
+        conv_desc,
+        op_desc.conv_dim,
+        padding.data(),
+        stride.data(),
+        dilation.data(),
+        conv_mode,
+        compute_type
+    ));
+
+    // Set groups
+    status = get_cutlass_status(cudnnSetConvolutionGroupCount(conv_desc, configuration.problem_size.groups));
+
+    // Create activation, filter, and output descriptor objects
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&activation_desc));
+    status = get_cutlass_status(cudnnCreateFilterDescriptor(&filter_desc));
+    status = get_cutlass_status(cudnnCreateTensorDescriptor(&output_desc));
+
+    // Set activation descriptor 
+    std::vector<int> activation_extent {
+      configuration.problem_size.N,
+      configuration.problem_size.C,
+      configuration.problem_size.D,
+      configuration.problem_size.H,
+      configuration.problem_size.W
+    };
+
+    std::vector<int> activation_stride {
+      configuration.layout_activations.stride()[3],
+      1,
+      configuration.layout_activations.stride()[2],
+      configuration.layout_activations.stride()[1],
+      configuration.layout_activations.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetTensorNdDescriptor(
+        activation_desc,
+        data_type_activation,
+        op_desc.conv_dim + 2,
+        activation_extent.data(),
+        activation_stride.data()        
+    ));
+
+    // Set filter descriptor
+    std::vector<int> filter_extent {
+      configuration.problem_size.K,
+      configuration.problem_size.C,
+      configuration.problem_size.T,
+      configuration.problem_size.R,
+      configuration.problem_size.S
+    };
+
+    std::vector<int> filter_stride {
+      configuration.layout_filters.stride()[3],
+      1,
+      configuration.layout_filters.stride()[2],
+      configuration.layout_filters.stride()[1],
+      configuration.layout_filters.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetFilterNdDescriptor(
+        filter_desc,
+        data_type_filter,
+        layout_filter,
+        op_desc.conv_dim + 2,
+        filter_extent.data() 
+    ));
+
+
+    // Set output descriptor
+    std::vector<int> output_extent {
+      configuration.problem_size.N,
+      configuration.problem_size.K,
+      configuration.problem_size.Z,
+      configuration.problem_size.P,
+      configuration.problem_size.Q
+    };
+
+    std::vector<int> output_stride {
+      configuration.layout_output.stride()[3],
+      1,
+      configuration.layout_output.stride()[2],
+      configuration.layout_output.stride()[1],
+      configuration.layout_output.stride()[0]
+    };
+
+    status = get_cutlass_status(
+      cudnnSetTensorNdDescriptor(
+        output_desc,
+        data_type_output,
+        op_desc.conv_dim + 2,
+        output_extent.data(),
+        output_stride.data() 
+    ));
+
+    // Set math instruction to tensor op
+    status = get_cutlass_status(
+      cudnnSetConvolutionMathType(conv_desc, math_type));
+
+    // Initialize workspace
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionForwardWorkspaceSize(
+            handle,
+            activation_desc,
+            filter_desc,
+            conv_desc,
+            output_desc,
+            fprop_algo,
+            &workspace_size_in_bytes
+        )); break;
+      case library::ConvKind::kDgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle,
+            filter_desc,
+            output_desc,
+            conv_desc,
+            activation_desc,
+            dgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+        case library::ConvKind::kWgrad:
+        status =  get_cutlass_status(
+          cudnnGetConvolutionBackwardFilterWorkspaceSize(
+            handle,
+            activation_desc,
+            output_desc,
+            conv_desc,
+            filter_desc,
+            wgrad_algo,
+            &workspace_size_in_bytes
+        )); break;
+
+    }
+
+    workspace = cutlass::device_memory::allocation<char>(workspace_size_in_bytes);
+  }
+
+  /// Executes Conv2d operator from cudnn library
+  cudnnStatus_t operator()(cudnnHandle_t handle) {
+
+    switch (conv_kind) {
+      case library::ConvKind::kFprop:
+        return cudnnConvolutionForward(
+          handle,
+          &alpha,
+          activation_desc,
+          activation(),
+          filter_desc,
+          filter(),
+          conv_desc,
+          fprop_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          output_desc,
+          arguments.D
+        );
+      case library::ConvKind::kDgrad:
+        return cudnnConvolutionBackwardData(
+          handle,
+          &alpha,
+          filter_desc,
+          filter(),
+          output_desc,
+          output(),
+          conv_desc,
+          dgrad_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          activation_desc,
+          arguments.D
+        );
+      case library::ConvKind::kWgrad:
+        return cudnnConvolutionBackwardFilter(
+          handle,
+          &alpha,
+          activation_desc,
+          activation(),
+          output_desc,
+          output(),
+          conv_desc,
+          wgrad_algo,
+          workspace.get(),
+          workspace_size_in_bytes,
+          &beta,
+          filter_desc,
+          arguments.D
+        );
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Activation Tensor
+  void const * activation() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.A;
+      case library::ConvKind::kDgrad : return arguments.C;
+      case library::ConvKind::kWgrad : return arguments.B;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Filter Tensor
+  void const *filter() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.B;
+      case library::ConvKind::kDgrad : return arguments.B;
+      case library::ConvKind::kWgrad : return arguments.C;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+
+  // Returns Output Tensor
+  void const *output() const {
+    switch(conv_kind) {
+      case library::ConvKind::kFprop : return arguments.C;
+      case library::ConvKind::kDgrad : return arguments.A;
+      case library::ConvKind::kWgrad : return arguments.A;
+      default : throw std::runtime_error("Invalid Conv Operator (fprop, dgrad, wgrad)");
+    }
+  }
+};
+
+} // namespace detail
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#endif //#if CUTLASS_ENABLE_CUDNN
+} // namespace profiler
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cutlass_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..be82245325cebb147e2c801965a52ece91395cb2
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
@@ -0,0 +1,93 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+*/
+
+#pragma once
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/manifest.h"
+#include "cutlass/library/singleton.h"
+
+#include "options.h"
+#include "operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// CUTLASS Profiler application
+class CutlassProfiler {
+private:
+
+  //
+  // Data members
+  //
+
+  /// Performance testbench options
+  Options options_;
+
+  /// Entry points for each operation
+  OperationProfilerVector operation_profilers_;
+
+private:
+
+  /// Prints usage
+  void print_usage_(std::ostream &);
+
+  /// Prints usage
+  void print_options_(std::ostream &);
+
+  /// Enumerates all operations
+  void enumerate_();
+
+  /// Profiles all operations
+  int profile_();
+
+public:
+
+  CutlassProfiler(Options const &options);
+  ~CutlassProfiler();
+
+  /// Invokes profiling operations
+  int operator()();
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/debug.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..98f1fdc3044501e456c927471b30d74b09eafd39
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/debug.h
@@ -0,0 +1,56 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief
+*/
+
+#pragma once
+
+#include <iostream>
+
+//#define report(x) { std::cout << "\033[31m" << __FILE__ << ":" << __LINE__ << "  " << x << "\033[0m" << std::endl; }
+//#define report(x) {}
+
+// Enable/Disable Profiler debug prints
+//#define DEBUG_PROFILER 
+
+//RED    31m   // profiler prints debug messages in red
+//YELLOW 33m   // ir prints debug messages in yellow
+
+#ifndef DEBUG_PROFILER
+#define debugprof(...)
+#else
+#define debugprof(...) do { \
+          printf("\033[33m[DEBUG PROF]  %s:%d | ", __FILE__, __LINE__); \
+          printf(__VA_ARGS__); \
+          printf("\033[0m\n"); \
+      } while (0)
+#endif 
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_allocation.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_allocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..488b635c2ec233e3027303bbf15a34f375a438fd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_allocation.h
@@ -0,0 +1,246 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Execution environment
+*/
+
+#pragma once
+
+#include <stdexcept>
+#include <list>
+#include <vector>
+
+#include "cutlass/library/library.h"
+#include "cutlass/util/distribution.h"
+
+#include "enumerated_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Device memory allocation
+class DeviceAllocation {
+private:
+
+  /// Data type of contained elements
+  library::NumericTypeID type_;
+
+  /// Gets the stride between elements
+  size_t batch_stride_;
+
+  /// Capacity in elements of device allocation
+  size_t capacity_;
+
+  /// Pointer to device memory
+  void *pointer_;
+
+  /// Layout type ID
+  library::LayoutTypeID layout_;
+
+  /// Stride vector
+  std::vector<int64_t> stride_;
+
+  /// Extent vector
+  std::vector<int> extent_;
+
+  /// Support allocating a 'batch' of non-overlapping tensors in contiguous memory
+  int batch_count_;
+
+  /// Buffer holding TensorRef instance to recently allocated memory
+  std::vector<uint8_t> tensor_ref_buffer_;
+
+  /// The device ID where the allocation is made
+  int device_;
+
+public:
+  //
+  // Static member functions
+  //
+
+  /// Determines the number of bytes needed to represent this numeric type
+  static size_t bytes(library::NumericTypeID type, size_t capacity);
+
+  /// Returns the stride of a packed layout
+  static std::vector<int64_t> get_packed_layout(
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent);
+
+  /// returns the capacity needed
+  static size_t construct_layout(
+    void *bytes,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> &stride);
+
+  /// Returns true if two blocks have exactly the same value
+  static bool block_compare_equal(
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
+    size_t capacity);
+
+  /// Returns true if two blocks have approximately the same value
+  static bool block_compare_relatively_equal(
+    library::NumericTypeID numeric_type,
+    void const *ptr_A,
+    void const *ptr_B,
+    size_t capacity,
+    double epsilon,
+    double nonzero_floor);
+
+public:
+  //
+  // Methods
+  //
+
+  DeviceAllocation();
+
+  DeviceAllocation(
+    library::NumericTypeID type,
+    size_t capacity,
+    int device = -1);
+
+  DeviceAllocation(
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
+    int batch_count = 1,
+    int device = -1);
+
+  ~DeviceAllocation();
+
+  DeviceAllocation &reset();
+
+  /// Allocates device memory of a given type and capacity
+  DeviceAllocation &reset(library::NumericTypeID type, size_t capacity);
+
+  /// Allocates memory for a given layout and tensor
+  DeviceAllocation &reset(
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride = std::vector<int64_t>(),
+    int batch_count = 1);
+
+  /// Returns a buffer owning the tensor reference
+  std::vector<uint8_t> &tensor_ref() {
+    return tensor_ref_buffer_;
+  }
+
+  bool good() const;
+
+  /// Data type of contained elements
+  library::NumericTypeID type() const;
+
+  /// Pointer to start of device memory allocation
+  void *data() const;
+
+  /// Pointer to the first element of a batch
+  void *batch_data(int batch_idx) const;
+
+  /// Gets the layout type
+  library::LayoutTypeID layout() const;
+
+  /// Gets the stride vector
+  std::vector<int64_t> const & stride() const;
+
+  /// Gets the extent vector
+  std::vector<int> const & extent() const;
+
+  /// Gets the number of adjacent tensors in memory
+  int batch_count() const;
+
+  /// Gets the stride (in units of elements) between items
+  int64_t batch_stride() const;
+
+  /// Gets the stride (in units of bytes) between items
+  int64_t batch_stride_bytes() const;
+
+  /// Capacity of allocation in number of elements
+  size_t capacity() const;
+
+  /// Capacity of allocation in bytes
+  size_t bytes() const;
+
+  /// Initializes a device allocation to a random distribution using cuRAND
+  void initialize_random_device(int seed, Distribution dist);
+
+  /// Initializes a host allocation to a random distribution using std::cout
+  void initialize_random_host(int seed, Distribution dist);
+
+  /// Initializes a device allocation to a sequential distribution
+  void initialize_sequential_device(Distribution dist);
+
+  /// Initializes a host allocation to a sequential distribution
+  void initialize_sequential_host(Distribution dist);
+
+  /// Initializes a device allocation to a random distribution using cuRAND
+  void initialize_random_sparsemeta_device(int seed, int MetaSizeInBits);
+
+  /// Initializes a host allocation to a random distribution using std::cout
+  void initialize_random_sparsemeta_host(int seed, int MetaSizeInBits);
+
+  /// Uniformly fills a tensor with a value when provided o.w. zero
+  void fill_device(double value);
+
+  /// Uniformly fills a host allocation with a value when provided o.w. zero
+  void fill_host(double value);
+
+  /// Copies from an equivalent-sized tensor in device memory
+  void copy_from_device(void const *ptr);
+
+  /// Copies from an equivalent-sized tensor in device memory
+  void copy_from_host(void const *ptr);
+
+  /// Copies from an equivalent-sized tensor in device memory
+  void copy_to_host(void *ptr);
+
+  /// Writes a tensor to csv
+  void write_tensor_csv(std::ostream &out);
+
+private:
+  /// A wrapper that sets the device, performs malloc, and sets back
+  cudaError_t malloc(void** ptr, size_t size);
+};
+
+using DeviceAllocationList = std::list<DeviceAllocation>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_context.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..0443b340397426bfafc812c1a4b9179fc6af0de4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/device_context.h
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief
+*/
+
+#pragma once
+
+#include <map>
+#include <string>
+
+
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+
+#include "options.h"
+#include "device_allocation.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Collection of allocations on the device
+class DeviceContext {
+public:
+
+  //
+  // Type definitions
+  //
+  using AllocationMap = std::map<std::string, DeviceAllocation *>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Memory allocations that exist (owning)
+  DeviceAllocationList device_memory_;
+
+  /// Non-owning set of named allocations
+  AllocationMap allocations_;
+
+public:
+
+  /// Allocates memory of a given type, capacity (elements), and name
+  DeviceAllocation *allocate_block(
+    Options const &options,
+    std::string const &name,
+    library::NumericTypeID type,
+    size_t capacity,
+    size_t device_index);
+
+  /// Allocates memory of a given type, capacity (elements), and name
+  DeviceAllocation *allocate_tensor(
+    Options const &options,
+    std::string const &name,
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    size_t device_index);
+
+  /// Allocates memory of a given type, capacity (elements), and name
+  DeviceAllocation *allocate_and_initialize_tensor(
+    Options const &options,
+    std::string const &name,
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    int seed_shift,
+    size_t device_index);
+
+  /// Allocates memory for sparse meta data
+  DeviceAllocation *allocate_and_initialize_sparsemeta_tensor(
+    Options const &options,
+    std::string const &name,
+    library::NumericTypeID type,
+    library::LayoutTypeID layout_id,
+    library::NumericTypeID type_a,
+    std::vector<int> const &extent,
+    std::vector<int64_t> const &stride,
+    int batch_count,
+    int seed_shift,
+    size_t device_index);
+
+  /// Clears named allocations (but does not necessarily free memory)
+  void clear();
+
+  /// Frees all device memory allocations
+  void free();
+
+  /// Gets the allocation by name
+  DeviceAllocation &at(std::string const &name);
+
+  size_t size() const;
+
+  AllocationMap::iterator begin();
+  AllocationMap::iterator end();
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/enumerated_types.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/enumerated_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..897311c228ce76c4e8814ce996929561d44d2465
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/enumerated_types.h
@@ -0,0 +1,169 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Provides several functions for filling tensors with data.
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <iostream>
+#include "cutlass/library/library.h"
+
+#define TRACE(x) { std::cout << __FILE__ << ":" << __LINE__ << "  " << x << std::endl; }
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+T from_string(std::string const &);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Enumerated type describing how the performance testbench evaluates kernels.
+enum class ExecutionMode {
+  kProfile,     ///< regular verification and profiling
+  kDryRun,      ///< no kernels are launched or workspaces allocated; used to assess what operators might be launched
+  kEnumerate,   ///< no kernels launched or workspaces allocated; lists all operation kind and operations
+  kTrace,       ///< executes a single device-side computation with no other kernel launches
+  kInvalid
+};
+
+/// Converts a ExecutionMode enumerant to a string
+char const *to_string(ExecutionMode mode, bool pretty = false);
+
+/// Parses a ExecutionMode enumerant from a string
+template <>
+ExecutionMode from_string<ExecutionMode>(std::string const &str);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Library algorithm mode
+enum class AlgorithmMode {
+  kMatching,            ///< compare against best matching algorithm
+  kBest,                    ///< evaluate all library algorithms and report best
+  kDefault,                 ///< use the library's default algorithm option
+  kInvalid
+};
+
+/// Converts a ExecutionMode enumerant to a string
+char const *to_string(AlgorithmMode mode, bool pretty = false);
+
+/// Parses a ExecutionMode enumerant from a string
+template <>
+AlgorithmMode from_string<AlgorithmMode>(std::string const &str);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Outcome of a performance test
+enum class Disposition {
+  kPassed,
+  kFailed,    // kernel itself reported an error
+  kNotRun,
+  kIncorrect, // kernel finished without a detected error, but result does not equal expected result
+  kNotVerified,
+  kInvalidProblem,
+  kNotSupported,
+  kInvalid
+};
+
+/// Converts a Disposition enumerant to a string
+char const *to_string(Disposition disposition, bool pretty = false);
+
+/// Parses a Disposition enumerant from a string
+template <>
+Disposition from_string<Disposition>(std::string const &str);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Indicates when to save 
+enum class SaveWorkspace {
+  kNever,
+  kIncorrect,
+  kAlways,
+  kInvalid
+};
+
+/// Converts a SaveWorkspace enumerant to a string
+char const *to_string(SaveWorkspace save_option, bool pretty = false);
+
+/// Parses a SaveWorkspace enumerant from a string
+template <>
+SaveWorkspace from_string<SaveWorkspace>(std::string const &str);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Indicates the type of kernel argument
+// ArgumentType can be both ScalarType or NumericType. Thus, enums kScalar and kNumeric
+// 1) kScalar: e.g. of a Scalar ArgumentType is u32 is a Scalar type.
+// Its c++ equivalent as "type name = initializer" is "u32 m = 32"
+// 2) kNumeric: e.g. of a Numeric ArgumentType is NumericTypeID is a Numeric type.
+// Its c++ equivalent as "type name = initializer" is "NumericTypeID numeric_type = u32"
+enum class ArgumentTypeID {
+  kScalar,
+  kInteger,
+  kTensor,
+  kBatchedTensor,
+  kStructure,
+  kEnumerated,
+  kInvalid
+};
+
+/// Converts a ArgumentTypeID enumerant to a string
+char const *to_string(ArgumentTypeID type, bool pretty = false);
+
+/// Parses a ArgumentTypeID enumerant from a string
+template <>
+ArgumentTypeID from_string<ArgumentTypeID>(std::string const &str);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Profiler typedefs
+using ProviderVector = std::vector<library::Provider>;
+using DispositionMap = std::map<library::Provider, Disposition>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Print vector for the report
+template <typename T>
+std::ostream& operator<< (std::ostream& out, const std::vector<T>& v) {
+  for (size_t i = 0; i < v.size(); ++i) {
+    out << to_string(v[i], true) << (i + 1u != v.size() ? "," : "");
+  }
+  return out;
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..faf317152473cac6dc62ecf8970cd1acfb2c1622
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
@@ -0,0 +1,333 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Gemm Profiler
+*/
+
+#pragma once
+
+#include <vector>
+#include <array>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "reduction_operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class GemmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct GemmProblem {
+
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGemm};
+
+    /// For profiling purposes
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<std::array<int64_t, 3>> leading_dims;
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+
+    int64_t m{16};
+    int64_t n{16};
+    int64_t k{16};
+
+    
+    int cluster_m{1};
+    int cluster_n{1};
+    int cluster_k{1};
+    int cluster_m_fallback{1};
+    int cluster_n_fallback{1};
+    int cluster_k_fallback{1};
+    
+
+    int64_t lda{0};
+    int64_t ldb{0};
+    int64_t ldc{0};
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    cutlass::library::SplitKMode split_k_mode{library::SplitKMode::kNone};
+    int split_k_slices{1};
+    int batch_count{1};
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_a{};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_b{};
+    
+
+    // gemm with parallel interleaved reduction
+    // gemm epilogue (alpha, beta) = (1.0, 0.0)
+    // reduction epilogue (alpha, beta) = (GemmProblem::alpha, GemmProblem::beta)
+    std::vector<uint8_t> alpha_one;
+    std::vector<uint8_t> beta_zero;
+
+    bool use_pdl{false};
+
+    bool enable_sm90_mixed_dtype_shuffle_test{false};
+
+    //
+    // Methods
+    //
+
+    /// Parses the problem
+    Status parse(
+      library::GemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    int64_t bytes_with_problem_shape(
+      library::GemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    int64_t flops_with_problem_shape(
+      library::GemmDescription const &operation_desc,
+      gemm::GemmCoord const &problem_shape) const;
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::GemmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::GemmDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::GemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used
+  struct GemmWorkspace {
+
+    DeviceAllocation *A{nullptr};
+    DeviceAllocation *B{nullptr};
+    DeviceAllocation *C{nullptr};
+    DeviceAllocation *Computed{nullptr};
+    DeviceAllocation *Reference{nullptr};
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    int problem_count{1};
+
+    library::GemmUniversalConfiguration configuration;
+    library::GemmUniversalArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    /// Library configuration and arguments for reduction operator
+    library::ReductionConfiguration reduction_configuration;
+    library::ReductionArguments reduction_arguments;
+
+    /// Buffer used for the cutlass reduction operations' host workspace
+    std::vector<uint8_t> reduction_host_workspace;
+
+    /// For mixed input dtype kernels
+    DeviceAllocation *Scale{nullptr};             // Scale tensor
+    DeviceAllocation *Zero{nullptr};              // Zero tensor
+    DeviceAllocation *dequantized_AB{nullptr};    // Dequantized A or B tensor for verification
+    DeviceAllocation *encoded_AB{nullptr};        // Encoded A or B in int4 x fp8 or shuffle
+    DeviceAllocation *packed_Scale{nullptr};      // Packed scale for int4 * fp8
+
+    cudaStream_t stream;
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  GemmProblem problem_;
+
+  /// Device memory allocations
+  std::vector<GemmWorkspace> gemm_workspace_;
+
+  /// CUTLASS parallel reduction operation to follow this* gemm operation
+  library::Operation const *reduction_op_;
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  GemmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~GemmOperationProfiler();
+
+  GemmProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+  /// Update workspace configuration according to flexible user setups
+  void update_workspace_(
+    GemmWorkspace &gemm_workspace,
+    gemm::GemmCoord const &problem_shape,
+    std::array<int64_t, 3> const &leading_dim,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    cutlass::library::RasterOrder const &raster_order,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Update performance result configuration according to flexible user setups
+  void update_result_(
+    PerformanceResult &result,
+    library::GemmDescription const &operation_desc,
+    ProblemSpace const &problem_space,
+    gemm::GemmCoord const &problem_shape,
+    cutlass::library::RasterOrder const &raster_order,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,
+    library::GemmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem,
+    GemmWorkspace &gemm_workspace);
+
+  /// Verifies CUTLASS against host and device references
+  bool verify_with_reference_(
+    Options const &options,
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
+
+  /// Method to profile a CUTLASS Operation
+  Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+
+  /// Initialize reduction problem dimensions and library::Operation
+  bool initialize_reduction_configuration_(
+    library::Operation const *operation,
+    ProblemSpace::Problem const &problem);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gpu_timer.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gpu_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..154045295d6443d930ba53387366f4b8abe408a4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/gpu_timer.h
@@ -0,0 +1,77 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+*/
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct GpuTimer {
+
+  cudaEvent_t events[2];
+
+  //
+  // Methods
+  //
+  
+  GpuTimer();
+
+  GpuTimer(GpuTimer const&) = delete;
+
+  GpuTimer(GpuTimer &&gpu_timer) noexcept;
+
+  ~GpuTimer();
+
+  /// Records a start event in the stream, the flag is for cudaEventRecordWithFlags
+  void start(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
+
+  /// Records a stop event in the stream, the flag is for cudaEventRecordWithFlags
+  void stop(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
+
+  /// Records a stop event in the stream and synchronizes on the stream, the flag is for cudaEventRecordWithFlags
+  void stop_and_wait(cudaStream_t stream = nullptr, unsigned int flag = cudaEventRecordDefault);
+
+  /// Returns the duration in milliseconds
+  double duration(int iterations = 1) const;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..62d47990584cbb984935a00a267cff15dbb4f4e5
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/grouped_gemm_operation_profiler.h
@@ -0,0 +1,344 @@
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/* \file
+   \brief GroupedGemm Profiler
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+
+// Profiler includes
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "options.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class GroupedGemmOperationProfiler : public OperationProfiler {
+public:
+  /// Problem structure obtained from problem space
+  struct GroupedGemmProblem {
+
+    cutlass::library::GemmUniversalMode mode{library::GemmUniversalMode::kGrouped};
+
+    std::vector<gemm::GemmCoord> problem_sizes;
+    std::vector<cute::Shape<int, int, int>> problem_sizes_3x;
+
+    /// For exploration purposes
+    std::vector<std::array<int64_t, 3>> preferred_clusters;
+    std::vector<std::array<int64_t, 3>> fallback_clusters;
+    std::vector<cutlass::library::RasterOrder> raster_orders;
+    std::vector<int> swizzle_sizes;
+
+    int cluster_m{1};
+    int cluster_n{1};
+    int cluster_k{1};
+    int cluster_m_fallback{1};
+    int cluster_n_fallback{1};
+    int cluster_k_fallback{1};
+
+    std::vector<int64_t> lda{0};
+    std::vector<int64_t> ldb{0};
+    std::vector<int64_t> ldc{0};
+
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+
+    cutlass::library::RasterOrder raster_order{cutlass::library::RasterOrder::kHeuristic};
+    int swizzle_size{1};
+
+    cutlass::library::RuntimeDatatype runtime_input_datatype_a{};
+    cutlass::library::RuntimeDatatype runtime_input_datatype_b{};
+
+    bool use_pdl{false};
+
+    /// Parses the problem
+    Status parse(
+      library::GroupedGemmDescription const& operation_desc,
+      ProblemSpace const& problem_space,
+      ProblemSpace::Problem const& problem);
+
+    int64_t m(int group_idx) const { return problem_sizes[group_idx].m(); };
+    int64_t n(int group_idx) const { return problem_sizes[group_idx].n(); };
+    int64_t k(int group_idx) const { return problem_sizes[group_idx].k(); };
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::GroupedGemmDescription const& operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::GroupedGemmDescription const& operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult& result,
+      library::GroupedGemmDescription const& operation_desc,
+      ProblemSpace const& problem_space);
+  };
+
+  struct BlockScalingWorkspace {
+    // host vector (per L2 workspace) of device vectors (per group) of device pointers
+    std::vector<DeviceAllocation*> SFA_ptr_array_device;
+    std::vector<DeviceAllocation*> SFB_ptr_array_device;
+    std::vector<DeviceAllocation*> SFC_ptr_array_device;
+    std::vector<DeviceAllocation*> SFD_ptr_array_device;
+
+    // host vector (per group) of device tensors
+    // (where each batch of device allocation is for a L2 workspace)
+    std::vector<DeviceAllocation*> SFA_ptr_array_host;
+    std::vector<DeviceAllocation*> SFB_ptr_array_host;
+    std::vector<DeviceAllocation*> SFC_ptr_array_host;
+    std::vector<DeviceAllocation*> SFD_ptr_array_host;
+    std::vector<DeviceAllocation*> SFD_reference_ptr_array_host;
+
+    // matrix wide constant, not per-batch or per-group
+    DeviceAllocation* norm_constant;
+  };
+
+  // workspace contains the allocated blocks, arguments just contain the raw
+  // pointers
+  struct GroupedGemmWorkspace {
+
+    // host vector (per L2 workspace) of device vectors (per group) of device pointers
+    std::vector<DeviceAllocation*> A_ptr_array_device;
+    std::vector<DeviceAllocation*> B_ptr_array_device;
+    std::vector<DeviceAllocation*> C_ptr_array_device;
+    std::vector<DeviceAllocation*> D_ptr_array_device;
+    std::vector<DeviceAllocation*> reference_ptr_array_host;
+
+    // host vector (per group) of device tensors
+    // (where each batch of device allocation is for a L2 workspace)
+    std::vector<DeviceAllocation*> A_ptr_array_host;
+    std::vector<DeviceAllocation*> B_ptr_array_host;
+    std::vector<DeviceAllocation*> C_ptr_array_host;
+    std::vector<DeviceAllocation*> D_ptr_array_host;
+
+    /// Number of copies of the problem workspace which are visited sequentially during
+    /// profiling to avoid camping in the last level cache.
+    /// *NOT* the number of groups in the grouped GEMM (we use `num_groups` in the profiler)
+    int problem_count{1};
+
+    DeviceAllocation* problem_sizes_array_device{nullptr};
+    DeviceAllocation* problem_sizes_3x_array_device{nullptr};
+    DeviceAllocation* lda_array_device{nullptr};
+    DeviceAllocation* ldb_array_device{nullptr};
+    DeviceAllocation* ldc_array_device{nullptr};
+    DeviceAllocation* ldd_array_device{nullptr};
+
+    std::optional<BlockScalingWorkspace> block_scales;
+
+    library::GemmGroupedConfiguration configuration;
+    library::GroupedGemmBlockScaledArguments arguments;
+
+    std::vector<uint8_t> host_workspace;
+    DeviceAllocation device_workspace;
+
+    cudaStream_t stream;
+  };
+
+private:
+  void init_arguments(Options const& options) {
+    auto& arguments = gemm_workspace_.arguments;
+    // these get updated in each profiler run to ensure L2 cycling
+    arguments.ptr_A = gemm_workspace_.A_ptr_array_device[0]->data();
+    arguments.ptr_B = gemm_workspace_.B_ptr_array_device[0]->data();
+    arguments.ptr_C = gemm_workspace_.C_ptr_array_device[0]->data();
+    arguments.ptr_D = gemm_workspace_.D_ptr_array_device[0]->data();
+
+    arguments.alpha = problem_.alpha.data();
+    arguments.beta = problem_.beta.data();
+    arguments.pointer_mode = library::ScalarPointerMode::kHost;
+    arguments.lda = static_cast<int64_t*>(gemm_workspace_.lda_array_device->data());
+    arguments.ldb = static_cast<int64_t*>(gemm_workspace_.ldb_array_device->data());
+    arguments.ldc = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
+    arguments.ldd = static_cast<int64_t*>(gemm_workspace_.ldc_array_device->data());
+    arguments.problem_sizes =
+      static_cast<gemm::GemmCoord*>(gemm_workspace_.problem_sizes_array_device->data());
+    arguments.problem_sizes_3x = static_cast<cute::Shape<int, int, int>*>(
+      gemm_workspace_.problem_sizes_3x_array_device->data());
+    gemm_workspace_.arguments.problem_sizes_3x_host = problem_.problem_sizes_3x.data();
+    gemm_workspace_.arguments.problem_count = problem_.problem_sizes.size();
+    gemm_workspace_.arguments.cluster_shape = {int(problem_.cluster_m), int(problem_.cluster_n), int(problem_.cluster_k)};
+    gemm_workspace_.arguments.cluster_shape_fallback = {int(problem_.cluster_m_fallback), int(problem_.cluster_n_fallback), int(problem_.cluster_k_fallback)};
+
+    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    arguments.sm_count = options.device.get_sm_count(0);
+    if (is_block_scaled) {
+      auto& block_scaled_ws = gemm_workspace_.block_scales.value();
+      arguments.SFA = block_scaled_ws.SFA_ptr_array_device[0]->data();
+      arguments.SFB = block_scaled_ws.SFB_ptr_array_device[0]->data();
+      arguments.SFD = block_scaled_ws.SFD_ptr_array_device[0]->data();
+      arguments.norm_constant = block_scaled_ws.norm_constant->data();
+    }
+    else if (is_blockwise) {
+      auto& block_scaled_ws = gemm_workspace_.block_scales.value();
+      arguments.SFA = block_scaled_ws.SFA_ptr_array_device[0]->data();
+      arguments.SFB = block_scaled_ws.SFB_ptr_array_device[0]->data();
+    }
+  }
+
+protected:
+  /// GEMM problem obtained from problem space
+  GroupedGemmProblem problem_;
+
+  /// Device memory allocations
+  GroupedGemmWorkspace gemm_workspace_;
+
+  bool is_block_scaled{false};
+  bool is_blockwise{false};
+
+public:
+  GroupedGemmOperationProfiler(Options const& options);
+
+  virtual ~GroupedGemmOperationProfiler();
+
+  GroupedGemmProblem const& problem() const { return problem_; }
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream& out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream& out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const& options,
+    PerformanceReport& report,
+    DeviceContext& device_context,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space,
+    ProblemSpace::Problem const& problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const& options,
+    PerformanceReport& report,
+    DeviceContext& device_context,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space,
+    ProblemSpace::Problem const& problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const& options,
+    PerformanceReport& report,
+    DeviceContext& device_context,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space,
+    ProblemSpace::Problem const& problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const& options,
+    PerformanceReport& report,
+    DeviceContext& device_context,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space,
+    ProblemSpace::Problem const& problem);
+
+protected:
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult& result,
+    Options const& options,
+    library::GroupedGemmDescription const& operation_desc,
+    ProblemSpace const& problem_space);
+
+  /// Update workspace configuration according to flexible user setups
+  void update_workspace_(
+    GroupedGemmWorkspace &gemm_workspace,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    cutlass::library::RasterOrder const &raster_order,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Update performance result configuration for exploration parameters
+  void update_workspace_and_result_(
+    GroupedGemmWorkspace &gemm_workspace,
+    PerformanceResult &result,
+    ProblemSpace const &problem_space,
+    cutlass::library::RasterOrder const &raster_order,
+    std::array<int64_t, 3> const &preferred_cluster,
+    std::array<int64_t, 3> const &fallback_cluster,
+    int swizzle_size,
+    bool is_dynamic_cluster_enabled);
+
+  /// Verifies CUTLASS against host and device references
+  bool verify_with_reference_(
+    Options const& options,
+    PerformanceReport& report,
+    DeviceContext& device_context,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space,
+    ProblemSpace::Problem const& problem,
+    cutlass::library::NumericTypeID element_A,
+    cutlass::library::NumericTypeID element_B);
+
+  /// Method to profile a CUTLASS Operation
+  Status profile_cutlass_(
+    PerformanceResult& result,
+    Options const& options,
+    library::Operation const* operation,
+    void* arguments,
+    void* host_workspace,
+    void* device_workspace) override;
+
+  /// Method to profile a CUTLASS Operation for the best configuration for a fixed shape
+  bool profile_cutlass_for_fixed_shape_(
+    Options const& options,
+    library::Operation const* operation,
+    ProblemSpace const& problem_space);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..446ef2c16739b28aaf038ca62bad6e3cdf667813
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/operation_profiler.h
@@ -0,0 +1,287 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <unordered_map>
+
+// CUTLASS includes
+#include "cutlass/trace.h"
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "performance_result.h"
+#include "performance_report.h"
+#include "problem_space.h"
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class OperationProfiler {
+public:
+
+
+protected:
+  //
+  // Data members
+  //
+
+  /// Top-level operation kind
+  library::OperationKind kind_;
+
+  /// Human readable description
+  std::string description_;
+
+  /// Arguments parsed from command line
+  ArgumentDescriptionVector arguments_;
+
+  /// List of providers used to verify and compare each result
+  ProviderVector verification_providers_;
+
+  /// Model performance result initialized by the operation profiler with workload statistics
+  /// and reasonable default state.
+  PerformanceResult model_result_;
+
+  /// Performance result vector constructed by profiling the operation
+  PerformanceResultVector results_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Ctor
+  OperationProfiler();
+
+  OperationProfiler(
+    Options const &options,
+    library::OperationKind kind, 
+    ArgumentDescriptionVector const &arguments = ArgumentDescriptionVector(),
+    ProviderVector const & verification_providers = ProviderVector());
+
+  /// Destructor
+  virtual ~OperationProfiler();
+
+  /// Obtains the operation kind
+  library::OperationKind kind() const { return kind_; }
+
+  /// Gets the schema description
+  std::string const &description() const;
+
+  /// Returns a reference to the arguments
+  ArgumentDescriptionVector const &arguments() const { return arguments_; }
+
+public:
+
+  //
+  // Basic overrides
+  //
+
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const =0;
+
+  /// Entry point to profile all operations in the manifest
+  virtual int profile_all(
+    Options const &options, 
+    library::Manifest const &manifest, 
+    DeviceContext &device_context);
+
+public:
+
+  //
+  // Operation-specific phases of verification and profiling
+  //
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem) = 0;
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem) = 0;
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem) = 0;
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem) = 0;
+
+public:
+
+  //
+  // Static helpers
+  //
+
+  /// Sleep for a given duration in ms
+  static void sleep(int sleep_duration);
+
+  /// Returns true if the current operation description satisfies the problem space
+  static bool satisfies(
+    library::OperationDescription const &op_desc,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+  
+  /// Compares tensors for equality
+  static Disposition compare_tensors(
+    Options const &options,
+    DeviceAllocation &experimental,
+    DeviceAllocation &reference,
+    int64_t count = 0);
+
+  static void save_workspace(
+    DeviceContext &device_context,
+    Options const &options,
+    library::OperationDescription const &desc,
+    library::Provider provider,
+    library::Provider verification_provider = library::Provider::kInvalid);
+  
+  /// Helper to set a performance result member
+  static void set_argument(  
+    PerformanceResult &result,
+    char const *name,
+    ProblemSpace const &problem_space,
+    std::string const &value);
+
+  /// Helper to set a performance result member
+  static void set_argument(  
+    PerformanceResult &result,
+    char const *name,
+    ProblemSpace const &problem_space,
+    int64_t value);
+
+protected:
+
+  /// Sets operation description 
+  static void initialize_result_(
+    PerformanceResult &result,
+    library::OperationDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Method to profile an initialized CUTLASS operation
+  virtual Status profile_cutlass_(
+    PerformanceResult &result,
+    Options const &options,
+    library::Operation const *operation,
+    void *arguments,
+    void *host_workspace,
+    void *device_workspace);
+
+  /// Profiles the GPU kernel launched in `func` running simultaneously on all
+  /// requested devices.
+  Status profile_kernel_w_cuda_graphs_(
+    PerformanceResult& result,
+    Options const& options,
+    std::function<Status(int, cudaStream_t, int)> const& func,
+    std::vector<cudaStream_t> const& streams);
+
+  Status profile_kernel_(
+    PerformanceResult& result,
+    Options const& options,
+    std::function<Status(int, cudaStream_t, int)> const& func,
+    std::vector<cudaStream_t> const& streams);
+
+  /// Profiles the GPU kernel launched in `func` on the `stream`
+  Status profile_kernel_(
+    PerformanceResult& result,
+    Options const& options,
+    std::function<Status(cudaStream_t, int)> const& func,
+    cudaStream_t stream = nullptr);
+
+  /// Profiles the GPU kernel launched in `func` on the `stream`
+  Status profile_kernel_no_cuda_graphs_(
+    PerformanceResult& result,
+    Options const& options,
+    std::function<Status(cudaStream_t, int)> const& func,
+    cudaStream_t stream = nullptr);
+
+private:
+  /// finds string matches filter_string in operation_name
+  bool find_string_matches_(
+    std::string const &filter_string, 
+    std::string const &operation_name);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Vector of owning operation profilers
+using OperationProfilerVector = std::vector<std::unique_ptr<OperationProfiler>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/options.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a957b36eea35f7c0a5366645c3a62298ca56dea
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/options.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Command line options for performance test program
+*/
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+
+#include <cuda_runtime.h>
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/library/library.h"
+
+#include "enumerated_types.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Global options
+class Options {
+public:
+
+  /// Cublas and cuDNN options
+  struct Library {
+
+    //
+    // Data members
+    //
+
+    /// Algorithm mode
+    AlgorithmMode algorithm_mode;
+
+    /// Algorithm enumerants
+    std::vector<int> algorithms;
+
+    //
+    // Methods
+    //
+
+    explicit Library(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+  };
+
+  /// Options related to the selected device
+  struct Device {
+
+    /// Device ID
+    std::vector<int> devices;
+
+    /// Number of total devices
+    /// This is not set by the user, it is set by automatically
+    int num_devices;
+
+    /// CUDA Device properties
+    std::vector<cudaDeviceProp> properties;
+
+    /// Total memory allocation on each device
+    size_t maximum_capacity;
+
+  private:
+    /// SM Count
+    /// Limits the number of SMs to use on each device 
+    int sm_count;
+
+    //
+    // Methods
+    //
+  public:
+    explicit Device(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+    void print_device_info(std::ostream &out) const;
+
+    /// Returns the device ID from a device index
+    int device_id(size_t device_index) const;
+
+    /// Returns the sm_count if set, otherwise returns the number of SMs on the device
+    int get_sm_count(int device_index) const;
+
+    /// Returns the compute capability of the listed devices (e.g. 70, 75, 80, etc.)
+    int compute_capability(int device_index) const;
+  };
+
+  /// Options related to initializing input tensors
+  struct Initialization {
+
+    /// If true, data is initialized randomly. If false, no initialization is performed after
+    /// allocating tensors.
+    bool enabled;
+
+    /// If true, data distribution is set by the user and is not allowed to change
+    /// If false, data distribution is allowed to change based on element_type (library::NumericTypeID)
+    bool fix_data_distribution;
+
+    /// Data distribution for input tensors
+    Distribution data_distribution;
+
+    /// Source of random tensor elements
+    library::Provider provider;
+
+    /// Random number generator seed.
+    int seed;
+
+    //
+    // Methods
+    //
+
+    explicit Initialization(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+
+    /// Helper to parse a Distribution object from the command line parser
+    static void get_distribution(
+      cutlass::CommandLine const &args,
+      std::string const &arg,
+      cutlass::Distribution &dist);
+  };
+
+  /// Options related to verification of the result
+  struct Verification {
+
+    //
+    // Data members
+    //
+
+    /// If true, kernels are verified before they are profiled
+    bool enabled;
+
+    /// If true, causes profiler to return an error code if no reference check is run.
+    /// Only valid when verification is enabled.
+    bool required;
+
+    /// Relative error threshold - zero to require bit-level consistency
+    double epsilon;
+
+    /// Values smaller than this are assumed to be zero
+    double nonzero_floor;
+
+    /// List of providers used to verify each result
+    ProviderVector providers;
+
+    /// Indicates when to save the workspace
+    SaveWorkspace save_workspace;
+
+    //
+    // Methods
+    //
+
+    explicit Verification(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+
+    /// Returns true if a provider is enabled
+    bool provider_enabled(library::Provider provider) const;
+
+    /// Returns the index of a provider if its enabled
+    size_t index(library::Provider provider) const;
+  };
+
+  /// Options related to profiling
+  struct Profiling {
+
+    /// Number of workspaces to rotate through to avoid cache-resident working sets
+    int workspace_count{0};
+
+    /// Number of iterations to warmup each kernel prior to profiling
+    int warmup_iterations{10};
+
+    /// Number of iterations to profile each kernel - if 0, kernels are launched up to the profiling duration
+    /// This will always override profiling-duration and min-iterations.
+    int iterations{100};
+
+    /// Time to spend profiling each kernel (ms)
+    int duration{10};
+
+    /// Minimum number of iterations to profile
+    int min_iterations{10};
+
+    /// If true, profiling with cuda graph enabled.
+    bool use_cuda_graphs{false};
+
+    /// If enabled, the CUTLASS profiler searches for the best-performing kernel 
+    /// within the subset of kernels matching a kernel filter regex. The best 
+    /// performance is determined by screening over a set of predefined M/N/K 
+    /// sizes and performance-related parameters, including cluster shapes, 
+    /// swizzle sizes, and rasterization orders.
+    /// For now, it only supports legacy GEMM and blockscaled GEMM.
+    bool enable_kernel_performance_search{false};
+
+    /// If enabled, the CUTLASS profiler searches for the best-performing kernel 
+    /// for a given M/N/K problem size by evaluating various performance-related 
+    /// parameters such as cluster shapes, swizzle sizes, and rasterization orders.
+    /// For now, it only supports legacy GEMM and blockscaled GEMM.
+    bool enable_best_kernel_for_fixed_shape{false};
+
+    /// Number of ms to sleep between profiling periods (ms)
+    int sleep_duration{50};
+
+    /// If true, profiling is actually conducted.
+    bool enabled{true};
+
+    /// If true, profiling returns an error code if no kernels are found to match the filters.
+    bool error_on_no_match{false};
+
+    /// If true, profiling returns an error code if no kernel are profiled
+    // Sometimes the kernel matches but failed to profile (e.g. can_implement() error)
+    bool error_if_nothing_is_profiled{false};
+
+    /// List of providers of each functionality to be profiled
+    ProviderVector providers;
+
+    //
+    // Methods
+    //
+
+    explicit Profiling(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+
+    /// Returns true if a provider is enabled
+    bool provider_enabled(library::Provider provider) const;
+
+    /// Returns the index of a provider if its enabled
+    size_t index(library::Provider provider) const;
+  };
+
+  /// Options related to reporting
+  struct Report {
+
+    /// If true, result is appended to possibly existing file
+    bool append;
+
+    /// Path to a file containing results
+    std::string output_path;
+
+    /// Path to a file containing junit xml results
+    std::string junit_output_path;
+
+    /// Sequence of tags to attach to each result
+    std::vector<std::pair<std::string, std::string>> pivot_tags;
+
+    /// If true, reports status of all kernels including those that were
+    /// not run for the given arguments
+    bool report_not_run;
+
+    /// Prints human-readable text to stdout. If false, nothing is written to stdout
+    bool verbose;
+
+    /// Sort results by flops-per-byte
+    bool sort_flops_per_byte;
+
+    /// Sort results by flops-per-second
+    bool sort_flops_per_sec;
+
+    /// Prints the name of the kernel being profiled before running the kernel.
+    /// This is useful for determining which kernel is causing a run of the profiler to hang
+    bool print_kernel_before_running;
+
+    //
+    // Methods
+    //
+
+    explicit Report(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+  };
+
+  /// Options related to printing usage and version information
+  struct About {
+
+    /// If true, usage is printed and the program ends.
+    bool help;
+
+    /// Prints version string
+    bool version;
+
+    /// Print information about devices
+    bool device_info;
+
+    //
+    // Methods
+    //
+
+    explicit About(CommandLine const &cmdline);
+
+    void print_usage(std::ostream &out) const;
+    void print_options(std::ostream &out, int indent = 0) const;
+
+    static void print_version(std::ostream &out);
+  };
+
+public:
+
+  //
+  // Data members
+  //
+
+  /// Top-level execution mode
+  ExecutionMode execution_mode;
+
+  /// Name of math function to profile
+  library::OperationKind operation_kind;
+
+  /// Vector of operation name substrings
+  std::vector<std::string> operation_names;
+
+  /// Map of problems to run for each operation
+  /// [operation_name] -> vector of problems, each problem specified as a vector of [argument name] -> [argument value]
+  std::unordered_map<std::string, std::vector<CommandLine>> operation_problems;
+
+  /// Vector of operation name substrings
+  std::vector<std::string> excluded_operation_names;
+
+
+  //
+  // Detailed configuration options
+  //
+
+  /// Configuration
+  CommandLine cmdline;
+  Device device;
+  Initialization initialization;
+  Library library;
+  Verification verification;
+  Profiling profiling;
+  Report report;
+  About about;
+
+public:
+
+  explicit Options(CommandLine const &cmdline);
+
+  void print_usage(std::ostream &out) const;
+  void print_options(std::ostream &out) const;
+
+  static std::string indent_str(int indent);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_report.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_report.h
new file mode 100644
index 0000000000000000000000000000000000000000..07102c99bc0f38a071e1ab828aab30678a3e2d44
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_report.h
@@ -0,0 +1,128 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Class performing output during profiling
+*/
+
+#pragma once
+
+#include <vector>
+#include <fstream>
+
+// CUTLASS Profiler includes
+#include "options.h"
+#include "enumerated_types.h"
+#include "performance_result.h"
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+class PerformanceReport {
+private:
+
+  /// Reference to options
+  Options const &options_;
+
+  /// Operation kind
+  library::OperationKind op_kind_;
+
+  /// Operation file name containing performance report of op_kind
+  std::string op_file_name_;
+
+  /// Output file containing results
+  std::ofstream output_file_;
+
+  /// Operation file name containing junit performance report of op_kind
+  std::string op_junit_file_name_;
+
+  /// Output file containing junit results
+  std::ofstream junit_output_file_;
+
+  /// Flag indicating the performance report is valid
+  bool good_;
+
+  /// Vector of argument names
+  std::vector<std::string> argument_names_;
+
+  /// Counter uniquely identifying problem within the report
+  size_t problem_index_;
+
+  /// Collection of all results
+  PerformanceResultVector concatenated_results_;
+
+public:
+
+  PerformanceReport(Options const &options, std::vector<std::string> const &argument_names, library::OperationKind const &op_kind);
+  ~PerformanceReport();
+
+  bool good() const { return good_; }
+
+  void next_problem();
+  void append_result(PerformanceResult result);
+  void sort_flops_per_byte(PerformanceResultVector &results);
+  void sort_flops_per_sec(PerformanceResultVector &results);
+  void append_results(PerformanceResultVector const &results);
+
+public:
+
+  /// Prints the CSV header
+  std::ostream & print_csv_header_(std::ostream &out);
+
+  /// Prints the CSV
+  std::ostream & print_result_csv_(std::ostream &out, PerformanceResult const &result);
+
+  /// @defgroup jUnit Result Generation
+  /// Functions related to generation of the jUnit results
+  /// @{
+
+  std::ostream & print_junit_header_(std::ostream &out);
+  std::ostream & print_junit_result_(std::ostream &out, PerformanceResult const &result);
+  std::ostream & print_junit_footer_(std::ostream &out);
+
+  /// @}
+
+  /// Prints the result in human readable form
+  std::ostream & print_result_pretty_(
+    std::ostream &out, 
+    PerformanceResult const &result,
+    bool use_shell_coloring = true);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_result.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..986ac89bc86a267ce8fb181a986f28f3f0936566
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/performance_result.h
@@ -0,0 +1,137 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+*/
+
+#pragma once
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+
+// CUTLASS Profiler includes
+#include "enumerated_types.h"
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance result object
+struct PerformanceResult {
+
+  /// Index of problem
+  size_t problem_index;
+
+  /// library::Provider
+  library::Provider provider;
+
+  /// Operation kind
+  library::OperationKind op_kind;
+
+  /// CUTLASS status result from kernels (success or failure)
+  // Status does information on verification
+  Status status;
+
+  /// Outcome of verification (worst case verification result)
+  Disposition disposition;
+  
+  /// Outcome of verification (all verification results)
+  DispositionMap verification_map;
+
+  /// Operation name
+  std::string operation_name;
+
+  /// Stringified vector of argument values
+  std::vector<std::pair<std::string, std::string> > arguments;
+
+  /// Number of bytes read or written
+  int64_t bytes;
+
+  /// Number of DL flops performed by the math function
+  int64_t flops;
+
+  /// Average runtime in ms
+  double runtime;
+
+  /// Average runtime in ms per device
+  std::vector<double> runtime_vector;
+
+  //
+  // Members
+  //
+
+  /// Ctor
+  PerformanceResult(): 
+    problem_index(0),
+    op_kind(library::OperationKind::kInvalid),
+    provider(library::Provider::kInvalid), 
+    disposition(Disposition::kNotRun),
+    status(Status::kInvalid),
+    bytes(0), 
+    flops(0), 
+    runtime(0)
+  { }
+
+  // Copy constructor for deep copy
+  PerformanceResult(const PerformanceResult& other) = default;
+
+  // Explicitly define copy assignment operator
+  PerformanceResult& operator=(const PerformanceResult& other) = default;
+
+  /// Returns true if the runtime is valid
+  bool good() const {
+    return runtime > 0;
+  }
+
+  /// Math throughput in units of GFLOP/s
+  double gflops_per_sec() const {
+    return double(flops) / runtime / 1.0e6;
+  }
+
+  /// memory bandwidth in units of GiB/s
+  double gbytes_per_sec() const {
+    return double(bytes) / double(1 << 30) / runtime * 1000.0;
+  }
+
+};
+
+using PerformanceResultVector = std::vector<PerformanceResult>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/problem_space.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/problem_space.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bdbec657c10cff0dafebd2cb6cd52057f3695c9
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/problem_space.h
@@ -0,0 +1,1039 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief 
+
+    "Any sufficiently complicated C or Fortran program contains an ad-hoc, informally-specified, 
+     bug-ridden, slow implementation of half of Common Lisp."
+
+      - Greenspun's Tenth Rule of Programming
+
+ 
+  cutlass::profiler::ProblemSpace defines a set of data structures which represent the Cartesian
+  product of sequences defined by integer ranges, lists of scalars, and sets of enumerated types.
+
+  These permit a single invocation of the CUTLASS Profiler to iterate over a large set of problems,
+  verify and profile various operations when they are compatible with the command line, and
+  construct data tables of results that are convenient inputs to post processing in Excel or Pandas. 
+
+  By executing multiple problems per invocation, startup overheads may be amortized across many
+  kernel launches. 
+*/
+
+#pragma once
+
+// Standard Library includes
+#include <string>
+#include <vector>
+#include <memory>
+#include <unordered_map>
+#include <cstdlib>
+
+// CUTLASS Utility includes
+#include "cutlass/util/command_line.h"
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+
+// Profiler includes
+#include "enumerated_types.h"
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines the argument schema
+struct ArgumentDescription {
+
+  /// Type of argument
+  ArgumentTypeID type;
+
+  /// Prioritized array of aliases used in command line parsing
+  std::vector<std::string> aliases;
+
+  /// Description of argument
+  std::string description;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  ArgumentDescription(): 
+    type(ArgumentTypeID::kInvalid) { }
+
+  /// Constructor with aliases
+  ArgumentDescription(
+    ArgumentTypeID type_,
+    std::vector<std::string> const &aliases_,
+    std::string const &description_
+  ):
+    type(type_), aliases(aliases_), description(description_) { }
+};
+
+/// Vector of arguments
+using ArgumentDescriptionVector = std::vector<ArgumentDescription>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Base class for kernel arguments
+struct KernelArgument {
+
+  //
+  // Type definitions
+  //
+
+  /// Value base class
+  struct Value {
+
+    KernelArgument const *argument;
+    bool not_null;
+
+    //
+    // Methods
+    //
+
+    Value(
+      KernelArgument const *argument_ = nullptr, 
+      bool not_null_ = true
+    ): argument(argument_), not_null(not_null_) { }
+
+    virtual ~Value() { }
+
+    virtual std::ostream &print(std::ostream &out) const =0;
+  };
+
+  /// Abstract base class to iterate over values within arguments
+  struct ValueIterator {
+
+    /// Indicates type of kernel argument
+    KernelArgument const *argument;
+    
+    /// If the iterator points to an argument that is null, it needs to be distinguished
+    /// from end.
+    bool null_argument;
+
+    //
+    // Methods
+    //
+
+    /// Constructs a value iterator - no methods are valid if argument_ == nullptr
+    ValueIterator(
+      KernelArgument const *argument_ = nullptr, 
+      bool null_argument_ = false): 
+      argument(argument_), null_argument(null_argument_) {
+
+      if (!argument_->not_null()) {
+        null_argument = true;
+      }
+    }
+
+    virtual ~ValueIterator() { }
+
+    /// Advances to next point in range
+    virtual void operator++() = 0;
+
+    /// Compares against another value iterator - must be of the same KernelArgument type
+    virtual bool operator==(ValueIterator const &it) const = 0;
+
+    /// Returns a unique_ptr<Value> object pointing to a newly created value object
+    virtual std::unique_ptr<Value> at() const = 0;
+
+    /// Gets the type of the iterator
+    ArgumentTypeID type() const {
+      return argument->description->type;
+    }
+
+    /// Helper to compute inequality
+    bool operator!=(ValueIterator const &it) const {
+      return !(*this == it); 
+    }
+
+    std::ostream &print(std::ostream &out) const;
+  };
+
+  //
+  // Data members
+  //
+
+  /// Describes the argument
+  ArgumentDescription const *description;
+
+  /// Parent node
+  KernelArgument *parent;
+
+  /// Sequence in which the kernel argument is to be iterated over. 
+  /// Smaller means faster changing. -1 is don't  care
+  int ordinal;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  KernelArgument(
+    ArgumentDescription const *description_ = nullptr,
+    KernelArgument *parent_ = nullptr,
+    int ordinal_ = -1
+  ): description(description_), parent(parent_), ordinal(ordinal_) { }
+
+  virtual ~KernelArgument();
+
+  /// Returns true if the kernel argument iself is empty
+  virtual bool not_null() const =0;
+
+  /// Returns a string name for debugging
+  std::string qualified_name() const {
+    if (description) {
+      if (description->aliases.empty()) {
+        return "<description_not_null_no_aliases>";
+      }
+      return description->aliases.front();
+    }
+    return "<description_null>";
+  }
+
+  virtual std::unique_ptr<ValueIterator> begin() const =0;
+  virtual std::unique_ptr<ValueIterator> end() const =0;
+};
+
+using KernelArgumentVector = std::vector<std::unique_ptr<KernelArgument>>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Defines a scalar argument type as a string that is lexically cast to the appropriate kernel
+/// type.
+struct ScalarArgument : public KernelArgument {
+
+  //
+  // Type definitions
+  //
+  
+  /// Value type
+  struct ScalarValue : public KernelArgument::Value {
+
+    std::string value;
+
+    //
+    // Methods
+    //
+
+    ScalarValue(
+      std::string const &value_ = "",
+      ScalarArgument const *argument = nullptr,
+      bool not_null_ = true
+    );
+
+    virtual std::ostream &print(std::ostream &out) const;
+  };
+
+  using ValueCollection = std::vector<decltype(ScalarValue::value)>;
+
+  /// Abstract base class to iterate over values within arguments
+  struct ScalarValueIterator : public KernelArgument::ValueIterator {
+
+    //
+    // Data members
+    //
+
+    ValueCollection::const_iterator value_it;
+
+    //
+    // Methods
+    //
+
+    explicit ScalarValueIterator(ScalarArgument const *argument = nullptr);
+
+    virtual void operator++();
+    virtual bool operator==(ValueIterator const &it) const;
+
+    /// Gets the value pointed to
+    virtual std::unique_ptr<KernelArgument::Value> at() const;
+  };
+
+  //
+  // Data members
+  //
+
+  /// Set of possible values
+  ValueCollection values;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  explicit ScalarArgument(
+    ArgumentDescription const *description
+  ): 
+    KernelArgument(description) { }
+
+  virtual bool not_null() const {
+    return !values.empty();
+  }
+
+  virtual std::unique_ptr<KernelArgument::ValueIterator> begin() const;
+  virtual std::unique_ptr<KernelArgument::ValueIterator> end() const;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Closed range supporting additive increment
+struct Range {
+  
+  //
+  // Type definitions
+  //
+
+  enum class Mode {
+    kSequence,
+    kRandom,
+    kRandomLog2,
+    kInvalid
+  };
+
+  struct Iterator {
+
+    int64_t value;
+    int64_t increment;
+    Range const *range;
+
+    //
+    // Methods
+    //
+    
+    Iterator(
+      int64_t value_ = 0, 
+      int64_t increment_ = 1,
+      Range const *range_ = nullptr
+    ): 
+      value(value_), increment(increment_), range(range_) { }
+
+    Iterator & operator++() {
+      value += increment;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator self(*this);
+      ++(*this);
+      return self;
+    }
+
+    bool operator==(Iterator const &it) const {
+      return value == it.value;
+    }
+
+    bool operator!=(Iterator const &it) const {
+      return !(*this == it);
+    }
+
+    static int64_t round(int64_t value, int64_t divisible) {
+      int64_t rem = (value % divisible);
+
+      // Round either up or down
+      if (rem > divisible / 2) {
+        value += (divisible - rem);
+      }
+      else {
+        value -= rem;
+      }
+
+      return value;
+    }
+
+    int64_t at() const {
+      if (!range) {
+        return value;
+      }
+
+      switch (range->mode) {
+        case Mode::kSequence: return value;
+
+        case Mode::kRandom: {
+          double rnd = double(range->minimum) + 
+            double(std::rand()) / double(RAND_MAX) * (double(range->maximum) - double(range->minimum));
+
+          int64_t value = int64_t(rnd);
+
+          return round(value, range->divisible);      
+        }
+        break;
+
+        case Mode::kRandomLog2: {
+          double lg2_minimum = std::log(double(range->minimum)) / std::log(2.0);
+          double lg2_maximum = std::log(double(range->maximum)) / std::log(2.0);
+          double rnd = lg2_minimum + double(std::rand()) / double(RAND_MAX) * (lg2_maximum - lg2_minimum);      
+
+          int64_t value = int64_t(std::pow(2.0, rnd));
+
+          return round(value, range->divisible);
+        }
+        break;
+        default: break;
+      }
+      return value;
+    }
+
+    int64_t operator*() const {
+      return at();
+    }
+  };
+
+  //
+  // Data members
+  //
+
+  int64_t first;        ///< first element in range
+  int64_t last;         ///< last element in range
+  int64_t increment;    ///< additive increment between values
+  
+  Mode mode;            ///< mode selection enables alternative values 
+  int64_t minimum;      ///< minimum value to return
+  int64_t maximum;      ///< maximum value to return
+  int64_t divisible;    ///< rounds value down to an integer multiple of this value 
+
+  //
+  // Methods
+  //
+
+  /// Default constructor - range acts as a scalar
+  Range(int64_t first_ = 0): first(first_), last(first_), increment(1), mode(Mode::kSequence), minimum(0), maximum(0), divisible(1) { }
+
+  /// Range acts as a range
+  Range(
+    int64_t first_, 
+    int64_t last_, 
+    int64_t increment_ = 1,
+    Mode mode_ = Mode::kSequence,
+    int64_t minimum_ = 0,
+    int64_t maximum_ = 0,
+    int64_t divisible_ = 1
+  ): first(first_), last(last_), increment(increment_), mode(mode_), minimum(minimum_), maximum(maximum_), divisible(divisible_) {
+
+    // Helpers to avoid constructing invalid ranges
+    if (increment > 0) {
+      if (last < first) {
+        std::swap(last, first);
+      }
+    }
+    else if (increment < 0) {
+      if (first < last) {
+        std::swap(last, first);
+      }
+    }
+    else if (last != first) {
+      last = first;
+      increment = 1;
+    }
+  }
+
+  /// Helper to construct a sequence range
+  static Range Sequence(int64_t first_, int64_t last_, int64_t increment_ = 1) {
+    return Range(first_, last_, increment_, Mode::kSequence);
+  }
+
+  /// Helper to construct a range that is a random distribution 
+  static Range Random(int64_t minimum_, int64_t maximum_, int64_t count_, int64_t divisible_ = 1) {
+    return Range(1, count_, 1, Mode::kRandom, minimum_, maximum_, divisible_);
+  }
+
+  /// Helper to construct a range that is a random distribution over a log scale
+  static Range RandomLog2(int64_t minimum_, int64_t maximum_, int64_t count_, int64_t divisible_ = 1) {
+    return Range(1, count_, 1, Mode::kRandomLog2, minimum_, maximum_, divisible_);
+  }
+
+  /// Returns an iterator to the first element within the range
+  Iterator begin() const {
+    return Iterator(first, increment, this);
+  }
+
+  /// Returns an iterator to the first element *after* the range
+  Iterator end() const {
+    return Iterator(first + ((last - first)/increment + 1) * increment, increment, this);
+  }
+};
+
+/// Integer-valued argument - represented as a list of integer-valued ranges
+struct IntegerArgument : public KernelArgument {
+
+  //
+  // Type definitions
+  //
+
+  /// Value type
+  struct IntegerValue : public KernelArgument::Value {
+
+    int64_t value;
+
+    //
+    // Methods
+    //
+
+    IntegerValue(
+      int64_t value_ = 0, 
+      IntegerArgument const *argument_ = nullptr, 
+      bool not_null_ = true
+    );
+
+    /// Pretty printer for debugging
+    virtual std::ostream &print(std::ostream &out) const;
+  };
+  
+  /// Collection of ranges represent the IntegerArgument's state
+  using RangeCollection = std::vector<Range>;
+
+  /// Abstract base class to iterate over values within arguments
+  struct IntegerValueIterator : public KernelArgument::ValueIterator {
+
+    //
+    // Data members
+    //
+
+    RangeCollection::const_iterator range_it;
+    Range::Iterator value_it;
+
+    //
+    // Methods
+    //
+
+    IntegerValueIterator();
+    IntegerValueIterator(IntegerArgument const *argument);
+
+    virtual void operator++();
+    virtual bool operator==(ValueIterator const &it) const;
+
+    /// Gets the value pointed to
+    virtual std::unique_ptr<KernelArgument::Value> at() const;
+  };
+
+  //
+  // Data members
+  //
+
+  /// Set of possible values
+  RangeCollection ranges;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  IntegerArgument(
+    ArgumentDescription const *description
+  ): 
+    KernelArgument(description) { }
+
+  virtual bool not_null() const {
+    bool _not_null = !ranges.empty();
+    return _not_null;
+  }
+
+  virtual std::unique_ptr<KernelArgument::ValueIterator> begin() const;
+  virtual std::unique_ptr<KernelArgument::ValueIterator> end() const;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure defining the data type of tensors
+struct TensorArgument : public KernelArgument {
+
+  //
+  // Type definitions
+  //
+
+  struct TensorDescription {
+
+    /// Data type of elements
+    library::NumericTypeID element;
+
+    /// Layout definition
+    library::LayoutTypeID layout;
+
+    /// Computed extent
+    std::vector<int> extent;
+
+    /// Enables directly specifying stride value used to size tensor
+    std::vector<int> stride;
+
+    //
+    // Methods
+    //
+
+    TensorDescription(
+      library::NumericTypeID element_ = library::NumericTypeID::kUnknown,
+      library::LayoutTypeID layout_ = library::LayoutTypeID::kUnknown,
+      std::vector<int> extent_ = std::vector<int>(),
+      std::vector<int> stride_ = std::vector<int>()
+    ): 
+      element(element_), layout(layout_), extent(extent_), stride(stride_) {}
+  };
+
+  using ValueCollection = std::vector<TensorDescription>;
+
+  /// Value structure
+  struct TensorValue : public KernelArgument::Value {
+
+    TensorDescription desc;
+
+    //
+    // Methods
+    //
+
+    TensorValue(
+      TensorDescription const &desc_ = TensorDescription(),
+      TensorArgument const *argument_ = nullptr, 
+      bool not_null_ = true
+    );
+    
+    /// Pretty printer for debugging
+    virtual std::ostream &print(std::ostream &out) const;
+  };
+
+  /// Abstract base class to iterate over values within arguments
+  struct TensorValueIterator : public KernelArgument::ValueIterator {
+
+    //
+    // Data members
+    //
+
+    ValueCollection::const_iterator value_it;
+
+    //
+    // Methods
+    //
+
+    explicit TensorValueIterator(TensorArgument const *argument_);
+
+    virtual void operator++();
+    virtual bool operator==(ValueIterator const &it) const;
+
+    /// Gets the value pointed to
+    virtual std::unique_ptr<KernelArgument::Value> at() const;
+  };
+
+  /// Set of possible values
+  ValueCollection values;
+
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  explicit TensorArgument(
+    ArgumentDescription const *description
+  ): 
+    KernelArgument(description) { }
+
+  virtual bool not_null() const {
+    return !values.empty();
+  }
+
+  virtual std::unique_ptr<KernelArgument::ValueIterator> begin() const;
+  virtual std::unique_ptr<KernelArgument::ValueIterator> end() const;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Numeric data type
+struct EnumeratedTypeArgument : public KernelArgument {
+
+  //
+  // Type definitions
+  //
+
+  struct EnumeratedTypeValue : public KernelArgument::Value {
+
+    /// Data type of element
+    std::string element;
+
+    //
+    // Methods
+    //
+
+    EnumeratedTypeValue(
+      std::string const &element_ = std::string(),
+      EnumeratedTypeArgument const *argument_ = nullptr, 
+      bool not_null_ = true
+    );
+    
+    /// Pretty printer for debugging
+    virtual std::ostream &print(std::ostream &out) const;
+  };
+
+  using ValueCollection = std::vector<decltype(EnumeratedTypeValue::element)>;
+
+  /// Abstract base class to iterate over values within arguments
+  struct EnumeratedTypeValueIterator : public KernelArgument::ValueIterator {
+
+    //
+    // Data members
+    //
+
+    ValueCollection::const_iterator value_it;
+
+    //
+    // Methods
+    //
+
+    explicit EnumeratedTypeValueIterator(EnumeratedTypeArgument const *argument_ = nullptr);
+
+    virtual void operator++();
+    virtual bool operator==(ValueIterator const &it) const;
+
+    /// Gets the value pointed to
+    virtual std::unique_ptr<KernelArgument::Value> at() const;
+  };
+
+  //
+  // Data members
+  //
+
+  ValueCollection values;
+
+  //
+  // Members
+  //
+
+  /// Default ctor
+  explicit EnumeratedTypeArgument(ArgumentDescription const *description):
+    KernelArgument(description) {}
+
+  virtual bool not_null() const {
+    return !values.empty();
+  }
+
+  virtual std::unique_ptr<KernelArgument::ValueIterator> begin() const;
+  virtual std::unique_ptr<KernelArgument::ValueIterator> end() const;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Object storing the space argument values
+class ProblemSpace {
+public:
+
+  /// Tuple of arguments
+  using Problem = std::vector<std::unique_ptr<KernelArgument::Value>>;
+
+  /// Type used to iterator over things
+  using IteratorVector = std::vector<std::unique_ptr<KernelArgument::ValueIterator>>;
+
+  /// Iterates over points in the design space
+  class Iterator {
+  private:
+
+    /// One iterator per argument
+    IteratorVector iterators;
+
+  public:
+
+    //
+    // Methods
+    //
+
+    explicit Iterator();
+    Iterator(ProblemSpace const &problem_space);
+    Iterator(Iterator &&it);
+
+    // Rule of three
+    Iterator(Iterator const &) = delete;
+    Iterator &operator=(Iterator const &it) = delete;
+    ~Iterator() = default;
+
+    /// Pre-increment - advances to next point in argument range
+    void operator++();
+
+    /// Gets the current argument value
+    Problem at() const;
+
+    /// Moves iterator to end
+    void move_to_end();
+
+    /// Equality operator
+    bool operator==(Iterator const &it) const;
+
+    /// Inequality operator
+    bool operator!=(Iterator const &it) const {
+      return !(*this == it);
+    }
+
+    /// Helper to call at() method
+    Problem operator*() const {
+      return at();
+    }
+
+    /// Helper to print iterator state
+    std::ostream & print(std::ostream &out) const;
+
+  private:
+
+    /// Helper for recursively constructing iterators
+    void construct_(KernelArgument const *argument);
+  };
+
+public:
+
+  //
+  // Data members
+  //
+
+  KernelArgumentVector arguments;
+
+  /// Map of argument names to their position within the argument vector
+  std::unordered_map<std::string, size_t> argument_index_map;
+
+public:
+  
+  //
+  // Methods
+  //
+
+  /// Default ctor
+  ProblemSpace() = default;
+
+  /// Constructs a problem space from a vector of arguments. This vector must outlive
+  /// the ProblemSpace object, which stores pointers to objects within the
+  /// ArgumentDescriptionVector.
+  ProblemSpace(ArgumentDescriptionVector const &schema, CommandLine const &cmdline);
+
+  Iterator begin() const;   // returns an iterator to the first point in the range
+  Iterator end() const;     // returns an iterator to the first point after the range
+
+  /// Returns the index of an argument by name
+  size_t argument_index(char const *name) const;
+
+  /// Gets all argument names as an ordered vector
+  std::vector<std::string> argument_names() const;
+
+  /// Returns the number of dimensions of the problem space
+  size_t rank() const { return arguments.size(); }
+ 
+private:
+
+  /// Helper for recursively cloning
+  void clone_(
+    KernelArgumentVector &kernel_args,
+    ArgumentDescription const *arg_desc);
+
+  /// Parses command line argument
+  void parse_(
+    KernelArgument *arg,
+    CommandLine const &cmdline);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Lexically casts an argument to an int if it is defined. Returns true if not null.
+bool arg_as_int(int &int_value, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_int(int64_t &int_value, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_int(
+  int &int_value,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_int(
+  int64_t &int_value,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+bool arg_as_bool(bool &bool_value, KernelArgument::Value const *value_ptr);
+
+bool arg_as_bool(bool &bool_value,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_NumericTypeID(library::NumericTypeID &numeric_type, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_NumericTypeID(
+  library::NumericTypeID &numeric_type,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_LayoutTypeID(library::LayoutTypeID &layout_type, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_LayoutTypeID(
+  library::LayoutTypeID &layout_type,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_OpcodeClassID(library::OpcodeClassID &opcode_class, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_OpcodeClassID(
+  library::OpcodeClassID &opcode_class,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(library::SplitKMode &split_k_mode, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_SplitKModeID(
+  library::SplitKMode &split_k_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(library::ConvModeID &conv_mode, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ConvModeID(
+  library::ConvModeID &conv_mode,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_IteratorAlgorithmID(library::IteratorAlgorithmID &iterator_algorithm, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_IteratorAlgorithmID(
+  library::IteratorAlgorithmID &iterator_algorithm,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RuntimeDatatype(library::RuntimeDatatype &runtime_datatype, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RuntimeDatatype(
+  library::RuntimeDatatype &runtime_datatype,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(library::RasterOrder &raster_order, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_RasterOrder(
+  library::RasterOrder &raster_order,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(library::Provider &provider, KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
+bool arg_as_ProviderID(
+  library::Provider &provider,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
+bool arg_as_scalar(
+  std::vector<uint8_t> &bytes,
+  library::NumericTypeID numeric_type, 
+  KernelArgument::Value const *value_ptr);
+
+/// Lexically casts an argument to a given type stored in a byte array. Returns true if not null.
+bool arg_as_scalar(
+  std::vector<uint8_t> &bytes,
+  library::NumericTypeID numeric_type, 
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+bool arg_as_string(
+  std::string& arg,
+  char const* name,
+  ProblemSpace const& problem_space,
+  ProblemSpace::Problem const& problem);
+
+/// Returns true if a tensor description satisfies a `tensor` value
+bool tensor_description_satisfies(
+  library::TensorDescription const &tensor_desc,
+  TensorArgument::TensorValue const *value_ptr);
+
+/// Returns true if a tensor description satisfies a `tensor` value
+bool tensor_description_satisfies(
+  library::TensorDescription const &tensor_desc,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+
+/// Returns true if a conv kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr);
+
+/// Returns true if a conv kind satisfies the value
+bool conv_kind_satisfies(
+  library::ConvKind const &conv_kind,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  EnumeratedTypeArgument::EnumeratedTypeValue const *value_ptr);
+
+/// Returns true if a iterator algorithm satisfies the value
+bool iterator_algorithm_satisfies(
+  library::IteratorAlgorithmID const &iterator_algorithm,
+  char const *name, 
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba47a6832077984c334a5467257a151735b088b3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
@@ -0,0 +1,229 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class Rank2KOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct RankKProblem {
+    int64_t n;
+    int64_t k;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    RankKProblem(): 
+      n(16), k(16), lda(0), ldc(0), 
+      fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::RankKDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::RankKDescription const &operation_desc) const;
+    
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct RankKWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::RankKConfiguration configuration;
+    library::RankKArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    RankKWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  RankKProblem problem_;
+
+  /// Device memory allocations 
+  RankKWorkspace rank_k_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  Rank2KOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~Rank2KOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::RankKDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..fff190a7570cd5811c6e5de6284bf96e40c404b7
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
@@ -0,0 +1,227 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class RankKOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct RankKProblem {
+    int64_t n;
+    int64_t k;
+    int64_t lda;
+    int64_t ldc;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    RankKProblem(): 
+      n(16), k(16), lda(0), ldc(0), 
+      fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::RankKDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::RankKDescription const &operation_desc) const;
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::RankKDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct RankKWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::RankKConfiguration configuration;
+    library::RankKArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    RankKWorkspace(): 
+      A(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  RankKProblem problem_;
+
+  /// Device memory allocations 
+  RankKWorkspace rank_k_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  RankKOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~RankKOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::RankKDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c81ef4637175a6de1f44cedddf319436aaff24d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
@@ -0,0 +1,173 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines profiling functionality for reduction operation
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#if CUTLASS_ENABLE_CUDNN
+#include "cudnn_helpers.h"
+#endif //#if CUTLASS_ENABLE_CUDNN
+#include "debug.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class ReductionOperationProfiler : public OperationProfiler {
+public:
+
+
+  /// Workspace used 
+  struct ReductionWorkspace {
+
+    /// Conv device allocations
+    DeviceAllocation *Workspace;
+    DeviceAllocation *Source;
+    DeviceAllocation *Destination;
+    DeviceAllocation *Reference;
+    
+    /// Library configuration and arguments
+    library::ReductionConfiguration configuration;
+    library::ReductionArguments arguments;
+
+    /// Buffer used for the cutlass operations' host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the cutlass operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    ReductionWorkspace(): 
+      Workspace(nullptr), Source(nullptr), Destination(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// Reduction problem obtained from problem space
+  MatrixCoord problem_;
+
+  /// Device memory allocations 
+  ReductionWorkspace conv_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  ReductionOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~ReductionOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+  
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..60204d8c9d458ab12020a6492de23174739aa584
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
@@ -0,0 +1,214 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief 
+
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+#include "gemm_operation_profiler.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class SparseGemmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct SparseGemmProblem {
+    int64_t m;
+    int64_t n;
+    int64_t k;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc;
+    int64_t lde;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+    static int const sparse = 2;
+    // every 128b ElementA uses one elementE
+    int elements_per_128b;    
+
+    //
+    // Methods
+    //
+
+    SparseGemmProblem(): 
+      m(16), n(16), k(16), lda(0), ldb(0), ldc(0), lde(0), split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::SparseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::SparseGemmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct SparseGemmWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *E;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+    
+    library::SparseGemmConfiguration configuration;
+    library::SparseGemmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    SparseGemmWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), E(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  // GEMM problem
+  SparseGemmProblem problem_;
+
+  /// Device memory allocations 
+  SparseGemmWorkspace gemm_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  SparseGemmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~SparseGemmOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::SparseGemmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..94ded5e803bf914e5ae8c4ebb867cfe42ef829bc
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
@@ -0,0 +1,230 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+/// Abstract base class for each math function
+class SymmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct SymmProblem {
+    int64_t m;
+    int64_t n;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldc;
+    SideMode side_mode;
+    FillMode fill_mode;
+    BlasMode blas_mode;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    SymmProblem(): 
+      m(16), n(16), lda(0), ldb(0), ldc(0), 
+      side_mode(SideMode::kInvalid), fill_mode(FillMode::kInvalid), blas_mode(BlasMode::kInvalid), 
+      split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::SymmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Total number of bytes loaded
+    int64_t bytes(library::SymmDescription const &operation_desc) const;
+
+    /// Total number of flops computed
+    int64_t flops(library::SymmDescription const &operation_desc) const;
+    
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::SymmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct SymmWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *C;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::SymmConfiguration configuration;
+    library::SymmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    SymmWorkspace(): 
+      A(nullptr), B(nullptr), C(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  SymmProblem problem_;
+
+  /// Device memory allocations 
+  SymmWorkspace symm_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  SymmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~SymmOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::SymmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f21dafa0ecc869840fdba0a9c4414a89bbf4a7d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
@@ -0,0 +1,222 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/* \file
+   \brief Defines a math function
+
+  
+*/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <unordered_map>
+
+// CUTLASS Library includes
+#include "cutlass/blas3.h"
+#include "cutlass/library/library.h"
+#include "cutlass/library/util.h"
+#include "cutlass/library/manifest.h"
+
+// Profiler includes
+#include "options.h"
+#include "device_context.h"
+#include "operation_profiler.h"
+#include "performance_result.h"
+#include "problem_space.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace profiler {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Abstract base class for each math function
+class TrmmOperationProfiler : public OperationProfiler {
+public:
+
+  /// Problem structure obtained from problem space
+  struct TrmmProblem {
+    int64_t m;
+    int64_t n;
+    int64_t lda;
+    int64_t ldb;
+    int64_t ldd;
+    SideMode side_mode;
+    FillMode fill_mode;
+    DiagType diag_type;
+    std::vector<uint8_t> alpha;
+    std::vector<uint8_t> beta;
+    int64_t split_k_slices;
+    int64_t batch_count;
+
+    //
+    // Methods
+    //
+
+    TrmmProblem(): 
+      m(16), n(16), lda(0), ldb(0),  ldd(0), split_k_slices(1), batch_count(1) { }
+
+    /// Parses the problem
+    Status parse(
+      library::TrmmDescription const &operation_desc,
+      ProblemSpace const &problem_space,
+      ProblemSpace::Problem const &problem);
+
+    /// Initializes a performance result
+    void initialize_result(
+      PerformanceResult &result,
+      library::TrmmDescription const &operation_desc,
+      ProblemSpace const &problem_space);
+  };
+
+  /// Workspace used 
+  struct TrmmWorkspace {
+
+    DeviceAllocation *A;
+    DeviceAllocation *B;
+    DeviceAllocation *D;
+    DeviceAllocation *Computed;
+    DeviceAllocation *Reference;
+
+    library::TrmmConfiguration configuration;
+    library::TrmmArguments arguments;
+
+    /// Buffer used for the operation's host workspace
+    std::vector<uint8_t> host_workspace;
+
+    /// Buffer used for the operations' device workspace
+    DeviceAllocation device_workspace;
+
+    //
+    // Methods
+    //
+
+    TrmmWorkspace(): 
+      A(nullptr), B(nullptr), D(nullptr), Computed(nullptr), Reference(nullptr) { }
+  };
+
+protected:
+
+  //
+  // Data members
+  //
+
+  /// GEMM problem obtained from problem space
+  TrmmProblem problem_;
+
+  /// Device memory allocations 
+  TrmmWorkspace trmm_workspace_;
+
+
+public:
+  //
+  // Methods
+  //
+
+  /// Ctor
+  TrmmOperationProfiler(Options const &options);
+
+  /// Destructor
+  virtual ~TrmmOperationProfiler();
+
+  /// Prints usage statement for the math function
+  virtual void print_usage(std::ostream &out) const;
+
+  /// Prints examples
+  virtual void print_examples(std::ostream &out) const;
+
+  /// Extracts the problem dimensions
+  virtual Status initialize_configuration(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Initializes workspace
+  virtual Status initialize_workspace(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Verifies CUTLASS against references
+  virtual bool verify_cutlass(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+  /// Measures performance results
+  virtual bool profile(
+    Options const &options, 
+    PerformanceReport &report, 
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+protected:
+
+  /// Initializes the performance result
+  void initialize_result_(
+    PerformanceResult &result,
+    Options const &options,  
+    library::TrmmDescription const &operation_desc,
+    ProblemSpace const &problem_space);
+
+  /// Verifies CUTLASS against references
+  bool verify_with_cublas_(
+    Options const &options,  
+    PerformanceReport &report,
+    DeviceContext &device_context,
+    library::Operation const *operation,
+    ProblemSpace const &problem_space,
+    ProblemSpace::Problem const &problem);
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace profiler
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/GPU_Clock.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/GPU_Clock.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2727c989e645eca8e67a5d8d50391ced803cffa
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/GPU_Clock.hpp
@@ -0,0 +1,67 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+struct GPU_Clock
+{
+  GPU_Clock() {
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+    cudaEventRecord(start_);
+  }
+
+  ~GPU_Clock() {
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+  }
+
+  void start() {
+    cudaEventRecord(start_);
+  }
+
+  float milliseconds() {
+    cudaEventRecord(stop_);
+    cudaEventSynchronize(stop_);
+    float time;
+    cudaEventElapsedTime(&time, start_, stop_);
+    return time;
+  }
+
+  float seconds() {
+    return milliseconds() * float(1e-3);
+  }
+
+ private:
+  cudaEvent_t start_, stop_;
+};
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/command_line.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/command_line.h
new file mode 100644
index 0000000000000000000000000000000000000000..c95bd1cbeb56cc566394b155ea7ac24f07c28162
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/command_line.h
@@ -0,0 +1,324 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * Utility for parsing command line arguments
+ */
+
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include <cuda_runtime.h>
+
+#include "cutlass/cutlass.h"
+
+namespace cutlass {
+
+/******************************************************************************
+ * command_line
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLine {
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  std::vector<std::string> args;
+
+  /**
+   * Constructor
+   */
+  CommandLine(int argc, const char** argv) {
+    using namespace std;
+
+    for (int i = 1; i < argc; i++) {
+      string arg = argv[i];
+
+      if ((arg[0] != '-') || (arg[1] != '-')) {
+        args.push_back(arg);
+        continue;
+      }
+
+      string::size_type pos;
+      string key, val;
+      if ((pos = arg.find('=')) == string::npos) {
+        key = string(arg, 2, arg.length() - 2);
+        val = "";
+      } else {
+        key = string(arg, 2, pos - 2);
+        val = string(arg, pos + 1, arg.length() - 1);
+      }
+
+      keys.push_back(key);
+      values.push_back(val);
+    }
+  }
+
+  /**
+   * Constructor to represent a command line from a map of [argument] -> [value]
+   */
+  CommandLine(std::unordered_map<std::string, std::string>& arg_map) {
+    for (const auto& [key, value] : arg_map) {
+      keys.push_back(key);
+      values.push_back(value);
+    }
+  }
+
+  /**
+   * Checks whether a flag "--<flag>" is present in the commandline
+   */
+  bool check_cmd_line_flag(const char* arg_name) const {
+    using namespace std;
+
+    for (int i = 0; i < int(keys.size()); ++i) {
+      if (keys[i] == string(arg_name)) return true;
+    }
+    return false;
+  }
+
+  /**
+   * Returns number of naked (non-flag and non-key-value) commandline parameters
+   */
+  size_t num_naked_args() const {
+    return args.size();
+  }
+
+  /**
+   * Print naked (non-flag and non-key-value) commandline parameters
+   */
+  void print_naked_args(std::ostream &out) const {
+    for (auto arg : args) {
+      out << "   " << arg <<"\n";
+    }
+  }
+
+  /**
+   * Returns the commandline parameter for a given index (not including flags)
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(size_t index, value_t& val) const {
+    using namespace std;
+    if (index < args.size()) {
+      istringstream str_stream(args[index]);
+      str_stream >> val;
+    }
+  }
+
+  /**
+   * Obtains the boolean value specified for a given commandline parameter --<flag>=<bool>
+   */
+  void get_cmd_line_argument(const char* arg_name, bool& val, bool _default) const {
+    val = _default;
+    if (check_cmd_line_flag(arg_name)) {
+      std::string value;
+      get_cmd_line_argument(arg_name, value);
+
+      val = !(value == "0" || value == "false");
+    }
+  }
+  
+  /**
+   * Obtains the value specified for a given commandline parameter --<flag>=<value>
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(const char* arg_name,
+                             value_t& val) const {
+
+    get_cmd_line_argument(arg_name, val, val);
+  }
+
+  /**
+   * Obtains the value specified for a given commandline parameter --<flag>=<value>
+   */
+  template <typename value_t>
+  void get_cmd_line_argument(const char* arg_name,
+                             value_t& val,
+                             value_t const& _default) const {
+    using namespace std;
+
+    val = _default;
+
+    for (int i = 0; i < int(keys.size()); ++i) {
+      if (keys[i] == string(arg_name)) {
+        istringstream str_stream(values[i]);
+        str_stream >> val;
+      }
+    }
+  }
+
+  /**
+   * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+   */
+  template <typename value_t>
+  void get_cmd_line_arguments(const char* arg_name,
+                              std::vector<value_t>& vals,
+                              char sep = ',') const {
+    using namespace std;
+
+    if (check_cmd_line_flag(arg_name)) {
+      // Clear any default values
+      vals.clear();
+
+      // Recover from multi-value string
+      for (size_t i = 0; i < keys.size(); ++i) {
+        if (keys[i] == string(arg_name)) {
+          string val_string(values[i]);
+          separate_string(val_string, vals, sep);
+        }
+      }
+    }
+  }
+
+  /**
+   * Returns the values specified for a given commandline parameter
+   * --<flag>=<value>,<value_start:value_end>*
+   */
+  void get_cmd_line_argument_pairs(const char* arg_name,
+                                   std::vector<std::pair<std::string, std::string> >& tokens,
+                                   char delim = ',',
+                                   char sep = ':') const {
+    if (check_cmd_line_flag(arg_name)) {
+      std::string value;
+      get_cmd_line_argument(arg_name, value);
+
+      tokenize(tokens, value, delim, sep);
+    }
+  }
+
+  /**
+   * Returns a list of ranges specified for a given commandline parameter
+   * --<flag>=<key:value>,<key:value>*
+   */
+  void get_cmd_line_argument_ranges(const char* arg_name,
+                                    std::vector<std::vector<std::string> >& vals,
+                                    char delim = ',',
+                                    char sep = ':') const {
+    std::vector<std::string> ranges;
+    get_cmd_line_arguments(arg_name, ranges, delim);
+
+    for (std::vector<std::string>::const_iterator range = ranges.begin();
+      range != ranges.end(); ++range) {
+
+      std::vector<std::string> range_vals;
+      separate_string(*range, range_vals, sep);
+      vals.push_back(range_vals);
+    }
+  }
+
+  /**
+   * The number of pairs parsed
+   */
+  int parsed_argc() const { return (int)keys.size(); }
+
+  //-------------------------------------------------------------------------
+  // Utility functions
+  //-------------------------------------------------------------------------
+
+  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
+  static void tokenize(std::vector<std::pair<std::string, std::string> >& tokens,
+                       std::string const& str,
+                       char delim = ',',
+                       char sep = ':') {
+    // Home-built to avoid Boost dependency
+    size_t s_idx = 0;
+    size_t d_idx = std::string::npos;
+    while (s_idx < str.size()) {
+      d_idx = str.find_first_of(delim, s_idx);
+
+      size_t end_idx = (d_idx != std::string::npos ? d_idx : str.size());
+      size_t sep_idx = str.find_first_of(sep, s_idx);
+      size_t offset = 1;
+      if (sep_idx == std::string::npos || sep_idx >= end_idx) {
+        sep_idx = end_idx;
+        offset = 0;
+      }
+
+      std::pair<std::string, std::string> item(
+          str.substr(s_idx, sep_idx - s_idx),
+          str.substr(sep_idx + offset, end_idx - sep_idx - offset));
+
+      tokens.push_back(item);
+      s_idx = end_idx + 1;
+    }
+  }
+
+  /// Tokenizes a comma-delimited list of string pairs delimited by ':'
+  static void tokenize(std::vector<std::string>& tokens,
+                       std::string const& str,
+                       char delim = ',',
+                       char sep = ':') {
+    typedef std::vector<std::pair<std::string, std::string> > TokenVector;
+    typedef TokenVector::const_iterator token_iterator;
+
+    std::vector<std::pair<std::string, std::string> > token_pairs;
+    tokenize(token_pairs, str, delim, sep);
+    for (token_iterator tok = token_pairs.begin(); tok != token_pairs.end(); ++tok) {
+      tokens.push_back(tok->first);
+    }
+  }
+
+  template <typename value_t>
+  static void separate_string(std::string const& str,
+                              std::vector<value_t>& vals,
+                              char sep = ',') {
+    std::istringstream str_stream(str);
+    std::string::size_type old_pos = 0;
+    std::string::size_type new_pos = 0;
+
+    // Iterate <sep>-delimited values
+    value_t val;
+    while ((new_pos = str.find(sep, old_pos)) != std::string::npos) {
+      if (new_pos != old_pos) {
+        str_stream.width(new_pos - old_pos);
+        str_stream >> val;
+        vals.push_back(val);
+      }
+
+      // skip over delimiter
+      str_stream.ignore(1);
+      old_pos = new_pos + 1;
+    }
+
+    // Read last value
+    str_stream >> val;
+    vals.push_back(val);
+  }
+};
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/cublas_wrappers.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/cublas_wrappers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ace1e0a232ea7cccbb2089ec8432783c49410dd
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/cublas_wrappers.hpp
@@ -0,0 +1,528 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+
+//-- BLAM_DEBUG_OUT ---------------------------------------------------------
+#ifdef BLAM_DEBUG
+# include <iostream>
+# ifndef BLAM_DEBUG_OUT
+#  define BLAM_DEBUG_OUT(msg)    std::cerr << "BLAM: " << msg << std::endl
+#  define BLAM_DEBUG_OUT_2(msg)  std::cerr << msg << std::endl
+# endif // BLAM_DEBUG_OUT
+#else
+# ifndef BLAM_DEBUG_OUT
+#  define BLAM_DEBUG_OUT(msg)
+#  define BLAM_DEBUG_OUT_2(msg)
+# endif // BLAM_DEBUG_OUT
+#endif // BLAM_DEBUG
+
+// User could potentially define ComplexFloat/ComplexDouble instead of std::
+#ifndef BLAM_COMPLEX_TYPES
+#define BLAM_COMPLEX_TYPES 1
+#include "cutlass/cutlass.h"
+#include CUDA_STD_HEADER(complex)
+
+namespace blam {
+template <typename T>
+using Complex       = cuda::std::complex<T>;
+using ComplexFloat  = cuda::std::complex<float>;
+using ComplexDouble = cuda::std::complex<double>;
+}
+#endif // BLAM_COMPLEX_TYPES
+
+// User could potentially define Half instead of cute::
+#ifndef BLAM_HALF_TYPE
+#define BLAM_HALF_TYPE 1
+#include <cute/numeric/numeric_types.hpp>
+namespace blam {
+using Half = cute::half_t;
+}
+#endif // BLAM_HALF_TYPE
+
+namespace blam
+{
+namespace cublas
+{
+
+inline const char*
+cublas_get_error(cublasStatus_t status)
+{
+  switch (status) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED -- The cuBLAS library was not initialized.";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED -- Resource allocation failed inside the cuBLAS library.";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE -- An unsupported value or parameter was passed to the function.";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH -- The function requires a feature absent from the device architecture.";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR -- An access to GPU memory space failed.";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED -- The GPU program failed to execute.";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR -- An internal cuBLAS operation failed.";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED -- The functionality requested is not supported.";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR -- An error was detected when checking the current licensing.";
+    default:
+      return "CUBLAS_ERROR -- <unknown>";
+  }
+}
+
+inline bool
+cublas_is_error(cublasStatus_t status)
+{
+  return status != CUBLAS_STATUS_SUCCESS;
+}
+
+
+// hgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const Half* alpha,
+     const Half* A, int ldA,
+     const Half* B, int ldB,
+     const Half* beta,
+     Half* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasHgemm");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      reinterpret_cast<const __half*>(alpha),
+                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
+                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
+                      reinterpret_cast<const __half*>(beta),
+                      reinterpret_cast<      __half*>(C), CUDA_R_16F, ldC,
+                      CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// mixed hf gemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const float* alpha,
+     const Half* A, int ldA,
+     const Half* B, int ldB,
+     const float* beta,
+     float* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasGemmEx mixed half-float");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      alpha,
+                      reinterpret_cast<const __half*>(A), CUDA_R_16F, ldA,
+                      reinterpret_cast<const __half*>(B), CUDA_R_16F, ldB,
+                      beta,
+                      C, CUDA_R_32F, ldC,
+                      CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// igemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const int32_t* alpha,
+     const int8_t* A, int ldA,
+     const int8_t* B, int ldB,
+     const int32_t* beta,
+     int32_t* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasIgemm");
+
+  return cublasGemmEx(handle, transA, transB,
+                      m, n, k,
+                      alpha,
+                      A, CUDA_R_8I, ldA,
+                      B, CUDA_R_8I, ldB,
+                      beta,
+                      C, CUDA_R_32I, ldC,
+                      CUDA_R_32I, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const float* alpha,
+     const float* A, int ldA,
+     const float* B, int ldB,
+     const float* beta,
+     float* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasSgemm");
+
+  return cublasSgemm(handle, transA, transB,
+                     m, n, k,
+                     alpha,
+                     A, ldA,
+                     B, ldB,
+                     beta,
+                     C, ldC);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const double* alpha,
+     const double* A, int ldA,
+     const double* B, int ldB,
+     const double* beta,
+     double* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasDgemm");
+
+  return cublasDgemm(handle, transA, transB,
+                     m, n, k,
+                     alpha,
+                     A, ldA,
+                     B, ldB,
+                     beta,
+                     C, ldC);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const ComplexFloat* alpha,
+     const ComplexFloat* A, int ldA,
+     const ComplexFloat* B, int ldB,
+     const ComplexFloat* beta,
+     ComplexFloat* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasCgemm");
+
+  return cublasCgemm(handle, transA, transB,
+                     m, n, k,
+                     reinterpret_cast<const cuFloatComplex*>(alpha),
+                     reinterpret_cast<const cuFloatComplex*>(A), ldA,
+                     reinterpret_cast<const cuFloatComplex*>(B), ldB,
+                     reinterpret_cast<const cuFloatComplex*>(beta),
+                     reinterpret_cast<cuFloatComplex*>(C), ldC);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm(cublasHandle_t handle,
+     cublasOperation_t transA, cublasOperation_t transB,
+     int m, int n, int k,
+     const ComplexDouble* alpha,
+     const ComplexDouble* A, int ldA,
+     const ComplexDouble* B, int ldB,
+     const ComplexDouble* beta,
+     ComplexDouble* C, int ldC)
+{
+  BLAM_DEBUG_OUT("cublasZgemm");
+
+  return cublasZgemm(handle, transA, transB,
+                     m, n, k,
+                     reinterpret_cast<const cuDoubleComplex*>(alpha),
+                     reinterpret_cast<const cuDoubleComplex*>(A), ldA,
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldB,
+                     reinterpret_cast<const cuDoubleComplex*>(beta),
+                     reinterpret_cast<cuDoubleComplex*>(C), ldC);
+}
+
+// hgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const Half* alpha,
+           const Half* A, int ldA, int loA,
+           const Half* B, int ldB, int loB,
+           const Half* beta,
+           Half* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasHgemmStridedBatched");
+
+  return cublasHgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const __half*>(alpha),
+                                   reinterpret_cast<const __half*>(A), ldA, loA,
+                                   reinterpret_cast<const __half*>(B), ldB, loB,
+                                   reinterpret_cast<const __half*>(beta),
+                                   reinterpret_cast<__half*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const float* alpha,
+           const float* A, int ldA, int loA,
+           const float* B, int ldB, int loB,
+           const float* beta,
+           float* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasSgemmStridedBatched");
+
+  return cublasSgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   alpha,
+                                   A, ldA, loA,
+                                   B, ldB, loB,
+                                   beta,
+                                   C, ldC, loC,
+                                   batch_size);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const double* alpha,
+           const double* A, int ldA, int loA,
+           const double* B, int ldB, int loB,
+           const double* beta,
+           double* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasDgemmStridedBatched");
+
+  return cublasDgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   alpha,
+                                   A, ldA, loA,
+                                   B, ldB, loB,
+                                   beta,
+                                   C, ldC, loC,
+                                   batch_size);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexFloat* alpha,
+           const ComplexFloat* A, int ldA, int loA,
+           const ComplexFloat* B, int ldB, int loB,
+           const ComplexFloat* beta,
+           ComplexFloat* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasCgemmStridedBatched");
+
+  return cublasCgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const cuFloatComplex*>(alpha),
+                                   reinterpret_cast<const cuFloatComplex*>(A), ldA, loA,
+                                   reinterpret_cast<const cuFloatComplex*>(B), ldB, loB,
+                                   reinterpret_cast<const cuFloatComplex*>(beta),
+                                   reinterpret_cast<cuFloatComplex*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexDouble* alpha,
+           const ComplexDouble* A, int ldA, int loA,
+           const ComplexDouble* B, int ldB, int loB,
+           const ComplexDouble* beta,
+           ComplexDouble* C, int ldC, int loC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasZgemmStridedBatched");
+
+  return cublasZgemmStridedBatched(handle, transA, transB,
+                                   m, n, k,
+                                   reinterpret_cast<const cuDoubleComplex*>(alpha),
+                                   reinterpret_cast<const cuDoubleComplex*>(A), ldA, loA,
+                                   reinterpret_cast<const cuDoubleComplex*>(B), ldB, loB,
+                                   reinterpret_cast<const cuDoubleComplex*>(beta),
+                                   reinterpret_cast<cuDoubleComplex*>(C), ldC, loC,
+                                   batch_size);
+}
+
+// hgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const Half* alpha,
+           const Half* const A[], int ldA,
+           const Half* const B[], int ldB,
+           const Half* beta,
+           Half* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasHgemmBatched");
+
+  return cublasHgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const __half*>(alpha),
+                            reinterpret_cast<const __half**>(const_cast<const Half**>(A)), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            reinterpret_cast<const __half**>(const_cast<const Half**>(B)), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            reinterpret_cast<const __half*>(beta),
+                            reinterpret_cast<__half**>(const_cast<Half**>(C)), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// sgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const float* alpha,
+           const float* const A[], int ldA,
+           const float* const B[], int ldB,
+           const float* beta,
+           float* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasSgemmBatched");
+
+  return cublasSgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            alpha,
+                            const_cast<const float**>(A), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            const_cast<const float**>(B), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            beta,
+                            const_cast<float**>(C), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// dgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const double* alpha,
+           const double* const A[], int ldA,
+           const double* const B[], int ldB,
+           const double* beta,
+           double* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasDgemmBatched");
+
+  return cublasDgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            alpha,
+                            const_cast<const double**>(A), ldA,
+                            // A, ldA,   // cuBLAS 9.2
+                            const_cast<const double**>(B), ldB,
+                            // B, ldB,   // cuBLAS 9.2
+                            beta,
+                            const_cast<double**>(C), ldC,
+                            // C, ldC,   // cuBLAS 9.2
+                            batch_size);
+}
+
+// cgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexFloat* alpha,
+           const ComplexFloat* const A[], int ldA,
+           const ComplexFloat* const B[], int ldB,
+           const ComplexFloat* beta,
+           ComplexFloat* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasCgemmBatched");
+
+  return cublasCgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const cuFloatComplex*>(alpha),
+                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(A)), ldA,
+                            //reinterpret_cast<const cuFloatComplex* const *>(A), ldA,  // cuBLAS 9.2
+                            const_cast<const cuFloatComplex**>(reinterpret_cast<const cuFloatComplex* const *>(B)), ldB,
+                            //reinterpret_cast<const cuFloatComplex* const *>(B), ldB,  // cuBLAS 9.2
+                            reinterpret_cast<const cuFloatComplex*>(beta),
+                            const_cast<cuFloatComplex**>(reinterpret_cast<cuFloatComplex* const *>(C)), ldC,
+                            //reinterpret_cast<cuFloatComplex* const *>(C), ldC,        // cuBLAS 9.2
+                            batch_size);
+}
+
+// zgemm
+inline cublasStatus_t
+gemm_batch(cublasHandle_t handle,
+           cublasOperation_t transA, cublasOperation_t transB,
+           int m, int n, int k,
+           const ComplexDouble* alpha,
+           const ComplexDouble* const A[], int ldA,
+           const ComplexDouble* const B[], int ldB,
+           const ComplexDouble* beta,
+           ComplexDouble* const C[], int ldC,
+           int batch_size)
+{
+  BLAM_DEBUG_OUT("cublasZgemmBatched");
+
+  return cublasZgemmBatched(handle, transA, transB,
+                            m, n, k,
+                            reinterpret_cast<const cuDoubleComplex*>(alpha),
+                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(A)), ldA,
+                            //reinterpret_cast<const cuDoubleComplex* const *>(A), ldA,  // cuBLAS 9.2
+                            const_cast<const cuDoubleComplex**>(reinterpret_cast<const cuDoubleComplex* const *>(B)), ldB,
+                            //reinterpret_cast<const cuDoubleComplex* const *>(B), ldB,  // cuBLAS 9.2
+                            reinterpret_cast<const cuDoubleComplex*>(beta),
+                            const_cast<cuDoubleComplex**>(reinterpret_cast<cuDoubleComplex* const *>(C)), ldC,
+                            //reinterpret_cast<cuDoubleComplex* const *>(C), ldC,        // cuBLAS 9.2
+                            batch_size);
+}
+
+} // end namespace cublas
+} // end namespace blam
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/debug.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..88481a82e0e08f06b54c07c946d28160d41f9f07
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/debug.h
@@ -0,0 +1,143 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Contains code for debugging cutlass code
+*/
+
+#pragma once
+
+#include "device_dump.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/******************************************************************************
+ * Debug and logging macros
+ ******************************************************************************/
+
+/**
+ * Formats and prints the given message to stdout
+ */
+#if !defined(CUDA_LOG)
+#if !defined(__CUDA_ARCH__)
+#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__)
+#else
+#define CUDA_LOG(format, ...)                              \
+  printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \
+         blockIdx.x,                                       \
+         blockIdx.y,                                       \
+         blockIdx.z,                                       \
+         threadIdx.x,                                      \
+         threadIdx.y,                                      \
+         threadIdx.z,                                      \
+         __VA_ARGS__);
+#endif
+#endif
+
+/**
+ * Formats and prints the given message to stdout only if DEBUG is defined
+ */
+#if !defined(CUDA_LOG_DEBUG)
+#ifdef DEBUG
+#define CUDA_LOG_DEBUG(format, ...) CUDA_LOG(format, __VA_ARGS__)
+#else
+#define CUDA_LOG_DEBUG(format, ...)
+#endif
+#endif
+
+/**
+ * \brief The corresponding error message is printed to \p stderr (or \p stdout in device code)
+ * along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl(cudaError_t error,
+                                                     const char* expression,
+                                                     const char* filename,
+                                                     int line) {
+  (void)filename;
+  (void)line;
+  if (error) {
+#if !defined(__CUDA_ARCH__)
+    fprintf(
+        stderr, "CUDA error %d [%s, %d] in expression '%s': %s\n", error, filename, line, expression, cudaGetErrorString(error));
+    fflush(stderr);
+#else
+    printf("CUDA error %d [%s, %d] in expression '%s'\n", error, filename, line, expression);
+#endif
+  }
+  return error;
+}
+
+/**
+ * \brief Perror macro
+ */
+#ifndef CUDA_PERROR
+#define CUDA_PERROR(e) cuda_perror_impl((cudaError_t)(e), #e, __FILE__, __LINE__)
+#endif
+
+/**
+ * \brief Perror macro with exit
+ */
+#ifndef CUDA_PERROR_EXIT
+#define CUDA_PERROR_EXIT(e)                                     \
+  do { if (cuda_perror_impl((cudaError_t)(e), #e, __FILE__, __LINE__)) { \
+    exit(1);                                                    \
+  } } while (0)
+#endif
+
+/**
+ * \brief Perror macro only if DEBUG is defined
+ */
+#ifndef CUDA_PERROR_DEBUG
+#ifdef DEBUG
+#define CUDA_PERROR_DEBUG(e) CUDA_PERROR(e)
+#else
+#define CUDA_PERROR_DEBUG(e) (e)
+#endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A small helper class to dump a type at compile time
+// Usage:: DumpType<Class>::Class
+template <typename T>
+struct DebugType {};
+
+template <typename T>
+void DebugTypeFunc(T const& t) {
+  T::t;
+}
+
+// A small helper class to dump a compile time constant at compile time
+// Usage: DumpValue<Class::kConstant>::kConstant
+template <int Value>
+struct DebugValue {};
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_dump.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_dump.h
new file mode 100644
index 0000000000000000000000000000000000000000..a73a8cfe79dd22c2d298fcb3be8cf25d5e3f5734
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_dump.h
@@ -0,0 +1,187 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cstdio>
+#include "cutlass/cutlass.h"
+
+/**
+ * \file
+ * \brief C++ interface to dump fragments and shared memory contents for
+ * debugging.
+ */
+
+namespace cutlass {
+namespace debug {
+
+/******************************************************************************
+ * Dump the fragments
+ ******************************************************************************/
+
+/// The first N threads dump the first M elements from their fragments with a
+/// stride of S elements.  If N is not specified, dump the data of all the
+/// threads.  If M is not specified, dump all the elements of the fragment.
+template <typename Fragment>
+CUTLASS_DEVICE void dump_fragment(Fragment const& frag, int N = 0, int M = 0,
+                                  int S = 1) {
+  int total_threads = blockDim.x * blockDim.y * blockDim.z;
+  int block_id =
+      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
+  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
+                  (threadIdx.y * blockDim.x) + threadIdx.x;
+
+  if (N < 0 || N > total_threads) {
+    if (thread_id == 0 && block_id == 0)
+      printf("Thread number N = %d should between [1, %d].\n", N,
+             total_threads);
+
+    __syncthreads();
+
+    return;
+  }
+
+  int total_elements = int(frag.size());
+
+  if (M < 0 || M > total_elements) {
+    if (thread_id == 0 && block_id == 0)
+      printf("Element number M = %d should between [1, %d].\n", M,
+             total_elements);
+
+    __syncthreads();
+
+    return;
+  }
+
+  if (N == 0) N = total_threads;
+
+  if (M == 0) M = total_elements;
+
+  if (S < 1 || S > M) {
+    if (thread_id == 0 && block_id == 0)
+      printf("Stride S = %d should between [1, %d].\n", S, M);
+
+    __syncthreads();
+
+    return;
+  }
+
+  if (thread_id == 0 && block_id == 0)
+    printf("\n*******************Dumping the fragments*******************\n\n");
+
+  CUTLASS_PRAGMA_NO_UNROLL
+  for (int tid = 0; tid < N; ++tid) {
+    if (tid == thread_id) {
+      printf("TB%d W%d T%d: ", block_id, tid / 32, tid & 31);
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int i = 0; i < M; i += S) {
+        printf("%.0f ", float(typename Fragment::value_type(frag[i])));
+      }
+      printf("\n");
+    }
+
+    __syncthreads();
+  }
+
+  if (thread_id == 0 && block_id == 0)
+    printf("\n***********************************************************\n\n");
+
+  __syncthreads();
+
+  return;
+}
+
+/******************************************************************************
+ * Dump the shared memory
+ ******************************************************************************/
+
+#define SHMEM_ROW_SIZE 128
+
+/// Dump the shared memory contents.  ptr is the begin address, size specifies
+/// the number of elements that need to be dumped, and S specifies the stride.
+template <typename Element>
+CUTLASS_DEVICE void dump_shmem(Element const* ptr, size_t size, int S = 1) {
+  int block_id =
+      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
+  int thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
+                  (threadIdx.y * blockDim.x) + threadIdx.x;
+
+  if (ptr == nullptr) {
+    if (thread_id == 0 && block_id == 0) printf("ptr is null.\n");
+
+    __syncthreads();
+    return;
+  }
+
+  if (size < 1) {
+    if (thread_id == 0 && block_id == 0)
+      printf("Element size is less than 1\n");
+
+    __syncthreads();
+
+    return;
+  }
+
+  int row_elements = SHMEM_ROW_SIZE / sizeof(Element);
+
+  if (S < 1 || S > row_elements) {
+    if (thread_id == 0 && block_id == 0)
+      printf("Stride S = %d should between [1, %d].\n", S, row_elements);
+
+    __syncthreads();
+
+    return;
+  }
+
+  __syncthreads();
+
+  if (thread_id == 0)
+    printf("\n********Dumping the shared memory of TB %d*******\n\n", block_id);
+
+  if (thread_id == 0) {
+    for (int i = 0; i < size; i += row_elements) {
+      for (int j = 0; j < row_elements; j += S) {
+        printf("%.0f ", float(ptr[i + j]));
+      }
+
+      printf("\n");
+    }
+  }
+
+  if (thread_id == 0)
+    printf("\n***********************************************************\n\n");
+
+  __syncthreads();
+
+  return;
+}
+}  // namespace debug
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_groupnorm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_groupnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..59457b2e8122f46e443844fe276b2c7fb35f3f56
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_groupnorm.h
@@ -0,0 +1,402 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do group norm on a device memory tensor with NHWC layout. The tensor will be divided into [N, H, W, G, C'] and then we do normalization on [H, W, C'].
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do group norm on a device memory tensor with NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void groupnorm(cutlass::Tensor4DCoord input_size,
+               const int num_groups,
+               const float eps,
+               TensorRef<T, layout::TensorNHWC> ref_output,
+               TensorRef<T, layout::TensorNHWC> ref_input,
+               TensorRef<T, layout::TensorNHWC> ref_gamma,
+               TensorRef<T, layout::TensorNHWC> ref_beta,
+               cudaStream_t stream);
+
+extern __shared__ char groupnorm_shm[];
+
+// For small prod_dim1_to_last_dim/num_groups, to avoid multiple loads from global memory,
+// we store the input in the shared memory.
+// grid(num_groups, dim0)
+// block(BLOCKSIZE)
+// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
+template<typename TVec, typename T, int T_PER_TVec>
+__global__ void groupnorm_twopass_store_locally(T*          output,
+                                                const T*    input,
+                                                const T*    gamma,
+                                                const T*    beta,
+                                                int         num_groups,
+                                                int         prod_dim1_to_last_dim,
+                                                int         last_dim,
+                                                const float eps,
+                                                const int   TVecs_PER_THREAD)
+{
+    const int   bid               = blockIdx.y;   // index of batch
+    const int   gid               = blockIdx.x;   // index of group
+    const int   tid               = threadIdx.x;  // index of thread
+    const int   bdimx             = blockDim.x;
+    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
+    const int   s_group_stride    = last_dim / num_groups;
+    const int   v_group_stride    = s_group_stride / T_PER_TVec;
+    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
+    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
+    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
+    T*       local_val         = ((T*)groupnorm_shm) + TVecs_PER_THREAD * T_PER_TVec * tid;
+    float       local_sum[1]      = {0.0f};
+
+// load from global memory into shared memory
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        const int offset_in_group =
+            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+            / T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            TVec      tmp_vec          = input_TVec_ptr[offset_in_group];
+            T*        tmp_vec_ptr      = (T*)(&tmp_vec);
+            const int local_val_offset = i * T_PER_TVec;
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                local_sum[0] += tmp;
+                local_val[local_val_offset + j] = tmp_vec_ptr[j];
+            }
+        }
+    }
+    __shared__ float s_mean, s_variance;
+
+    // reduction for mean
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_mean = local_sum[0] / s_reduce_elements;
+    }
+    __syncthreads();
+
+    // reduction for std
+    local_sum[0] = 0.0f;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int local_val_offset = i * T_PER_TVec;
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(local_val[local_val_offset + j]);
+                tmp -= s_mean;
+                local_sum[0] += tmp * tmp;
+            }
+        }
+    }
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
+    }
+    __syncthreads();
+
+    // normalize
+    const int   gamma_offset_of_group = gid * v_group_stride;
+    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
+    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        const int offset_in_group =
+            ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+            / T_PER_TVec;
+        const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
+        const int local_val_offset      = i * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            TVec gamma_val     = gamma_TVec_ptr[gamma_offset_in_group];
+            TVec beta_val      = beta_TVec_ptr[gamma_offset_in_group];
+            T*   gamma_val_ptr = (T*)(&gamma_val);
+            T*   beta_val_ptr  = (T*)(&beta_val);
+            TVec tmp_vec;
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = (static_cast<float>(local_val[local_val_offset + j]) - s_mean) * s_variance
+                                * static_cast<float>(gamma_val_ptr[j])
+                            + static_cast<float>(beta_val_ptr[j]);
+                if (sizeof(T) == sizeof(half)) {
+                    tmp_vec_ptr[j] = T(__float2half_rn(tmp));
+                }
+                else {
+                    tmp_vec_ptr[j] = T(tmp);
+                }
+            }
+            output_TVec_ptr[offset_in_group] = tmp_vec;
+        }
+    }
+}
+
+// For large prod_dim1_to_last_dim/num_groups,
+// in which the data cannot be stored locally,
+// we will load from global memory multiple times,
+// grid(num_groups, dim0)
+// block(BLOCKSIZE)
+// BLOCKSIZE * TVecs_PER_THREAD <= prod_dim1_to_last_dim/num_group
+template<typename TVec, typename T, int T_PER_TVec>
+__global__ void groupnorm_twopass_multiple_load(T*          output,
+                                                const T*    input,
+                                                const T*    gamma,
+                                                const T*    beta,
+                                                int         num_groups,
+                                                int         prod_dim1_to_last_dim,
+                                                int         last_dim,
+                                                const float eps,
+                                                const int   TVecs_PER_THREAD)
+{
+    const int   bid               = blockIdx.y;   // index of batch
+    const int   gid               = blockIdx.x;   // index of group
+    const int   tid               = threadIdx.x;  // index of thread
+    const int   bdimx             = blockDim.x;
+    const int   s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+    const int   v_reduce_elements = s_reduce_elements / T_PER_TVec;
+    const int   s_group_stride    = last_dim / num_groups;
+    const int   v_group_stride    = s_group_stride / T_PER_TVec;
+    const int   offset_of_group   = (bid * prod_dim1_to_last_dim + gid * s_group_stride) / T_PER_TVec;
+    const TVec* input_TVec_ptr    = (const TVec*)(input) + offset_of_group;
+    TVec*       output_TVec_ptr   = (TVec*)(output) + offset_of_group;
+    float       local_sum[1]      = {0.0f};
+
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                local_sum[0] += tmp;
+            }
+        }
+    }
+    __shared__ float s_mean, s_variance;
+
+    // reduction for mean
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_mean = local_sum[0] / s_reduce_elements;
+    }
+    __syncthreads();
+
+    // reduction for std
+    local_sum[0] = 0.0f;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            TVec tmp_vec     = input_TVec_ptr[offset_in_group];
+            T*   tmp_vec_ptr = (T*)(&tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp = static_cast<float>(tmp_vec_ptr[j]);
+                tmp -= s_mean;
+                local_sum[0] += tmp * tmp;
+            }
+        }
+    }
+    if (bdimx <= 32) {
+        warpReduceSum<float, 1>(local_sum);
+    }
+    else {
+        blockReduceSum<float, 1>(local_sum);
+    }
+    if (tid == 0) {
+        s_variance = rsqrtf(local_sum[0] / s_reduce_elements + eps);
+    }
+    __syncthreads();
+
+    // normalize
+    const int   gamma_offset_of_group = gid * v_group_stride;
+    const TVec* gamma_TVec_ptr        = (const TVec*)gamma + gamma_offset_of_group;
+    const TVec* beta_TVec_ptr         = (const TVec*)beta + gamma_offset_of_group;
+#pragma unroll
+    for (int i = 0; i < TVecs_PER_THREAD; i += 1) {
+        const int current_load_start_idx = (i * bdimx + tid) * T_PER_TVec;
+        if (current_load_start_idx < s_reduce_elements) {
+            const int offset_in_group =
+                ((current_load_start_idx / s_group_stride) * last_dim + (current_load_start_idx % s_group_stride))
+                / T_PER_TVec;
+            const int gamma_offset_in_group = (current_load_start_idx % s_group_stride) / T_PER_TVec;
+            TVec      gamma_val             = gamma_TVec_ptr[gamma_offset_in_group];
+            TVec      beta_val              = beta_TVec_ptr[gamma_offset_in_group];
+            T*        gamma_val_ptr         = (T*)(&gamma_val);
+            T*        beta_val_ptr          = (T*)(&beta_val);
+            TVec      tmp_vec               = input_TVec_ptr[offset_in_group];
+            T*        tmp_vec_ptr           = (T*)(&tmp_vec);
+            TVec      output_tmp_vec;
+            T*        output_tmp_vec_ptr = (T*)(&output_tmp_vec);
+#pragma unroll
+            for (int j = 0; j < T_PER_TVec; j++) {
+                float tmp =
+                    (static_cast<float>(tmp_vec_ptr[j]) - s_mean) * s_variance * static_cast<float>(gamma_val_ptr[j])
+                    + static_cast<float>(beta_val_ptr[j]);
+                if (sizeof(T) == sizeof(half)) {
+                    output_tmp_vec_ptr[j] = T(__float2half_rn(tmp));
+                }
+                else {
+                    output_tmp_vec_ptr[j] = T(tmp);
+                }
+            }
+            output_TVec_ptr[offset_in_group] = output_tmp_vec;
+        }
+    }
+}
+
+//ref_input & ref_output should be [N, H, W, C]
+//ref_gamma & ref_beta should be [1, 1, 1, C]
+template <typename T>
+void groupnorm(cutlass::Tensor4DCoord input_size,
+               const int num_groups,
+               const float eps,
+               TensorRef<T, layout::TensorNHWC> ref_output,
+               TensorRef<T, layout::TensorNHWC> ref_input,
+               TensorRef<T, layout::TensorNHWC> ref_gamma,
+               TensorRef<T, layout::TensorNHWC> ref_beta,
+               cudaStream_t stream){
+  const int N = input_size.n();
+  const int H = input_size.h();
+  const int W = input_size.w();
+  const int C = input_size.c();
+  if (C % num_groups != 0){
+    printf("[ERROR] C should be a multiple of num_groups.\n");
+  }
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* gamma = ref_gamma.data();
+  const T* beta = ref_beta.data();
+
+  const int dim0 = N;
+  const int last_dim = C;
+  const int prod_dim1_to_last_dim = H*W*C;
+  const int s_reduce_elements = prod_dim1_to_last_dim / num_groups;
+  const int s_group_stride = last_dim / num_groups;
+  dim3      grid(num_groups, dim0);
+  int       threadblock_size = 32;
+  if (s_group_stride % 2 == 0) {
+    const int T_PER_TVec = 2;
+    while (threadblock_size < 1024) {
+      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
+        break;
+        threadblock_size *= 2;
+      }
+    dim3      block(threadblock_size);
+    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
+    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
+    // for small s_reduce_elements, specific case for H=W=22, C=1280, num_groups=32;
+    // the size of grid & block may have better choice for different cases.
+    // ensure shared memory is smaller than 48KB
+    if (std::is_same<T, float>::value){
+      if (shm_size < 48 * 1024) {
+        groupnorm_twopass_store_locally<float2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+      else {
+        groupnorm_twopass_multiple_load<float2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+    }
+    else{
+      if (shm_size < 48 * 1024) {
+        groupnorm_twopass_store_locally<half2, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+      else {
+        groupnorm_twopass_multiple_load<half2, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+          output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+      }
+    }
+  }
+  else {
+    const int T_PER_TVec = 1;
+    while (threadblock_size < 1024) {
+      if (s_reduce_elements / T_PER_TVec / threadblock_size <= 8)
+        break;
+        threadblock_size *= 2;
+      }
+    dim3      block(threadblock_size);
+    const int TVec_PER_THREAD = (s_reduce_elements / T_PER_TVec + threadblock_size - 1) / threadblock_size;
+    const int shm_size = T_PER_TVec * TVec_PER_THREAD * threadblock_size * sizeof(T);
+    if (shm_size < 48 * 1024) {
+      groupnorm_twopass_store_locally<T, T, T_PER_TVec><<<grid, block, shm_size, stream>>>(
+        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+    }
+    else {
+      groupnorm_twopass_multiple_load<T, T, T_PER_TVec><<<grid, block, 0, stream>>>(
+        output, input, gamma, beta, num_groups, prod_dim1_to_last_dim, last_dim, eps, TVec_PER_THREAD);
+    }
+  }
+
+}
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_layernorm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_layernorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fcbf5cb0f4bf3152a708c6e3845e89fd214cfac
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_layernorm.h
@@ -0,0 +1,644 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do layernorm on a device memory tensor with RowMajor layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do layernorm on a device memory tensor with RowMajor layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void layernorm(cutlass::MatrixCoord tensor_size,
+               TensorRef<T, layout::RowMajor> ref_output,
+               TensorRef<T, layout::RowMajor> ref_input,
+               TensorRef<T, layout::RowMajor> ref_gamma,
+               TensorRef<T, layout::RowMajor> ref_beta,
+               cudaStream_t stream);
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
+*/
+template<typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e1(T* output, 
+                                                        const T* input, 
+                                                        const T* gamma, 
+                                                        const T* beta, 
+                                                        const int m, 
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  T local_val[ITEM_PER_THREAD];
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  const T zero = T(0.0f);
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){ 
+    int index = tid + i*bdimx;
+    local_val[i] = index < n ? input[index] : zero;   
+    local_sums[0] += static_cast<float>(local_val[i]); 
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
+    int index = tid + i*bdimx;
+    if (index < n){
+      const float tmp = static_cast<float>(local_val[i]) - s_mean;
+      local_sums[0] += tmp * tmp;
+    }
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0 ; i < ITEM_PER_THREAD ; i++){
+    int index = tid + i*bdimx;
+    if (index < n) {
+      const T gamma_val = gamma[index];
+      const T beta_val = beta[index];
+      output[index] = T((static_cast<float>(local_val[i]) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
+*/
+template<typename T2, typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e2(T2* output,
+                                                        const T2* input,
+                                                        const T2* gamma,
+                                                        const T2* beta,
+                                                        const int m,
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  T2 local_val[ITEM_PER_THREAD];
+  const int n_2 = n / 2;
+  int offset = m_idx * n_2;
+  input += offset;
+  output += offset;
+
+  const T2 zero = {T(0.0f), T(0.0f)};
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    local_val[i] = index < n_2 ? input[index] : zero;
+    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_2){
+      const float2 tmp = {static_cast<float>(local_val[i].x) - s_mean,
+                          static_cast<float>(local_val[i].y) - s_mean};
+      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_2){
+      const T2 gamma_val = gamma[index];
+      const T2 beta_val = beta[index];
+      T2 tmp;
+      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+      output[index] = tmp;
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*4 elements;
+*/
+template<typename T4, typename T, int ITEM_PER_THREAD>
+__global__ void layernorm_twoPassAlgo_stored_locally_e4(T4* output,
+                                                        const T4* input,
+                                                        const T4* gamma,
+                                                        const T4* beta,
+                                                        const int m,
+                                                        const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  T4 local_val[ITEM_PER_THREAD];
+  const int n_4 = n / 4;
+  int offset = m_idx * n_4;
+  input += offset;
+  output += offset;
+
+  const T4 zero = {T(0.0f), T(0.0f), T(0.0f), T(0.0f)};
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    local_val[i] = index < n_4 ? input[index] : zero;
+    local_sums[0] += static_cast<float>(local_val[i].x) + static_cast<float>(local_val[i].y) +
+                     static_cast<float>(local_val[i].z) + static_cast<float>(local_val[i].w);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_4){
+      const float4 tmp = {static_cast<float>(local_val[i].x) - s_mean,
+                          static_cast<float>(local_val[i].y) - s_mean,
+                          static_cast<float>(local_val[i].z) - s_mean,
+                          static_cast<float>(local_val[i].w) - s_mean};
+      local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y + tmp.z * tmp.z + tmp.w * tmp.w;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  #pragma UNROLL
+  for (int i = 0; i < ITEM_PER_THREAD; i += 1) {
+    const int index = i*bdimx + tid;
+    if (index < n_4){
+      const T4 gamma_val = gamma[index];
+      const T4 beta_val = beta[index];
+      T4 tmp;
+      tmp.x = T((static_cast<float>(local_val[i].x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+      tmp.y = T((static_cast<float>(local_val[i].y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+      tmp.z = T((static_cast<float>(local_val[i].z) - s_mean)*s_variance*static_cast<float>(gamma_val.z) + static_cast<float>(beta_val.z));
+      tmp.w = T((static_cast<float>(local_val[i].w) - s_mean)*s_variance*static_cast<float>(gamma_val.w) + static_cast<float>(beta_val.w));
+      output[index] = tmp;
+    }
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with n elements ; each thread deals with ITEM_PER_THREAD elements
+*/
+template<typename T>
+__global__ void layernorm_twoPassAlgo_e1(T* output,
+                                         const T* input,
+                                         const T* gamma,
+                                         const T* beta,
+                                         const int m,
+                                         const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_sums[0] += local_val;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_val = local_val - s_mean;
+    local_sums[0] += local_val * local_val;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  for (int index = tid ; index < n ; index += bdimx){
+    const T gamma_val = gamma[index];
+    const T beta_val = beta[index];
+    const T local_val = input[index];
+    output[index] = T((static_cast<float>(local_val) - s_mean) * s_variance * static_cast<float>(gamma_val) + static_cast<float>(beta_val));
+  }
+}
+
+/**
+ * output [m, n] row-major
+ * input [m, n] row-major
+ * gamma [n]
+ * beta [n]
+ * grid(m)
+ * block(block_size) -- each block deals with block_size*ITEM_PER_THREAD*2 elements;
+*/
+template<typename T2, typename T>
+__global__ void layernorm_twoPassAlgo_e2(T2* output,
+                                         const T2* input,
+                                         const T2* gamma,
+                                         const T2* beta,
+                                         const int m,
+                                         const int n)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+  const int n_2 = n / 2;
+  int offset = m_idx * n_2;
+  input += offset;
+  output += offset;
+
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    local_sums[0] += static_cast<float>(local_val.x) + static_cast<float>(local_val.y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    const float2 tmp = {static_cast<float>(local_val.x) - s_mean,
+                        static_cast<float>(local_val.y) - s_mean};
+    local_sums[0] += tmp.x * tmp.x + tmp.y * tmp.y;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + 1e-5);
+  }
+  __syncthreads();
+
+  for (int index = tid; index < n_2; index += bdimx) {
+    const T2 local_val = input[index];
+    const T2 gamma_val = gamma[index];
+    const T2 beta_val = beta[index];
+    T2 tmp;
+    tmp.x = T((static_cast<float>(local_val.x) - s_mean)*s_variance*static_cast<float>(gamma_val.x) + static_cast<float>(beta_val.x));
+    tmp.y = T((static_cast<float>(local_val.y) - s_mean)*s_variance*static_cast<float>(gamma_val.y) + static_cast<float>(beta_val.y));
+    output[index] = tmp;
+  }
+}
+
+template <typename T>
+void layernorm(cutlass::MatrixCoord tensor_size,
+               TensorRef<T, layout::RowMajor> ref_output,
+               TensorRef<T, layout::RowMajor> ref_input,
+               TensorRef<T, layout::RowMajor> ref_gamma,
+               TensorRef<T, layout::RowMajor> ref_beta,
+               cudaStream_t stream){
+  const int m = tensor_size.row();
+  const int n = tensor_size.column();
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* gamma = ref_gamma.data();
+  const T* beta = ref_beta.data();
+  dim3 grid(m);
+  dim3 block((n + 31)/32*32);
+  if (block.x > 1024){
+    block.x = 1024;
+  }
+  // TODO : There should be better configs for different cases, we only use several samples to show how to use here
+  // TODO : using registers to store values locally can reduce the loads from global memory and speedup the kernels.
+  if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
+    block.x = (n/4 + 31)/32*32;
+    if (std::is_same<T, float>::value) {
+      layernorm_twoPassAlgo_stored_locally_e4<float4, float, 1><<<grid, block, 0, stream>>>(
+        (float4*)output,
+        (const float4*)input,
+        (const float4*)gamma,
+        (const float4*)beta,
+        m,
+        n);
+    } // if (std::is_same<T, float>::value)
+    else {
+      layernorm_twoPassAlgo_stored_locally_e4<half4, half, 1><<<grid, block, 0, stream>>>(
+        (half4*)output,
+        (const half4*)input,
+        (const half4*)gamma,
+        (const half4*)beta,
+        m,
+        n);
+    }
+  } //if ((n % 4 == 0) && (n >= 128) && (n <= 4096))
+  else if (n % 2 == 0) {
+    if (n / 2 <= 1024) {
+      block.x = (n/2 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 1><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } //if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 1><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n / 2 <= 1024)
+    else if (n <= 8192) {
+      block.x = ((n + 7)/8 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 4><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 4><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 8192)
+    else if (n <= 16384) {
+      block.x = ((n + 15)/ 16 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 8><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 8><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 16384)
+    else if (n <= 32768) {
+      block.x = ((n + 31)/32 + 31)/32*32;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_stored_locally_e2<float2, float, 16><<<grid, block, 0, stream>>>(
+          (float2*)output,
+          (const float2*)input,
+          (const float2*)gamma,
+          (const float2*)beta,
+          m,
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_stored_locally_e2<half2, half, 16><<<grid, block, 0, stream>>>(
+          (half2*)output,
+          (const half2*)input,
+          (const half2*)gamma,
+          (const half2*)beta,
+          m,
+          n);
+      }
+    } // if (n <= 32768)
+    else {
+      if (block.x > 512)
+        block.x = 512;
+      if (std::is_same<T, float>::value) {
+        layernorm_twoPassAlgo_e2<float2, float><<<grid, block, 0, stream>>>(
+          (float2 *)output, 
+          (const float2 *)input,
+          (const float2 *)gamma, 
+          (const float2 *)beta, 
+          m, 
+          n);
+      } // if (std::is_same<T, float>::value)
+      else {
+        layernorm_twoPassAlgo_e2<half2, half><<<grid, block, 0, stream>>>(
+          (half2 *)output,
+          (const half2 *)input,
+          (const half2 *)gamma,
+          (const half2 *)beta,
+          m,
+          n);
+      }
+    }
+  } // if (n % 2 == 0)
+  else {
+    if (n <= 1024) {
+      layernorm_twoPassAlgo_stored_locally_e1<T, 1><<<grid, block, 0, stream>>>(
+        output, 
+        input, 
+        gamma, 
+        beta, 
+        m, 
+        n);
+    } // if (n <= 1024)
+    else if (n <= 8192) {
+      block.x = ((n + 7)/8 + 31)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 8><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 8192)
+    else if (n <= 16384) {
+      block.x = ((n + 15)/16 + 32)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 16><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 16384)
+    else if (n <= 32768) {
+      block.x = ((n + 31)/32 + 31)/32*32;
+      layernorm_twoPassAlgo_stored_locally_e1<T, 32><<<grid, block, 0, stream>>>(
+        output,
+        input,
+        gamma,
+        beta,
+        m,
+        n);
+    } // if (n <= 32768)
+    else{
+      if (block.x > 512) {
+        block.x = 512;
+      }
+      layernorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
+        output, 
+        input, 
+        gamma, 
+        beta, 
+        m, 
+        n);
+    }
+  } 
+}
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_memory.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..44f6a467a5d0938289e4bc127cddc13b9aeabdf3
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_memory.h
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ interface to CUDA device memory management functions.
+ */
+
+#include <memory>
+#include <sstream>
+
+#include "cutlass/platform/platform.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/trace.h"
+#include "exceptions.h"
+
+namespace cutlass {
+namespace device_memory {
+
+/******************************************************************************
+ * Allocation lifetime
+ ******************************************************************************/
+
+/// Allocate a buffer of \p count elements of type \p T on the current CUDA device
+template <typename T>
+T* allocate(size_t count = 1) {
+
+  T* ptr = 0;
+  size_t bytes = count * sizeof_bits<T>::value / 8;
+
+  cudaError_t cuda_error = cudaMalloc((void**)&ptr, bytes);
+
+  if (cuda_error != cudaSuccess) {
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 0)
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: cudaMalloc failed: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+#endif
+    throw cuda_exception("Failed to allocate memory", cuda_error);
+  }
+#if (CUTLASS_DEBUG_TRACE_LEVEL > 1)
+  else {
+    std::ostringstream os;
+    os << "cutlass::device_memory::allocate: Successful cudaMalloc: bytes=" << bytes;
+    CUTLASS_TRACE_HOST(os.str());
+  }
+#endif
+
+  return ptr;
+}
+
+/// Free the buffer pointed to by \p ptr
+template <typename T>
+void free(T* ptr) {
+  if (ptr) {
+    cudaError_t cuda_error = (cudaFree(ptr));
+    if (cuda_error != cudaSuccess) {
+      throw cuda_exception("Failed to free device memory", cuda_error);
+    }
+  }
+}
+
+/******************************************************************************
+ * Data movement
+ ******************************************************************************/
+
+template <typename T>
+void copy(T* dst, T const* src, size_t count, cudaMemcpyKind kind) {
+  size_t bytes = count * sizeof_bits<T>::value / 8;
+  if (bytes == 0 && count > 0) {
+    bytes = 1;
+  }
+  cudaError_t cuda_error = (cudaMemcpy(dst, src, bytes, kind));
+  if (cuda_error != cudaSuccess) {
+    std::ostringstream os;
+    os << "cutlass::device_memory::copy: cudaMemcpy() failed: "
+       << "dst=" << dst << ", src=" << src
+       << ", bytes=" << bytes << ", count=" << count;
+    if (kind == cudaMemcpyHostToDevice) {
+      os << ", kind=cudaMemcpyHostToDevice";
+    }
+    else if (kind == cudaMemcpyDeviceToHost) {
+      os << ", kind=cudaMemcpyDeviceToHost";
+    }
+    else if (kind == cudaMemcpyDeviceToDevice) {
+      os << ", kind=cudaMemcpyDeviceToDevice";
+    }
+    else if (kind == cudaMemcpyHostToHost) {
+      os << ", kind=cudaMemcpyHostToHost";
+    }
+    else if (kind == cudaMemcpyDefault) {
+      os << ", kind=cudaMemcpyDefault";
+    }
+    else {
+      os << ", kind=Unknown";
+    }
+    os << ", error: " << cudaGetErrorString(cuda_error);
+
+    throw cuda_exception(os.str().c_str(), cuda_error);
+  }
+}
+
+template <typename T>
+void copy_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToDevice);
+}
+
+template <typename T>
+void copy_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToHost);
+}
+
+template <typename T>
+void copy_device_to_device(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyDeviceToDevice);
+}
+
+template <typename T>
+void copy_host_to_host(T* dst, T const* src, size_t count = 1) {
+  copy(dst, src, count, cudaMemcpyHostToHost);
+}
+
+/// Copies elements from device memory to host-side range
+template <typename OutputIterator, typename T>
+void insert_to_host(OutputIterator begin, OutputIterator end, T const* device_begin) {
+  size_t elements = end - begin;
+  copy_to_host(&*begin, device_begin, elements);
+}
+
+/// Copies elements to device memory from host-side range
+template <typename T, typename InputIterator>
+void insert_to_device(T* device_begin, InputIterator begin, InputIterator end) {
+  size_t elements = end - begin;
+  copy_to_device(device_begin, &*begin, elements);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+class DeviceAllocation {
+public:
+
+  /// Delete functor for CUDA device memory
+  struct deleter {
+    void operator()(T* ptr) {
+      cudaError_t cuda_error = (cudaFree(ptr));
+      if (cuda_error != cudaSuccess) {
+        // noexcept
+        //                throw cuda_exception("cudaFree() failed", cuda_error);
+        return;
+      }
+    }
+  };
+
+public:
+  //
+  // Data members
+  //
+
+  /// Number of elements of T allocated on the current CUDA device
+  size_t capacity;
+
+  /// Smart pointer
+  platform::unique_ptr<T, deleter> smart_ptr;
+
+public:
+
+  //
+  // Static methods
+  //
+
+  /// Static member to compute the number of bytes needed for a given number of elements
+  static size_t bytes(size_t elements) {
+    if (sizeof_bits<T>::value < 8) {
+      size_t const kElementsPerByte = 8 / sizeof_bits<T>::value;
+      return elements / kElementsPerByte;
+    }
+    else {
+      size_t const kBytesPerElement = sizeof_bits<T>::value / 8;
+      return elements * kBytesPerElement;
+    }
+  }
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor: allocates no memory
+  DeviceAllocation() : capacity(0) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device
+  DeviceAllocation(size_t _capacity) : 
+    smart_ptr(device_memory::allocate<T>(_capacity)), capacity(_capacity) {}
+
+  /// Constructor: allocates \p capacity elements on the current CUDA device taking ownership of the allocation
+  DeviceAllocation(T *ptr, size_t _capacity) : smart_ptr(ptr), capacity(_capacity) {}
+
+  /// Copy constructor
+  DeviceAllocation(DeviceAllocation const &p): 
+    smart_ptr(device_memory::allocate<T>(p.capacity)), capacity(p.capacity) {
+
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+  }
+
+  /// Move constructor
+  DeviceAllocation(DeviceAllocation &&p): capacity(0) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+  }
+
+  /// Destructor
+  ~DeviceAllocation() { reset(); }
+
+  /// Returns a pointer to the managed object
+  T* get() const { return smart_ptr.get(); }
+
+  /// Releases the ownership of the managed object (without deleting) and resets capacity to zero
+  T* release() {
+    capacity = 0;
+    return smart_ptr.release();
+  }
+
+  /// Deletes the managed object and resets capacity to zero
+  void reset() {
+    capacity = 0;
+    smart_ptr.reset();
+  }
+
+  /// Deletes managed object, if owned, and allocates a new object
+  void reset(size_t _capacity) {
+    reset(device_memory::allocate<T>(_capacity), _capacity);
+  }
+
+  /// Deletes managed object, if owned, and replaces its reference with a given pointer and capacity
+  void reset(T* _ptr, size_t _capacity) {
+    smart_ptr.reset(_ptr);
+    capacity = _capacity;
+  }
+
+  /// Allocates a new buffer and copies the old buffer into it. The old buffer is then released.
+  void reallocate(size_t new_capacity) {
+    
+    platform::unique_ptr<T, deleter> new_allocation(device_memory::allocate<T>(new_capacity));
+
+    device_memory::copy_device_to_device(
+      new_allocation.get(), 
+      smart_ptr.get(), 
+      std::min(new_capacity, capacity));
+
+    std::swap(smart_ptr, new_allocation);
+    std::swap(new_capacity, capacity);
+  }
+
+  /// Returns the number of elements
+  size_t size() const {
+    return capacity;
+  }
+
+  /// Returns the number of bytes needed to store the allocation
+  size_t bytes() const {
+    return bytes(capacity);
+  }
+
+  /// Returns a pointer to the object owned by *this
+  T* operator->() const { return smart_ptr.get(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object.
+  deleter& get_deleter() { return smart_ptr.get_deleter(); }
+
+  /// Returns the deleter object which would be used for destruction of the managed object (const)
+  const deleter& get_deleter() const { return smart_ptr.get_deleter(); }
+
+  /// Copies a device-side memory allocation
+  DeviceAllocation & operator=(DeviceAllocation const &p) {
+    if (capacity != p.capacity) {
+      smart_ptr.reset(device_memory::allocate<T>(p.capacity));
+      capacity = p.capacity;
+    }
+    device_memory::copy_device_to_device(smart_ptr.get(), p.get(), capacity);
+    return *this;
+  }
+
+  /// Move assignment
+  DeviceAllocation & operator=(DeviceAllocation && p) {
+    std::swap(smart_ptr, p.smart_ptr);
+    std::swap(capacity, p.capacity);
+    return *this;
+  }
+
+  /// Copies the entire allocation from another location in device memory.
+  void copy_from_device(T const *ptr) const {
+    copy_from_device(ptr, capacity);
+  }
+
+  /// Copies a given number of elements from device memory
+  void copy_from_device(T const *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_device(T *ptr) const {
+    copy_to_device(ptr, capacity);
+  }
+
+  void copy_to_device(T *ptr, size_t elements) const {
+    device_memory::copy_device_to_device(ptr, get(), elements);
+  }
+
+  void copy_from_host(T const *ptr) const {
+    copy_from_host(ptr, capacity);
+  }
+
+  void copy_from_host(T const *ptr, size_t elements) const {
+    device_memory::copy_to_device(get(), ptr, elements);
+  }
+
+  void copy_to_host(T *ptr) const {
+    copy_to_host(ptr, capacity);
+  }
+
+  void copy_to_host(T *ptr, size_t elements) const {
+    device_memory::copy_to_host(ptr, get(), elements); 
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace device_memory {
+
+/// Device allocation abstraction that tracks size and capacity
+template <typename T>
+using allocation = cutlass::DeviceAllocation<T>;
+
+}  // namespace device_memory
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nchw_to_nhwc.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e38029951d27c0be8da059b59d2a83fe2762ef1
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
@@ -0,0 +1,141 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to transform a device memory tensor from NCHW layout to NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface to transform a device memory tensor from NCHW layout to NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNCHW> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream);
+
+template <typename T>
+__global__ void nchw_to_nhwc_kernel(T *output, 
+                                    const T *input, 
+                                    const int n,
+                                    const int h, 
+                                    const int w, 
+                                    const int c) {
+  const int hw = h*w;
+  const int chw = c*hw;
+  __shared__ T shbuf[32 * (32 + 1)]; 
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / 32;     
+  const int32_t lid  = tid % 32;     
+  const int32_t ni   = blockIdx.z;
+  const int32_t ci0  = blockIdx.y * 32;
+  const int32_t hwi0 = blockIdx.x * 32;
+
+  const size_t input_idx = ni * chw + (ci0 + wid) * hw + hwi0;
+  const T *A = input + input_idx;
+  if (hwi0 + lid < hw) {
+    const int lid_x_33 = lid * 33;
+    if ((ci0 + 32) <= c) {
+      int ci = wid;  // between 0 and 7
+      CUTLASS_PRAGMA_UNROLL
+      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
+        shbuf[lid_x_33 + ci] = A[lid];
+        A                    = &A[8 * hw];
+        ci += 8;
+      }
+    } else {
+      for (int ci = wid; ci < 32; ci += 8) {
+        if ((ci + ci0) < c) {
+          shbuf[lid_x_33 + ci] = A[lid];
+        }
+        A = &A[8 * hw];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t ciOut = ci0 + lid;
+  output = &output[ni * chw + ciOut];
+  if (ciOut < c) {
+    if (hwi0 + 32 < hw) {
+      int hwI = wid;
+      CUTLASS_PRAGMA_UNROLL
+      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
+        output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
+        hwI += 8;
+      }
+    } else {
+      for (int hwI = wid; hwI < 32; hwI += 8) {
+        if (hwi0 + hwI < hw) {
+          output[(hwi0 + hwI) * c] = shbuf[(hwI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void nchw_to_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNCHW> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream) {
+  
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.c() == output_tensor_size.h() &&
+    input_tensor_size.h() == output_tensor_size.w() &&
+    input_tensor_size.w() == output_tensor_size.c());
+
+  int n = output_tensor_size.n();
+  int h = output_tensor_size.h();
+  int w = output_tensor_size.w();
+  int c = output_tensor_size.c();
+  
+  dim3 grid((h*w + 31)/32, (c + 31)/32, n);
+  dim3 block(32, 8);
+  nchw_to_nhwc_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(), 
+                                                  n, h, w, c);
+}
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_padding.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..f58da62a35350b4a865f4521ec1cbb76ae87e874
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_padding.h
@@ -0,0 +1,276 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels for padding in device memory with NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface for padding in a device memory tensor with NHWC layout
+ * \tparam T: data type
+ */
+template <typename T>
+void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream);
+
+
+template <typename T>
+__global__ void nhwc_padding_kernel(const int32_t n,
+                                    const int32_t h,
+                                    const int32_t w,
+                                    const int32_t c_in,
+                                    const int32_t c_out,
+                                    const T zero,
+                                    const T *input,
+                                    T *output){
+
+  const int32_t idx_jump       = blockDim.x * gridDim.x;
+  const int32_t total_elements = n * h * w * c_out;
+
+  int32_t c_idx, w_idx, h_idx, n_idx, resudial;
+
+  T value;
+  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
+        
+    c_idx = idx%c_out;
+    if (c_idx >= c_in){
+      value = zero;    
+    }
+    else{
+      resudial = idx/c_out;
+      w_idx = resudial%w;
+      resudial = resudial/w;
+      h_idx = resudial%h;
+      n_idx = resudial/h;	
+      resudial = ((n_idx * h + h_idx) * w + w_idx) * c_in + c_idx;
+      value = input[resudial];
+    }
+    output[idx] = value;
+  }
+}
+
+
+// fast kernel for c_in = 3 & c_out = 4
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){                                                
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;  
+  const int threadidx = threadIdx.x; 
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
+  __syncthreads();
+  
+  const int output_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < output_offset + 256 ? max_output_element : output_offset + 256;
+  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
+    Telement array[element_in_Tio];
+    CUTLASS_PRAGMA_UNROLL
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
+    output[i] = *((const Tio *)array);
+  }
+}
+
+// fast kernel for c_in = 3 & c_out = 8
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To8_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){                                                
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;  
+  const int threadidx = threadIdx.x; 
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];  
+  __syncthreads();
+  
+  const int output_offset = blockIdx.x * 512;
+  const int lower_bound = max_output_element < output_offset + 512 ? max_output_element : output_offset + 512;
+  for (int i = output_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + (element_in_Tio == 4 ? j/2 : j)*3;
+    Telement array[element_in_Tio];
+    //float
+    if (element_in_Tio == 4){
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0 ; k < element_in_Tio ; k++)
+        array[k] = ((j % 2) == 1) ? zero_element : ((k >= 3) ? zero_element : shm_element[k]);
+    }
+    //half
+    else{
+      CUTLASS_PRAGMA_UNROLL
+      for (int k = 0 ; k < element_in_Tio ; k++) 
+        array[k] = (k >= 3) ? zero_element : shm_element[k];          
+    }
+    output[i] = *((const Tio *)array);
+  }
+}
+
+template <typename T>
+void nhwc_padding(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  cudaStream_t stream){
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.h() == output_tensor_size.h() &&
+    input_tensor_size.w() == output_tensor_size.w() &&
+    input_tensor_size.c() <= output_tensor_size.c()); 
+    
+  int n = input_tensor_size.n();
+  int h = input_tensor_size.h();
+  int w = input_tensor_size.w();
+  int c_in = input_tensor_size.c();
+  int c_out = output_tensor_size.c();
+    
+  //case 1 : channel == 3 padding to 4 or 8
+  if ((c_out == 4 || c_out == 8) && c_in == 3 && (n*h*w % 8 == 0)){
+    dim3 block(192);
+    const int nhw = n*h*w;
+    const int nhwc = nhw*c_in;
+    //for half_t
+    if (cutlass::sizeof_bits<T>::value == 16){
+      const int element_in_Tio = 8;
+      const int max_input_element = nhwc/element_in_Tio;
+      const int max_output_element = nhw*c_out/element_in_Tio;
+      const int4 zero_io = {0, 0, 0, 0};
+      const half_t zero_element = static_cast<half_t>(0.0f);
+      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
+      if (c_out == 4){
+        nhwc_padding_channel_3To4_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const int4 *)ref_input.data(),
+          (int4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+      else if (c_out == 8){
+        nhwc_padding_channel_3To8_kernel<int4, half_t, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const int4 *)ref_input.data(),
+          (int4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+    }
+    //for float
+    else{
+      const int element_in_Tio = 4;
+      const int max_input_element = nhwc/element_in_Tio;
+      const int max_output_element = nhw*c_out/element_in_Tio;
+      const float4 zero_io = {0.0f, 0.0f, 0.0f, 0.0f};
+      const float zero_element = 0.0f;
+      dim3 grid((nhwc + 192*element_in_Tio - 1)/(192*element_in_Tio));
+      if (c_out == 4){
+        nhwc_padding_channel_3To4_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const float4 *)ref_input.data(),
+          (float4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+      else if (c_out == 8){
+        nhwc_padding_channel_3To8_kernel<float4, float, element_in_Tio><<<grid, block, 0, stream>>>
+          (n, h, w,
+          (const float4 *)ref_input.data(),
+          (float4 *)ref_output.data(),
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+      }
+    }
+  }
+  //case 2 : even channel
+  else if ((c_out % 2) == 0 && (c_in % 2) == 0){
+    int32_t total_elements = n * h * w * c_out / 2;
+    int block_size = 256;
+    dim3 grid((total_elements + 255)/256);
+    dim3 block(block_size);
+    //for half_t
+    if (cutlass::sizeof_bits<T>::value == 16){
+      const __half2 zero  = {0.0f, 0.0f};
+      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const __half2*)ref_input.data(), (__half2*)ref_output.data());
+    }
+    //for float
+    else{
+      const float2 zero  = {0.0f, 0.0f};
+      nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in/2, c_out/2, zero, (const float2*)ref_input.data(), (float2*)ref_output.data());
+    }
+  }
+  //case 3 : odd channel
+  else{
+    int32_t total_elements = n * h * w * c_out;
+    int block_size = 256;
+    dim3 grid((total_elements + 255)/256);
+    dim3 block(block_size);
+    const T zero = static_cast<T>(0.0f);
+    nhwc_padding_kernel<<<grid, block, 0, stream>>>(n, h, w, c_in, c_out, zero, ref_input.data(), ref_output.data());
+  }
+}
+
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_pooling.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..5633456c1412ff41366ec4c6ec5c3e6e3a2d6c19
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_pooling.h
@@ -0,0 +1,573 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to do avg/max pooling on a device memory tensor with NHWC layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+/** \brief interface to do avg/max pooling on a device memory tensor with NHWC layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord filter_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  cutlass::MatrixCoord padding,
+                  cutlass::MatrixCoord stride,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  int poolingType, //0 for avg pooling ; 1 for max pooling
+                  cudaStream_t stream);
+
+/** get the output size of pooling
+ */
+inline int getOutputSize(int H_W, int padding, int kernel_size, int stride)
+{
+    return (H_W + 2 * padding - kernel_size) / stride + 1;
+}
+
+/**
+ * input is [N, H, W, C]
+ * assume stride == kernel_size
+ * output_h = (H + 2*padding_H - kernel_H)/stride_H
+ * output_w = (W + 2*padding_W - kernel_W)/stride_W
+ * output is [N, output_h, output_w, C]
+ * grid(N, output_h, output_w)
+ * block(min(C, 256)) :
+ * each block deals with C elements of output when each thread deals with ((C + 255)/256 element of output)
+*/
+template<typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nhwc_element1_kernel(T* output,
+                                             const T* input,
+                                             const int N,
+                                             const int H,
+                                             const int W,
+                                             const int C,
+                                             const int output_H,
+                                             const int output_W,
+                                             const int kernel_H,
+                                             const int kernel_W,
+                                             const int stride_H,
+                                             const int stride_W,
+                                             const int padding_H,
+                                             const int padding_W)
+{
+  const int tid = threadIdx.x;
+  const int n_idx = blockIdx.x;
+  const int output_h_idx = blockIdx.y;
+  const int output_w_idx = blockIdx.z;
+
+  int h_start_idx = output_h_idx * stride_H - padding_H;
+  int h_end_idx = h_start_idx + kernel_H;
+  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
+  h_end_idx = h_end_idx > H ? H : h_end_idx;
+
+  int w_start_idx = output_w_idx * stride_W - padding_W;
+  int w_end_idx = w_start_idx + kernel_W;
+  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
+  w_end_idx = w_end_idx > W ? W : w_end_idx;
+
+  input += n_idx * H * W * C;
+  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
+  const int kernel_size2 = kernel_H * kernel_W;
+  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
+    float pooling;
+    if (IS_AVG_POOLING){
+      pooling = 0.0f;
+    }
+    else{
+      pooling = -FLT_MAX;
+    }
+    for (int h = h_start_idx; h < h_end_idx; h++) {
+      for (int w = w_start_idx; w < w_end_idx; w++) {
+        const int idx = (h * W + w) * C;
+        const float tmp = static_cast<float>(input[idx + c_idx]);
+        if (IS_AVG_POOLING){
+          pooling = pooling + tmp;
+        }
+        else{
+          pooling = pooling > tmp ? pooling : tmp;
+        }
+      }
+    }
+
+    T output_val;
+    if (IS_AVG_POOLING){
+      output_val = T(pooling/kernel_size2);
+    }
+    else{
+      output_val = T(pooling);
+    }
+    output[c_idx] = output_val;
+  }
+}
+
+template<typename T2, typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nhwc_element2_kernel(T2* output,
+                                             const T2* input,
+                                             const int N,
+                                             const int H,
+                                             const int W,
+                                             const int C,
+                                             const int output_H,
+                                             const int output_W,
+                                             const int kernel_H,
+                                             const int kernel_W,
+                                             const int stride_H,
+                                             const int stride_W,
+                                             const int padding_H,
+                                             const int padding_W)
+{
+  const int tid = threadIdx.x;
+  const int n_idx = blockIdx.x;
+  const int output_h_idx = blockIdx.y;
+  const int output_w_idx = blockIdx.z;
+
+  int h_start_idx = output_h_idx * stride_H - padding_H;
+  int h_end_idx = h_start_idx + kernel_H;
+  h_start_idx = (h_start_idx < 0) ? 0 : h_start_idx;
+  h_end_idx = h_end_idx > H ? H : h_end_idx;
+
+  int w_start_idx = output_w_idx * stride_W - padding_W;
+  int w_end_idx = w_start_idx + kernel_W;
+  w_start_idx = (w_start_idx < 0) ? 0 : w_start_idx;
+  w_end_idx = w_end_idx > W ? W : w_end_idx;
+
+  input += n_idx * H * W * C;
+  output += ((n_idx * output_H + output_h_idx) * output_W + output_w_idx) * C;
+  const int kernel_size2 = kernel_H * kernel_W;
+  for (int c_idx = tid; c_idx < C; c_idx += blockDim.x) {
+    float2 pooling;
+    if (IS_AVG_POOLING) { 
+      pooling = {0.0f, 0.0f};
+    }
+    else {
+      pooling = {-FLT_MAX, -FLT_MAX};
+    }
+    for (int h = h_start_idx; h < h_end_idx; h++) {
+      for (int w = w_start_idx; w < w_end_idx; w++) {
+        const int idx = (h * W + w) * C;
+        const T2 tmp = input[idx + c_idx];
+        const float2 tmp_flt2 = {static_cast<float>(tmp.x), static_cast<float>(tmp.y)};
+        if (IS_AVG_POOLING) {
+          pooling.x += tmp_flt2.x;
+          pooling.y += tmp_flt2.y;
+        }
+        else {
+          pooling.x = pooling.x > tmp_flt2.x ? pooling.x : tmp_flt2.x;
+          pooling.y = pooling.y > tmp_flt2.y ? pooling.y : tmp_flt2.y;
+        }
+      }
+    }
+
+    T2 output_val;
+    if (IS_AVG_POOLING) {
+      output_val.x = T(pooling.x/kernel_size2);
+      output_val.y = T(pooling.y/kernel_size2);
+    }
+    else {
+      output_val.x = T(pooling.x);
+      output_val.y = T(pooling.y);
+    }
+    output[c_idx] = output_val;
+  }
+}
+
+/**
+ * output [N, 1, 1, C]
+ * input [N, H, W, C]
+ * grid(C, N)
+ * block(block_size) -- each block deals with H*W/block_size elements;
+*/
+template<typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nxhTo1x1_element1_kernel(
+    T* output, const T* input, const int N, const int HW, const int C)
+{
+    const int c_idx = blockIdx.x;
+    const int n_idx = blockIdx.y;
+    float pooling[1];
+    if (IS_AVG_POOLING) {
+      pooling[0] = 0.0f;
+    }
+    else {
+      pooling[0] = -FLT_MAX;
+    }
+    const size_t input_offset = n_idx * HW * C + c_idx;
+    input += input_offset;
+    const size_t output_offset = n_idx * C + c_idx;
+    output += output_offset;
+    int tid = threadIdx.x;
+
+    for (int index = tid; index < HW; index += blockDim.x) {
+        float val = static_cast<float>(input[index * C]);
+        if (IS_AVG_POOLING) {
+          pooling[0] += val;
+        }
+        else {
+          pooling[0] = pooling[0] > val ? pooling[0] : val;
+        }
+    }
+    if (blockDim.x <= 32) {
+        if (IS_AVG_POOLING) {
+          warpReduceSum<float, 1>(pooling);
+        }
+        else {
+          warpReduceMax<float, 1>(pooling);
+        }
+    }
+    else {
+        if (IS_AVG_POOLING) {
+          blockReduceSum<float, 1>(pooling);
+        }
+        else {
+          blockReduceMax<float, 1>(pooling);
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        T output_val;
+        if (IS_AVG_POOLING) {
+          output_val = T(pooling[0] / HW);
+        }
+        else {
+          output_val = T(pooling[0]);
+        }
+        output[0] = output_val;
+    }
+}
+
+
+/**
+ * output [N, 1, 1, C]
+ * input [N, H, W, C]
+ * grid(C/2, N)
+ * block(block_size) -- each thread deals with H*W/block_size * 2 elements;
+*/
+template<typename T2, typename T, bool IS_AVG_POOLING>
+__global__ void pooling_nxhTo1x1_element2_kernel(
+    T2* output, const T2* input, const int N, const int HW, const int C)
+{
+    const int c_idx = blockIdx.x;
+    const int n_idx = blockIdx.y;
+    float pooling[2];
+    if (IS_AVG_POOLING) {
+      pooling[0] = pooling[1] = 0.0f;
+    }
+    else {
+      pooling[0] = pooling[1] = -FLT_MAX;
+    }
+    const int C_2 = C / 2;
+    const size_t input_offset = n_idx * HW * C_2 + c_idx;
+    input += input_offset;
+    const size_t output_offset = n_idx * C_2 + c_idx;
+    output += output_offset;
+    int tid = threadIdx.x;
+
+    for (int index = tid; index < HW; index += blockDim.x) {
+        T2 val = input[index * C_2];
+        float2 val_flt2 = {static_cast<float>(val.x), static_cast<float>(val.y)};
+        if (IS_AVG_POOLING) {
+          pooling[0] += val_flt2.x;
+          pooling[1] += val_flt2.y;
+        }
+        else {
+          pooling[0] = pooling[0] > val_flt2.x ? pooling[0] : val_flt2.x;
+          pooling[1] = pooling[1] > val_flt2.y ? pooling[1] : val_flt2.y;
+        }
+    }
+    if (blockDim.x <= 32) {
+        if (IS_AVG_POOLING) {
+          warpReduceSum<float, 2>(pooling);
+        }
+        else {
+          warpReduceMax<float, 2>(pooling);
+        }
+    }
+    else {
+        if (IS_AVG_POOLING) {
+          blockReduceSum<float, 2>(pooling);
+        }
+        else {
+          blockReduceMax<float, 2>(pooling);
+        }
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        T2 output_val;
+        if (IS_AVG_POOLING) {
+          output_val.x = T(pooling[0] / HW);
+          output_val.y = T(pooling[1] / HW);
+        }
+        else {
+          output_val.x = T(pooling[0]);
+          output_val.y = T(pooling[1]);
+        }
+        output[0] = output_val;
+    }
+}
+
+template <typename T>
+void pooling_nhwc(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord filter_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  cutlass::Tensor4DCoord padding,
+                  cutlass::MatrixCoord stride,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNHWC> ref_output,
+                  int poolingType, //0 for avg pooling ; 1 for max pooling
+                  cudaStream_t stream) {
+
+  assert(input_tensor_size.n() == output_tensor_size.n() &&
+         input_tensor_size.c() == output_tensor_size.c());
+
+  const int N = input_tensor_size.n();
+  const int H = input_tensor_size.h();
+  const int W = input_tensor_size.w();
+  const int C = input_tensor_size.c();
+  const int padding_H = padding.h();
+  const int padding_W = padding.w();
+  const int kernel_H = filter_tensor_size.h();
+  const int kernel_W = filter_tensor_size.w();
+  const int stride_H = stride.row();
+  const int stride_W = stride.column();
+
+  const int output_H = getOutputSize(H, padding_H, kernel_H, stride_H);
+  const int output_W = getOutputSize(W, padding_W, kernel_W, stride_W);
+
+  assert(output_tensor_size.h() == output_H &&
+         output_tensor_size.w() == output_W);
+
+  if (C % 2 != 0) {
+    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
+      dim3 grid(C, N);
+      dim3 block(256);
+      if (H*W < block.x){
+        block.x = (H*W + 31)/32*32;
+      } 
+      if (poolingType == 0) {
+        pooling_nxhTo1x1_element1_kernel<T, true><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H*W,
+          C);
+      } // if (poolingType == 0)
+      else {
+        pooling_nxhTo1x1_element1_kernel<T, false><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H*W,
+          C);
+      }
+    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
+    else {
+      dim3 grid(N, output_H, output_W);
+      dim3 block(256);
+      if (C < block.x) {
+        block.x = C;
+      }
+      if (poolingType == 0) {
+        pooling_nhwc_element1_kernel<T, true><<<grid, block, 0, stream>>>(
+          ref_output.data(), 
+          ref_input.data(),
+          N,
+          H,
+          W,
+          C,
+          output_H,
+          output_W,
+          kernel_H,
+          kernel_W,
+          stride_H,
+          stride_W,
+          padding_H,
+          padding_W);
+      } // if (poolingType == 0)
+      else {
+        pooling_nhwc_element1_kernel<T, false><<<grid, block, 0, stream>>>(
+          ref_output.data(),
+          ref_input.data(),
+          N,
+          H,
+          W,
+          C,
+          output_H,
+          output_W,
+          kernel_H,
+          kernel_W,
+          stride_H,
+          stride_W,
+          padding_H,
+          padding_W);
+      }
+    }
+  } // if (C % 2 != 0))
+  else {
+    if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0)) {
+      dim3 grid(C/2, N);
+      dim3 block(256);
+      if (H*W < block.x){
+        block.x = (H*W + 31)/32*32;
+      }
+      if (poolingType == 0) {
+        if (std::is_same<T, float>::value) {
+          pooling_nxhTo1x1_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nxhTo1x1_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        }
+      } // if (poolingType == 0)
+      else {
+        if (std::is_same<T, float>::value) {
+          pooling_nxhTo1x1_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nxhTo1x1_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H*W,
+            C);
+        }
+      }
+    } // if ((H == kernel_H && padding_H == 0) && (W == kernel_W && padding_W == 0))
+    else {
+      dim3 grid(N, output_H, output_W);
+      dim3 block(256);
+      if (C/2 < block.x) {
+        block.x = C/2;
+      }
+      if (poolingType == 0) {
+        if (std::is_same<T, float>::value) {
+          pooling_nhwc_element2_kernel<float2, float, true><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nhwc_element2_kernel<half2, half, true><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        }
+      } // if (poolingType == 0)
+      else {
+        if (std::is_same<T, float>::value) {
+          pooling_nhwc_element2_kernel<float2, float, false><<<grid, block, 0, stream>>>(
+            (float2*)(ref_output.data()),
+            (const float2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        } // if (std::is_same<T, float>::value)
+        else {
+          pooling_nhwc_element2_kernel<half2, half, false><<<grid, block, 0, stream>>>(
+            (half2*)(ref_output.data()),
+            (const half2*)(ref_input.data()),
+            N,
+            H,
+            W,
+            C/2,
+            output_H,
+            output_W,
+            kernel_H,
+            kernel_W,
+            stride_H,
+            stride_W,
+            padding_H,
+            padding_W);
+        }
+      }
+    }
+  }
+}
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_to_nchw.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
new file mode 100644
index 0000000000000000000000000000000000000000..babfecd39205ebff39794133868e4a95b7e9525c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
@@ -0,0 +1,144 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief cuda kernels to transform a device memory tensor from NHWC layout to NCHW layout.
+ */
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+
+namespace cutlass {
+
+/** \brief interface to transform a device memory tensor from NHWC layout to NCHW layout.
+ * \tparam T: data type
+ */
+template <typename T>
+void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNCHW> ref_output,
+                  cudaStream_t stream);
+
+
+template <typename T>
+__global__ void nhwc_to_nchw_kernel(T *output, 
+                                    const T *input, 
+                                    const int n,
+                                    const int h, 
+                                    const int w, 
+                                    const int c) {
+ 
+  const int hw = h*w;
+  const int hwc = hw*c;
+  __shared__ T shbuf[32 * (32 + 1)]; 
+  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / 32; 
+  const int32_t lid  = tid % 32; 
+  const int32_t ni   = blockIdx.z;
+  const int32_t hwi0  = blockIdx.y * 32;  
+  const int32_t ci0 = blockIdx.x * 32;  
+
+  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
+  const T *A = input + input_idx;
+  if (ci0 + lid < c) {
+    const int lid_x_33 = lid * 33;
+    if ((hwi0 + 32) <= hw) {
+      int hwi = wid;  // between 0 and 7
+      CUTLASS_PRAGMA_UNROLL
+      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) { 
+        shbuf[lid_x_33 + hwi] = A[lid];
+        A                     = &A[8 * c];
+        hwi += 8;
+      }
+    } else {
+      for (int hwi = wid; hwi < 32; hwi += 8) { 
+        if ((hwi + hwi0) < hw) {
+          shbuf[lid_x_33 + hwi] = A[lid];
+        }
+        A = &A[8 * c];
+      }
+    }
+  }
+  __syncthreads();
+
+  const int32_t hwiOut = hwi0 + lid;
+  output = &output[ni * hwc + hwiOut];
+  if (hwiOut < hw) {
+    if (ci0 + 32 < c) {
+      int cI = wid;
+      CUTLASS_PRAGMA_UNROLL
+      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
+        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        cI += 8;
+      }
+    } else {
+      for (int cI = wid; cI < 32; cI += 8) {
+        if (ci0 + cI < c) {
+          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void nhwc_to_nchw(cutlass::Tensor4DCoord input_tensor_size,
+                  cutlass::Tensor4DCoord output_tensor_size,
+                  TensorRef<T, layout::TensorNHWC> ref_input,
+                  TensorRef<T, layout::TensorNCHW> ref_output,
+                  cudaStream_t stream) {
+  
+  assert(
+    input_tensor_size.n() == output_tensor_size.n() &&
+    input_tensor_size.h() == output_tensor_size.c() &&
+    input_tensor_size.w() == output_tensor_size.h() &&
+    input_tensor_size.c() == output_tensor_size.w());
+
+  int n = input_tensor_size.n();
+  int h = input_tensor_size.h();
+  int w = input_tensor_size.w();
+  int c = input_tensor_size.c();
+
+  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
+  dim3 block(32, 8);
+  nhwc_to_nchw_kernel<<<grid, block, 0, stream>>>(ref_output.data(), ref_input.data(), 
+                                                  n, h, w, c);
+
+}
+
+} //namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_rmsnorm.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_rmsnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d1b1af56e4463640edc3e9c82533baf815c9b27
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_rmsnorm.h
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/util/device_utils.h"
+#include <cfloat>
+
+namespace cutlass {
+
+__global__ void rmsnorm_twoPassAlgo_e8(float4 *output, const float4 *input,
+                                       const float4 *weight,
+                                       const int m, const int n, float epsilon) {
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean;
+  float local_sums[1] = {0.0f};
+  const int n_8 = n / 8;
+  int offset = m_idx * n_8;
+  input += offset;
+  output += offset;
+
+  for (int index = tid; index < n_8; index += bdimx) {
+    const float4 local_val = input[index];
+    const half2 *h1 = (half2 *)&local_val.x;
+    const half2 *h2 = (half2 *)&local_val.y;
+    const half2 *h3 = (half2 *)&local_val.z;
+    const half2 *h4 = (half2 *)&local_val.w;
+    local_sums[0] += static_cast<float>(h1->x) * static_cast<float>(h1->x) +
+                     static_cast<float>(h1->y) * static_cast<float>(h1->y) +
+                     static_cast<float>(h2->x) * static_cast<float>(h2->x) +
+                     static_cast<float>(h2->y) * static_cast<float>(h2->y) +
+                     static_cast<float>(h3->x) * static_cast<float>(h3->x) +
+                     static_cast<float>(h3->y) * static_cast<float>(h3->y) +
+                     static_cast<float>(h4->x) * static_cast<float>(h4->x) +
+                     static_cast<float>(h4->y) * static_cast<float>(h4->y);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = rsqrtf(local_sums[0] / n + epsilon);
+  }
+  __syncthreads();
+
+  for (int index = tid; index < n_8; index += bdimx) {
+    const float4 local_val = input[index];
+    const float4 weight_val = weight[index];
+
+    const half2 *l1 = (half2 *)&local_val.x;
+    const half2 *l2 = (half2 *)&local_val.y;
+    const half2 *l3 = (half2 *)&local_val.z;
+    const half2 *l4 = (half2 *)&local_val.w;
+
+    const half2 *g1 = (half2 *)&weight_val.x;
+    const half2 *g2 = (half2 *)&weight_val.y;
+    const half2 *g3 = (half2 *)&weight_val.z;
+    const half2 *g4 = (half2 *)&weight_val.w;
+
+    float4 tmp;
+    half2 *h1 = (half2 *)&tmp.x;
+    half2 *h2 = (half2 *)&tmp.y;
+    half2 *h3 = (half2 *)&tmp.z;
+    half2 *h4 = (half2 *)&tmp.w;
+
+    h1->x = half(static_cast<float>(l1->x) * s_mean * static_cast<float>(g1->x));
+    h1->y = half(static_cast<float>(l1->y) * s_mean * static_cast<float>(g1->y));
+    h2->x = half(static_cast<float>(l2->x) * s_mean * static_cast<float>(g2->x));
+    h2->y = half(static_cast<float>(l2->y) * s_mean * static_cast<float>(g2->y));
+    h3->x = half(static_cast<float>(l3->x) * s_mean * static_cast<float>(g3->x));
+    h3->y = half(static_cast<float>(l3->y) * s_mean * static_cast<float>(g3->y));
+    h4->x = half(static_cast<float>(l4->x) * s_mean * static_cast<float>(g4->x));
+    h4->y = half(static_cast<float>(l4->y) * s_mean * static_cast<float>(g4->y));
+
+    output[index] = tmp;
+  }
+}
+
+template<typename T>
+__global__ void rmsnorm_twoPassAlgo_e1(T* output,
+                                       const T* input,
+                                       const T* weight,
+                                       const int m, const int n,
+                                       float epsilon)
+{
+  const int m_idx = blockIdx.x;
+  const int tid = threadIdx.x;
+  const int bdimx = blockDim.x;
+  __shared__ float s_mean;
+  float local_sums[1] = {0.0f};
+  int offset = m_idx * n;
+  input += offset;
+  output += offset;
+
+  for (int index = tid ; index < n ; index += bdimx){
+    float local_val = static_cast<float>(input[index]);
+    local_sums[0] += local_val * local_val;
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  }
+  else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = rsqrtf(local_sums[0] / n + epsilon);
+  }
+  __syncthreads();
+
+  for (int index = tid ; index < n ; index += bdimx){
+    const T weight_val = weight[index];
+    const T local_val = input[index];
+    output[index] = T(static_cast<float>(local_val) * s_mean * static_cast<float>(weight_val));
+  }
+}
+
+template <typename T>
+void rmsnorm(cutlass::MatrixCoord tensor_size,
+             TensorRef<T, layout::RowMajor> ref_output,
+             TensorRef<T, layout::RowMajor> ref_input,
+             TensorRef<T, layout::RowMajor> ref_weight,
+             cudaStream_t stream, float epsilon = 1e-5f){
+  const int m = tensor_size.row();
+  const int n = tensor_size.column();
+  T* output = ref_output.data();
+  const T* input = ref_input.data();
+  const T* weight = ref_weight.data();
+  dim3 grid(m);
+
+  if (n % 8 == 0 && std::is_same<T, cutlass::half_t>::value) {
+    dim3 block(cutlass::platform::min(1024, (n / 8 + 31) / 32 * 32));
+
+    rmsnorm_twoPassAlgo_e8<<<grid, block, 0, stream>>>(
+        (float4 *)output, (const float4 *)input, (const float4 *)weight, m, n, epsilon);
+  } else {
+    dim3 block(cutlass::platform::min(1024, ((n + 31)/32 + 31)/32*32));
+
+    rmsnorm_twoPassAlgo_e1<<<grid, block, 0, stream>>>(
+        output, input, weight, m, n, epsilon);
+  }
+
+  auto result = cudaGetLastError();
+  if (result != cudaSuccess) {
+    std::cerr << "CUDA error: " << cudaGetErrorString(result) << std::endl;
+    abort();
+  }
+}
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_utils.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9747d50975d7d35df287f6b056aedc489adb317c
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/device_utils.h
@@ -0,0 +1,127 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief utils code for device cutlass code
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cfloat>
+#define FINAL_MASK 0xffffffff
+
+struct half4 {
+    half x, y, z, w;
+};
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceSum(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceSum(T* val)
+{
+    __shared__ T shared[NUM][33];
+    int lane = threadIdx.x & 0x1f;
+    int wid = threadIdx.x >> 5;
+
+    warpReduceSum<T, NUM>(val);
+
+    if (lane == 0) {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[i][wid] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+    }
+    warpReduceSum<T, NUM>(val);
+    return (T)0.0f;
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T warpReduceMax(T* val)
+{
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+#pragma unroll
+        for (int mask = 16; mask > 0; mask >>= 1)
+            val[i] = max(val[i], __shfl_xor_sync(FINAL_MASK, val[i], mask, 32));
+    }
+    return (T)(0.0f);
+}
+
+template<typename T, int NUM>
+__inline__ __device__ T blockReduceMax(T* val)
+{
+    static __shared__ T shared[32][NUM];
+    int lane = threadIdx.x & 0x1f;  // in-warp idx
+    int wid = threadIdx.x >> 5;     // warp idx
+
+    warpReduceMax<T, NUM>(val);  // get maxx in each warp
+
+    if (lane == 0)  // record in-warp maxx by warp Idx
+    {
+#pragma unroll
+        for (int i = 0; i < NUM; i++) {
+            shared[wid][i] = val[i];
+        }
+    }
+
+    __syncthreads();
+
+    // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
+    // blockDim.x is not divided by 32
+    bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+        val[i] = is_mask ? shared[lane][i] : (T)(-FLT_MAX);
+    }
+    warpReduceMax<T, NUM>(val);
+
+    return (T)0.0f;
+}
+
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/distribution.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/distribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..6565aba9607ad68defacb6e98d9f9bbc944cd48d
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/distribution.h
@@ -0,0 +1,157 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+/*! \file
+    \brief This header contains a class to parametrize a statistical distribution function.
+*/
+
+#include <ostream>
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Distribution type
+struct Distribution {
+  /// Variant types
+  enum Kind { Invalid, Uniform, Gaussian, Identity, Sequential, AllZeros, AllOnes };
+
+  /// Distribution state
+  union {
+    /// Uniform distribution
+    struct {
+      double min;
+      double max;
+      // Percent elements set to NaN
+      double pnan;
+    } uniform;
+
+    /// Gaussian distribution
+    struct {
+      double mean;
+      double stddev;
+      double pnz;
+      double pnzA;
+      double pnzB;
+      double pnzC;
+    } gaussian;
+
+    /// Elements are linear combination of row and column index
+    struct {
+      double start;
+      double delta;
+    } sequential;
+  };
+
+  /// Active variant kind
+  Kind kind;
+
+  /// Random values are cast to integer after scaling by this power of two
+  int int_scale;
+
+  //
+  // Methods
+  //
+
+  Distribution() : kind(Invalid), int_scale(0) {}
+
+/// Configures distribution as uniform random
+  Distribution &set_uniform(double _min, double _max, int _int_scale = 0, double _pnan = 0) {
+    kind = Uniform;
+    uniform.min = _min;
+    uniform.max = _max;
+    int_scale = _int_scale;
+    uniform.pnan = _pnan;
+    return *this;
+  }
+
+  /// Configures distribution as Gaussian distribution
+  Distribution &set_gaussian(double _mean, double _stddev, int _int_scale = 0, double _pnz = 1.0) {
+    kind = Gaussian;
+    gaussian.mean = _mean;
+    gaussian.stddev = _stddev;
+    gaussian.pnz = _pnz;
+    gaussian.pnzA = _pnz;
+    gaussian.pnzB = _pnz;
+    gaussian.pnzC = _pnz;
+    int_scale = _int_scale;
+    return *this;
+  }
+
+  /// Sets identity
+  Distribution &set_identity() {
+    kind = Identity;
+    return *this;
+  }
+
+  /// Sets sequential
+  Distribution &set_sequential(double start, double delta, int _int_scale = 0) {
+    kind = Sequential;
+    sequential.start = start;
+    sequential.delta = delta;
+    int_scale = _int_scale;
+    return *this;
+  }
+};
+
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Prints a Distribution to ostream
+inline std::ostream &operator<<(std::ostream &out, cutlass::Distribution const &dist) {
+  switch (dist.kind) {
+    case cutlass::Distribution::Uniform:
+      out << "uniform, min: " << dist.uniform.min << ", max: " << dist.uniform.max
+          << ", pnan: " << dist.uniform.pnan;
+      break;
+    case cutlass::Distribution::Gaussian:
+      out << "gaussian, mean: " << dist.gaussian.mean << ", stddev: " << dist.gaussian.stddev
+          << ", pnzA: " << dist.gaussian.pnzA << ", pnzB: "
+          << dist.gaussian.pnzB << ", pnzC: " << dist.gaussian.pnzC;
+      break;
+    case cutlass::Distribution::Identity:
+      out << "identity";
+      break;
+    case cutlass::Distribution::Sequential:
+      out << "sequential";
+      break;
+    default:
+      out << "unknown";
+  }
+
+  out << ", int_scale: " << dist.int_scale;
+
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/exceptions.h b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2b7df6cb1c465a312d76566768cb79fcdfffee4
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/exceptions.h
@@ -0,0 +1,69 @@
+/******************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+/**
+ * \file
+ * \brief C++ exception semantics for CUDA error codes
+ */
+
+#include <cuda_runtime.h>
+#include <iosfwd>
+#include <stdexcept>
+
+#include "cutlass/platform/platform.h"
+
+namespace cutlass {
+
+/// C++ exception wrapper for CUDA \p cudaError_t
+class cuda_exception : public std::exception {
+ public:
+  /// Constructor
+  cuda_exception(const char* msg = "", cudaError_t err = cudaErrorUnknown) : msg(msg), err(err) {}
+
+  /// Returns the underlying CUDA \p cudaError_t
+  cudaError_t cudaError() const { return err; }
+
+ protected:
+  /// Explanatory string
+  const char* msg;
+
+  /// Underlying CUDA \p cudaError_t
+  cudaError_t err;
+};
+
+/// Writes a cuda_exception instance to an output stream
+inline std::ostream& operator<<(std::ostream& out, cuda_exception const& e) {
+  return out << e.what() << ": " << cudaGetErrorString(e.cudaError());
+}
+
+}  // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/gett_commandline.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/gett_commandline.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..be2264466e350c062900a50e27e923847186d084
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/gett_commandline.hpp
@@ -0,0 +1,369 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief GETT command line parser to gather semantic modes, their stride order, and extents.
+*/
+#pragma once
+
+#include <iostream>
+#include <iomanip>
+#include <utility>
+#include <type_traits>
+#include <vector>
+#include <map>
+#include <algorithm>
+#include <numeric>
+
+#include "cutlass/util/command_line.h"
+
+namespace cutlass {
+
+// Output shortcuts
+std::ostream& operator<<(std::ostream& os, std::vector<char> data) {
+  for (auto& a : data) os << a;
+  return os;
+}
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, std::vector<T> data) {
+  for (auto& a : data) os << a << " ";
+  return os;
+}
+
+struct GettCommandLine {
+  struct GettProblem {
+    using extent_type = int;
+    using stride_type = int64_t;
+
+    // Row modes: appear in A and C/D
+    std::vector<extent_type> M;
+    std::vector<stride_type> ldAm;
+    std::vector<stride_type> ldCm;
+
+    // Column modes: appear in B and C/D
+    std::vector<extent_type> N;
+    std::vector<stride_type> ldBn;
+    std::vector<stride_type> ldCn;  
+
+    // Reduction modes: appear in A and B
+    std::vector<extent_type> K;
+    std::vector<stride_type> ldAk;
+    std::vector<stride_type> ldBk;
+
+    // Batch modes: appear in all in/out tensors
+    std::vector<extent_type> L;
+    std::vector<stride_type> ldAl;
+    std::vector<stride_type> ldBl;
+    std::vector<stride_type> ldCl;
+  };
+
+  static GettProblem
+  parse(int argc, char const* argv[], bool parse_verbose = false) {
+    using extent_type = typename GettProblem::extent_type;
+    using stride_type = typename GettProblem::stride_type;
+
+    cutlass::CommandLine cmd(argc, argv);
+
+    // modeA
+    std::vector<char> a_mode;
+    cmd.get_cmd_line_arguments("modeA", a_mode);
+
+    // modeB
+    std::vector<char> b_mode;
+    cmd.get_cmd_line_arguments("modeB", b_mode);
+
+    // modeC
+    std::vector<char> c_mode;
+    cmd.get_cmd_line_arguments("modeC", c_mode);
+
+
+    // mode_sizes
+    std::map<char,extent_type> mode_size;
+    // First, initialize all modes in a, b, c to make sure they're in map
+    for (char a : a_mode) mode_size[a] = 1;
+    for (char b : b_mode) mode_size[b] = 1;
+    for (char c : c_mode) mode_size[c] = 1;
+
+    // Then, overwrite the ones in -extent
+    std::vector<std::pair<std::string, std::string> > extent_tokens;
+    cmd.get_cmd_line_argument_pairs("extents", extent_tokens);
+    for (auto e : extent_tokens) {
+      if (std::get<0>(e).size() > 1) {
+        std::cerr << "ERROR: Mode name must only be 1 character long.\n";
+        print_usage();
+        exit(1);
+      }
+      char label = std::get<0>(e)[0];
+      int  size  = std::stoi(std::get<1>(e));
+      mode_size[label] = size;
+    }
+
+    // Print out symbolic modes and their extents
+    if (parse_verbose) {
+      std::cout << "C_" << c_mode << " = A_" << a_mode << " * B_" << b_mode << "\n";
+      for (auto e : mode_size) std::cout << "     " << std::get<0>(e) << " : " << std::get<1>(e) << "\n";
+    }
+
+    //
+    // Collect/Compute strides
+    //
+
+    std::map<char,stride_type> mode_ldA;
+    std::map<char,stride_type> mode_ldB;
+    std::map<char,stride_type> mode_ldC;
+
+    {
+      stride_type current;
+
+      current = 1;
+      for (char a : a_mode) { mode_ldA[a] = current; current *= mode_size[a]; }
+
+      current = 1;
+      for (char b : b_mode) { mode_ldB[b] = current; current *= mode_size[b]; }
+
+      current = 1;
+      for (char c : c_mode) { mode_ldC[c] = current; current *= mode_size[c]; }
+    }
+
+    //
+    // Collect mode categories
+    //
+
+    std::vector<char> row_mode;  // rows
+    std::vector<char> col_mode;  // columns
+    std::vector<char> red_mode;  // reductions
+    std::vector<char> bat_mode;  // batches
+
+    {
+      std::vector<char> a_label = a_mode;
+      std::vector<char> b_label = b_mode;
+      std::vector<char> c_label = c_mode;
+
+      std::sort(std::begin(a_label), std::end(a_label));
+      std::sort(std::begin(b_label), std::end(b_label));
+      std::sort(std::begin(c_label), std::end(c_label));
+
+      // std::set_intersections to find semantic category of each symbolic mode
+      std::set_intersection(std::begin(a_label), std::end(a_label),
+                            std::begin(c_label), std::end(c_label),
+                            std::back_inserter(row_mode));
+
+      std::set_intersection(std::begin(b_label), std::end(b_label),
+                            std::begin(c_label), std::end(c_label),
+                            std::back_inserter(col_mode));
+
+      std::set_intersection(std::begin(a_label), std::end(a_label),
+                            std::begin(b_label), std::end(b_label),
+                            std::back_inserter(red_mode));
+
+      std::set_intersection(std::begin(row_mode), std::end(row_mode),
+                            std::begin(col_mode), std::end(col_mode),
+                            std::back_inserter(bat_mode));
+
+      // std::set_difference to remove batch modes from other semantic modes
+      for (char l : bat_mode) {
+        row_mode.erase(std::remove(std::begin(row_mode), std::end(row_mode), l), std::end(row_mode));
+        col_mode.erase(std::remove(std::begin(col_mode), std::end(col_mode), l), std::end(col_mode));
+        red_mode.erase(std::remove(std::begin(red_mode), std::end(red_mode), l), std::end(red_mode));
+      }
+    }
+
+    // Print out the semantic association of each symbolic mode
+    if (parse_verbose) {
+      std::cout << "  rows : " << row_mode << '\n';
+      std::cout << "  cols : " << col_mode << '\n';
+      std::cout << "  reds : " << red_mode << '\n';
+      std::cout << "  bats : " << bat_mode << '\n';
+    }
+
+    //
+    // Permute modes
+    //
+
+    // Permute the batched modes to promote coalescing
+    // Sort the batched modes by min(ldAl,ldBl) and in case of a tie by the size
+    std::sort(std::begin(bat_mode), std::end(bat_mode), [&](char l1, char l2) {
+        return std::tie(std::min(mode_ldA[l1],mode_ldB[l1]),mode_size[l1])
+             < std::tie(std::min(mode_ldA[l2],mode_ldB[l2]),mode_size[l2]);
+      });
+    // Compute sizes and strides of ordered reduction modes
+    std::vector<extent_type> L;
+    std::vector<stride_type> ldAl;
+    std::vector<stride_type> ldBl;
+    std::vector<stride_type> ldCl;
+    for (char l : bat_mode) {
+      L.push_back(mode_size[l]);
+      ldAl.push_back(mode_ldA[l]);
+      ldBl.push_back(mode_ldB[l]);
+      ldCl.push_back(mode_ldC[l]);
+    }
+
+    // Permute the reduction modes to promote coalescing
+    // Sort the reduction modes by min(ldAk,ldBk) and in case of a tie by the size
+    std::sort(std::begin(red_mode), std::end(red_mode), [&](char k1, char k2) {
+        return std::tie(std::min(mode_ldA[k1],mode_ldB[k1]),mode_size[k1])
+             < std::tie(std::min(mode_ldA[k2],mode_ldB[k2]),mode_size[k2]);
+      });
+    // Compute sizes and strides of ordered reduction modes
+    std::vector<extent_type> K;
+    std::vector<stride_type> ldAk;
+    std::vector<stride_type> ldBk;
+    for (char k : red_mode) {
+      K.push_back(mode_size[k]);
+      ldAk.push_back(mode_ldA[k]);
+      ldBk.push_back(mode_ldB[k]);
+    }
+
+    // Permute the row modes to promote coalescing
+    // Sort the row modes by min(ldAm,ldCm) and in case of a tie by ldAm
+    std::sort(std::begin(row_mode), std::end(row_mode), [&](char m1, char m2) {
+        return std::tie(std::min(mode_ldA[m1],mode_ldC[m1]),mode_ldA[m1])
+             < std::tie(std::min(mode_ldA[m2],mode_ldC[m2]),mode_ldA[m2]);
+      });
+    // Compute sizes and strides of ordered row modes
+    std::vector<extent_type> M;
+    std::vector<stride_type> ldAm;
+    std::vector<stride_type> ldCm;
+    for (char m : row_mode) {
+      M.push_back(mode_size[m]);
+      ldAm.push_back(mode_ldA[m]);
+      ldCm.push_back(mode_ldC[m]);
+    }
+
+    // Permute the col modes to promote coalescing
+    // Sort the col modes by min(ldBn,ldCn) and in case of a tie by ldBn
+    std::sort(std::begin(col_mode), std::end(col_mode), [&](char n1, char n2) {
+        return std::tie(std::min(mode_ldB[n1],mode_ldC[n1]),mode_ldB[n1])
+             < std::tie(std::min(mode_ldB[n2],mode_ldC[n2]),mode_ldB[n2]);
+      });
+    // Compute sizes and strides of ordered col modes
+    std::vector<extent_type> N;
+    std::vector<stride_type> ldBn;
+    std::vector<stride_type> ldCn;
+    for (char n : col_mode) {
+      N.push_back(mode_size[n]);
+      ldBn.push_back(mode_ldB[n]);
+      ldCn.push_back(mode_ldC[n]);
+    }
+
+    if (parse_verbose) {
+      std::cout << "C_";
+      if (! row_mode.empty()) {
+        std::cout << "(" << row_mode << ")";
+      }
+      if (! col_mode.empty()) {
+        std::cout << "(" << col_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << " = A_";
+      if (! row_mode.empty()) {
+        std::cout << "(" << row_mode << ")";
+      }
+      if (! red_mode.empty()) {
+        std::cout << "(" << red_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << " * B_";
+      if (! col_mode.empty()) {
+        std::cout << "(" << col_mode << ")";
+      }
+      if (! red_mode.empty()) {
+        std::cout << "(" << red_mode << ")";
+      }
+      if (! bat_mode.empty()) {
+        std::cout << "(" << bat_mode << ")";
+      }
+      std::cout << '\n';
+
+      int M_size = std::accumulate(std::begin(M), std::end(M), 1, std::multiplies<>{});
+      int N_size = std::accumulate(std::begin(N), std::end(N), 1, std::multiplies<>{});
+      int K_size = std::accumulate(std::begin(K), std::end(K), 1, std::multiplies<>{});
+      int L_size = std::accumulate(std::begin(L), std::end(L), 1, std::multiplies<>{});
+
+      std::cout << "     M : (" << M_size << ") ";
+      for (char m : row_mode) std::cout << m << ":" << mode_size[m] << " ";
+      std::cout << '\n';
+      std::cout << "     N : (" << N_size << ") ";
+      for (char n : col_mode) std::cout << n << ":" << mode_size[n] << " ";
+      std::cout << '\n';
+      std::cout << "     K : (" << K_size << ") ";
+      for (char k : red_mode) std::cout << k << ":" << mode_size[k] << " ";
+      std::cout << '\n';
+      std::cout << "     L : (" << L_size << ") ";
+      for (char l : bat_mode) std::cout << l << ":" << mode_size[l] << " ";
+      std::cout << '\n';
+
+      std::cout << "  ldAm : " << ldAm << '\n';
+      std::cout << "  ldAk : " << ldAk << '\n';
+      std::cout << "  ldAl : " << ldAl << '\n';
+      std::cout << "  ldBn : " << ldBn << '\n';
+      std::cout << "  ldBk : " << ldBk << '\n';
+      std::cout << "  ldBl : " << ldBl << '\n';
+      std::cout << "  ldCm : " << ldCm << '\n';
+      std::cout << "  ldCn : " << ldCn << '\n';
+      std::cout << "  ldCl : " << ldCl << '\n';
+    }
+
+    return {M, ldAm, ldCm,
+            N, ldBn, ldCn,   
+            K, ldAk, ldBk, 
+            L, ldAl, ldBl, ldCl}; 
+  }
+
+  static void
+  print_usage() {
+    std::cout <<
+      "GETT problem command line parser:\n"
+      "  --modeA=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the row, reduction, and batch modes in A tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --modeB=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the column, reduction, and batch modes in B tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --modeC=<m0,...>\n"
+      "    A comma delimited list of characters that correspond to the row, column, and batch modes in B tensor.\n"
+      "    The semantic association of each symbolic mode is determined automatically.\n\n"
+
+      "  --extents=<mode:extent,....>\n"
+      "    A command delimited list of symbolic mode and its corresponding extent.\n"
+      "    Extents are defaulted to 1 if any are not provided.\n\n"
+
+      "Example usage: gett.exe --modeC=m,n,l --modeA=m,k,l --modeB=k,n,l --extents=m:4096,n:4096,k:4096\n";
+  }
+};
+
+} // namespace cutlass
diff --git a/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/helper_cuda.hpp b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/helper_cuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..58d08b860c9e665d170fd022ed0d95875e029019
--- /dev/null
+++ b/build/torch29-cxx11-cu128-aarch64-linux/include/third-party/cutlass/tools/util/include/cutlass/util/helper_cuda.hpp
@@ -0,0 +1,116 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+
+#include <cute/util/debug.hpp>
+
+namespace cute
+{
+
+void
+device_init(int device_id, bool quiet = false)
+{
+  cudaDeviceProp device_prop;
+  std::size_t    device_free_physmem;
+  std::size_t    device_total_physmem;
+
+  CUTE_CHECK_ERROR(cudaSetDevice(device_id));
+  CUTE_CHECK_ERROR(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+  CUTE_CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device_id));
+
+  if (device_prop.major < 1) {
+    fprintf(stderr, "Device does not support CUDA.\n");
+    exit(1);
+  }
+
+  //float device_giga_bandwidth = float(device_prop.memoryBusWidth) * device_prop.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+  if (!quiet) {
+    printf("Using device %d: %s  (SM%d, %d SMs)\n",
+           device_id, device_prop.name,
+           device_prop.major * 10 + device_prop.minor,
+           device_prop.multiProcessorCount);
+    fflush(stdout);
+  }
+}
+
+/**
+ * Convert the SM version (e.g. v7.0, v7.5) to the physical number of cores.
+ */
+inline int
+_ConvertSMVer2Cores(int major, int minor)
+{
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexadecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf("MapSMtoCores for SM %d.%d is undefined."
+         "  Default to use %d Cores/SM\n",
+         major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+} // end namespace cute